Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
64 commits
Select commit Hold shift + click to select a range
bbb8610
rename script
bringlein Jul 16, 2025
d53287a
getting flashinfer installed in container
bringlein Jul 16, 2025
720dc2a
getting offline script to run
bringlein Jul 16, 2025
7e8c19c
triton dejavu for granite 4
bringlein Jul 17, 2025
0223476
adding cache to moe config script
bringlein Jul 18, 2025
75f3877
moving fmwork to third_party
bringlein Jul 18, 2025
da56c2e
benchmark moe fallback heuristics
bringlein Jul 18, 2025
fd8531c
implemented fused moe benchmark
bringlein Jul 18, 2025
dbddbb1
first tuning result
bringlein Jul 18, 2025
e324ca4
getting moe benchmark to run
bringlein Jul 18, 2025
b7161b7
tuning remaining parts...log lost due to restart
bringlein Jul 21, 2025
8c7acf4
benchmarking fused_moe
bringlein Jul 22, 2025
ba3ef68
setup ttft sweeps
bringlein Jul 23, 2025
2e129b8
first tuning night
bringlein Jul 24, 2025
b37a420
adding tune log for analysis
bringlein Jul 24, 2025
5a31ca4
testing for longer sequences
bringlein Jul 24, 2025
fd1f12e
adding cache for baseline
bringlein Jul 24, 2025
600f5d8
measuring full cudagraphs, adding tuned fp8 moe
bringlein Jul 28, 2025
408b462
itl tuning
bringlein Jul 28, 2025
8b0994f
improving benchmark latency script
bringlein Jul 29, 2025
023a5a5
adding simple heuristics to unit tests
bringlein Jul 29, 2025
322eae9
making path relative
bringlein Jul 29, 2025
bbce518
also looking at decode
bringlein Jul 29, 2025
cc9327d
Adding kernels with flexible tiles
bringlein Jul 29, 2025
ad50b39
updating dejavu version
bringlein Jul 29, 2025
80e9ea0
updating serving range script
bringlein Jul 29, 2025
0b64697
preparing cuda graph catpure
bringlein Jul 30, 2025
a2f635e
some fixes
bringlein Jul 30, 2025
e31fc91
fixing flash_attn alternating
bringlein Jul 30, 2025
0dda12d
fix sweep script
bringlein Jul 31, 2025
dc8af9d
adding random range serve bench script
bringlein Jul 31, 2025
08a967e
making benchmark random range faster
bringlein Aug 4, 2025
ded193b
quantize g4 script
bringlein Aug 5, 2025
e5f50d5
getting fp8 dynamic to work
bringlein Aug 8, 2025
9f5e543
micro benchmark for vllm full cuda graph mode
bringlein Aug 8, 2025
af5befc
starting ws experiments
bringlein Aug 12, 2025
b2c94f1
another schema for fp8 tuning
bringlein Aug 12, 2025
b510a5e
first ws mb
bringlein Aug 13, 2025
0128071
fix tuning error
bringlein Aug 13, 2025
836257c
making it compatible with cuda graphs
bringlein Aug 13, 2025
c1df8c6
adding kernels with static launch grid
bringlein Aug 13, 2025
cce415a
further ws tuning for simple kernel; add tuning for new grid kernels
bringlein Aug 14, 2025
e221205
tuning grid w/o ws
bringlein Aug 16, 2025
19a73a8
grid tuning with ws, preparation
bringlein Aug 16, 2025
16d64d8
tuning forcing ws (partially failing)
bringlein Aug 16, 2025
2057fa4
allowing ws or not
bringlein Aug 16, 2025
ffe0346
tuning with wrong ws config
bringlein Aug 16, 2025
d757651
autotuning with the right bohb spaces
bringlein Aug 16, 2025
ebc7a41
preparing to run without bo
bringlein Aug 16, 2025
41120d7
switching to random search
bringlein Aug 17, 2025
adc8ccf
switching back to bo
bringlein Aug 18, 2025
7eb1410
another run without ws
bringlein Aug 18, 2025
64d6d33
preparing tuning on MI300
bringlein Aug 18, 2025
f0713ca
Merge branch 'ngl_granite4_cudagraph' of github.com:foundation-model-…
bringlein Aug 18, 2025
c68dd28
tuning 30min on H100
bringlein Aug 19, 2025
429ddd5
tuning 30min MI300
bringlein Aug 19, 2025
211e655
Merge branch 'ngl_granite4_cudagraph' of github.com:foundation-model-…
bringlein Aug 19, 2025
40f0dfb
baseline experiments
bringlein Aug 20, 2025
45056f5
last paper experiments
bringlein Sep 1, 2025
5be31b1
updating fmwork pointer
bringlein Sep 1, 2025
0eea195
some cleanup
bringlein Sep 3, 2025
b05e851
some cleanup
bringlein Sep 3, 2025
3713282
Merge branch 'ngl_pr_2025-09-03' of github.com:foundation-model-stack…
bringlein Sep 3, 2025
f3f623e
further cleanup
bringlein Sep 3, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,5 @@ ShareGPT_V3_unfiltered_cleaned_split.json

.vscode/settings.json

ibm-triton-lib/ibm_triton_lib.egg-info/

3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,6 @@
[submodule "vllm"]
path = vllm
url = https://github.com/vllm-project/vllm.git
[submodule "third_party/fmwork"]
path = third_party/fmwork
url = git@github.com:bringlein/fmwork.git
76 changes: 68 additions & 8 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ ARG BASE_UBI_IMAGE_TAG=9.4
ARG PYTHON_VERSION=3.12
ARG MAX_JOBS=64
ARG PIP_VLLM_VERSION=0.8.1
# TODO add ARG CUDA_VERSION=12-8

ARG VLLM_SOURCE=pip
# or VLLM_SOURCE=custom
Expand Down Expand Up @@ -122,6 +123,31 @@ ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
python3 setup.py bdist_wheel --dist-dir=/workspace/

# ## flashinfer Builder #################################################################
# FROM vllm-builder_custom AS flashinfer-builder
# ARG MAX_JOBS
#
# # # build deps?
# # RUN --mount=type=cache,target=/root/.cache/pip \
# # --mount=type=cache,target=/root/.cache/uv \
# # uv pip install ninja cmake wheel pybind11 setuptools
#
# WORKDIR /workspace/flashinfer
# RUN git clone --recursive https://github.com/flashinfer-ai/flashinfer.git
#
# ENV TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'
# ENV FLASHINFER_ENABLE_SM90=1
# RUN --mount=type=cache,target=/root/.cache/pip \
# cd flashinfer \
# && export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} export FLASHINFER_ENABLE_SM90=${FLASHINFER_ENABLE_SM90} \
# && python -m flashinfer.aot \
# && python -m build --no-isolation --wheel
#
# # uv pip install \
# # --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.6.post1"
#
# RUN ls -al /workspace/flashinfer/flashinfer/dist

## Runtime #################################################################
FROM base AS runtime

Expand Down Expand Up @@ -227,20 +253,54 @@ RUN --mount=type=cache,target=/root/.cache/pip \
uv pip install pytest llnl-hatchet debugpy

# Install FlashInfer
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment

RUN --mount=type=cache,target=/root/.cache/pip \
. /etc/environment && \
python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
# RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
# echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
# RUN --mount=type=cache,target=/root/.cache/pip \
# . /etc/environment && \
# python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
# RUN --mount=type=cache,target=/root/.cache/pip \
# . /etc/environment && \
# uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
# RUN --mount=type=cache,target=/root/.cache/pip \
# uv pip install flashinfer-python -i https://flashinfer.ai/whl/cu124/torch2.6/ --no-deps
# RUN --mount=type=cache,target=/root/.cache/pip \
# --mount=type=cache,target=/root/.cache/uv \
# uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.5/flashinfer_python-0.2.5+cu124torch2.6-cp38-abi3-linux_x86_64.whl#sha256=43d767b912c0c43a04be99595e0123eab9385fc72530a2874b5fb08e3145c0be
# RUN --mount=type=cache,target=/root/.cache/pip \
# --mount=type=cache,target=/root/.cache/uv \
# uv pip install torch==2.7.0
# RUN --mount=type=cache,target=/root/.cache/pip \
# --mount=type=cache,target=/root/.cache/uv \
# uv pip install https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.5%2Bcu128torch2.7-cp38-abi3-linux_x86_64.whl
# RUN mkdir /workspace/flashinfer_dist && ls -al /workspace/flashinfer_dist
# COPY --from=flashinfer-builder /workspace/*.whl /workspace/flashinfer_dist
# RUN --mount=type=cache,target=/root/.cache/pip \
# --mount=type=cache,target=/root/.cache/uv \
# uv pip install /workspace/flashinfer_dist/*.whl
# TODO: we need nvcc for flashinfer installation...custom build fails, see above
RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \
https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
RUN microdnf install -y \
cuda-nvcc-12-8 cuda-nvtx-12-8 cuda-libraries-devel-12-8 && \
microdnf clean all
ENV CUDA_HOME="/usr/local/cuda" \
PATH="${CUDA_HOME}/bin:${PATH}" \
LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64:${LD_LIBRARY_PATH}"
ENV TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'
ENV FLASHINFER_ENABLE_SM90=1
RUN TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} FLASHINFER_ENABLE_SM90=${FLASHINFER_ENABLE_SM90} uv pip install \
--no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.6.post1"


RUN ln -s ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_cupti/lib/libcupti.so.12 ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_cupti/lib/libcupti.so

RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/uv \
git clone --depth 1 https://github.com/EleutherAI/lm-evaluation-harness && cd lm-evaluation-harness && uv pip install .

RUN git clone --depth 1 https://github.com/IBM/fmwork.git
# RUN git clone --depth 1 https://github.com/IBM/fmwork.git
# RUN git clone --depth 1 https://github.com/IBM/fmwork.git
COPY third_party/fmwork fmwork

ENV STORE_TEST_RESULT_PATH=/results

Expand All @@ -250,7 +310,7 @@ COPY vllm/tests tests
COPY ShareGPT_V3_unfiltered_cleaned_split.json ShareGPT_V3_unfiltered_cleaned_split.json

# Copy thid-party kernels and insert into path
COPY third_party third_party
COPY third_party/kernels third_party
ENV PYTHONPATH /workspace

# see https://github.com/IBM/triton-dejavu?tab=readme-ov-file#environment-variables
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_bmm:_bmm_chunk_fwd_kernel)",
"total_bench_time_s": 4.903317928314209,
"evaluated_configs": 9,
"keys": [
"chunk_size",
"K",
"IS_CAUSAL"
],
"cache": {
"('256', '128', 'False', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.int32')": "BLOCK_SIZE_M: 32, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 32, num_warps: 2, num_ctas: 1, num_stages: 5, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
},
"timings": {
"('256', '128', 'False', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.int32')": [
0.007391999941319227
]
},
"timings_data": {
"labels": [
"ms"
],
"rep_t_ms": 100,
"warmup_t_ms": 25,
"cuda_graphs": false
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_bmm:_bmm_chunk_fwd_kernel)",
"total_bench_time_s": 10756.567904472351,
"evaluated_configs": 2625,
"keys": [
"chunk_size",
"K",
"IS_CAUSAL"
],
"cache": {
"('256', '128', 'False', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.int32')": "BLOCK_SIZE_M: 16, BLOCK_SIZE_N: 32, BLOCK_SIZE_K: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
},
"timings": {
"('256', '128', 'False', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.int32')": [
0.002230335958302021
]
},
"timings_data": {
"labels": [
"ms"
],
"rep_t_ms": 100,
"warmup_t_ms": 25,
"cuda_graphs": true
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_cumsum_fwd_kernel)",
"total_bench_time_s": 7.295067548751831,
"evaluated_configs": 7,
"keys": [
"chunk_size",
"nheads"
],
"cache": {
"('256', '128', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32')": "BLOCK_SIZE_H: 2, num_warps: 4, num_ctas: 1, num_stages: 3, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
},
"timings": {
"('256', '128', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32')": [
0.007071999832987785
]
},
"timings_data": {
"labels": [
"ms"
],
"rep_t_ms": 100,
"warmup_t_ms": 25,
"cuda_graphs": false
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_cumsum_fwd_kernel)",
"total_bench_time_s": 7.361271619796753,
"evaluated_configs": 7,
"keys": [
"chunk_size",
"nheads"
],
"cache": {
"('256', '128', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32')": "BLOCK_SIZE_H: 2, num_warps: 4, num_ctas: 1, num_stages: 3, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
},
"timings": {
"('256', '128', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32')": [
0.002133406000211835
]
},
"timings_data": {
"labels": [
"ms"
],
"rep_t_ms": 100,
"warmup_t_ms": 25,
"cuda_graphs": true
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{
"signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_scan:_chunk_scan_fwd_kernel)",
"total_bench_time_s": 22.759257316589355,
"evaluated_configs": 11,
"keys": [
"chunk_size",
"hdim",
"dstate",
"IS_CAUSAL"
],
"cache": {
"('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16')": "BLOCK_SIZE_M: 128, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 32, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
"('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32')": "BLOCK_SIZE_M: 32, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 32, num_warps: 2, num_ctas: 1, num_stages: 5, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
},
"timings": {
"('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16')": [
0.014240000396966934
],
"('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32')": [
0.8048959970474243
]
},
"timings_data": {
"labels": [
"ms"
],
"rep_t_ms": 100,
"warmup_t_ms": 25,
"cuda_graphs": false
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{
"signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_scan:_chunk_scan_fwd_kernel)",
"total_bench_time_s": 15278.822125434875,
"evaluated_configs": 2625,
"keys": [
"chunk_size",
"hdim",
"dstate",
"IS_CAUSAL"
],
"cache": {
"('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16')": "BLOCK_SIZE_M: 16, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 16, num_warps: 2, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
},
"timings": {
"('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16')": [
0.014237518422305584
]
},
"timings_data": {
"labels": [
"ms"
],
"rep_t_ms": 100,
"warmup_t_ms": 25,
"cuda_graphs": true
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_state_fwd_kernel)",
"total_bench_time_s": 5.0212812423706055,
"evaluated_configs": 9,
"keys": [
"hdim",
"dstate",
"chunk_size"
],
"cache": {
"('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.float32', 'torch.int32')": "BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 32, BLOCK_SIZE_K: 32, num_warps: 2, num_ctas: 1, num_stages: 5, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
},
"timings": {
"('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.float32', 'torch.int32')": [
0.009247999638319016
]
},
"timings_data": {
"labels": [
"ms"
],
"rep_t_ms": 100,
"warmup_t_ms": 25,
"cuda_graphs": false
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_state_fwd_kernel)",
"total_bench_time_s": 9348.028031349182,
"evaluated_configs": 2625,
"keys": [
"hdim",
"dstate",
"chunk_size"
],
"cache": {
"('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.float32', 'torch.int32')": "BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 64, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
},
"timings": {
"('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.float32', 'torch.int32')": [
0.003924777265638113
]
},
"timings_data": {
"labels": [
"ms"
],
"rep_t_ms": 100,
"warmup_t_ms": 25,
"cuda_graphs": true
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_state_varlen_kernel)",
"total_bench_time_s": 0.0,
"evaluated_configs": 0,
"keys": null,
"cache": {},
"timings": {}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_state_varlen_kernel)",
"total_bench_time_s": 17.040932178497314,
"evaluated_configs": 9,
"keys": [
"hdim",
"dstate",
"chunk_size"
],
"cache": {
"('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32', 'torch.bfloat16')": "BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 32, BLOCK_SIZE_K: 32, num_warps: 2, num_ctas: 1, num_stages: 5, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
"('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16')": "BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 32, num_warps: 2, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
},
"timings": {
"('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32', 'torch.bfloat16')": [
0.009184000082314014
],
"('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16')": [
0.009184000082314014
]
},
"timings_data": {
"labels": [
"ms"
],
"rep_t_ms": 100,
"warmup_t_ms": 25,
"cuda_graphs": false
}
}
Loading