Skip to content

[MoE] Cleanup MoE examples #1576

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 28 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
b30eade
deepseekv3
kylesayrs Jun 19, 2025
a957f2f
remove dreg
kylesayrs Jun 19, 2025
2fd2a25
reformat example
kylesayrs Jun 19, 2025
b8b217c
wip: clean up moe examples
kylesayrs Jun 19, 2025
43bc91d
remove deepseek2.5 for now
kylesayrs Jun 19, 2025
7d8ed36
update readme
kylesayrs Jun 19, 2025
b7273a9
infer model device with optional override
kylesayrs Jun 19, 2025
afebe2e
handle nullable dataset_args
kylesayrs Jun 20, 2025
ab3aa3e
update docstrings, comments
kylesayrs Jun 20, 2025
e9e30c3
rename files, update examples tests
kylesayrs Jun 20, 2025
6bf5acb
rebase on main
kylesayrs Jun 20, 2025
e77a31b
clean examples
kylesayrs Jun 20, 2025
366ac25
revert examples changes
kylesayrs Jun 20, 2025
c44da34
revert extra examples
kylesayrs Jun 20, 2025
2db2789
revert examples changes
kylesayrs Jun 20, 2025
0dc2381
remove extra examples
kylesayrs Jun 20, 2025
b70aba7
revert examples tests changes
kylesayrs Jun 20, 2025
5e5657b
Revert "revert extra examples"
kylesayrs Jun 20, 2025
735c317
Merge branch 'kylesayrs/deepseek-v3' into kylesayrs/cleanup-moe-examples
kylesayrs Jun 20, 2025
4812350
clean up examples
kylesayrs Jun 20, 2025
626000d
merge with main src
kylesayrs Jun 26, 2025
45f6391
Merge remote-tracking branch 'origin' into kylesayrs/cleanup-moe-exam…
kylesayrs Jun 26, 2025
863377e
remove extra file
kylesayrs Jun 26, 2025
2f5de10
convert to fp8 examples
kylesayrs Jun 26, 2025
11d23fa
Merge remote-tracking branch 'origin' into kylesayrs/cleanup-moe-exam…
kylesayrs Jul 29, 2025
93f69f0
remove 25 3 deepseek examples
kylesayrs Jul 29, 2025
da3680f
add r1 test, which is skipped
kylesayrs Jul 29, 2025
de58207
fix readme
kylesayrs Jul 29, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 0 additions & 125 deletions examples/quantizing_moe/deepseek_moe_w4a16.py

This file was deleted.

8 changes: 0 additions & 8 deletions examples/quantizing_moe/deepseek_recipe_w4a16.yaml

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,17 @@
# previous version or upgrading to a version where this bug is fixed

# select a Mixture of Experts model for quantization
MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
MODEL_ID = "deepseek-ai/DeepSeek-V2.5"

model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Select calibration dataset.
# its recommended to use more calibration samples for MoE models so each expert is hit
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
DATASET_SPLIT = "train_sft"
NUM_CALIBRATION_SAMPLES = 2048
NUM_CALIBRATION_SAMPLES = 512
MAX_SEQUENCE_LENGTH = 2048


Expand Down Expand Up @@ -57,16 +56,12 @@ def tokenize(sample):

ds = ds.map(tokenize, remove_columns=ds.column_names)

# define a llmcompressor recipe for INT8 W8A8 quantization
# Configure the quantization algorithm to run.
# since the MoE gate layers are sensitive to quantization, we add them to the ignore
# list so they remain at full precision
recipe = [
GPTQModifier(
targets="Linear",
scheme="W8A8",
ignore=["lm_head", "re:.*mlp.gate$"],
),
]
recipe = GPTQModifier(
targets="Linear", scheme="W4A16", ignore=["lm_head", "re:.*mlp.gate$"]
)

oneshot(
model=model,
Expand All @@ -82,12 +77,10 @@ def tokenize(sample):
if Version(__version__) < Version("4.48"):
print("========== SAMPLE GENERATION ==============")
dispatch_for_generation(model)
SAMPLE_INPUT = ["I love quantization because"]
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device)
output = model.generate(**inputs, max_length=50)
text_output = tokenizer.batch_decode(output)
print(text_output)
sample = tokenizer("Hello my name is", return_tensors="pt")
sample = {key: value.to("cuda") for key, value in sample.items()}
output = model.generate(**sample, max_new_tokens=100)
print(tokenizer.decode(output[0]))
print("==========================================")
else:
print(
Expand All @@ -96,6 +89,6 @@ def tokenize(sample):
)

# Save to disk in compressed-tensors format.
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W8A8"
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)
Original file line number Diff line number Diff line change
@@ -1,28 +1,23 @@
import torch
from datasets import load_dataset
from packaging.version import Version
from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor.utils import dispatch_for_generation

# NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
# Please consider either downgrading your transformers version to a
# previous version or upgrading to a version where this bug is fixed

# select a Mixture of Experts model for quantization
MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1"

model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, torch_dtype="auto", trust_remote_code=True
MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Select calibration dataset.
# its recommended to use more calibration samples for MoE models so each expert is hit
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
DATASET_SPLIT = "train_sft"
NUM_CALIBRATION_SAMPLES = 2048
NUM_CALIBRATION_SAMPLES = 512
MAX_SEQUENCE_LENGTH = 2048


Expand Down Expand Up @@ -56,16 +51,17 @@ def tokenize(sample):

ds = ds.map(tokenize, remove_columns=ds.column_names)

# define a llmcompressor recipe for FP8 W8A8 quantization
# Configure the quantization algorithm to run.
# since the MoE gate layers are sensitive to quantization, we add them to the ignore
# list so they remain at full precision
recipe = [
QuantizationModifier(
targets="Linear",
scheme="FP8",
ignore=["lm_head", "re:.*mlp.gate$"],
),
]
recipe = QuantizationModifier(
scheme="W4A16",
targets="Linear",
ignore=[
"lm_head",
"re:.*block_sparse_moe.gate", # does not quantize well
],
)

oneshot(
model=model,
Expand All @@ -76,24 +72,15 @@ def tokenize(sample):
trust_remote_code_model=True,
)

# Confirm generations of the quantized model look sane.
# Generation is broken for deepseek models when using the latest transformers package
if Version(__version__) < Version("4.48"):
print("========== SAMPLE GENERATION ==============")
dispatch_for_generation(model)
SAMPLE_INPUT = ["I love quantization because"]
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device)
output = model.generate(**inputs, max_length=50)
text_output = tokenizer.batch_decode(output)
print(text_output)
else:
print(
"WARNING: cannot perform sample generation of "
"deepseek models with transformers >= 4.48"
)
print("========== SAMPLE GENERATION ==============")
dispatch_for_generation(model)
sample = tokenizer("Hello my name is", return_tensors="pt")
sample = {key: value.to("cuda") for key, value in sample.items()}
output = model.generate(**sample, max_new_tokens=100)
print(tokenizer.decode(output[0]))
print("==========================================")

# Save to disk in compressed-tensors format.
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8"
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)
58 changes: 0 additions & 58 deletions examples/quantizing_moe/mixtral_moe_w8a8_fp8.py

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,13 @@ def tokenize(sample):
# Confirm generations of the quantized model look sane.
print("========== SAMPLE GENERATION ==============")
dispatch_for_generation(model)
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=20)
sample = tokenizer("Hello my name is", return_tensors="pt")
sample = {key: value.to("cuda") for key, value in sample.items()}
output = model.generate(**sample, max_new_tokens=100)
print(tokenizer.decode(output[0]))
print("==========================================")

# Save to disk in compressed-tensors format.
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-quantized.w4a16"
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)
8 changes: 8 additions & 0 deletions src/llmcompressor/args/dataset_arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ class DatasetArguments(CustomDatasetArguments):
"will execute code present on the Hub on your local machine."
},
)
# --- pipeline arguments --- #
pipeline: Optional[str] = field(
default="independent",
metadata={
Expand All @@ -196,3 +197,10 @@ class DatasetArguments(CustomDatasetArguments):
"definition"
},
)
model_input_device: Optional[str] = field(
default=None,
metadata={
"help": "Device to put model inputs on for calibration. "
"If none is specified, the model input device is inferred from the model"
},
)
Loading