Gemlite generate.py fix (#2372)

mobicham · web-flow · commit 7a846d5578a0 · 2025-06-24T12:29:21.000-07:00
* fix get_plain() with FMA mode

* update

* fix in_features/out_feature meta-data mismatch

* update gemlite slice test

* add packing_bitwidth support

* add packing_bitwidth support and cleanup

* update default gemlite layout

* cleanup

* fix symmetric use-case and relax _same_meta_data

* _copy() meta data

* fix (4,) in autoquant

* Add dynamic mode in gemlite layout

* mode explanation

Signed-off-by: mobicham &lt;hicham@mobiuslabs.com&gt;

* use weights_only instead of static

* generate fix

Signed-off-by: mobicham &lt;hicham@mobiuslabs.com&gt;

* remove set_packing_bitwidth

---------

Signed-off-by: mobicham &lt;hicham@mobiuslabs.com&gt;
diff --git a/torchao/_models/llama/benchmarks.sh b/torchao/_models/llama/benchmarks.sh
@@ -97,19 +97,13 @@ python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --co
 python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization sparse-marlin --write_result benchmark_results.txt --prefill_size 8000 --precision float16 --sparsity semi-structured
 
 # gemlite benchmarks
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-8-4-64  --write_result benchmark_results.txt
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-32-4-64  --write_result benchmark_results.txt
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-8-4-None  --write_result benchmark_results.txt
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-32-4-None  --write_result benchmark_results.txt
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-8-8-None  --write_result benchmark_results.txt
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-32-8-None  --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-4-64-wo  --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-4-128-wo  --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-8-None-dq  --write_result benchmark_results.txt
 
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-8-4-64  --write_result benchmark_results.txt  --batch_size 32
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-32-4-64  --write_result benchmark_results.txt  --batch_size 32
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-8-4-None  --write_result benchmark_results.txt  --batch_size 32
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-32-4-None  --write_result benchmark_results.txt --batch_size 32
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-8-8-None  --write_result benchmark_results.txt --batch_size 32
-python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-32-8-None  --write_result benchmark_results.txt --batch_size 32
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-4-64-wo  --write_result benchmark_results.txt --batch_size 32
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-4-128-wo  --write_result benchmark_results.txt --batch_size 32
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --precision float16 --quantization gemlite-8-None-dq  --write_result benchmark_results.txt --batch_size 32
 
 # 2:4 sparse model
 export MODEL_REPO=nm-testing/SparseLlama-3-8B-pruned_50.2of4
diff --git a/torchao/_models/llama/generate.py b/torchao/_models/llama/generate.py
@@ -244,7 +244,7 @@ def encode_tokens(tokenizer, string, bos=True, device=default_device):
 
 def _load_model(checkpoint_path, device, precision):
     checkpoint = torch.load(
-        str(checkpoint_path), mmap=True, weights_only=True, map_location=device
+        str(checkpoint_path), mmap=True, weights_only=True, map_location="cpu"
     )
     if "model" in checkpoint and "stories" in str(checkpoint_path):
         checkpoint = checkpoint["model"]
@@ -366,34 +366,24 @@ def ffn_or_attn_only(mod, fqn):
             import os
             import pwd
 
-            from gemlite.core import GemLiteLinearTriton
+            import gemlite
+
+            gemlite.set_autotune("max")
+            config_file = f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json"
 
             _quant_args = quantization.split("-")
-            bit_width = int(_quant_args[-2])
-            group_size = None if _quant_args[-1] == "None" else int(_quant_args[-1])
-            try:
-                packing_bitwidth = int(_quant_args[-3])
-            except:
-                # if only 2 inputs found, use default value
-                packing_bitwidth = 32
+            bit_width = int(_quant_args[1])
+            group_size = None if _quant_args[2] == "None" else int(_quant_args[2])
+            mode = "dynamic" if _quant_args[3] == "dq" else "weight_only"
 
             quantize_(
                 model,
-                gemlite_uintx_weight_only(group_size, bit_width, packing_bitwidth),
+                gemlite_uintx_weight_only(
+                    bit_width=bit_width, group_size=group_size, mode=mode
+                ),
             )
 
-            # try to load gemlite kernel config
-            try:
-                GemLiteLinearTriton.load_config(
-                    f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json"
-                )
-                print(
-                    f"loaded gemlite kernel cache /tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json"
-                )
-            except:
-                print(
-                    f"unable to load gemlite kernel cache /tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json"
-                )
+            gemlite.load_config(config_file)
 
             print("running gemlite warmup")
             generate(
@@ -405,9 +395,8 @@ def ffn_or_attn_only(mod, fqn):
                 temperature=temperature,
                 top_k=top_k,
             )
-            GemLiteLinearTriton.cache_config(
-                f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json"
-            )
+            gemlite.cache_config(config_file)
+
         if "int8wo" in quantization:
             quantize_(model, int8_weight_only())
         if "int8dq" in quantization: