1
+ from sparseml .transformers import SparseAutoModelForCausalLM , SparseAutoTokenizer , oneshot
2
+ from copy import deepcopy
3
+ import torch
4
+
5
+ model_name = "Qwen/Qwen1.5-MoE-A2.7B"
6
+
7
+ model = SparseAutoModelForCausalLM .from_pretrained (
8
+ model_name ,
9
+ device_map = "cuda:0" ,
10
+ torch_dtype = torch .float16 ,
11
+ )
12
+ og_model = deepcopy (model )
13
+ tokenizer = SparseAutoTokenizer .from_pretrained (
14
+ model_name
15
+ )
16
+
17
+ dataset = "open-platypus"
18
+ recipe = "tests/sparseml/transformers/compression/recipes/new_quant_full.yaml"
19
+
20
+ oneshot (
21
+ model = model ,
22
+ dataset = dataset ,
23
+ overwrite_output_dir = True ,
24
+ output_dir = "./output_one_shot" ,
25
+ recipe = recipe ,
26
+ num_calibration_samples = 8
27
+
28
+ )
29
+
30
+ prompt = "Why did the transformer cross the road?"
31
+ prompt_tokenized = tokenizer (prompt , return_tensors = "pt" ).to (model .device )
32
+ print ('----' )
33
+ print (f"Output from the original model: { tokenizer .decode (og_model .generate (** prompt_tokenized , max_length = 50 )[0 ])} " )
34
+ print ('----' )
35
+ tokenizer = SparseAutoTokenizer .from_pretrained ("./output_one_shot" )
36
+ prompt_tokenized = tokenizer (prompt , return_tensors = "pt" ).to (model .device )
37
+ print (f"Output from the quantized model: { tokenizer .decode (model .generate (** prompt_tokenized , max_length = 50 )[0 ])} " )
38
+ print ('----' )
39
+ model = SparseAutoModelForCausalLM .from_pretrained ("./output_one_shot" , device_map = "cuda:1" , torch_dtype = torch .float16 )
40
+ print (f"Output from the quantized model (reloaded): { tokenizer .decode (model .generate (** prompt_tokenized .to (model .device ), max_length = 50 )[0 ])} " )
41
+ print ('----' )
0 commit comments