|
6 | 6 | from transformers import AutoConfig, AutoTokenizer
|
7 | 7 | from transformers.generation import GenerationConfig
|
8 | 8 | import intel_extension_for_pytorch as ipex
|
| 9 | +from intel_extension_for_transformers.llm.utils.generation import _beam_search, _greedy_search |
9 | 10 | from intel_extension_for_transformers.transformers import AutoModelForCausalLM, WeightOnlyQuantConfig
|
10 | 11 | from intel_extension_for_transformers.llm.quantization.utils import convert_dtype_str2torch
|
11 | 12 | from transformers.utils import check_min_version
|
|
36 | 37 | # ============Benchmark configs==============
|
37 | 38 | parser.add_argument("--benchmark", action="store_true")
|
38 | 39 | parser.add_argument("--do_profiling", action="store_true")
|
| 40 | +parser.add_argument("--disable_optimize_transformers", action="store_true") |
39 | 41 | parser.add_argument("--profile_token_latency", action="store_true")
|
40 | 42 | parser.add_argument("--iters", default=10, type=int, help="num iter")
|
41 | 43 | parser.add_argument("--num_warmup", default=3, type=int, help="num warmup")
|
|
49 | 51 | help="tasks list for accuracy validation")
|
50 | 52 | # ============WeightOnlyQuant configs===============
|
51 | 53 | parser.add_argument("--woq", action="store_true")
|
52 |
| -parser.add_argument("--woq_algo", default="RTN", choices=['RTN'], |
| 54 | +parser.add_argument("--woq_algo", default="RTN", choices=['RTN', 'GPTQ'], |
53 | 55 | help="Weight-only parameter.")
|
54 | 56 | parser.add_argument("--woq_dtype", type=str, default="int4_fullrange",
|
55 | 57 | choices=["int4_fullrange"])
|
|
58 | 60 | parser.add_argument("--woq_enable_mse_search", action="store_true")
|
59 | 61 | parser.add_argument("--device", default="xpu")
|
60 | 62 | parser.add_argument("--compute_dtype", default="fp16")
|
| 63 | +parser.add_argument( |
| 64 | + "--gptq_percdamp", |
| 65 | + type=float, |
| 66 | + default=0.01, |
| 67 | + help="Percent of the average Hessian diagonal to use for dampening.", |
| 68 | +) |
| 69 | +parser.add_argument( |
| 70 | + "--gptq_block_size", |
| 71 | + type=int, |
| 72 | + default=128, |
| 73 | + help="Block size. sub weight matrix size to run GPTQ.", |
| 74 | +) |
| 75 | +parser.add_argument( |
| 76 | + "--gptq_nsamples", type=int, default=128, help="Number of calibration data samples." |
| 77 | +) |
| 78 | +parser.add_argument( |
| 79 | + "--gptq_use_max_length", |
| 80 | + action="store_true", |
| 81 | + help="Set all sequence length to be same length of args.gptq_pad_max_length", |
| 82 | +) |
| 83 | +parser.add_argument( |
| 84 | + "--gptq_pad_max_length", |
| 85 | + type=int, |
| 86 | + default=2048, |
| 87 | + help="Calibration dataset sequence max length, this should align with your model config", |
| 88 | +) |
61 | 89 | # ============BitsAndBytes configs==============
|
62 | 90 | parser.add_argument("--bitsandbytes", action="store_true")
|
63 | 91 | parser.add_argument("--load_in_4bit", type=bool, default=False)
|
|
77 | 105 | trust_remote_code=args.trust_remote_code,
|
78 | 106 | revision=args.revision,
|
79 | 107 | )
|
80 |
| -generation_config = GenerationConfig.from_pretrained(args.model, trust_remote_code=args.trust_remote_code) |
81 |
| -generation_config.do_sample = False |
| 108 | + |
82 | 109 | user_model = None
|
83 | 110 |
|
84 | 111 | # tokenizer
|
|
90 | 117 |
|
91 | 118 | quantization_config = None
|
92 | 119 | if args.woq:
|
93 |
| - quantization_config = WeightOnlyQuantConfig( |
94 |
| - compute_dtype=args.compute_dtype, weight_dtype=args.woq_dtype, |
95 |
| - group_size=args.woq_group_size, scale_dtype=args.compute_dtype |
96 |
| - ) #default is A16W4G16 |
| 120 | + if args.woq_algo == "GPTQ": |
| 121 | + algorithm_args = { |
| 122 | + "act_order": False, |
| 123 | + "percdamp": args.gptq_percdamp, |
| 124 | + "block_size": args.gptq_block_size, |
| 125 | + "nsamples": args.gptq_nsamples, |
| 126 | + "use_max_length": args.gptq_use_max_length, |
| 127 | + "pad_max_length": args.gptq_pad_max_length, |
| 128 | + } |
| 129 | + quantization_config = WeightOnlyQuantConfig( |
| 130 | + compute_dtype=args.compute_dtype, |
| 131 | + scale_dtype=args.compute_dtype, |
| 132 | + weight_dtype=args.woq_dtype, |
| 133 | + scheme=args.woq_scheme, |
| 134 | + group_size=args.woq_group_size, |
| 135 | + algorithm=args.woq_algo, |
| 136 | + tokenizer=tokenizer, |
| 137 | + algorithm_args=algorithm_args, |
| 138 | + ) |
| 139 | + else: |
| 140 | + quantization_config = WeightOnlyQuantConfig( |
| 141 | + compute_dtype=args.compute_dtype, weight_dtype=args.woq_dtype, |
| 142 | + group_size=args.woq_group_size, scale_dtype=args.compute_dtype |
| 143 | + ) #default is A16W4G16 |
97 | 144 |
|
98 | 145 | # get model
|
99 | 146 | if quantization_config is not None:
|
100 | 147 | user_model = AutoModelForCausalLM.from_pretrained(args.model,
|
101 | 148 | device_map=args.device,
|
102 | 149 | quantization_config=quantization_config,
|
103 | 150 | trust_remote_code=args.trust_remote_code,
|
104 |
| - fp16=True, |
| 151 | + torch_dtype=torch.float16, |
105 | 152 | use_neural_speed=False
|
106 | 153 | )
|
107 | 154 | elif args.load_in_4bit or args.load_in_8bit:
|
|
117 | 164 | tokenizer.save_pretrained(args.output_dir)
|
118 | 165 |
|
119 | 166 | if args.benchmark:
|
120 |
| - prompt = "它完成了,并提交了。你可以在Android和网络上玩美味生存。在网络上玩是有效的,但你必须模拟多次触摸才能移动桌子." |
| 167 | + if config.model_type == "qwen": |
| 168 | + prompt = "它完成了,并提交了。你可以在Android和网络上玩美味生存。在网络上玩是有效的,但你必须模拟多次触摸才能移动桌子." |
| 169 | + else: |
| 170 | + prompt = "Once upon a time, there existed a little girl, who liked to have adventures. She wanted to go to places and meet new people, and have fun." |
121 | 171 |
|
122 | 172 | input_size = tokenizer(prompt, return_tensors="pt").input_ids.size(dim=1)
|
123 | 173 | print("---- Prompt size:", input_size)
|
124 | 174 |
|
125 | 175 | user_model = AutoModelForCausalLM.from_pretrained(
|
126 | 176 | args.model, trust_remote_code=args.trust_remote_code, device_map=args.device, torch_dtype=torch_dtype) \
|
127 | 177 | if user_model is None else user_model
|
128 |
| - user_model = ipex.optimize_transformers( |
129 |
| - user_model.eval(), device=args.device, inplace=True, woq=True, dtype=torch_dtype) |
| 178 | + user_model = user_model.to(memory_format=torch.channels_last) |
| 179 | + if not args.disable_optimize_transformers: |
| 180 | + print("Optimize with IPEX...") |
| 181 | + user_model = ipex.optimize_transformers( |
| 182 | + user_model.eval(), device=args.device, inplace=True, woq=(hasattr(user_model, "quantization_config")), dtype=torch_dtype) |
| 183 | + else: |
| 184 | + print("Disabled optimization with IPEX...") |
130 | 185 | # start
|
131 | 186 | num_iter = args.iters
|
132 | 187 | num_warmup = args.num_warmup
|
|
136 | 191 |
|
137 | 192 | generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=args.num_beams)
|
138 | 193 | if args.profile_token_latency:
|
139 |
| - generate_kwargs["token_latency"] = True |
| 194 | + ipex.transformers.optimize.convert_function(user_model, "greedy_search", _greedy_search) |
| 195 | + if args.disable_optimize_transformers: |
| 196 | + ipex.transformers.optimize.convert_function(user_model, "beam_search", _beam_search) |
| 197 | + user_model.config.token_latency = True |
140 | 198 |
|
141 | 199 | total_time = 0.0
|
142 | 200 | total_list = []
|
|
205 | 263 | user_model = AutoModelForCausalLM.from_pretrained(
|
206 | 264 | args.model, trust_remote_code=args.trust_remote_code, device_map=args.device, torch_dtype=torch_dtype) \
|
207 | 265 | if user_model is None else user_model
|
208 |
| - user_model = ipex.optimize_transformers( |
209 |
| - user_model.eval(), device=args.device, inplace=True, woq=True, dtype=torch_dtype) |
| 266 | + if not args.disable_optimize_transformers: |
| 267 | + print("Optimize with IPEX...") |
| 268 | + user_model = ipex.optimize_transformers( |
| 269 | + user_model.eval(), device=args.device, inplace=True, woq=(hasattr(user_model, "quantization_config")), dtype=torch_dtype) |
| 270 | + else: |
| 271 | + print("Disabled optimization with IPEX...") |
210 | 272 | results = evaluate(
|
211 | 273 | model="hf-causal",
|
212 | 274 | model_args='pretrained='+args.model+',tokenizer=' + args.model + \
|
213 |
| - ',dtype=float32, trust_remote_code=' + str(args.trust_remote_code), |
| 275 | + ',dtype=float32,trust_remote_code=' + str(args.trust_remote_code), |
214 | 276 | user_model=user_model,
|
215 | 277 | batch_size=args.batch_size,
|
216 | 278 | tasks=args.tasks,
|
|
0 commit comments