Skip to content
This repository was archived by the owner on Oct 25, 2024. It is now read-only.

Commit 8569d7b

Browse files
Dev/ipex woq (#1225)
Co-authored-by: VincyZhang <wenxin.zhang@intel.com>
1 parent e879faa commit 8569d7b

File tree

13 files changed

+1181
-72
lines changed

13 files changed

+1181
-72
lines changed

.github/workflows/script/formatScan/pylint.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ else
2828
fi
2929
# install packages
3030
pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@83dbfbf6070324f3e5872f63e49d49ff7ef4c9b3
31-
pip install accelerate nlpaug nltk schema optimum-intel==1.11.0 optimum==1.13.3 peft==0.6.2
31+
pip install accelerate nlpaug nltk schema optimum-intel optimum peft
32+
pip install --upgrade --force-reinstall transformers
3233

3334
echo "[DEBUG] list pipdeptree..."
3435
pip install pipdeptree

.github/workflows/script/unitTest/coverage/.neural-chat-coveragerc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ omit =
88
*/intel_extension_for_transformers/llm/amp/**
99
*/intel_extension_for_transformers/llm/evaluation/**
1010
*/intel_extension_for_transformers/llm/quantization/**
11+
*/intel_extension_for_transformers/llm/utils/generation/**
1112
*/intel_extension_for_transformers/llm/library/**
1213
*/intel_extension_for_transformers/llm/operator/**
1314
*/intel_extension_for_transformers/llm/runtime/**

examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py

Lines changed: 77 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from transformers import AutoConfig, AutoTokenizer
77
from transformers.generation import GenerationConfig
88
import intel_extension_for_pytorch as ipex
9+
from intel_extension_for_transformers.llm.utils.generation import _beam_search, _greedy_search
910
from intel_extension_for_transformers.transformers import AutoModelForCausalLM, WeightOnlyQuantConfig
1011
from intel_extension_for_transformers.llm.quantization.utils import convert_dtype_str2torch
1112
from transformers.utils import check_min_version
@@ -36,6 +37,7 @@
3637
# ============Benchmark configs==============
3738
parser.add_argument("--benchmark", action="store_true")
3839
parser.add_argument("--do_profiling", action="store_true")
40+
parser.add_argument("--disable_optimize_transformers", action="store_true")
3941
parser.add_argument("--profile_token_latency", action="store_true")
4042
parser.add_argument("--iters", default=10, type=int, help="num iter")
4143
parser.add_argument("--num_warmup", default=3, type=int, help="num warmup")
@@ -49,7 +51,7 @@
4951
help="tasks list for accuracy validation")
5052
# ============WeightOnlyQuant configs===============
5153
parser.add_argument("--woq", action="store_true")
52-
parser.add_argument("--woq_algo", default="RTN", choices=['RTN'],
54+
parser.add_argument("--woq_algo", default="RTN", choices=['RTN', 'GPTQ'],
5355
help="Weight-only parameter.")
5456
parser.add_argument("--woq_dtype", type=str, default="int4_fullrange",
5557
choices=["int4_fullrange"])
@@ -58,6 +60,32 @@
5860
parser.add_argument("--woq_enable_mse_search", action="store_true")
5961
parser.add_argument("--device", default="xpu")
6062
parser.add_argument("--compute_dtype", default="fp16")
63+
parser.add_argument(
64+
"--gptq_percdamp",
65+
type=float,
66+
default=0.01,
67+
help="Percent of the average Hessian diagonal to use for dampening.",
68+
)
69+
parser.add_argument(
70+
"--gptq_block_size",
71+
type=int,
72+
default=128,
73+
help="Block size. sub weight matrix size to run GPTQ.",
74+
)
75+
parser.add_argument(
76+
"--gptq_nsamples", type=int, default=128, help="Number of calibration data samples."
77+
)
78+
parser.add_argument(
79+
"--gptq_use_max_length",
80+
action="store_true",
81+
help="Set all sequence length to be same length of args.gptq_pad_max_length",
82+
)
83+
parser.add_argument(
84+
"--gptq_pad_max_length",
85+
type=int,
86+
default=2048,
87+
help="Calibration dataset sequence max length, this should align with your model config",
88+
)
6189
# ============BitsAndBytes configs==============
6290
parser.add_argument("--bitsandbytes", action="store_true")
6391
parser.add_argument("--load_in_4bit", type=bool, default=False)
@@ -77,8 +105,7 @@
77105
trust_remote_code=args.trust_remote_code,
78106
revision=args.revision,
79107
)
80-
generation_config = GenerationConfig.from_pretrained(args.model, trust_remote_code=args.trust_remote_code)
81-
generation_config.do_sample = False
108+
82109
user_model = None
83110

84111
# tokenizer
@@ -90,18 +117,38 @@
90117

91118
quantization_config = None
92119
if args.woq:
93-
quantization_config = WeightOnlyQuantConfig(
94-
compute_dtype=args.compute_dtype, weight_dtype=args.woq_dtype,
95-
group_size=args.woq_group_size, scale_dtype=args.compute_dtype
96-
) #default is A16W4G16
120+
if args.woq_algo == "GPTQ":
121+
algorithm_args = {
122+
"act_order": False,
123+
"percdamp": args.gptq_percdamp,
124+
"block_size": args.gptq_block_size,
125+
"nsamples": args.gptq_nsamples,
126+
"use_max_length": args.gptq_use_max_length,
127+
"pad_max_length": args.gptq_pad_max_length,
128+
}
129+
quantization_config = WeightOnlyQuantConfig(
130+
compute_dtype=args.compute_dtype,
131+
scale_dtype=args.compute_dtype,
132+
weight_dtype=args.woq_dtype,
133+
scheme=args.woq_scheme,
134+
group_size=args.woq_group_size,
135+
algorithm=args.woq_algo,
136+
tokenizer=tokenizer,
137+
algorithm_args=algorithm_args,
138+
)
139+
else:
140+
quantization_config = WeightOnlyQuantConfig(
141+
compute_dtype=args.compute_dtype, weight_dtype=args.woq_dtype,
142+
group_size=args.woq_group_size, scale_dtype=args.compute_dtype
143+
) #default is A16W4G16
97144

98145
# get model
99146
if quantization_config is not None:
100147
user_model = AutoModelForCausalLM.from_pretrained(args.model,
101148
device_map=args.device,
102149
quantization_config=quantization_config,
103150
trust_remote_code=args.trust_remote_code,
104-
fp16=True,
151+
torch_dtype=torch.float16,
105152
use_neural_speed=False
106153
)
107154
elif args.load_in_4bit or args.load_in_8bit:
@@ -117,16 +164,24 @@
117164
tokenizer.save_pretrained(args.output_dir)
118165

119166
if args.benchmark:
120-
prompt = "它完成了,并提交了。你可以在Android和网络上玩美味生存。在网络上玩是有效的,但你必须模拟多次触摸才能移动桌子."
167+
if config.model_type == "qwen":
168+
prompt = "它完成了,并提交了。你可以在Android和网络上玩美味生存。在网络上玩是有效的,但你必须模拟多次触摸才能移动桌子."
169+
else:
170+
prompt = "Once upon a time, there existed a little girl, who liked to have adventures. She wanted to go to places and meet new people, and have fun."
121171

122172
input_size = tokenizer(prompt, return_tensors="pt").input_ids.size(dim=1)
123173
print("---- Prompt size:", input_size)
124174

125175
user_model = AutoModelForCausalLM.from_pretrained(
126176
args.model, trust_remote_code=args.trust_remote_code, device_map=args.device, torch_dtype=torch_dtype) \
127177
if user_model is None else user_model
128-
user_model = ipex.optimize_transformers(
129-
user_model.eval(), device=args.device, inplace=True, woq=True, dtype=torch_dtype)
178+
user_model = user_model.to(memory_format=torch.channels_last)
179+
if not args.disable_optimize_transformers:
180+
print("Optimize with IPEX...")
181+
user_model = ipex.optimize_transformers(
182+
user_model.eval(), device=args.device, inplace=True, woq=(hasattr(user_model, "quantization_config")), dtype=torch_dtype)
183+
else:
184+
print("Disabled optimization with IPEX...")
130185
# start
131186
num_iter = args.iters
132187
num_warmup = args.num_warmup
@@ -136,7 +191,10 @@
136191

137192
generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=args.num_beams)
138193
if args.profile_token_latency:
139-
generate_kwargs["token_latency"] = True
194+
ipex.transformers.optimize.convert_function(user_model, "greedy_search", _greedy_search)
195+
if args.disable_optimize_transformers:
196+
ipex.transformers.optimize.convert_function(user_model, "beam_search", _beam_search)
197+
user_model.config.token_latency = True
140198

141199
total_time = 0.0
142200
total_list = []
@@ -205,12 +263,16 @@
205263
user_model = AutoModelForCausalLM.from_pretrained(
206264
args.model, trust_remote_code=args.trust_remote_code, device_map=args.device, torch_dtype=torch_dtype) \
207265
if user_model is None else user_model
208-
user_model = ipex.optimize_transformers(
209-
user_model.eval(), device=args.device, inplace=True, woq=True, dtype=torch_dtype)
266+
if not args.disable_optimize_transformers:
267+
print("Optimize with IPEX...")
268+
user_model = ipex.optimize_transformers(
269+
user_model.eval(), device=args.device, inplace=True, woq=(hasattr(user_model, "quantization_config")), dtype=torch_dtype)
270+
else:
271+
print("Disabled optimization with IPEX...")
210272
results = evaluate(
211273
model="hf-causal",
212274
model_args='pretrained='+args.model+',tokenizer=' + args.model + \
213-
',dtype=float32, trust_remote_code=' + str(args.trust_remote_code),
275+
',dtype=float32,trust_remote_code=' + str(args.trust_remote_code),
214276
user_model=user_model,
215277
batch_size=args.batch_size,
216278
tasks=args.tasks,

intel_extension_for_transformers/llm/quantization/utils.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@
2222
import os
2323
from accelerate import init_empty_weights
2424
from datasets import load_dataset
25-
from intel_extension_for_transformers.transformers.utils.utility import LazyImport
2625
from neural_compressor import quantization
2726
from neural_compressor.adaptor.torch_utils.model_wrapper import WeightOnlyLinear
27+
from neural_compressor.utils.utility import LazyImport
2828
from neural_compressor.config import PostTrainingQuantConfig
2929
from ...utils.utils import is_ipex_available
3030
from transformers import AutoTokenizer
@@ -349,6 +349,13 @@ def default_calib_func(model):
349349
if config.algorithm in ["TEQ", "RTN", "GPTQ"]:
350350
calib_func = None
351351

352+
orig_dtype = torch.float32
353+
for param in model.parameters():
354+
orig_dtype = param.dtype
355+
if orig_dtype != torch.float32:
356+
model.to(dtype=torch.float32)
357+
break
358+
352359
inc_model = quantization.fit(model,
353360
conf,
354361
calib_func=calib_func,
@@ -363,7 +370,6 @@ def default_calib_func(model):
363370
None,
364371
config,
365372
device=device)
366-
return q_model.to("xpu")
367373
else:
368374
if config.algorithm == "GPTQ":
369375
inc_model = inc_model.export_compressed_model(use_optimum_format=True)
@@ -381,9 +387,12 @@ def default_calib_func(model):
381387
}
382388

383389
setattr(config, "gptq_quantize_config", quantize_config)
384-
return replace_linear(inc_model, None, None, config, device=device)
385-
386-
return replace_linear(inc_model.model, None, None, config, device=device)
390+
q_model = replace_linear(inc_model, None, None, config, device=device)
391+
else:
392+
q_model = replace_linear(inc_model.model, None, None, config, device=device)
393+
if orig_dtype != torch.float32:
394+
q_model.to(dtype=orig_dtype)
395+
return q_model.to(device)
387396

388397
def convert_dtype_str2torch(str_dtype):
389398
if str_dtype == "int8":
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
#
4+
# Copyright (c) 2023 Intel Corporation
5+
#
6+
# Licensed under the Apache License, Version 2.0 (the "License");
7+
# you may not use this file except in compliance with the License.
8+
# You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
18+
from .beam_search import _beam_search
19+
from .greedy_search import _greedy_search

0 commit comments

Comments
 (0)