Skip to content
This repository was archived by the owner on Oct 25, 2024. It is now read-only.

Commit 578162a

Browse files
authored
[CPP Graph] add opt cpp graph and chat application (#133)
1 parent 1d2b4f0 commit 578162a

File tree

19 files changed

+941
-13
lines changed

19 files changed

+941
-13
lines changed

intel_extension_for_transformers/llm/runtime/graph/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ We support the following models:
2424
|[MPT-7B](https://huggingface.co/mosaicml/mpt-7b), [MPT-30B](https://huggingface.co/mosaicml/mpt-30b)|||
2525
|[Falcon-7B](https://huggingface.co/tiiuae/falcon-7b), [Falcon-40B](https://huggingface.co/tiiuae/falcon-40b)|||
2626
|[BLOOM-7B](https://huggingface.co/bigscience/bloomz-7b1)|||
27+
|[OPT-125m](https://huggingface.co/facebook/opt-125m), [OPT-350m](https://huggingface.co/facebook/opt-350m), [OPT-1.3B](https://huggingface.co/facebook/opt-1.3b), [OPT-13B](https://huggingface.co/facebook/opt-13b)|||
2728

2829
### Code generation models
2930
| model name | INT8 | INT4|
@@ -45,7 +46,6 @@ ninja
4546
### 2. Convert LLM
4647
LLM Runtime assumes the same model format as [llama.cpp](https://github.com/ggerganov/llama.cpp) and [ggml](https://github.com/ggerganov/ggml). You can also convert the model by following the below steps:
4748

48-
4949
```bash
5050
# download fp32 model (e.g., LLAMA2) from Hugging Face
5151
git clone https://huggingface.co/meta-llama/Llama-2-7b-chat-hf

intel_extension_for_transformers/llm/runtime/graph/application/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ compile_quant(quant_dolly quant_model.cpp dolly gptneox)
5959
compile_quant(quant_llama quant_model.cpp llama llama)
6060
compile_quant(quant_mpt quant_model.cpp mpt mpt)
6161
compile_quant(quant_starcoder quant_model.cpp starcoder starcoder)
62+
compile_quant(quant_opt quant_model.cpp opt opt)
6263
compile_quant(quant_bloom quant_model.cpp bloom bloom)
6364

6465
# all models running
@@ -80,4 +81,5 @@ compile_run(run_dolly main_run.cpp dolly gptneox)
8081
compile_run(run_llama main_run.cpp llama llama)
8182
compile_run(run_mpt main_run.cpp mpt mpt)
8283
compile_run(run_starcoder main_run.cpp starcoder starcoder)
84+
compile_run(run_opt main_run.cpp opt opt)
8385
compile_run(run_bloom main_run.cpp bloom bloom)

intel_extension_for_transformers/llm/runtime/graph/models/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,5 @@ add_subdirectory(mpt)
1919
add_subdirectory(gptneox)
2020
add_subdirectory(starcoder)
2121
add_subdirectory(falcon)
22-
add_subdirectory(bloom)
22+
add_subdirectory(opt)
23+
add_subdirectory(bloom)

intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_files.h

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,9 +211,13 @@ struct model_file_loader {
211211
file.read_raw(&hparams.alibi_bias_max, sizeof(float));
212212
file.read_raw(&hparams.clip_qkv, sizeof(float));
213213
hparams.par_res = file.read_u32();
214+
hparams.word_embed_proj_dim = file.read_u32();
215+
hparams.do_layer_norm_before = bool(file.read_u32());
214216
}
215217
void read_vocab() {
216218
vocab.id_to_token.resize(hparams.n_vocab);
219+
file.read_raw(&vocab.bos_token_id, sizeof(model_vocab::id));
220+
file.read_raw(&vocab.eos_token_id, sizeof(model_vocab::id));
217221

218222
for (uint32_t i = 0; i < hparams.n_vocab; i++) {
219223
uint32_t len = file.read_u32();
@@ -230,6 +234,7 @@ struct model_file_loader {
230234
tok_score.tok = std::move(word);
231235
tok_score.score = score;
232236
}
237+
233238
}
234239
void read_tensor_metadata(size_t file_idx, model_load_tensors_map& tensors_map) {
235240
while (file.tell() < file.size) {
@@ -316,12 +321,16 @@ struct model_file_saver {
316321
file.write_raw(&hparams.alibi_bias_max, sizeof(float));
317322
file.write_raw(&hparams.clip_qkv, sizeof(float));
318323
file.write_u32(hparams.par_res);
324+
file.write_u32(hparams.word_embed_proj_dim);
325+
file.write_u32(static_cast<int>(hparams.do_layer_norm_before));
319326
}
320327
void write_vocab() {
321328
if (any_file_loader->file_version == MODEL_FILE_VERSION_NE) {
322329
fprintf(stderr, "model.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n");
323330
}
324331
uint32_t n_vocab = any_file_loader->hparams.n_vocab;
332+
file.write_raw(&(any_file_loader->vocab.bos_token_id), sizeof(model_vocab::id));
333+
file.write_raw(&(any_file_loader->vocab.eos_token_id), sizeof(model_vocab::id));
325334
for (uint32_t i = 0; i < n_vocab; i++) {
326335
const auto& token_score = any_file_loader->vocab.id_to_token.at(i);
327336
file.write_u32((uint32_t)token_score.tok.size());
@@ -410,7 +419,11 @@ struct model_model_loader {
410419
if (it == tensors_map.name_to_idx.end()) {
411420
it = tensors_map.name_to_idx.find("transformer.word_embeddings.weight");
412421
if (it == tensors_map.name_to_idx.end()) {
413-
throw std::string("missing tok_embeddings.weight");
422+
it = tensors_map.name_to_idx.find("model.decoder.embed_tokens.weight");
423+
if (it != tensors_map.name_to_idx.end()) return 1; // hacky solution for OPT loading
424+
if (it == tensors_map.name_to_idx.end()) {
425+
throw std::string("missing tok_embeddings.weight");
426+
}
414427
}
415428
}
416429
}

intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_types.h

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,9 @@
4242
#include "models/model_utils/util.h"
4343

4444
#define MODEL_MAX_NORM 4
45-
#define MODEL_MAX_ATTN 4
45+
#define MODEL_MAX_ATTN 8
4646
#define MODEL_MAX_FFN 6
47-
#define MODEL_MAX_OTHERS 6
47+
#define MODEL_MAX_OTHERS 7
4848

4949
#define MODEL_USE_SCRATCH
5050
#define MODEL_MAX_SCRATCH_BUFFERS 16
@@ -64,8 +64,10 @@
6464
#ifdef __cplusplus
6565
extern "C" {
6666
#endif
67-
enum model_archs { MODEL_UNKNOWN, MODEL_LLAMA, MODEL_GPTJ, MODEL_MPT, MODEL_GPTNEOX, MODEL_STARCODER, MODEL_FALCON,
68-
MODEL_BLOOM };
67+
68+
enum model_archs { MODEL_UNKNOWN, MODEL_LLAMA, MODEL_GPTJ, MODEL_MPT, MODEL_GPTNEOX, MODEL_STARCODER, MODEL_FALCON,
69+
MODEL_OPT, MODEL_BLOOM};
70+
6971

7072
static const size_t MB = 1024 * 1024;
7173

@@ -101,10 +103,12 @@ struct model_hparams {
101103
uint32_t n_layer = 32;
102104
uint32_t n_rot = 64;
103105
enum ne_ftype ftype = NE_FTYPE_MOSTLY_F16;
104-
int32_t max_seq_len = 0; // for mpt
105-
float alibi_bias_max = 0; // for mpt
106-
float clip_qkv = 0; // for mpt
107-
int32_t par_res = 1; // for neox 1 = true, 0 = false
106+
int32_t max_seq_len = 0; // for mpt
107+
float alibi_bias_max = 0; // for mpt
108+
float clip_qkv = 0; // for mpt
109+
int32_t par_res = 1; // for neox 1 = true, 0 = false
110+
uint32_t word_embed_proj_dim = 0; // for opt
111+
bool do_layer_norm_before = false; // for opt
108112

109113
bool operator!=(const model_hparams& other) const {
110114
return static_cast<bool>(memcmp(this, &other, sizeof(model_hparams)));
@@ -186,6 +190,8 @@ struct model_vocab {
186190

187191
std::unordered_map<token, id> token_to_id;
188192
std::vector<token_score> id_to_token;
193+
id bos_token_id = -1; //The default value is -1
194+
id eos_token_id = -1; //The default value is -1
189195
};
190196

191197
// reference: https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
@@ -350,7 +356,7 @@ class model_name_to_arch {
350356
model_name_to_arch() {}
351357
// update this table if has new cpp model
352358
std::unordered_map<std::string, model_archs> name2arch_ = {
353-
{"unknown", MODEL_UNKNOWN}, {"llama", MODEL_LLAMA}, {"gptj", MODEL_GPTJ}, {"mpt", MODEL_MPT},
359+
{"unknown", MODEL_UNKNOWN}, {"llama", MODEL_LLAMA}, {"gptj", MODEL_GPTJ}, {"mpt", MODEL_MPT}, {"opt", MODEL_OPT},
354360
{"gptneox", MODEL_GPTNEOX}, {"dolly", MODEL_GPTNEOX}, {"starcoder", MODEL_STARCODER}, {"falcon", MODEL_FALCON},
355361
{"bloom", MODEL_BLOOM},
356362
};

intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,7 @@ static std::vector<model_vocab::id> model_tokenize(const model_vocab& vocab, con
286286
}
287287

288288
if (bos) {
289-
output.push_back(model_token_bos());
289+
output.push_back(vocab.bos_token_id);
290290
}
291291

292292
tokenizer.tokenize(text, output);

intel_extension_for_transformers/llm/runtime/graph/models/mpt/mpt.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,15 @@
2121
enum mpt_model {
2222
MPT_UNKNOWN,
2323
MPT_7B,
24+
MPT_30B,
2425
};
2526

2627
static const model_scratch mpt_mem_req(int n_layers) {
2728
switch (n_layers) {
2829
case 32:
2930
return {2048ull * MB, 2048ull * MB, 4096ull * MB, 3072ull * MB};
31+
case 48:
32+
return {4096ull * MB, 4096ull * MB, 8192ull * MB, 6144ull * MB};
3033
// TODO(hengyu): add more variants besides 6B
3134
default:
3235
MODEL_ASSERT(false);
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Copyright (c) 2023 Intel Corporation
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
set(TARGET opt)
16+
add_library_w_warning(${TARGET} opt.cpp opt_utils.cpp ${MODEL_UTILS_SOURCE})
17+
target_compile_features(${TARGET} PUBLIC cxx_std_11) # don't bump
18+
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
19+
target_link_libraries(${TARGET} PUBLIC ne_layers ${LLAMA_EXTRA_LIBS} jblas::jblas)

0 commit comments

Comments
 (0)