Skip to content
This repository was archived by the owner on Aug 30, 2024. It is now read-only.

Init arch Xe2 #298

Open
wants to merge 34 commits into
base: xetla
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
536e03e
save
sunjiweiswift May 17, 2024
5949084
save(some error with kslicing)
sunjiweiswift May 21, 2024
0669b12
fix kslicing bug
sunjiweiswift May 22, 2024
aafe774
save(g128 MTL 270Gflops bug on g32)
sunjiweiswift May 24, 2024
1b9a443
add Specialized for FPU
sunjiweiswift May 24, 2024
194ca35
support int scale col_major(with opt 10% perf when g = 32)
sunjiweiswift May 27, 2024
2bc4877
support int4x8 for int32 weight
sunjiweiswift May 27, 2024
8b9df8b
Update include/experimental/group/gemm/compute_policy.hpp
sunjiweiswift May 28, 2024
e8d3fbb
Update include/experimental/group/gemm/compute_policy.hpp
sunjiweiswift May 28, 2024
b0621df
save(perf bug with int4x8 load)
sunjiweiswift May 28, 2024
56be57a
save
sunjiweiswift May 29, 2024
2b37173
add first token UT
sunjiweiswift May 30, 2024
f973aa2
opt mma code
sunjiweiswift May 30, 2024
0f36c04
opt perf for int4x8
sunjiweiswift May 30, 2024
d9902d8
support load one fp16 data
sunjiweiswift May 31, 2024
30b8e95
support zero_pt
sunjiweiswift May 31, 2024
885995f
support ASYM and SYM
sunjiweiswift Jun 3, 2024
7e99e68
save
sunjiweiswift Jun 4, 2024
150f7d3
ut improve
sunjiweiswift Jun 6, 2024
ddbac97
support sg_n > 1
sunjiweiswift Jun 6, 2024
d2aff4b
add #pragma unroll
sunjiweiswift Jun 7, 2024
97c2481
support HF zero pt layout K x N, compress int4 along N dimensions
sunjiweiswift Jun 7, 2024
f19c86f
save
sunjiweiswift Jun 11, 2024
897f5d5
sg_m =4 for first token
sunjiweiswift Jun 14, 2024
e7f2716
Extract dequant func
sunjiweiswift Jun 14, 2024
0ebd890
update row_major for origin PVC/ARC template
sunjiweiswift Jun 17, 2024
b2dfad5
save(fix HPC 2D load)
sunjiweiswift Jun 17, 2024
8817f54
fix XEHPC 2D load
sunjiweiswift Jun 17, 2024
957c5a4
fix compile for all UT
sunjiweiswift Jun 17, 2024
5456fc0
sync ipex 20240618
DDEle Jun 19, 2024
9185409
opt PVC arch
sunjiweiswift Jun 19, 2024
93c8ad1
fix group_qkv
sunjiweiswift Jun 19, 2024
8f0abc4
fix group_qkv
sunjiweiswift Jun 20, 2024
dc7d812
init arch Xe2
airMeng Jun 20, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 48 additions & 7 deletions include/common/core/arch_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,8 @@ struct load_store_attr_t {
static constexpr bool has_hw_block_2d = false;
};

template <>
struct load_store_attr_t<msg_type::block_2d, gpu_arch::XeHpc> {
/// HW limitation checks https://gfxspecs.intel.com/Predator/Home/Index/55490
Comment on lines -34 to -36
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are multiple places with the gfxspecs link. I think they are helpful for internal developers. If they violate any company policies, they should be removed all at once in a separate PR.

template <msg_type message_type, gpu_arch arg_tag>
struct xe_plus_load_store_attr_t {
static constexpr bool has_hw_block_2d = true;
static constexpr uint32_t max_load_height_in_elem = 32;
static constexpr uint32_t max_load_width_in_bytes = 64;
Expand All @@ -55,10 +54,9 @@ struct load_store_attr_t<msg_type::block_2d, gpu_arch::XeHpc> {

template <msg_type message_type, gpu_arch arg_tag>
struct client_load_store_attr_base_t {
/// HW limitation checks https://gfxspecs.intel.com/Predator/Home/Index/55490
static constexpr bool has_hw_block_2d = false;
static constexpr uint32_t max_load_height_in_elem = 32;
static constexpr uint32_t max_load_width_in_bytes = 64;
static constexpr uint32_t max_load_height_in_elem = 0;
static constexpr uint32_t max_load_width_in_bytes = 0;
static constexpr uint32_t max_trans_load_width_in_bytes = 32;
static constexpr uint32_t max_vnni_load_width_in_elems = 16;
static constexpr uint32_t min_vnni_load_height_in_bytes = 4;
Expand Down Expand Up @@ -87,6 +85,18 @@ struct load_store_attr_t<msg_type::block_2d, gpu_arch::XeLpg>
msg_type::block_2d,
gpu_arch::XeLpg> {};

template <>
struct load_store_attr_t<msg_type::block_2d, gpu_arch::XeHpc>
: public xe_plus_load_store_attr_base_t<
msg_type::block_2d,
gpu_arch::XeHpc> {};

template <>
struct load_store_attr_t<msg_type::block_2d, gpu_arch::Xe2>
: public xe_plus_load_store_attr_base_t<
msg_type::block_2d,
gpu_arch::Xe2> {};

template <gpu_arch arch_tag>
inline constexpr bool arch_has_2d_load_store =
load_store_attr_t<msg_type::block_2d, arch_tag>::has_hw_block_2d;
Expand All @@ -105,6 +115,13 @@ struct load_store_attr_t<msg_type::block_1d, gpu_arch::XeHpc> {
static constexpr uint32_t max_prefetch_vec_len = 64;
};

template <>
struct load_store_attr_t<msg_type::block_1d, gpu_arch::Xe2> {
static constexpr uint32_t max_load_vec_len = 512;
static constexpr uint32_t max_store_vec_len = 512;
static constexpr uint32_t max_prefetch_vec_len = 64;
};

struct dpas_attr_base_t {
static constexpr bool has_xmx = true;
static constexpr uint32_t systolic_depth = 8;
Expand All @@ -129,6 +146,11 @@ struct dpas_attr_t<gpu_arch::XeHpg> : public dpas_attr_base_t {
static constexpr uint32_t n_fixed_limit = 8;
};

template <>
struct dpas_attr_t<gpu_arch::Xe2> : public dpas_attr_t<gpu_arch::XeHpc> {
static constexpr uint32_t systolic_depth = 4;
};

template <gpu_arch arch_tag>
inline constexpr bool arch_has_xmx = dpas_attr_t<arch_tag>::has_xmx;

Expand Down Expand Up @@ -162,6 +184,10 @@ template <>
struct register_bytes_t<gpu_arch::XeLpg> {
static constexpr uint32_t reg_in_bytes = 32;
};
template <>
struct register_bytes_t<gpu_arch::Xe2> {
static constexpr uint32_t reg_in_bytes = 64;
};

template <grf_mode grf_num_mode, gpu_arch arch_tag>
struct register_attr_t {
Expand Down Expand Up @@ -236,10 +262,25 @@ struct arch_attr_t<gpu_arch::XeLpg> {

using dpas_attr = dpas_attr_t<gpu_arch::XeLpg>;

static constexpr uint32_t max_wg_num = 64;
static constexpr uint32_t max_wg_num = 16;
static constexpr uint32_t local_mem_size = 64 * 1024;
};

template <>
struct arch_attr_t<gpu_arch::Xe2> {
template <msg_type message_type = msg_type::block_2d>
using load_store_attr = load_store_attr_t<message_type, gpu_arch::Xe2>;

template <grf_mode grf_num_mode = grf_mode::double_grf>
using register_attr = register_attr_t<grf_num_mode, gpu_arch::Xe2>;

using dpas_attr = dpas_attr_t<gpu_arch::Xe2>;

static constexpr uint32_t max_wg_num = 16;
static constexpr uint32_t local_mem_size = 128 * 1024;
};


/// @} xetla_core_arch_config

} // namespace gpu::xetla
2 changes: 1 addition & 1 deletion include/common/core/common_types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#include <cstdint>

namespace gpu::xetla {
enum class gpu_arch : uint8_t { XeLpg = 0, XeHpg = 1, XeHpc = 2 };
enum class gpu_arch : uint8_t { XeLpg = 0, XeHpg = 1, XeHpc = 2, Xe2 = 3 };

enum class grf_mode : uint8_t { normal = 0, double_grf = 1 };

Expand Down
15 changes: 7 additions & 8 deletions include/group/gemm/compute_policy.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,16 +118,15 @@ struct compute_policy_default_fpu<
static constexpr int sync_freq = perf_tuning_knob::sync_freq;
static constexpr int k_stride = perf_tuning_knob::k_stride;

static constexpr uint32_t block_size_y_a =
arch_tag_ == gpu_arch::XeLpg ? 8 : 16;
static constexpr uint32_t block_bytes_x_a = 32;
static constexpr uint32_t block_size_y_a = 16;
using mma_attr = mma_attr_t<arch_tag_, block_size_y_a>;
static constexpr uint32_t block_bytes_x_a = mma_attr::mma_k_in_bytes;
static constexpr uint32_t block_size_x_a =
block_bytes_x_a / sizeof(dtype_mma_a);
static constexpr uint32_t block_bytes_x_b =
arch_attr_t<arch_tag>::template register_attr<>::reg_in_bytes;
static constexpr uint32_t block_size_x_b =
block_bytes_x_b / sizeof(dtype_mma_b);
static constexpr uint32_t block_size_y_b = block_size_x_a;
static constexpr uint32_t block_size_x_b = mma_attr::mma_n_in_elem;
static constexpr uint32_t block_bytes_y_b = mma_attr::mma_k_in_bytes;
static constexpr uint32_t block_size_y_b =
block_bytes_y_b / sizeof(dtype_mma_b);
};

/// @} xetla_gemm
Expand Down
6 changes: 3 additions & 3 deletions include/subgroup/tile/impl/payload_xe.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1101,7 +1101,7 @@ struct mem_payload_t<
tile_desc_,
msg_type::block_2d,
arch_tag_,
std::enable_if_t<(arch_tag_ <= gpu_arch::XeHpg)>> {
std::enable_if_t<(arch_has_2d_load_store<arch_tag_>)>> {
using dtype = native_type_t<dtype_>;
using mem_desc_t =
mem_desc_t<dtype_, mem_layout_, mem_space::global, alignment_>;
Expand Down Expand Up @@ -1652,7 +1652,7 @@ struct prefetch_payload_t<
num_coop_sg_,
arch_tag_,
std::enable_if_t<
arch_tag_ <= gpu_arch::XeHpg &&
arch_has_2d_load_store<arch_tag_> &&
((block_size_y_ != 1 && mem_layout_ == mem_layout::row_major) ||
(block_size_x_ != 1 && mem_layout_ == mem_layout::col_major))>> {
using dtype = native_type_t<dtype_>;
Expand Down Expand Up @@ -2305,4 +2305,4 @@ struct prefetch_payload_t<
__XETLA_API void update_tdesc([[maybe_unused]] int offset) {}
};

} // namespace gpu::xetla::subgroup
} // namespace gpu::xetla::subgroup
2 changes: 1 addition & 1 deletion include/subgroup/tile/impl/prefetch_xe.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,4 +195,4 @@ __XETLA_API
typename std::enable_if_t<detail::check_prefetch_type<payload_t>::is_local>
tile_prefetch([[maybe_unused]] payload_t& payload) {}

} // namespace gpu::xetla::subgroup
} // namespace gpu::xetla::subgroup