Skip to content

8351623: VectorAPI: Add SVE implementation of subword gather load operation #26236

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 28 additions & 6 deletions src/hotspot/cpu/aarch64/aarch64.ad
Original file line number Diff line number Diff line change
Expand Up @@ -2401,6 +2401,20 @@ uint Matcher::vector_ideal_reg(int len) {
return 0;
}

// Vector ideal reg size corresponding to the specified len in bytes
uint Matcher::vector_ideal_reg_size(int len) {
assert(MaxVectorSize >= len, "");
uint ideal_reg = vector_ideal_reg(len);
switch (ideal_reg) {
case Op_VecD: return 8;
case Op_VecX: return 16;
case Op_VecA: return MaxVectorSize;
default:
ShouldNotReachHere();
return 0;
}
}

MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
assert(Matcher::is_generic_vector(generic_opnd), "not generic");
switch (ideal_reg) {
Expand Down Expand Up @@ -2647,12 +2661,13 @@ bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
// into registers?
bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {

// Loads and stores with indirect memory input (e.g., volatile loads and
// stores) do not subsume the input into complex addressing expressions. If
// the addressing expression is input to at least one such load or store, do
// not clone the addressing expression. Query needs_acquiring_load and
// needs_releasing_store as a proxy for indirect memory input, as it is not
// possible to directly query for indirect memory input at this stage.
// Loads and stores with indirect memory input (e.g., volatile loads/stores,
// and vector gather_loads/scatter_stores) do not subsume the input into
// complex addressing expressions. If the addressing expression is input
// to at least one such load or store, do not clone the addressing expression.
// Query needs_acquiring_load and needs_releasing_store as a proxy for
// indirect memory input, as it is not possible to directly query for indirect
// memory input at this stage.
for (DUIterator_Fast imax, i = m->fast_outs(imax); i < imax; i++) {
Node* n = m->fast_out(i);
if (n->is_Load() && needs_acquiring_load(n)) {
Expand All @@ -2661,6 +2676,13 @@ bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack,
if (n->is_Store() && needs_releasing_store(n)) {
return false;
}

if (n->is_LoadVectorGather() ||
n->is_StoreVectorScatter() ||
n->is_LoadVectorGatherMasked() ||
n->is_StoreVectorScatterMasked()) {
return false;
}
}

if (clone_base_plus_offset_address(m, mstack, address_visited)) {
Expand Down
172 changes: 165 additions & 7 deletions src/hotspot/cpu/aarch64/aarch64_vector.ad
Original file line number Diff line number Diff line change
Expand Up @@ -168,22 +168,21 @@ source %{
case Op_MaskAll:
case Op_VectorMaskGen:
case Op_LoadVectorMasked:
case Op_LoadVectorGather:
case Op_LoadVectorGatherMasked:
case Op_StoreVectorMasked:
case Op_StoreVectorScatter:
case Op_StoreVectorScatterMasked:
case Op_PopulateIndex:
case Op_CompressM:
case Op_CompressV:
// Temporarily disable vector mask widen support for NEON,
// because we do not have the use case now.
case Op_VectorMaskWiden:
if (UseSVE == 0) {
return false;
}
break;
case Op_LoadVectorGather:
case Op_LoadVectorGatherMasked:
if (UseSVE == 0 || is_subword_type(bt)) {
return false;
}
break;
case Op_MulAddVS2VI:
if (length_in_bytes != 16) {
return false;
Expand Down Expand Up @@ -358,6 +357,12 @@ source %{
return false;
}

// SVE requires vector indices for gather-load/scatter-store operations
// on all data types.
bool Matcher::gather_scatter_needs_vector_index(BasicType bt) {
return UseSVE > 0;
}

// Assert that the given node is not a variable shift.
bool assert_not_var_shift(const Node* n) {
assert(!n->as_ShiftV()->is_var_shift(), "illegal variable shift");
Expand Down Expand Up @@ -5274,6 +5279,35 @@ instruct extractD(vRegD dst, vReg src, immI idx) %{
ins_pipe(pipe_slow);
%}

// ---------------------------- Vector Slice ------------------------

instruct vslice_neon(vReg dst, vReg src1, vReg src2, immI index) %{
predicate(VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)));
match(Set dst (VectorSlice (Binary src1 src2) index));
format %{ "vslice_neon $dst, $src1, $src2, $index" %}
ins_encode %{
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
uint scale = type2aelembytes(Matcher::vector_element_basic_type(this));
__ ext($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
$src1$$FloatRegister, $src2$$FloatRegister,
((uint)$index$$constant * scale));
%}
ins_pipe(pipe_slow);
%}

instruct vslice_sve(vReg dst_src1, vReg src2, immI index) %{
predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)));
match(Set dst_src1 (VectorSlice (Binary dst_src1 src2) index));
format %{ "vslice_sve $dst_src1, $dst_src1, $src2, $index" %}
ins_encode %{
assert(UseSVE > 0, "must be sve");
uint scale = type2aelembytes(Matcher::vector_element_basic_type(this));
__ sve_ext($dst_src1$$FloatRegister, $src2$$FloatRegister,
((uint)$index$$constant * scale));
%}
ins_pipe(pipe_slow);
%}

// ------------------------------ Vector mask load/store -----------------------

// vector load mask
Expand Down Expand Up @@ -5937,6 +5971,32 @@ instruct vmaskcast_narrow_sve(pReg dst, pReg src, pReg ptmp) %{
ins_pipe(pipe_slow);
%}

// Vector mask widen to twice size
//
// Unpack elements from the lowest or highest half of the source
// predicate and place in elements of twice their size within the
// destination predicate.

instruct vmaskwiden_lo_sve(pReg dst, pReg src) %{
predicate(UseSVE > 0 && n->as_VectorMaskWiden()->is_lo());
match(Set dst (VectorMaskWiden src));
format %{ "vmaskwiden_lo_sve $dst, $src" %}
ins_encode %{
__ sve_punpklo($dst$$PRegister, $src$$PRegister);
%}
ins_pipe(pipe_slow);
%}

instruct vmaskwiden_hi_sve(pReg dst, pReg src) %{
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can both the hi and lo widen rules be combined into a single one as the arguments are the same? or would it make it less understandable?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The main problem is that we cannot get the flag of __is_lo easily from the relative machnode as far as I know.

predicate(UseSVE > 0 && !n->as_VectorMaskWiden()->is_lo());
match(Set dst (VectorMaskWiden src));
format %{ "vmaskwiden_hi_sve $dst, $src" %}
ins_encode %{
__ sve_punpkhi($dst$$PRegister, $src$$PRegister);
%}
ins_pipe(pipe_slow);
%}

// vector mask reinterpret

instruct vmask_reinterpret_same_esize(pReg dst_src) %{
Expand Down Expand Up @@ -6670,6 +6730,55 @@ instruct rearrange(vReg dst, vReg src, vReg shuffle) %{

// ------------------------------ Vector Load Gather ---------------------------

instruct gather_load_subword_le128(vReg dst, indirect mem, vReg idx) %{
predicate(UseSVE > 0 &&
type2aelembytes(Matcher::vector_element_basic_type(n)) <= 2 &&
Matcher::vector_length_in_bytes(n->as_LoadVectorGather()->in(3)) <= 16);
match(Set dst (LoadVectorGather mem idx));
effect(TEMP_DEF dst);
format %{ "gather_load_subword_le128 $dst, $mem, $idx\t# vector (sve)" %}
ins_encode %{
BasicType bt = Matcher::vector_element_basic_type(this);
if (bt == T_BYTE) {
__ sve_ld1b_gather($dst$$FloatRegister, ptrue,
as_Register($mem$$base), $idx$$FloatRegister);
__ xtn($dst$$FloatRegister, __ T4H, $dst$$FloatRegister, __ T4S);
__ xtn($dst$$FloatRegister, __ T8B, $dst$$FloatRegister, __ T8H);
} else {
assert(bt == T_SHORT, "unsupported type");
__ sve_ld1h_gather($dst$$FloatRegister, ptrue,
as_Register($mem$$base), $idx$$FloatRegister);
__ xtn($dst$$FloatRegister, __ T4H, $dst$$FloatRegister, __ T4S);
}
%}
ins_pipe(pipe_slow);
%}

instruct gather_load_subword_gt128(vReg dst, indirect mem, vReg idx, vReg vtmp) %{
predicate(UseSVE > 0 &&
type2aelembytes(Matcher::vector_element_basic_type(n)) <= 2 &&
Matcher::vector_length_in_bytes(n->as_LoadVectorGather()->in(3)) > 16);
match(Set dst (LoadVectorGather mem idx));
effect(TEMP_DEF dst, TEMP vtmp);
format %{ "gather_load_subword_gt128 $dst, $mem, $idx\t# vector (sve). KILL $vtmp" %}
ins_encode %{
BasicType bt = Matcher::vector_element_basic_type(this);
__ sve_dup($vtmp$$FloatRegister, __ S, 0);
if (bt == T_BYTE) {
__ sve_ld1b_gather($dst$$FloatRegister, ptrue,
as_Register($mem$$base), $idx$$FloatRegister);
__ sve_uzp1($dst$$FloatRegister, __ H, $dst$$FloatRegister, $vtmp$$FloatRegister);
__ sve_uzp1($dst$$FloatRegister, __ B, $dst$$FloatRegister, $vtmp$$FloatRegister);
} else {
assert(bt == T_SHORT, "unsupported type");
__ sve_ld1h_gather($dst$$FloatRegister, ptrue,
as_Register($mem$$base), $idx$$FloatRegister);
__ sve_uzp1($dst$$FloatRegister, __ H, $dst$$FloatRegister, $vtmp$$FloatRegister);
}
%}
ins_pipe(pipe_slow);
%}

instruct gather_loadS(vReg dst, indirect mem, vReg idx) %{
predicate(UseSVE > 0 &&
type2aelembytes(Matcher::vector_element_basic_type(n)) == 4);
Expand All @@ -6680,7 +6789,7 @@ instruct gather_loadS(vReg dst, indirect mem, vReg idx) %{
assert(length_in_bytes == MaxVectorSize, "invalid vector length");
__ sve_ld1w_gather($dst$$FloatRegister, ptrue,
as_Register($mem$$base), $idx$$FloatRegister);
%}
%}
ins_pipe(pipe_slow);
%}

Expand All @@ -6700,6 +6809,55 @@ instruct gather_loadD(vReg dst, indirect mem, vReg idx, vReg tmp) %{
ins_pipe(pipe_slow);
%}

instruct gather_load_subword_masked_le128(vReg dst, indirect mem, vReg idx, pRegGov pg) %{
predicate(UseSVE > 0 &&
type2aelembytes(Matcher::vector_element_basic_type(n)) <= 2 &&
Matcher::vector_length_in_bytes(n->as_LoadVectorGatherMasked()->in(3)->in(1)) <= 16);
match(Set dst (LoadVectorGatherMasked mem (Binary idx pg)));
effect(TEMP_DEF dst);
format %{ "gather_load_subword_masked_le128 $dst, $pg, $mem, $idx\t# vector (sve)" %}
ins_encode %{
BasicType bt = Matcher::vector_element_basic_type(this);
if (bt == T_BYTE) {
__ sve_ld1b_gather($dst$$FloatRegister, $pg$$PRegister,
as_Register($mem$$base), $idx$$FloatRegister);
__ xtn($dst$$FloatRegister, __ T4H, $dst$$FloatRegister, __ T4S);
__ xtn($dst$$FloatRegister, __ T8B, $dst$$FloatRegister, __ T8H);
} else {
assert(bt == T_SHORT, "unsupported type");
__ sve_ld1h_gather($dst$$FloatRegister, $pg$$PRegister,
as_Register($mem$$base), $idx$$FloatRegister);
__ xtn($dst$$FloatRegister, __ T4H, $dst$$FloatRegister, __ T4S);
}
%}
ins_pipe(pipe_slow);
%}

instruct gather_load_subword_masked_gt128(vReg dst, indirect mem, vReg idx, vReg vtmp, pRegGov pg) %{
predicate(UseSVE > 0 &&
type2aelembytes(Matcher::vector_element_basic_type(n)) <= 2 &&
Matcher::vector_length_in_bytes(n->as_LoadVectorGatherMasked()->in(3)->in(1)) > 16);
match(Set dst (LoadVectorGatherMasked mem (Binary idx pg)));
effect(TEMP_DEF dst, TEMP vtmp);
format %{ "gather_load_subword_masked_gt128 $dst, $pg, $mem, $idx\t# vector (sve). KILL $vtmp" %}
ins_encode %{
BasicType bt = Matcher::vector_element_basic_type(this);
__ sve_dup($vtmp$$FloatRegister, __ S, 0);
if (bt == T_BYTE) {
__ sve_ld1b_gather($dst$$FloatRegister, $pg$$PRegister,
as_Register($mem$$base), $idx$$FloatRegister);
__ sve_uzp1($dst$$FloatRegister, __ H, $dst$$FloatRegister, $vtmp$$FloatRegister);
__ sve_uzp1($dst$$FloatRegister, __ B, $dst$$FloatRegister, $vtmp$$FloatRegister);
} else {
assert(bt == T_SHORT, "unsupported type");
__ sve_ld1h_gather($dst$$FloatRegister, $pg$$PRegister,
as_Register($mem$$base), $idx$$FloatRegister);
__ sve_uzp1($dst$$FloatRegister, __ H, $dst$$FloatRegister, $vtmp$$FloatRegister);
}
%}
ins_pipe(pipe_slow);
%}

instruct gather_loadS_masked(vReg dst, indirect mem, vReg idx, pRegGov pg) %{
predicate(UseSVE > 0 &&
type2aelembytes(Matcher::vector_element_basic_type(n)) == 4);
Expand Down
Loading