diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad index 456712cf8da70..fcc24bbb985eb 100644 --- a/src/hotspot/cpu/aarch64/aarch64.ad +++ b/src/hotspot/cpu/aarch64/aarch64.ad @@ -2401,6 +2401,20 @@ uint Matcher::vector_ideal_reg(int len) { return 0; } +// Vector ideal reg size corresponding to the specified len in bytes +uint Matcher::vector_ideal_reg_size(int len) { + assert(MaxVectorSize >= len, ""); + uint ideal_reg = vector_ideal_reg(len); + switch (ideal_reg) { + case Op_VecD: return 8; + case Op_VecX: return 16; + case Op_VecA: return MaxVectorSize; + default: + ShouldNotReachHere(); + return 0; + } +} + MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) { assert(Matcher::is_generic_vector(generic_opnd), "not generic"); switch (ideal_reg) { @@ -2647,12 +2661,13 @@ bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) { // into registers? bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) { - // Loads and stores with indirect memory input (e.g., volatile loads and - // stores) do not subsume the input into complex addressing expressions. If - // the addressing expression is input to at least one such load or store, do - // not clone the addressing expression. Query needs_acquiring_load and - // needs_releasing_store as a proxy for indirect memory input, as it is not - // possible to directly query for indirect memory input at this stage. + // Loads and stores with indirect memory input (e.g., volatile loads/stores, + // and vector gather_loads/scatter_stores) do not subsume the input into + // complex addressing expressions. If the addressing expression is input + // to at least one such load or store, do not clone the addressing expression. + // Query needs_acquiring_load and needs_releasing_store as a proxy for + // indirect memory input, as it is not possible to directly query for indirect + // memory input at this stage. for (DUIterator_Fast imax, i = m->fast_outs(imax); i < imax; i++) { Node* n = m->fast_out(i); if (n->is_Load() && needs_acquiring_load(n)) { @@ -2661,6 +2676,13 @@ bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, if (n->is_Store() && needs_releasing_store(n)) { return false; } + + if (n->is_LoadVectorGather() || + n->is_StoreVectorScatter() || + n->is_LoadVectorGatherMasked() || + n->is_StoreVectorScatterMasked()) { + return false; + } } if (clone_base_plus_offset_address(m, mstack, address_visited)) { diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad index b4e6d79347f1f..86fcd7530f685 100644 --- a/src/hotspot/cpu/aarch64/aarch64_vector.ad +++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad @@ -168,22 +168,21 @@ source %{ case Op_MaskAll: case Op_VectorMaskGen: case Op_LoadVectorMasked: + case Op_LoadVectorGather: + case Op_LoadVectorGatherMasked: case Op_StoreVectorMasked: case Op_StoreVectorScatter: case Op_StoreVectorScatterMasked: case Op_PopulateIndex: case Op_CompressM: case Op_CompressV: + // Temporarily disable vector mask widen support for NEON, + // because we do not have the use case now. + case Op_VectorMaskWiden: if (UseSVE == 0) { return false; } break; - case Op_LoadVectorGather: - case Op_LoadVectorGatherMasked: - if (UseSVE == 0 || is_subword_type(bt)) { - return false; - } - break; case Op_MulAddVS2VI: if (length_in_bytes != 16) { return false; @@ -5274,6 +5273,35 @@ instruct extractD(vRegD dst, vReg src, immI idx) %{ ins_pipe(pipe_slow); %} +// ---------------------------- Vector Slice ------------------------ + +instruct vslice_neon(vReg dst, vReg src1, vReg src2, immI index) %{ + predicate(VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n))); + match(Set dst (VectorSlice (Binary src1 src2) index)); + format %{ "vslice_neon $dst, $src1, $src2, $index" %} + ins_encode %{ + uint length_in_bytes = Matcher::vector_length_in_bytes(this); + uint scale = type2aelembytes(Matcher::vector_element_basic_type(this)); + __ ext($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B, + $src1$$FloatRegister, $src2$$FloatRegister, + ((uint)$index$$constant * scale)); + %} + ins_pipe(pipe_slow); +%} + +instruct vslice_sve(vReg dst_src1, vReg src2, immI index) %{ + predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n))); + match(Set dst_src1 (VectorSlice (Binary dst_src1 src2) index)); + format %{ "vslice_sve $dst_src1, $dst_src1, $src2, $index" %} + ins_encode %{ + assert(UseSVE > 0, "must be sve"); + uint scale = type2aelembytes(Matcher::vector_element_basic_type(this)); + __ sve_ext($dst_src1$$FloatRegister, $src2$$FloatRegister, + ((uint)$index$$constant * scale)); + %} + ins_pipe(pipe_slow); +%} + // ------------------------------ Vector mask load/store ----------------------- // vector load mask @@ -5937,6 +5965,32 @@ instruct vmaskcast_narrow_sve(pReg dst, pReg src, pReg ptmp) %{ ins_pipe(pipe_slow); %} +// Vector mask widen to twice size +// +// Unpack elements from the lowest or highest half of the source +// predicate and place in elements of twice their size within the +// destination predicate. + +instruct vmaskwiden_lo_sve(pReg dst, pReg src) %{ + predicate(UseSVE > 0 && n->as_VectorMaskWiden()->is_lo()); + match(Set dst (VectorMaskWiden src)); + format %{ "vmaskwiden_lo_sve $dst, $src" %} + ins_encode %{ + __ sve_punpklo($dst$$PRegister, $src$$PRegister); + %} + ins_pipe(pipe_slow); +%} + +instruct vmaskwiden_hi_sve(pReg dst, pReg src) %{ + predicate(UseSVE > 0 && !n->as_VectorMaskWiden()->is_lo()); + match(Set dst (VectorMaskWiden src)); + format %{ "vmaskwiden_hi_sve $dst, $src" %} + ins_encode %{ + __ sve_punpkhi($dst$$PRegister, $src$$PRegister); + %} + ins_pipe(pipe_slow); +%} + // vector mask reinterpret instruct vmask_reinterpret_same_esize(pReg dst_src) %{ @@ -6670,6 +6724,55 @@ instruct rearrange(vReg dst, vReg src, vReg shuffle) %{ // ------------------------------ Vector Load Gather --------------------------- +instruct gather_load_subword_le128(vReg dst, indirect mem, vReg idx) %{ + predicate(UseSVE > 0 && + type2aelembytes(Matcher::vector_element_basic_type(n)) <= 2 && + Matcher::vector_length_in_bytes(n->as_LoadVectorGather()->in(3)) <= 16); + match(Set dst (LoadVectorGather mem idx)); + effect(TEMP_DEF dst); + format %{ "gather_load_subword_le128 $dst, $mem, $idx\t# vector (sve)" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + if (bt == T_BYTE) { + __ sve_ld1b_gather($dst$$FloatRegister, ptrue, + as_Register($mem$$base), $idx$$FloatRegister); + __ xtn($dst$$FloatRegister, __ T4H, $dst$$FloatRegister, __ T4S); + __ xtn($dst$$FloatRegister, __ T8B, $dst$$FloatRegister, __ T8H); + } else { + assert(bt == T_SHORT, "unsupported type"); + __ sve_ld1h_gather($dst$$FloatRegister, ptrue, + as_Register($mem$$base), $idx$$FloatRegister); + __ xtn($dst$$FloatRegister, __ T4H, $dst$$FloatRegister, __ T4S); + } + %} + ins_pipe(pipe_slow); +%} + +instruct gather_load_subword_gt128(vReg dst, indirect mem, vReg idx, vReg vtmp) %{ + predicate(UseSVE > 0 && + type2aelembytes(Matcher::vector_element_basic_type(n)) <= 2 && + Matcher::vector_length_in_bytes(n->as_LoadVectorGather()->in(3)) > 16); + match(Set dst (LoadVectorGather mem idx)); + effect(TEMP_DEF dst, TEMP vtmp); + format %{ "gather_load_subword_gt128 $dst, $mem, $idx\t# vector (sve). KILL $vtmp" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + __ sve_dup($vtmp$$FloatRegister, __ S, 0); + if (bt == T_BYTE) { + __ sve_ld1b_gather($dst$$FloatRegister, ptrue, + as_Register($mem$$base), $idx$$FloatRegister); + __ sve_uzp1($dst$$FloatRegister, __ H, $dst$$FloatRegister, $vtmp$$FloatRegister); + __ sve_uzp1($dst$$FloatRegister, __ B, $dst$$FloatRegister, $vtmp$$FloatRegister); + } else { + assert(bt == T_SHORT, "unsupported type"); + __ sve_ld1h_gather($dst$$FloatRegister, ptrue, + as_Register($mem$$base), $idx$$FloatRegister); + __ sve_uzp1($dst$$FloatRegister, __ H, $dst$$FloatRegister, $vtmp$$FloatRegister); + } + %} + ins_pipe(pipe_slow); +%} + instruct gather_loadS(vReg dst, indirect mem, vReg idx) %{ predicate(UseSVE > 0 && type2aelembytes(Matcher::vector_element_basic_type(n)) == 4); @@ -6680,7 +6783,7 @@ instruct gather_loadS(vReg dst, indirect mem, vReg idx) %{ assert(length_in_bytes == MaxVectorSize, "invalid vector length"); __ sve_ld1w_gather($dst$$FloatRegister, ptrue, as_Register($mem$$base), $idx$$FloatRegister); - %} + %} ins_pipe(pipe_slow); %} @@ -6700,6 +6803,55 @@ instruct gather_loadD(vReg dst, indirect mem, vReg idx, vReg tmp) %{ ins_pipe(pipe_slow); %} +instruct gather_load_subword_masked_le128(vReg dst, indirect mem, vReg idx, pRegGov pg) %{ + predicate(UseSVE > 0 && + type2aelembytes(Matcher::vector_element_basic_type(n)) <= 2 && + Matcher::vector_length_in_bytes(n->as_LoadVectorGatherMasked()->in(3)->in(1)) <= 16); + match(Set dst (LoadVectorGatherMasked mem (Binary idx pg))); + effect(TEMP_DEF dst); + format %{ "gather_load_subword_masked_le128 $dst, $pg, $mem, $idx\t# vector (sve)" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + if (bt == T_BYTE) { + __ sve_ld1b_gather($dst$$FloatRegister, $pg$$PRegister, + as_Register($mem$$base), $idx$$FloatRegister); + __ xtn($dst$$FloatRegister, __ T4H, $dst$$FloatRegister, __ T4S); + __ xtn($dst$$FloatRegister, __ T8B, $dst$$FloatRegister, __ T8H); + } else { + assert(bt == T_SHORT, "unsupported type"); + __ sve_ld1h_gather($dst$$FloatRegister, $pg$$PRegister, + as_Register($mem$$base), $idx$$FloatRegister); + __ xtn($dst$$FloatRegister, __ T4H, $dst$$FloatRegister, __ T4S); + } + %} + ins_pipe(pipe_slow); +%} + +instruct gather_load_subword_masked_gt128(vReg dst, indirect mem, vReg idx, vReg vtmp, pRegGov pg) %{ + predicate(UseSVE > 0 && + type2aelembytes(Matcher::vector_element_basic_type(n)) <= 2 && + Matcher::vector_length_in_bytes(n->as_LoadVectorGatherMasked()->in(3)->in(1)) > 16); + match(Set dst (LoadVectorGatherMasked mem (Binary idx pg))); + effect(TEMP_DEF dst, TEMP vtmp); + format %{ "gather_load_subword_masked_gt128 $dst, $pg, $mem, $idx\t# vector (sve). KILL $vtmp" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + __ sve_dup($vtmp$$FloatRegister, __ S, 0); + if (bt == T_BYTE) { + __ sve_ld1b_gather($dst$$FloatRegister, $pg$$PRegister, + as_Register($mem$$base), $idx$$FloatRegister); + __ sve_uzp1($dst$$FloatRegister, __ H, $dst$$FloatRegister, $vtmp$$FloatRegister); + __ sve_uzp1($dst$$FloatRegister, __ B, $dst$$FloatRegister, $vtmp$$FloatRegister); + } else { + assert(bt == T_SHORT, "unsupported type"); + __ sve_ld1h_gather($dst$$FloatRegister, $pg$$PRegister, + as_Register($mem$$base), $idx$$FloatRegister); + __ sve_uzp1($dst$$FloatRegister, __ H, $dst$$FloatRegister, $vtmp$$FloatRegister); + } + %} + ins_pipe(pipe_slow); +%} + instruct gather_loadS_masked(vReg dst, indirect mem, vReg idx, pRegGov pg) %{ predicate(UseSVE > 0 && type2aelembytes(Matcher::vector_element_basic_type(n)) == 4); diff --git a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 index cc07e0e407615..234a9504dbfbc 100644 --- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 +++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 @@ -158,22 +158,21 @@ source %{ case Op_MaskAll: case Op_VectorMaskGen: case Op_LoadVectorMasked: + case Op_LoadVectorGather: + case Op_LoadVectorGatherMasked: case Op_StoreVectorMasked: case Op_StoreVectorScatter: case Op_StoreVectorScatterMasked: case Op_PopulateIndex: case Op_CompressM: case Op_CompressV: + // Temporarily disable vector mask widen support for NEON, + // because we do not have the use case now. + case Op_VectorMaskWiden: if (UseSVE == 0) { return false; } break; - case Op_LoadVectorGather: - case Op_LoadVectorGatherMasked: - if (UseSVE == 0 || is_subword_type(bt)) { - return false; - } - break; case Op_MulAddVS2VI: if (length_in_bytes != 16) { return false; @@ -3419,6 +3418,35 @@ EXTRACT_FP(F, fmovs, 4, S, 2) // DOUBLE EXTRACT_FP(D, fmovd, 2, D, 3) +// ---------------------------- Vector Slice ------------------------ + +instruct vslice_neon(vReg dst, vReg src1, vReg src2, immI index) %{ + predicate(VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n))); + match(Set dst (VectorSlice (Binary src1 src2) index)); + format %{ "vslice_neon $dst, $src1, $src2, $index" %} + ins_encode %{ + uint length_in_bytes = Matcher::vector_length_in_bytes(this); + uint scale = type2aelembytes(Matcher::vector_element_basic_type(this)); + __ ext($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B, + $src1$$FloatRegister, $src2$$FloatRegister, + ((uint)$index$$constant * scale)); + %} + ins_pipe(pipe_slow); +%} + +instruct vslice_sve(vReg dst_src1, vReg src2, immI index) %{ + predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n))); + match(Set dst_src1 (VectorSlice (Binary dst_src1 src2) index)); + format %{ "vslice_sve $dst_src1, $dst_src1, $src2, $index" %} + ins_encode %{ + assert(UseSVE > 0, "must be sve"); + uint scale = type2aelembytes(Matcher::vector_element_basic_type(this)); + __ sve_ext($dst_src1$$FloatRegister, $src2$$FloatRegister, + ((uint)$index$$constant * scale)); + %} + ins_pipe(pipe_slow); +%} + // ------------------------------ Vector mask load/store ----------------------- // vector load mask @@ -3991,6 +4019,32 @@ instruct vmaskcast_narrow_sve(pReg dst, pReg src, pReg ptmp) %{ ins_pipe(pipe_slow); %} +// Vector mask widen to twice size +// +// Unpack elements from the lowest or highest half of the source +// predicate and place in elements of twice their size within the +// destination predicate. + +instruct vmaskwiden_lo_sve(pReg dst, pReg src) %{ + predicate(UseSVE > 0 && n->as_VectorMaskWiden()->is_lo()); + match(Set dst (VectorMaskWiden src)); + format %{ "vmaskwiden_lo_sve $dst, $src" %} + ins_encode %{ + __ sve_punpklo($dst$$PRegister, $src$$PRegister); + %} + ins_pipe(pipe_slow); +%} + +instruct vmaskwiden_hi_sve(pReg dst, pReg src) %{ + predicate(UseSVE > 0 && !n->as_VectorMaskWiden()->is_lo()); + match(Set dst (VectorMaskWiden src)); + format %{ "vmaskwiden_hi_sve $dst, $src" %} + ins_encode %{ + __ sve_punpkhi($dst$$PRegister, $src$$PRegister); + %} + ins_pipe(pipe_slow); +%} + // vector mask reinterpret instruct vmask_reinterpret_same_esize(pReg dst_src) %{ @@ -4680,6 +4734,55 @@ instruct rearrange(vReg dst, vReg src, vReg shuffle) %{ // ------------------------------ Vector Load Gather --------------------------- +instruct gather_load_subword_le128(vReg dst, indirect mem, vReg idx) %{ + predicate(UseSVE > 0 && + type2aelembytes(Matcher::vector_element_basic_type(n)) <= 2 && + Matcher::vector_length_in_bytes(n->as_LoadVectorGather()->in(3)) <= 16); + match(Set dst (LoadVectorGather mem idx)); + effect(TEMP_DEF dst); + format %{ "gather_load_subword_le128 $dst, $mem, $idx\t# vector (sve)" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + if (bt == T_BYTE) { + __ sve_ld1b_gather($dst$$FloatRegister, ptrue, + as_Register($mem$$base), $idx$$FloatRegister); + __ xtn($dst$$FloatRegister, __ T4H, $dst$$FloatRegister, __ T4S); + __ xtn($dst$$FloatRegister, __ T8B, $dst$$FloatRegister, __ T8H); + } else { + assert(bt == T_SHORT, "unsupported type"); + __ sve_ld1h_gather($dst$$FloatRegister, ptrue, + as_Register($mem$$base), $idx$$FloatRegister); + __ xtn($dst$$FloatRegister, __ T4H, $dst$$FloatRegister, __ T4S); + } + %} + ins_pipe(pipe_slow); +%} + +instruct gather_load_subword_gt128(vReg dst, indirect mem, vReg idx, vReg vtmp) %{ + predicate(UseSVE > 0 && + type2aelembytes(Matcher::vector_element_basic_type(n)) <= 2 && + Matcher::vector_length_in_bytes(n->as_LoadVectorGather()->in(3)) > 16); + match(Set dst (LoadVectorGather mem idx)); + effect(TEMP_DEF dst, TEMP vtmp); + format %{ "gather_load_subword_gt128 $dst, $mem, $idx\t# vector (sve). KILL $vtmp" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + __ sve_dup($vtmp$$FloatRegister, __ S, 0); + if (bt == T_BYTE) { + __ sve_ld1b_gather($dst$$FloatRegister, ptrue, + as_Register($mem$$base), $idx$$FloatRegister); + __ sve_uzp1($dst$$FloatRegister, __ H, $dst$$FloatRegister, $vtmp$$FloatRegister); + __ sve_uzp1($dst$$FloatRegister, __ B, $dst$$FloatRegister, $vtmp$$FloatRegister); + } else { + assert(bt == T_SHORT, "unsupported type"); + __ sve_ld1h_gather($dst$$FloatRegister, ptrue, + as_Register($mem$$base), $idx$$FloatRegister); + __ sve_uzp1($dst$$FloatRegister, __ H, $dst$$FloatRegister, $vtmp$$FloatRegister); + } + %} + ins_pipe(pipe_slow); +%} + instruct gather_loadS(vReg dst, indirect mem, vReg idx) %{ predicate(UseSVE > 0 && type2aelembytes(Matcher::vector_element_basic_type(n)) == 4); @@ -4690,7 +4793,7 @@ instruct gather_loadS(vReg dst, indirect mem, vReg idx) %{ assert(length_in_bytes == MaxVectorSize, "invalid vector length"); __ sve_ld1w_gather($dst$$FloatRegister, ptrue, as_Register($mem$$base), $idx$$FloatRegister); - %} + %} ins_pipe(pipe_slow); %} @@ -4710,6 +4813,55 @@ instruct gather_loadD(vReg dst, indirect mem, vReg idx, vReg tmp) %{ ins_pipe(pipe_slow); %} +instruct gather_load_subword_masked_le128(vReg dst, indirect mem, vReg idx, pRegGov pg) %{ + predicate(UseSVE > 0 && + type2aelembytes(Matcher::vector_element_basic_type(n)) <= 2 && + Matcher::vector_length_in_bytes(n->as_LoadVectorGatherMasked()->in(3)->in(1)) <= 16); + match(Set dst (LoadVectorGatherMasked mem (Binary idx pg))); + effect(TEMP_DEF dst); + format %{ "gather_load_subword_masked_le128 $dst, $pg, $mem, $idx\t# vector (sve)" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + if (bt == T_BYTE) { + __ sve_ld1b_gather($dst$$FloatRegister, $pg$$PRegister, + as_Register($mem$$base), $idx$$FloatRegister); + __ xtn($dst$$FloatRegister, __ T4H, $dst$$FloatRegister, __ T4S); + __ xtn($dst$$FloatRegister, __ T8B, $dst$$FloatRegister, __ T8H); + } else { + assert(bt == T_SHORT, "unsupported type"); + __ sve_ld1h_gather($dst$$FloatRegister, $pg$$PRegister, + as_Register($mem$$base), $idx$$FloatRegister); + __ xtn($dst$$FloatRegister, __ T4H, $dst$$FloatRegister, __ T4S); + } + %} + ins_pipe(pipe_slow); +%} + +instruct gather_load_subword_masked_gt128(vReg dst, indirect mem, vReg idx, vReg vtmp, pRegGov pg) %{ + predicate(UseSVE > 0 && + type2aelembytes(Matcher::vector_element_basic_type(n)) <= 2 && + Matcher::vector_length_in_bytes(n->as_LoadVectorGatherMasked()->in(3)->in(1)) > 16); + match(Set dst (LoadVectorGatherMasked mem (Binary idx pg))); + effect(TEMP_DEF dst, TEMP vtmp); + format %{ "gather_load_subword_masked_gt128 $dst, $pg, $mem, $idx\t# vector (sve). KILL $vtmp" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + __ sve_dup($vtmp$$FloatRegister, __ S, 0); + if (bt == T_BYTE) { + __ sve_ld1b_gather($dst$$FloatRegister, $pg$$PRegister, + as_Register($mem$$base), $idx$$FloatRegister); + __ sve_uzp1($dst$$FloatRegister, __ H, $dst$$FloatRegister, $vtmp$$FloatRegister); + __ sve_uzp1($dst$$FloatRegister, __ B, $dst$$FloatRegister, $vtmp$$FloatRegister); + } else { + assert(bt == T_SHORT, "unsupported type"); + __ sve_ld1h_gather($dst$$FloatRegister, $pg$$PRegister, + as_Register($mem$$base), $idx$$FloatRegister); + __ sve_uzp1($dst$$FloatRegister, __ H, $dst$$FloatRegister, $vtmp$$FloatRegister); + } + %} + ins_pipe(pipe_slow); +%} + instruct gather_loadS_masked(vReg dst, indirect mem, vReg idx, pRegGov pg) %{ predicate(UseSVE > 0 && type2aelembytes(Matcher::vector_element_basic_type(n)) == 4); diff --git a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp index 2e35763aa4326..f86514869604d 100644 --- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp @@ -3660,6 +3660,10 @@ template f(op1, 31, 25), f(type, 24, 23), f(op2, 22, 21), rf(Zm, 16); \ f(op3, 15, 13), pgrf(Pg, 10), srf(Xn, 5), rf(Zt, 0); \ } + // SVE 8-bit gather load bytes (scalar plus 32-bit unscaled offsets) + INSN(sve_ld1b_gather, 0b1000010, 0b00, 0b00, 0b010); + // SVE 16-bit gather load halfwords (scalar plus 32-bit scaled offsets) + INSN(sve_ld1h_gather, 0b1000010, 0b01, 0b01, 0b010); // SVE 32-bit gather load words (scalar plus 32-bit scaled offsets) INSN(sve_ld1w_gather, 0b1000010, 0b10, 0b01, 0b010); // SVE 64-bit gather load (scalar plus 32-bit unpacked scaled offsets) diff --git a/src/hotspot/cpu/aarch64/matcher_aarch64.hpp b/src/hotspot/cpu/aarch64/matcher_aarch64.hpp index 0fbc2ef141e8b..acb3cf678d30a 100644 --- a/src/hotspot/cpu/aarch64/matcher_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/matcher_aarch64.hpp @@ -164,6 +164,12 @@ // Implements a variant of EncodeISOArrayNode that encode ASCII only static const bool supports_encode_ascii_array = true; + // SVE requires vector indices for gather-load/scatter-store operations + // on all data types. + static bool gather_scatter_needs_vector_index(BasicType bt) { + return has_predicated_vectors(); + } + // An all-set mask is used for the alltrue vector test with SVE static constexpr bool vectortest_needs_second_argument(bool is_alltrue, bool is_predicate) { return is_predicate && is_alltrue; diff --git a/src/hotspot/cpu/arm/arm.ad b/src/hotspot/cpu/arm/arm.ad index 4a0b557968caa..904f4815a11f8 100644 --- a/src/hotspot/cpu/arm/arm.ad +++ b/src/hotspot/cpu/arm/arm.ad @@ -1037,6 +1037,12 @@ uint Matcher::vector_ideal_reg(int size) { return 0; } +// Vector ideal reg size corresponding to the specified size in bytes +uint Matcher::vector_ideal_reg_size(int size) { + assert(MaxVectorSize >= size, ""); + return size; +} + // Limits on vector size (number of elements) loaded into vector. int Matcher::max_vector_size(const BasicType bt) { assert(is_java_primitive(bt), "only primitive type vectors"); diff --git a/src/hotspot/cpu/arm/matcher_arm.hpp b/src/hotspot/cpu/arm/matcher_arm.hpp index 66fe8ac330eb5..2642e1d440532 100644 --- a/src/hotspot/cpu/arm/matcher_arm.hpp +++ b/src/hotspot/cpu/arm/matcher_arm.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2021, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -157,6 +157,11 @@ // Implements a variant of EncodeISOArrayNode that encode ASCII only static const bool supports_encode_ascii_array = false; + // Return true if vector gather-load/scatter-store needs vector index as input. + static bool gather_scatter_needs_vector_index(BasicType bt) { + return false; + } + // Some architecture needs a helper to check for alltrue vector static constexpr bool vectortest_needs_second_argument(bool is_alltrue, bool is_predicate) { return false; diff --git a/src/hotspot/cpu/ppc/matcher_ppc.hpp b/src/hotspot/cpu/ppc/matcher_ppc.hpp index aad41fb7b1cf0..31a02c2f95a8d 100644 --- a/src/hotspot/cpu/ppc/matcher_ppc.hpp +++ b/src/hotspot/cpu/ppc/matcher_ppc.hpp @@ -167,6 +167,11 @@ // Implements a variant of EncodeISOArrayNode that encode ASCII only static const bool supports_encode_ascii_array = true; + // Return true if vector gather-load/scatter-store needs vector index as input. + static bool gather_scatter_needs_vector_index(BasicType bt) { + return !is_subword_type(bt); + } + // Some architecture needs a helper to check for alltrue vector static constexpr bool vectortest_needs_second_argument(bool is_alltrue, bool is_predicate) { return false; diff --git a/src/hotspot/cpu/ppc/ppc.ad b/src/hotspot/cpu/ppc/ppc.ad index bf43ecaba79f2..1088816772d81 100644 --- a/src/hotspot/cpu/ppc/ppc.ad +++ b/src/hotspot/cpu/ppc/ppc.ad @@ -2328,6 +2328,12 @@ uint Matcher::vector_ideal_reg(int size) { } } +// Vector ideal reg size corresponding to the specified size in bytes +uint Matcher::vector_ideal_reg_size(int size) { + assert(MaxVectorSize == size, ""); + return size; +} + // Limits on vector size (number of elements) loaded into vector. int Matcher::max_vector_size(const BasicType bt) { assert(is_java_primitive(bt), "only primitive type vectors"); diff --git a/src/hotspot/cpu/riscv/matcher_riscv.hpp b/src/hotspot/cpu/riscv/matcher_riscv.hpp index 1b490a07f92a6..7c379a2874dfc 100644 --- a/src/hotspot/cpu/riscv/matcher_riscv.hpp +++ b/src/hotspot/cpu/riscv/matcher_riscv.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2021, 2025, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2021, 2022, Huawei Technologies Co., Ltd. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * @@ -163,6 +163,11 @@ // Implements a variant of EncodeISOArrayNode that encode ASCII only static const bool supports_encode_ascii_array = true; + // Return true if vector gather-load/scatter-store needs vector index as input. + static bool gather_scatter_needs_vector_index(BasicType bt) { + return !is_subword_type(bt); + } + // Some architecture needs a helper to check for alltrue vector static constexpr bool vectortest_needs_second_argument(bool is_alltrue, bool is_predicate) { return false; diff --git a/src/hotspot/cpu/riscv/riscv.ad b/src/hotspot/cpu/riscv/riscv.ad index 9adcc1d9c5a9f..4a6b84d6e7974 100644 --- a/src/hotspot/cpu/riscv/riscv.ad +++ b/src/hotspot/cpu/riscv/riscv.ad @@ -1,5 +1,5 @@ // -// Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. // Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved. // Copyright (c) 2020, 2024, Huawei Technologies Co., Ltd. All rights reserved. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. @@ -2014,6 +2014,12 @@ uint Matcher::vector_ideal_reg(int len) { return 0; } +// Vector ideal reg size corresponding to the specified len in bytes +uint Matcher::vector_ideal_reg_size(int len) { + assert(MaxVectorSize >= len, ""); + return MaxVectorSize; +} + int Matcher::scalable_vector_reg_size(const BasicType bt) { return Matcher::max_vector_size(bt); } diff --git a/src/hotspot/cpu/s390/matcher_s390.hpp b/src/hotspot/cpu/s390/matcher_s390.hpp index e4c277c63a8b9..8780623226701 100644 --- a/src/hotspot/cpu/s390/matcher_s390.hpp +++ b/src/hotspot/cpu/s390/matcher_s390.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2021, 2025, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2017, 2024 SAP SE. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * @@ -160,6 +160,11 @@ // Implements a variant of EncodeISOArrayNode that encode ASCII only static const bool supports_encode_ascii_array = true; + // Return true if vector gather-load/scatter-store needs vector index as input. + static bool gather_scatter_needs_vector_index(BasicType bt) { + return !is_subword_type(bt); + } + // Some architecture needs a helper to check for alltrue vector static constexpr bool vectortest_needs_second_argument(bool is_alltrue, bool is_predicate) { return false; diff --git a/src/hotspot/cpu/s390/s390.ad b/src/hotspot/cpu/s390/s390.ad index c32064be86d87..476b5be0facc9 100644 --- a/src/hotspot/cpu/s390/s390.ad +++ b/src/hotspot/cpu/s390/s390.ad @@ -1,5 +1,5 @@ // -// Copyright (c) 2017, 2024, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 2017, 2025, Oracle and/or its affiliates. All rights reserved. // Copyright (c) 2017, 2024 SAP SE. All rights reserved. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. // @@ -1847,6 +1847,12 @@ uint Matcher::vector_ideal_reg(int size) { } } +// Vector ideal reg size corresponding to the specified size in bytes +uint Matcher::vector_ideal_reg_size(int size) { + assert(MaxVectorSize == size, ""); + return size; +} + // Limits on vector size (number of elements) loaded into vector. int Matcher::max_vector_size(const BasicType bt) { assert(is_java_primitive(bt), "only primitive type vectors"); diff --git a/src/hotspot/cpu/x86/matcher_x86.hpp b/src/hotspot/cpu/x86/matcher_x86.hpp index 41486c244b247..5aac2e0677440 100644 --- a/src/hotspot/cpu/x86/matcher_x86.hpp +++ b/src/hotspot/cpu/x86/matcher_x86.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2021, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -155,6 +155,11 @@ // Implements a variant of EncodeISOArrayNode that encode ASCII only static const bool supports_encode_ascii_array = true; + // Return true if vector gather-load/scatter-store needs vector index as input. + static bool gather_scatter_needs_vector_index(BasicType bt) { + return !is_subword_type(bt); + } + // Without predicated input, an all-one vector is needed for the alltrue vector test static constexpr bool vectortest_needs_second_argument(bool is_alltrue, bool is_predicate) { return is_alltrue && !is_predicate; diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad index 933be1667c2ba..4bc89f1fb65ce 100644 --- a/src/hotspot/cpu/x86/x86.ad +++ b/src/hotspot/cpu/x86/x86.ad @@ -2247,6 +2247,12 @@ uint Matcher::vector_ideal_reg(int size) { return 0; } +// Vector ideal reg size corresponding to the specified len in bytes +uint Matcher::vector_ideal_reg_size(int size) { + assert(MaxVectorSize >= size, ""); + return size; +} + // Check for shift by small constant as well static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) { if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() && diff --git a/src/hotspot/share/adlc/formssel.cpp b/src/hotspot/share/adlc/formssel.cpp index b938d5b75608d..466b9f8d7cb4e 100644 --- a/src/hotspot/share/adlc/formssel.cpp +++ b/src/hotspot/share/adlc/formssel.cpp @@ -4360,7 +4360,7 @@ bool MatchRule::is_vector() const { "RoundDoubleModeV","RotateLeftV" , "RotateRightV", "LoadVector","StoreVector", "LoadVectorGather", "StoreVectorScatter", "LoadVectorGatherMasked", "StoreVectorScatterMasked", "SelectFromTwoVector", "VectorTest", "VectorLoadMask", "VectorStoreMask", "VectorBlend", "VectorInsert", - "VectorRearrange", "VectorLoadShuffle", "VectorLoadConst", + "VectorRearrange", "VectorLoadShuffle", "VectorLoadConst", "VectorSlice", "VectorCastB2X", "VectorCastS2X", "VectorCastI2X", "VectorCastL2X", "VectorCastF2X", "VectorCastD2X", "VectorCastF2HF", "VectorCastHF2F", "VectorUCastB2X", "VectorUCastS2X", "VectorUCastI2X", @@ -4368,7 +4368,7 @@ bool MatchRule::is_vector() const { "FmaVD", "FmaVF", "FmaVHF", "PopCountVI", "PopCountVL", "PopulateIndex", "VectorLongToMask", "CountLeadingZerosV", "CountTrailingZerosV", "SignumVF", "SignumVD", "SaturatingAddV", "SaturatingSubV", // Next are vector mask ops. - "MaskAll", "AndVMask", "OrVMask", "XorVMask", "VectorMaskCast", + "MaskAll", "AndVMask", "OrVMask", "XorVMask", "VectorMaskCast", "VectorMaskWiden", "RoundVF", "RoundVD", // Next are not supported currently. "PackB","PackS","PackI","PackL","PackF","PackD","Pack2L","Pack2D", diff --git a/src/hotspot/share/opto/classes.hpp b/src/hotspot/share/opto/classes.hpp index bc259eed2d101..2c898b7503258 100644 --- a/src/hotspot/share/opto/classes.hpp +++ b/src/hotspot/share/opto/classes.hpp @@ -513,6 +513,7 @@ macro(VectorUnbox) macro(VectorMaskWrapper) macro(VectorMaskCmp) macro(VectorMaskCast) +macro(VectorMaskWiden) macro(VectorTest) macro(VectorBlend) macro(VectorRearrange) @@ -535,6 +536,7 @@ macro(VectorUCastS2X) macro(VectorUCastI2X) macro(VectorizedHashCode) macro(VectorInsert) +macro(VectorSlice) macro(MaskAll) macro(AndVMask) macro(OrVMask) diff --git a/src/hotspot/share/opto/library_call.hpp b/src/hotspot/share/opto/library_call.hpp index fbc6007d4e195..6d3fe40d45639 100644 --- a/src/hotspot/share/opto/library_call.hpp +++ b/src/hotspot/share/opto/library_call.hpp @@ -417,6 +417,8 @@ class LibraryCallKit : public GraphKit { bool inline_vector_select_from_two_vectors(); Node* gen_call_to_vector_math(int vector_api_op_id, BasicType bt, int num_elem, Node* opd1, Node* opd2); + Node* gen_gather_load_subword(Node* addr, Node* indexes, Node* indexes1, Node* indexes2, Node* indexes3, const TypeVect* vector_type); + Node* gen_gather_load_masked_subword(Node* addr, Node* indexes, Node* indexes1, Node* indexes2, Node* indexes3, Node* mask, const TypeVect* vector_type); enum VectorMaskUseType { VecMaskUseLoad = 1 << 0, diff --git a/src/hotspot/share/opto/matcher.cpp b/src/hotspot/share/opto/matcher.cpp index 5cb56019bc144..ba0354458bcb9 100644 --- a/src/hotspot/share/opto/matcher.cpp +++ b/src/hotspot/share/opto/matcher.cpp @@ -2440,6 +2440,7 @@ void Matcher::find_shared_post_visit(Node* n, uint opcode) { n->del_req(4); break; } + case Op_VectorSlice: case Op_SelectFromTwoVector: case Op_LoopLimit: { Node* pair1 = new BinaryNode(n->in(1), n->in(2)); diff --git a/src/hotspot/share/opto/matcher.hpp b/src/hotspot/share/opto/matcher.hpp index 2ee2ded17b6ad..0f5636573fea6 100644 --- a/src/hotspot/share/opto/matcher.hpp +++ b/src/hotspot/share/opto/matcher.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -360,6 +360,8 @@ class Matcher : public PhaseTransform { // Vector ideal reg static uint vector_ideal_reg(int len); + // Vector ideal reg size + static uint vector_ideal_reg_size(int len); // Vector length static uint vector_length(const Node* n); diff --git a/src/hotspot/share/opto/node.hpp b/src/hotspot/share/opto/node.hpp index 2bbb10879f595..c7afa8b320f6a 100644 --- a/src/hotspot/share/opto/node.hpp +++ b/src/hotspot/share/opto/node.hpp @@ -192,6 +192,7 @@ class StoreVectorScatterNode; class StoreVectorScatterMaskedNode; class VerifyVectorAlignmentNode; class VectorMaskCmpNode; +class VectorMaskWidenNode; class VectorUnboxNode; class VectorSet; class VectorReinterpretNode; @@ -748,6 +749,7 @@ class Node { DEFINE_CLASS_ID(NegV, Vector, 8) DEFINE_CLASS_ID(SaturatingVector, Vector, 9) DEFINE_CLASS_ID(MulVL, Vector, 10) + DEFINE_CLASS_ID(VectorMaskWiden, Vector, 11) DEFINE_CLASS_ID(Con, Type, 8) DEFINE_CLASS_ID(ConI, Con, 0) DEFINE_CLASS_ID(SafePointScalarMerge, Type, 9) @@ -1009,6 +1011,7 @@ class Node { DEFINE_CLASS_QUERY(Type) DEFINE_CLASS_QUERY(Vector) DEFINE_CLASS_QUERY(VectorMaskCmp) + DEFINE_CLASS_QUERY(VectorMaskWiden) DEFINE_CLASS_QUERY(VectorUnbox) DEFINE_CLASS_QUERY(VectorReinterpret) DEFINE_CLASS_QUERY(CompressV) diff --git a/src/hotspot/share/opto/vectorIntrinsics.cpp b/src/hotspot/share/opto/vectorIntrinsics.cpp index 97c5dbe03ef34..af5891d1c9e72 100644 --- a/src/hotspot/share/opto/vectorIntrinsics.cpp +++ b/src/hotspot/share/opto/vectorIntrinsics.cpp @@ -1173,6 +1173,149 @@ bool LibraryCallKit::inline_vector_mem_masked_operation(bool is_store) { return true; } +Node* LibraryCallKit::gen_gather_load_subword(Node* addr, Node* indexes0, Node* indexes1, Node* indexes2, + Node* indexes3, const TypeVect* vt) { + BasicType elem_bt = vt->element_basic_type(); + uint elem_num = vt->length(); + const TypeVect* index_vt = indexes0->bottom_type()->isa_vect(); + const TypePtr* addr_type = gvn().type(addr)->isa_ptr(); + Node* addr_mem = memory(addr); + + // The first gather-load. + Node* vgather = gvn().transform(new LoadVectorGatherNode(control(), addr_mem, addr, addr_type, vt, indexes0)); + + uint index_elem_num = index_vt != nullptr ? index_vt->length() : 0; + uint vector_reg_size = Matcher::vector_ideal_reg_size(vt->length_in_bytes()); + uint max_elem_num = vector_reg_size / type2aelembytes(elem_bt); + + // The second gather-load. + if (indexes1 != nullptr) { + assert(index_vt != nullptr, "indexes0 must be a vector"); + assert(Type::equals(indexes1->bottom_type(), index_vt), "invalid vector type for indexes1"); + Node* vgather1 = gvn().transform(new LoadVectorGatherNode(control(), addr_mem, addr, addr_type, vt, indexes1)); + // Merge the second gather with the first gather result. + Node* idx = gvn().makecon(TypeInt::make(max_elem_num - index_elem_num)); + Node* vslice = gvn().transform(new VectorSliceNode(vgather1, vgather1, idx)); + vgather = gvn().transform(new OrVNode(vgather, vslice, vt)); + } + + // The third and fourth gather-loads on byte vector. + if (indexes2 != nullptr) { + assert(elem_bt == T_BYTE, "only byte vector needs more than 2 times of gather-load"); + assert(indexes3 != nullptr, "indexes3 must be non-null"); + assert(Type::equals(indexes2->bottom_type(), index_vt), "invalid vector type for indexes2"); + assert(Type::equals(indexes3->bottom_type(), index_vt), "invalid vector type for indexes3"); + Node* vgather2 = gvn().transform(new LoadVectorGatherNode(control(), addr_mem, addr, addr_type, vt, indexes2)); + // Merge the third gather with previous results. + Node* idx = gvn().makecon(TypeInt::make(max_elem_num - 2 * index_elem_num)); + Node* vslice = gvn().transform(new VectorSliceNode(vgather2, vgather2, idx)); + vgather = gvn().transform(new OrVNode(vgather, vslice, vt)); + + Node* vgather3 = gvn().transform(new LoadVectorGatherNode(control(), addr_mem, addr, addr_type, vt, indexes3)); + // Merge the fourth gather with previous results. + idx = gvn().makecon(TypeInt::make(max_elem_num - 3 * index_elem_num)); + vslice = gvn().transform(new VectorSliceNode(vgather3, vgather3, idx)); + vgather = gvn().transform(new OrVNode(vgather, vslice, vt)); + } + return vgather; +} + +Node* LibraryCallKit::gen_gather_load_masked_subword(Node* addr, Node* indexes0, Node* indexes1, + Node* indexes2, Node* indexes3, Node* mask, + const TypeVect* vt) { + BasicType elem_bt = vt->element_basic_type(); + const TypeVect* index_vt = indexes0->bottom_type()->isa_vect(); + const TypePtr* addr_type = gvn().type(addr)->isa_ptr(); + Node* addr_mem = memory(addr); + + // Case for architectures (e.g. X86_64) that the subword vector gather does not accept a vector + // index as input. The given mask needs to be kept as it is. + if (index_vt == nullptr) { + return gvn().transform(new LoadVectorGatherMaskedNode(control(), addr_mem, addr, addr_type, vt, indexes0, mask)); + } + + // Otherwise, the given mask may need to be split and used by multiple masked vector gather-loads + // respectively. Additionally, each part of the mask needs to be converted to int type because the + // vector gather-load instruction works on the same type of the vector indice. + // + // For example, if it is gather loading a short vector and it needs two times of gather-load, the + // lower half of the given mask is used by the first gather-load, and the higher half of the given + // mask is used by the second gather-load. It will be more complicated for byte vector, that the + // mask maybe split into 4 parts. Here is an example: + // + // mask = [1010 0101 1111 1100] + // ---- ---- ---- ---- + // | | | |------ gather_mask0 + // | | |----------- gather_mask1 + // | |---------------- gather_mask2 + // |--------------------- gather_mask3 + // + // Define the mask type of gather mask as the same as the vector index. + uint index_elem_num = index_vt->length(); + const TypeVect* mask_vt = TypeVect::makemask(T_INT, index_elem_num); + Node* gather_mask_short = nullptr; + Node* gather_mask = nullptr; + + // The first masked vector gather-load with vector index. + // + // Generate a new vector mask by widening the lower half of the given mask to int type. For byte + // vector, it maybe the lowest 1/4 part of the given mask. + if (elem_bt == T_BYTE) { + const TypeVect* mask_vt_short = TypeVect::makemask(T_SHORT, MaxVectorSize / type2aelembytes(T_SHORT)); + gather_mask_short = gvn().transform(new VectorMaskWidenNode(mask, mask_vt_short, /* is_lo */ true)); + } else { + gather_mask_short = mask; + } + gather_mask = gvn().transform(new VectorMaskWidenNode(gather_mask_short, mask_vt, /* is_lo */ true)); + Node* vgather = gvn().transform(new LoadVectorGatherMaskedNode(control(), addr_mem, addr, addr_type, vt, indexes0, gather_mask)); + + // The second masked vector gather with vector index. + uint vector_reg_size = Matcher::vector_ideal_reg_size(vt->length_in_bytes()); + uint max_elem_num = vector_reg_size / type2aelembytes(elem_bt); + if (indexes1 != nullptr) { + assert(index_vt != nullptr, "indexes0 must be a vector"); + assert(Type::equals(indexes1->bottom_type(), index_vt), "invalid vector type for indexes1"); + + // Generate a new vector mask by widening the higher half of the given mask to int type. For byte + // vector, it maybe the 2/4 part of the mask starting from the lowest bit. + gather_mask = gvn().transform(new VectorMaskWidenNode(gather_mask_short, mask_vt, /* is_lo */ false)); + Node* vgather1 = gvn().transform(new LoadVectorGatherMaskedNode(control(), addr_mem, addr, addr_type, vt, indexes1, gather_mask)); + // Merge the second gather with the first gather result. + Node* idx = gvn().makecon(TypeInt::make(max_elem_num - index_elem_num)); + Node* slice = gvn().transform(new VectorSliceNode(vgather1, vgather1, idx)); + vgather = gvn().transform(new OrVNode(vgather, slice, vt)); + } + + // The third and fourth masked vector gathers for byte vector. + if (indexes2 != nullptr) { + assert(elem_bt == T_BYTE, "only byte vector needs more than 2 times of gather load"); + assert(indexes3 != nullptr, "indexes3 must be non-null"); + assert(Type::equals(indexes2->bottom_type(), index_vt), "invalid vector type for indexes2"); + assert(Type::equals(indexes3->bottom_type(), index_vt), "invalid vector type for indexes3"); + + // The third masked vector gather with vector index. The new vector mask is widened from the 3/4 + // part of the input mask. + const TypeVect* mask_vt_short = TypeVect::makemask(T_SHORT, MaxVectorSize / type2aelembytes(T_SHORT)); + gather_mask_short = gvn().transform(new VectorMaskWidenNode(mask, mask_vt_short, /* is_lo */ false)); + gather_mask = gvn().transform(new VectorMaskWidenNode(gather_mask_short, mask_vt, /* is_lo */ true)); + Node* vgather2 = gvn().transform(new LoadVectorGatherMaskedNode(control(), addr_mem, addr, addr_type, vt, indexes2, gather_mask)); + // Merge the third gather with previous results. + Node* idx = gvn().makecon(TypeInt::make(max_elem_num - 2 * index_elem_num)); + Node* slice = gvn().transform(new VectorSliceNode(vgather2, vgather2, idx)); + vgather = gvn().transform(new OrVNode(vgather, slice, vt)); + + // The fourth masked vector gather with vector index. The new vector mask is widened from the 4/4 + // part of the input mask. + gather_mask = gvn().transform(new VectorMaskWidenNode(gather_mask_short, mask_vt, /* is_lo */ false)); + Node* vgather3 = gvn().transform(new LoadVectorGatherMaskedNode(control(), addr_mem, addr, addr_type, vt, indexes3, gather_mask)); + // Merge the fourth gather with previous results. + idx = gvn().makecon(TypeInt::make(max_elem_num - 3 * index_elem_num)); + slice = gvn().transform(new VectorSliceNode(vgather3, vgather3, idx)); + vgather = gvn().transform(new OrVNode(vgather, slice, vt)); + } + return vgather; +} + // // , @@ -1273,13 +1416,32 @@ bool LibraryCallKit::inline_vector_gather_scatter(bool is_scatter) { } } - // Check that the vector holding indices is supported by architecture - // For sub-word gathers expander receive index array. - if (!is_subword_type(elem_bt) && !arch_supports_vector(Op_LoadVector, idx_num_elem, T_INT, VecMaskNotUsed)) { - log_if_needed(" ** not supported: arity=%d op=%s/loadindex vlen=%d etype=int is_masked_op=%d", - is_scatter, is_scatter ? "scatter" : "gather", - idx_num_elem, is_masked_op ? 1 : 0); - return false; // not supported + bool needs_vector_index = Matcher::gather_scatter_needs_vector_index(elem_bt); + if (needs_vector_index) { + // Check that the vector holding indices is supported by architecture + if (!arch_supports_vector(Op_LoadVector, idx_num_elem, T_INT, VecMaskNotUsed)) { + log_if_needed(" ** not supported: arity=%d op=%s/loadindex vlen=%d etype=int is_masked_op=%d", + is_scatter, is_scatter ? "scatter" : "gather", + idx_num_elem, is_masked_op ? 1 : 0); + return false; // not supported + } + + // Check more ops that are necessary to finish the whole subword gather with vector indexes. + if (!is_scatter && gvn().type(argument(10)) != TypePtr::NULL_PTR) { + assert(is_subword_type(elem_bt), "Only subword gather operation accepts multiple indexes"); + if (!arch_supports_vector(Op_VectorSlice, num_elem, elem_bt, VecMaskNotUsed) || + !arch_supports_vector(Op_OrV, num_elem, elem_bt, VecMaskNotUsed)) { + log_if_needed(" ** not supported: op=gather/merge vlen=%d etype=%s is_masked_op=%d", + num_elem, type2name(elem_bt), is_masked_op ? 1 : 0); + return false; // not supported + } + + if (is_masked_op && !arch_supports_vector(Op_VectorMaskWiden, idx_num_elem, T_INT, VecMaskNotUsed)) { + log_if_needed(" ** not supported: op=gather/maskwiden vlen=%d etype=%s is_masked_op=1", + idx_num_elem, type2name(elem_bt)); + return false; // not supported + } + } } Node* base = argument(6); @@ -1289,9 +1451,10 @@ bool LibraryCallKit::inline_vector_gather_scatter(bool is_scatter) { SavedState old_state(this); Node* addr = nullptr; - if (!is_subword_type(elem_bt)) { + if (needs_vector_index) { addr = make_unsafe_address(base, offset, elem_bt, true); } else { + assert(is_subword_type(elem_bt), "Only subword gather operation supports non-vector indexes"); assert(!is_scatter, "Only supports gather operation for subword types now"); uint header = arrayOopDesc::base_offset_in_bytes(elem_bt); assert(offset->is_Con() && offset->bottom_type()->is_long()->get_con() == header, @@ -1321,7 +1484,7 @@ bool LibraryCallKit::inline_vector_gather_scatter(bool is_scatter) { // Get the indexes for gather/scatter. Node* indexes = nullptr; const TypeInstPtr* vbox_idx_type = TypeInstPtr::make_exact(TypePtr::NotNull, vbox_idx_klass); - if (is_subword_type(elem_bt)) { + if (!needs_vector_index) { Node* indexMap = argument(16); Node* indexM = argument(17); indexes = array_element_address(indexMap, indexM, T_INT); @@ -1333,6 +1496,37 @@ bool LibraryCallKit::inline_vector_gather_scatter(bool is_scatter) { } } + // Get other index vectors if they are not nullptr for subword gather operation. + Node* indexes1 = nullptr; + Node* indexes2 = nullptr; + Node* indexes3 = nullptr; + if (!is_scatter && needs_vector_index) { + // Get the second index vector if it is not nullptr. + Node* idx1 = argument(10); + if (gvn().type(idx1) != TypePtr::NULL_PTR) { + indexes1 = unbox_vector(idx1, vbox_idx_type, T_INT, idx_num_elem); + if (indexes1 == nullptr) { + return false; + } + } + + // Get the third and fourth index vectors if they are not nullptr. + Node* idx2 = argument(11); + Node* idx3 = argument(12); + if (gvn().type(idx2) != TypePtr::NULL_PTR) { + assert(elem_bt == T_BYTE, "Only byte gather needs more than 2 index vectors"); + if (gvn().type(idx3) == TypePtr::NULL_PTR) { + return false; + } + + indexes2 = unbox_vector(idx2, vbox_idx_type, T_INT, idx_num_elem); + indexes3 = unbox_vector(idx3, vbox_idx_type, T_INT, idx_num_elem); + if (indexes2 == nullptr || indexes3 == nullptr) { + return false; + } + } + } + // Get the vector mask value. Node* mask = nullptr; if (is_masked_op) { @@ -1363,9 +1557,17 @@ bool LibraryCallKit::inline_vector_gather_scatter(bool is_scatter) { } else { Node* vload = nullptr; if (mask != nullptr) { - vload = gvn().transform(new LoadVectorGatherMaskedNode(control(), memory(addr), addr, addr_type, vector_type, indexes, mask)); + if (is_subword_type(elem_bt)) { + vload = gen_gather_load_masked_subword(addr, indexes, indexes1, indexes2, indexes3, mask, vector_type); + } else { + vload = gvn().transform(new LoadVectorGatherMaskedNode(control(), memory(addr), addr, addr_type, vector_type, indexes, mask)); + } } else { - vload = gvn().transform(new LoadVectorGatherNode(control(), memory(addr), addr, addr_type, vector_type, indexes)); + if (is_subword_type(elem_bt)) { + vload = gen_gather_load_subword(addr, indexes, indexes1, indexes2, indexes3, vector_type); + } else { + vload = gvn().transform(new LoadVectorGatherNode(control(), memory(addr), addr, addr_type, vector_type, indexes)); + } } Node* box = box_vector(vload, vbox_type, elem_bt, num_elem); set_result(box); diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp index 667e74a476179..d3606924c83a8 100644 --- a/src/hotspot/share/opto/vectornode.cpp +++ b/src/hotspot/share/opto/vectornode.cpp @@ -1041,19 +1041,31 @@ Node* VectorNode::try_to_gen_masked_vector(PhaseGVN* gvn, Node* node, const Type uint vlen = vt->length(); BasicType bt = vt->element_basic_type(); + BasicType mask_bt = bt; + uint mask_vlen = vlen; + if (vopc == Op_LoadVectorGather && is_subword_type(bt)) { + // It uses the index vector's type as the mask type for subword gather load. + const TypeVect* index_vt = node->in(MemNode::ValueIn)->bottom_type()->isa_vect(); + if (index_vt == nullptr) { + return nullptr; + } + mask_bt = index_vt->element_basic_type(); + mask_vlen = index_vt->length(); + } + // Predicated vectors do not need to add another mask input if (node->is_predicated_vector() || !Matcher::has_predicated_vectors() || !Matcher::match_rule_supported_vector_masked(vopc, vlen, bt) || - !Matcher::match_rule_supported_vector(Op_VectorMaskGen, vlen, bt)) { + !Matcher::match_rule_supported_vector(Op_VectorMaskGen, mask_vlen, mask_bt)) { return nullptr; } Node* mask = nullptr; // Generate a vector mask for vector operation whose vector length is lower than the // hardware supported max vector length. - if (vt->length_in_bytes() < (uint)MaxVectorSize) { - Node* length = gvn->transform(new ConvI2LNode(gvn->makecon(TypeInt::make(vlen)))); - mask = gvn->transform(VectorMaskGenNode::make(length, bt, vlen)); + if (mask_vlen * type2aelembytes(mask_bt) < (uint)MaxVectorSize) { + Node* length = gvn->transform(new ConvI2LNode(gvn->makecon(TypeInt::make(mask_vlen)))); + mask = gvn->transform(VectorMaskGenNode::make(length, mask_bt, mask_vlen)); } else { return nullptr; } diff --git a/src/hotspot/share/opto/vectornode.hpp b/src/hotspot/share/opto/vectornode.hpp index 3caaf7c59d7d3..182272a6ccfe5 100644 --- a/src/hotspot/share/opto/vectornode.hpp +++ b/src/hotspot/share/opto/vectornode.hpp @@ -1739,6 +1739,24 @@ class VectorRearrangeNode : public VectorNode { Node* vec_shuffle() const { return in(2); } }; +// Generate a vector by slicing the two source vectors based on an index. +// +// Copy the indexed byte up to the last byte of the first source vector +// to the bottom of the result vector, then fill the remainder of the +// result starting from the first byte of the second source vector. +// +// E.g. src1 = [hgfedcba] src2 = [ponmlkji] index = 3 +// dst = [kjihgfed] +class VectorSliceNode : public VectorNode { + public: + VectorSliceNode(Node* vec1, Node* vec2, Node* index) + : VectorNode(vec1, vec2, index, vec1->bottom_type()->is_vect()) { + assert(index->bottom_type()->isa_int(), "index must be an integral value"); + assert(index->is_Con(), "index must be a constant"); + } + + virtual int Opcode() const; +}; // Select elements from two source vectors based on the wrapped indexes held in // the first vector. @@ -1798,6 +1816,28 @@ class VectorMaskCastNode : public VectorNode { virtual int Opcode() const; }; +// Unpack the elements to twice size. +class VectorMaskWidenNode : public VectorNode { + private: + // "_is_lo" is used to denote whether the lower half or + // the upper half of the elements are widened. + // E.g. src = [1111 0101] + // _is_lo = true, dst = [0001 0001] + // _is_lo = false, dst = [0101 0101] + bool _is_lo; + + public: + VectorMaskWidenNode(Node* in, const TypeVect* vt, bool is_lo) : VectorNode(in, vt), _is_lo(is_lo) { + init_class_id(Class_VectorMaskWiden); + const TypeVect* in_vt = in->bottom_type()->is_vect(); + assert(type2aelembytes(in_vt->element_basic_type()) == type2aelembytes(vt->element_basic_type()) / 2, "must be half size"); + } + + bool is_lo() const { return _is_lo; } + virtual int Opcode() const; + virtual uint size_of() const { return sizeof(*this); } +}; + // This is intended for use as a simple reinterpret node that has no cast. class VectorReinterpretNode : public VectorNode { private: diff --git a/test/hotspot/gtest/aarch64/aarch64-asmtest.py b/test/hotspot/gtest/aarch64/aarch64-asmtest.py index 5b2c18b0a2b7c..25a9be83e28b9 100644 --- a/test/hotspot/gtest/aarch64/aarch64-asmtest.py +++ b/test/hotspot/gtest/aarch64/aarch64-asmtest.py @@ -2087,6 +2087,8 @@ def generate(kind, names): ["index", "__ sve_index(z7, __ D, r5, 5);", "index\tz7.d, x5, #5"], ["cpy", "__ sve_cpy(z7, __ H, p3, r5);", "cpy\tz7.h, p3/m, w5"], ["tbl", "__ sve_tbl(z16, __ S, z17, z18);", "tbl\tz16.s, {z17.s}, z18.s"], + ["ld1b", "__ sve_ld1b_gather(z15, p0, r5, z16);", "ld1b\t{z15.s}, p0/z, [x5, z16.s, uxtw]"], + ["ld1h", "__ sve_ld1h_gather(z15, p0, r5, z16);", "ld1h\t{z15.s}, p0/z, [x5, z16.s, uxtw #1]"], ["ld1w", "__ sve_ld1w_gather(z15, p0, r5, z16);", "ld1w\t{z15.s}, p0/z, [x5, z16.s, uxtw #2]"], ["ld1d", "__ sve_ld1d_gather(z15, p0, r5, z16);", "ld1d\t{z15.d}, p0/z, [x5, z16.d, uxtw #3]"], ["st1w", "__ sve_st1w_scatter(z15, p0, r5, z16);", "st1w\t{z15.s}, p0, [x5, z16.s, uxtw #2]"], diff --git a/test/hotspot/gtest/aarch64/asmtest.out.h b/test/hotspot/gtest/aarch64/asmtest.out.h index d90c2479995f5..8e049d36ea7c4 100644 --- a/test/hotspot/gtest/aarch64/asmtest.out.h +++ b/test/hotspot/gtest/aarch64/asmtest.out.h @@ -1100,6 +1100,8 @@ __ sve_index(z7, __ D, r5, 5); // index z7.d, x5, #5 __ sve_cpy(z7, __ H, p3, r5); // cpy z7.h, p3/m, w5 __ sve_tbl(z16, __ S, z17, z18); // tbl z16.s, {z17.s}, z18.s + __ sve_ld1b_gather(z15, p0, r5, z16); // ld1b {z15.s}, p0/z, [x5, z16.s, uxtw] + __ sve_ld1h_gather(z15, p0, r5, z16); // ld1h {z15.s}, p0/z, [x5, z16.s, uxtw #1] __ sve_ld1w_gather(z15, p0, r5, z16); // ld1w {z15.s}, p0/z, [x5, z16.s, uxtw #2] __ sve_ld1d_gather(z15, p0, r5, z16); // ld1d {z15.d}, p0/z, [x5, z16.d, uxtw #3] __ sve_st1w_scatter(z15, p0, r5, z16); // st1w {z15.s}, p0, [x5, z16.s, uxtw #2] @@ -1438,30 +1440,30 @@ 0x9101a1a0, 0xb10a5cc8, 0xd10810aa, 0xf10fd061, 0x120cb166, 0x321764bc, 0x52174681, 0x720c0227, 0x9241018e, 0xb25a2969, 0xd278b411, 0xf26aad01, - 0x14000000, 0x17ffffd7, 0x140004b0, 0x94000000, - 0x97ffffd4, 0x940004ad, 0x3400000a, 0x34fffa2a, - 0x3400954a, 0x35000008, 0x35fff9c8, 0x350094e8, - 0xb400000b, 0xb4fff96b, 0xb400948b, 0xb500001d, - 0xb5fff91d, 0xb500943d, 0x10000013, 0x10fff8b3, - 0x100093d3, 0x90000013, 0x36300016, 0x3637f836, - 0x36309356, 0x3758000c, 0x375ff7cc, 0x375892ec, + 0x14000000, 0x17ffffd7, 0x140004b2, 0x94000000, + 0x97ffffd4, 0x940004af, 0x3400000a, 0x34fffa2a, + 0x3400958a, 0x35000008, 0x35fff9c8, 0x35009528, + 0xb400000b, 0xb4fff96b, 0xb40094cb, 0xb500001d, + 0xb5fff91d, 0xb500947d, 0x10000013, 0x10fff8b3, + 0x10009413, 0x90000013, 0x36300016, 0x3637f836, + 0x36309396, 0x3758000c, 0x375ff7cc, 0x3758932c, 0x128313a0, 0x528a32c7, 0x7289173b, 0x92ab3acc, 0xd2a0bf94, 0xf2c285e8, 0x9358722f, 0x330e652f, 0x53067f3b, 0x93577c53, 0xb34a1aac, 0xd35a4016, 0x13946c63, 0x93c3dbc8, 0x54000000, 0x54fff5a0, - 0x540090c0, 0x54000001, 0x54fff541, 0x54009061, - 0x54000002, 0x54fff4e2, 0x54009002, 0x54000002, - 0x54fff482, 0x54008fa2, 0x54000003, 0x54fff423, - 0x54008f43, 0x54000003, 0x54fff3c3, 0x54008ee3, - 0x54000004, 0x54fff364, 0x54008e84, 0x54000005, - 0x54fff305, 0x54008e25, 0x54000006, 0x54fff2a6, - 0x54008dc6, 0x54000007, 0x54fff247, 0x54008d67, - 0x54000008, 0x54fff1e8, 0x54008d08, 0x54000009, - 0x54fff189, 0x54008ca9, 0x5400000a, 0x54fff12a, - 0x54008c4a, 0x5400000b, 0x54fff0cb, 0x54008beb, - 0x5400000c, 0x54fff06c, 0x54008b8c, 0x5400000d, - 0x54fff00d, 0x54008b2d, 0x5400000e, 0x54ffefae, - 0x54008ace, 0x5400000f, 0x54ffef4f, 0x54008a6f, + 0x54009100, 0x54000001, 0x54fff541, 0x540090a1, + 0x54000002, 0x54fff4e2, 0x54009042, 0x54000002, + 0x54fff482, 0x54008fe2, 0x54000003, 0x54fff423, + 0x54008f83, 0x54000003, 0x54fff3c3, 0x54008f23, + 0x54000004, 0x54fff364, 0x54008ec4, 0x54000005, + 0x54fff305, 0x54008e65, 0x54000006, 0x54fff2a6, + 0x54008e06, 0x54000007, 0x54fff247, 0x54008da7, + 0x54000008, 0x54fff1e8, 0x54008d48, 0x54000009, + 0x54fff189, 0x54008ce9, 0x5400000a, 0x54fff12a, + 0x54008c8a, 0x5400000b, 0x54fff0cb, 0x54008c2b, + 0x5400000c, 0x54fff06c, 0x54008bcc, 0x5400000d, + 0x54fff00d, 0x54008b6d, 0x5400000e, 0x54ffefae, + 0x54008b0e, 0x5400000f, 0x54ffef4f, 0x54008aaf, 0xd40658e1, 0xd4014d22, 0xd4046543, 0xd4273f60, 0xd44cad80, 0xd503201f, 0xd503203f, 0xd503205f, 0xd503209f, 0xd50320bf, 0xd503219f, 0xd50323bf, @@ -1668,76 +1670,77 @@ 0x65d8a801, 0x65dcac01, 0x655cb241, 0x0520a1e0, 0x0521a601, 0x052281e0, 0x05238601, 0x04a14026, 0x042244a6, 0x046344a6, 0x04a444a6, 0x04e544a7, - 0x0568aca7, 0x05b23230, 0x853040af, 0xc5b040af, - 0xe57080af, 0xe5b080af, 0x25034440, 0x254054c4, - 0x25034640, 0x25415a05, 0x25834440, 0x25c54489, - 0x250b5d3a, 0x2550dc20, 0x2518e3e1, 0x2518e021, - 0x2518e0a1, 0x2518e121, 0x2518e1a1, 0x2558e3e2, - 0x2558e042, 0x2558e0c2, 0x2558e142, 0x2598e3e3, - 0x2598e063, 0x2598e0e3, 0x2598e163, 0x25d8e3e4, - 0x25d8e084, 0x25d8e104, 0x25d8e184, 0x2518e407, - 0x05214800, 0x05614800, 0x05a14800, 0x05e14800, - 0x05214c00, 0x05614c00, 0x05a14c00, 0x05e14c00, - 0x05304001, 0x05314001, 0x05a18610, 0x05e18610, - 0x05271e11, 0x6545e891, 0x6585e891, 0x65c5e891, - 0x6545c891, 0x6585c891, 0x65c5c891, 0x45b0c210, - 0x45f1c231, 0x1e601000, 0x1e603000, 0x1e621000, - 0x1e623000, 0x1e641000, 0x1e643000, 0x1e661000, - 0x1e663000, 0x1e681000, 0x1e683000, 0x1e6a1000, - 0x1e6a3000, 0x1e6c1000, 0x1e6c3000, 0x1e6e1000, - 0x1e6e3000, 0x1e701000, 0x1e703000, 0x1e721000, - 0x1e723000, 0x1e741000, 0x1e743000, 0x1e761000, - 0x1e763000, 0x1e781000, 0x1e783000, 0x1e7a1000, - 0x1e7a3000, 0x1e7c1000, 0x1e7c3000, 0x1e7e1000, - 0x1e7e3000, 0xf8268267, 0xf82d023c, 0xf8301046, - 0xf83d2083, 0xf8263290, 0xf82d528c, 0xf8284299, - 0xf8337160, 0xf8386286, 0xf8bf820e, 0xf8a600e0, - 0xf8af1353, 0xf8a922ea, 0xf8b53396, 0xf8a251e3, - 0xf8b340f4, 0xf8a470fd, 0xf8a06209, 0xf8f48097, - 0xf8f002ea, 0xf8eb10d9, 0xf8ff21b0, 0xf8f7302c, - 0xf8ee52a9, 0xf8f041fa, 0xf8e471e4, 0xf8e863c6, - 0xf864823d, 0xf87d013a, 0xf86f1162, 0xf87d20e3, - 0xf86132bb, 0xf870510e, 0xf8704336, 0xf86572b4, - 0xf8706217, 0xb83e8294, 0xb8200264, 0xb8381284, - 0xb8242358, 0xb8333102, 0xb828530e, 0xb83042df, - 0xb824703f, 0xb82a6194, 0xb8a080e9, 0xb8b80090, - 0xb8bb1146, 0xb8bb21b8, 0xb8b032df, 0xb8b653f4, - 0xb8bd41c9, 0xb8b47287, 0xb8bc6169, 0xb8ee828c, - 0xb8e10138, 0xb8f3126d, 0xb8f020b0, 0xb8e03183, - 0xb8e851ef, 0xb8f041e4, 0xb8fe7005, 0xb8ea6376, - 0xb8638120, 0xb873015d, 0xb8781284, 0xb86723b8, - 0xb86e3175, 0xb87b51ed, 0xb87f41d1, 0xb863721e, - 0xb87660f4, 0xce216874, 0xce104533, 0xce648c15, - 0xce8e3302, 0xce6e82ab, 0xce6c87d1, 0xcec08063, - 0xce638937, 0x25e0c358, 0x25a1c7d3, 0x0580785a, - 0x05426328, 0x05009892, 0x25a0cc29, 0x2561cec8, - 0x058044b3, 0x05401c99, 0x05006b49, 0x25e0d6f7, - 0x2561c528, 0x0583c8bc, 0x0542522f, 0x05001ec0, - 0x25e0de65, 0x25a1c113, 0x05803cad, 0x0540f3c0, - 0x0500ab15, 0x2560c28c, 0x2561d7c0, 0x05801ed7, - 0x0542633b, 0x05003696, 0x2560d4b4, 0x25e1c918, - 0x058021ff, 0x05400e15, 0x0500f3de, 0x0473025a, - 0x04bd05ab, 0x658e0025, 0x658a08e2, 0x659a0493, - 0x043e1062, 0x04f418b4, 0x046d15bd, 0x04611fce, - 0x04d6a07c, 0x04001929, 0x041a09da, 0x04d098f4, - 0x04db10d4, 0x0459a3ad, 0x041aa029, 0x041919fb, - 0x04d39e24, 0x04118302, 0x04101dba, 0x04d7ae16, - 0x04dea571, 0x04180210, 0x05e786fc, 0x05e4915c, - 0x04881cf1, 0x044a0f04, 0x04090969, 0x048b16c4, - 0x044101e4, 0x04dcbf44, 0x65809745, 0x658d833f, - 0x65c68468, 0x65c79b07, 0x65829e38, 0x049dafca, - 0x6582bba8, 0x65c0b7ff, 0x65c1b4e0, 0x658dbadd, - 0x65819a9d, 0x65ed9246, 0x65b30815, 0x65e6263c, - 0x65eebb94, 0x65bad14e, 0x65efe178, 0x65fc5697, - 0x65e07f14, 0x040c55a6, 0x04977f4d, 0x043d3046, - 0x04b733a0, 0x046830a4, 0x04ed322d, 0x05686948, - 0x05bd6c13, 0x65c88ef0, 0x450db3d7, 0x4540b6d9, - 0x043e3979, 0x445896ce, 0x445a9005, 0x44d98069, - 0x445b87ae, 0x04da348e, 0x04982edb, 0x0499397f, - 0x0408338c, 0x04ca309c, 0x65c721e6, 0x65c63641, - 0x65982882, 0x04812b8b, 0x0e251083, 0x4e3712d5, - 0x0e61101f, 0x4e6d118b, 0x0eba1338, 0x4eb712d5, - 0x2e31120f, 0x6e2e11ac, 0x2e6810e6, 0x6e6f11cd, - 0x2eaa1128, 0x6eb1120f, + 0x0568aca7, 0x05b23230, 0x841040af, 0x84b040af, + 0x853040af, 0xc5b040af, 0xe57080af, 0xe5b080af, + 0x25034440, 0x254054c4, 0x25034640, 0x25415a05, + 0x25834440, 0x25c54489, 0x250b5d3a, 0x2550dc20, + 0x2518e3e1, 0x2518e021, 0x2518e0a1, 0x2518e121, + 0x2518e1a1, 0x2558e3e2, 0x2558e042, 0x2558e0c2, + 0x2558e142, 0x2598e3e3, 0x2598e063, 0x2598e0e3, + 0x2598e163, 0x25d8e3e4, 0x25d8e084, 0x25d8e104, + 0x25d8e184, 0x2518e407, 0x05214800, 0x05614800, + 0x05a14800, 0x05e14800, 0x05214c00, 0x05614c00, + 0x05a14c00, 0x05e14c00, 0x05304001, 0x05314001, + 0x05a18610, 0x05e18610, 0x05271e11, 0x6545e891, + 0x6585e891, 0x65c5e891, 0x6545c891, 0x6585c891, + 0x65c5c891, 0x45b0c210, 0x45f1c231, 0x1e601000, + 0x1e603000, 0x1e621000, 0x1e623000, 0x1e641000, + 0x1e643000, 0x1e661000, 0x1e663000, 0x1e681000, + 0x1e683000, 0x1e6a1000, 0x1e6a3000, 0x1e6c1000, + 0x1e6c3000, 0x1e6e1000, 0x1e6e3000, 0x1e701000, + 0x1e703000, 0x1e721000, 0x1e723000, 0x1e741000, + 0x1e743000, 0x1e761000, 0x1e763000, 0x1e781000, + 0x1e783000, 0x1e7a1000, 0x1e7a3000, 0x1e7c1000, + 0x1e7c3000, 0x1e7e1000, 0x1e7e3000, 0xf8268267, + 0xf82d023c, 0xf8301046, 0xf83d2083, 0xf8263290, + 0xf82d528c, 0xf8284299, 0xf8337160, 0xf8386286, + 0xf8bf820e, 0xf8a600e0, 0xf8af1353, 0xf8a922ea, + 0xf8b53396, 0xf8a251e3, 0xf8b340f4, 0xf8a470fd, + 0xf8a06209, 0xf8f48097, 0xf8f002ea, 0xf8eb10d9, + 0xf8ff21b0, 0xf8f7302c, 0xf8ee52a9, 0xf8f041fa, + 0xf8e471e4, 0xf8e863c6, 0xf864823d, 0xf87d013a, + 0xf86f1162, 0xf87d20e3, 0xf86132bb, 0xf870510e, + 0xf8704336, 0xf86572b4, 0xf8706217, 0xb83e8294, + 0xb8200264, 0xb8381284, 0xb8242358, 0xb8333102, + 0xb828530e, 0xb83042df, 0xb824703f, 0xb82a6194, + 0xb8a080e9, 0xb8b80090, 0xb8bb1146, 0xb8bb21b8, + 0xb8b032df, 0xb8b653f4, 0xb8bd41c9, 0xb8b47287, + 0xb8bc6169, 0xb8ee828c, 0xb8e10138, 0xb8f3126d, + 0xb8f020b0, 0xb8e03183, 0xb8e851ef, 0xb8f041e4, + 0xb8fe7005, 0xb8ea6376, 0xb8638120, 0xb873015d, + 0xb8781284, 0xb86723b8, 0xb86e3175, 0xb87b51ed, + 0xb87f41d1, 0xb863721e, 0xb87660f4, 0xce216874, + 0xce104533, 0xce648c15, 0xce8e3302, 0xce6e82ab, + 0xce6c87d1, 0xcec08063, 0xce638937, 0x25e0c358, + 0x25a1c7d3, 0x0580785a, 0x05426328, 0x05009892, + 0x25a0cc29, 0x2561cec8, 0x058044b3, 0x05401c99, + 0x05006b49, 0x25e0d6f7, 0x2561c528, 0x0583c8bc, + 0x0542522f, 0x05001ec0, 0x25e0de65, 0x25a1c113, + 0x05803cad, 0x0540f3c0, 0x0500ab15, 0x2560c28c, + 0x2561d7c0, 0x05801ed7, 0x0542633b, 0x05003696, + 0x2560d4b4, 0x25e1c918, 0x058021ff, 0x05400e15, + 0x0500f3de, 0x0473025a, 0x04bd05ab, 0x658e0025, + 0x658a08e2, 0x659a0493, 0x043e1062, 0x04f418b4, + 0x046d15bd, 0x04611fce, 0x04d6a07c, 0x04001929, + 0x041a09da, 0x04d098f4, 0x04db10d4, 0x0459a3ad, + 0x041aa029, 0x041919fb, 0x04d39e24, 0x04118302, + 0x04101dba, 0x04d7ae16, 0x04dea571, 0x04180210, + 0x05e786fc, 0x05e4915c, 0x04881cf1, 0x044a0f04, + 0x04090969, 0x048b16c4, 0x044101e4, 0x04dcbf44, + 0x65809745, 0x658d833f, 0x65c68468, 0x65c79b07, + 0x65829e38, 0x049dafca, 0x6582bba8, 0x65c0b7ff, + 0x65c1b4e0, 0x658dbadd, 0x65819a9d, 0x65ed9246, + 0x65b30815, 0x65e6263c, 0x65eebb94, 0x65bad14e, + 0x65efe178, 0x65fc5697, 0x65e07f14, 0x040c55a6, + 0x04977f4d, 0x043d3046, 0x04b733a0, 0x046830a4, + 0x04ed322d, 0x05686948, 0x05bd6c13, 0x65c88ef0, + 0x450db3d7, 0x4540b6d9, 0x043e3979, 0x445896ce, + 0x445a9005, 0x44d98069, 0x445b87ae, 0x04da348e, + 0x04982edb, 0x0499397f, 0x0408338c, 0x04ca309c, + 0x65c721e6, 0x65c63641, 0x65982882, 0x04812b8b, + 0x0e251083, 0x4e3712d5, 0x0e61101f, 0x4e6d118b, + 0x0eba1338, 0x4eb712d5, 0x2e31120f, 0x6e2e11ac, + 0x2e6810e6, 0x6e6f11cd, 0x2eaa1128, 0x6eb1120f, + }; // END Generated code -- do not edit diff --git a/test/hotspot/jtreg/compiler/vectorapi/VectorGatherSubwordTest.java b/test/hotspot/jtreg/compiler/vectorapi/VectorGatherSubwordTest.java new file mode 100644 index 0000000000000..230c51075e029 --- /dev/null +++ b/test/hotspot/jtreg/compiler/vectorapi/VectorGatherSubwordTest.java @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package compiler.vectorapi; + +import compiler.lib.generators.*; +import compiler.lib.ir_framework.*; +import jdk.incubator.vector.*; +import jdk.test.lib.Asserts; + +/** + * @test + * @bug 8351623 + * @summary VectorAPI: Add SVE implementation for subword gather load operation + * @key randomness + * @library /test/lib / + * @modules jdk.incubator.vector + * + * @run driver compiler.vectorapi.VectorGatherSubwordTest + */ +public class VectorGatherSubwordTest { + private static final VectorSpecies B_SPECIES = ByteVector.SPECIES_PREFERRED; + private static final VectorSpecies S_SPECIES = ShortVector.SPECIES_PREFERRED; + + private static int LENGTH = 128; + private static final Generators random = Generators.G; + + private static byte[] ba; + private static byte[] br; + private static short[] sa; + private static short[] sr; + private static boolean[] m; + private static int[][] indexes; + + static { + ba = new byte[LENGTH]; + br = new byte[LENGTH]; + sa = new short[LENGTH]; + sr = new short[LENGTH]; + m = new boolean[LENGTH]; + indexes = new int[2][]; + + Generator byteGen = random.uniformInts(Byte.MIN_VALUE, Byte.MAX_VALUE); + Generator shortGen = random.uniformInts(Short.MIN_VALUE, Short.MAX_VALUE); + for (int i = 0; i < LENGTH; i++) { + ba[i] = byteGen.next().byteValue(); + sa[i] = shortGen.next().shortValue(); + m[i] = i % 2 == 0; + } + + int[] nums = {B_SPECIES.length(), S_SPECIES.length()}; + for (int i = 0; i < 2; i++) { + indexes[i] = new int[nums[i]]; + random.fill(random.uniformInts(0, nums[i] - 1), indexes[i]); + } + } + + @Test + @IR(counts = { IRNode.LOAD_VECTOR_GATHER, " >0 "}, applyIfCPUFeature = {"sve", "true"}) + public void testLoadGatherByte() { + for (int i = 0; i < LENGTH; i += B_SPECIES.length()) { + ByteVector.fromArray(B_SPECIES, ba, i, indexes[0], 0) + .intoArray(br, i); + } + } + + @Check(test = "testLoadGatherByte") + public void verifyLoadGatherByte() { + for (int i = 0; i < LENGTH; i += B_SPECIES.length()) { + for (int j = 0; j < B_SPECIES.length(); j++) { + Asserts.assertEquals(ba[i + indexes[0][j]], br[i + j]); + } + } + } + + @Test + @IR(counts = { IRNode.LOAD_VECTOR_GATHER, " >0 "}, applyIfCPUFeature = {"sve", "true"}) + public void testLoadGatherShort() { + for (int i = 0; i < LENGTH; i += S_SPECIES.length()) { + ShortVector.fromArray(S_SPECIES, sa, i, indexes[1], 0) + .intoArray(sr, i); + } + } + + @Check(test = "testLoadGatherShort") + public void verifyLoadGatherShort() { + for (int i = 0; i < LENGTH; i += S_SPECIES.length()) { + for (int j = 0; j < S_SPECIES.length(); j++) { + Asserts.assertEquals(sa[i + indexes[1][j]], sr[i + j]); + } + } + } + + @Test + @IR(counts = { IRNode.LOAD_VECTOR_GATHER_MASKED, " >0 "}, applyIfCPUFeature = {"sve", "true"}) + public void testLoadGatherMaskedByte() { + VectorMask mask = VectorMask.fromArray(B_SPECIES, m, 0); + for (int i = 0; i < LENGTH; i += B_SPECIES.length()) { + ByteVector.fromArray(B_SPECIES, ba, i, indexes[0], 0, mask) + .intoArray(br, i); + } + } + + @Check(test = "testLoadGatherMaskedByte") + public void verifyLoadGatherMaskedByte() { + for (int i = 0; i < LENGTH; i += B_SPECIES.length()) { + for (int j = 0; j < B_SPECIES.length(); j++) { + Asserts.assertEquals(m[j] ? ba[i + indexes[0][j]] : 0, br[i + j]); + } + } + } + + @Test + @IR(counts = { IRNode.LOAD_VECTOR_GATHER_MASKED, " >0 "}, applyIfCPUFeature = {"sve", "true"}) + public void testLoadGatherMaskedShort() { + VectorMask mask = VectorMask.fromArray(S_SPECIES, m, 0); + for (int i = 0; i < LENGTH; i += S_SPECIES.length()) { + ShortVector.fromArray(S_SPECIES, sa, i, indexes[1], 0, mask) + .intoArray(sr, i); + } + } + + @Check(test = "testLoadGatherMaskedShort") + public void verifyLoadGatherMaskedShort() { + for (int i = 0; i < LENGTH; i += S_SPECIES.length()) { + for (int j = 0; j < S_SPECIES.length(); j++) { + Asserts.assertEquals(m[j] ? sa[i + indexes[1][j]] : 0, sr[i + j]); + } + } + } + + public static void main(String[] args) { + TestFramework testFramework = new TestFramework(); + testFramework.setDefaultWarmup(5000) + .addFlags("--add-modules=jdk.incubator.vector") + .start(); + } +}