diff --git a/slothy/targets/arm_v81m/arch_v81m.py b/slothy/targets/arm_v81m/arch_v81m.py index 44ec73c5..b6bdb34d 100644 --- a/slothy/targets/arm_v81m/arch_v81m.py +++ b/slothy/targets/arm_v81m/arch_v81m.py @@ -353,6 +353,7 @@ def is_load_store_instruction(self): vst42_with_writeback, vst43_with_writeback, ldrd, + ldrd_no_imm, ldrd_with_writeback, ldrd_with_post, strd, @@ -402,6 +403,7 @@ def is_scalar_load(self): return self._is_instance_of( [ ldrd, + ldrd_no_imm, ldrd_with_writeback, ldrd_with_post, ldr, @@ -1075,6 +1077,27 @@ def write(self): return super().write() +class ldrd_no_imm(MVEInstruction): + pattern = "ldrd , , []" + inputs = ["Rn"] + outputs = ["Rt0", "Rt1"] + + @classmethod + def make(cls, src): + obj = MVEInstruction.build(cls, src) + obj.increment = None + obj.pre_index = 0 + obj.addr = obj.args_in[0] + return obj + + def write(self): + self.immediate = simplify(self.pre_index) + + if int(self.immediate) != 0: + self.pattern = ldrd.pattern + return super().write() + + class ldrd_with_writeback(MVEInstruction): pattern = "ldrd , , [, ]!" inputs = ["Rn"] @@ -1425,6 +1448,12 @@ class vbic(MVEInstruction): outputs = ["Qd"] +class vbic_nodt(MVEInstruction): + pattern = "vbic , , " + inputs = ["Qn", "Qm"] + outputs = ["Qd"] + + class vorr(MVEInstruction): pattern = "vorr.
, , " inputs = ["Qn", "Qm"] @@ -1437,6 +1466,12 @@ class veor(MVEInstruction): outputs = ["Qd"] +class veor_nodt(MVEInstruction): + pattern = "veor , , " + inputs = ["Qn", "Qm"] + outputs = ["Qd"] + + class nop(MVEInstruction): pattern = "nop" diff --git a/slothy/targets/arm_v81m/cortex_m55r1.py b/slothy/targets/arm_v81m/cortex_m55r1.py index ed4e6a16..c8371f1b 100644 --- a/slothy/targets/arm_v81m/cortex_m55r1.py +++ b/slothy/targets/arm_v81m/cortex_m55r1.py @@ -80,8 +80,10 @@ vhcadd, vand, vbic, + vbic_nodt, vorr, veor, + veor_nodt, vmulh, vmul_T1, vmul_T2, @@ -100,6 +102,7 @@ vmulf_T1, vmulf_T2, ldrd, + ldrd_no_imm, ldrd_with_writeback, ldrd_with_post, strd, @@ -316,8 +319,10 @@ def get_min_max_objective(slothy): vhcadd: ExecutionUnit.VEC_INT, vand: ExecutionUnit.VEC_INT, vbic: ExecutionUnit.VEC_INT, + vbic_nodt: ExecutionUnit.VEC_INT, vorr: ExecutionUnit.VEC_INT, veor: ExecutionUnit.VEC_INT, + veor_nodt: ExecutionUnit.VEC_INT, vmulh: ExecutionUnit.VEC_MUL, vmul_T1: ExecutionUnit.VEC_MUL, vmul_T2: ExecutionUnit.VEC_MUL, @@ -337,6 +342,7 @@ def get_min_max_objective(slothy): vmulf_T1: ExecutionUnit.VEC_FPU, vmulf_T2: ExecutionUnit.VEC_FPU, ldrd: ExecutionUnit.LOAD, + ldrd_no_imm: ExecutionUnit.LOAD, ldrd_with_writeback: ExecutionUnit.LOAD, ldrd_with_post: ExecutionUnit.LOAD, strd: ExecutionUnit.STORE, @@ -422,6 +428,7 @@ def get_min_max_objective(slothy): ldr_with_writeback, ldr_with_post, ldrd, + ldrd_no_imm, ldrd_with_writeback, ldrd_with_post, strd, @@ -459,8 +466,10 @@ def get_min_max_objective(slothy): vhcadd, vand, vbic, + vbic_nodt, vorr, veor, + veor_nodt, vmulh, vmul_T1, vmul_T2, @@ -544,6 +553,7 @@ def get_min_max_objective(slothy): default_latencies = { ( ldrd, + ldrd_no_imm, ldrd_with_writeback, ldrd_with_post, ): 2, @@ -580,8 +590,10 @@ def get_min_max_objective(slothy): vaddva, vand, vbic, + vbic_nodt, vorr, veor, + veor_nodt, qsave, save, qrestore, @@ -744,8 +756,10 @@ def get_latency(src, out_idx, dst): vhcadd, vand, vbic, + vbic_nodt, vorr, veor, + veor_nodt, vrshr, vshrnb, vshrnt, diff --git a/slothy/targets/arm_v81m/cortex_m85r1.py b/slothy/targets/arm_v81m/cortex_m85r1.py index cabcc179..1a2286e7 100644 --- a/slothy/targets/arm_v81m/cortex_m85r1.py +++ b/slothy/targets/arm_v81m/cortex_m85r1.py @@ -80,8 +80,10 @@ vhcadd, vand, vbic, + vbic_nodt, vorr, veor, + veor_nodt, vmulh, vmul_T1, vmul_T2, @@ -100,6 +102,7 @@ vmulf_T1, vmulf_T2, ldrd, + ldrd_no_imm, ldrd_with_writeback, ldrd_with_post, strd, @@ -308,8 +311,10 @@ def get_min_max_objective(slothy): vhcadd: ExecutionUnit.VEC_INT, vand: [ExecutionUnit.VEC_BITWA, ExecutionUnit.VEC_BITWB], vbic: [ExecutionUnit.VEC_BITWA, ExecutionUnit.VEC_BITWB], + vbic_nodt: [ExecutionUnit.VEC_BITWA, ExecutionUnit.VEC_BITWB], vorr: [ExecutionUnit.VEC_BITWA, ExecutionUnit.VEC_BITWB], veor: [ExecutionUnit.VEC_BITWA, ExecutionUnit.VEC_BITWB], + veor_nodt: [ExecutionUnit.VEC_BITWA, ExecutionUnit.VEC_BITWB], vmulh: ExecutionUnit.VEC_MUL, vmul_T1: ExecutionUnit.VEC_MUL, vmul_T2: ExecutionUnit.VEC_MUL, @@ -330,6 +335,7 @@ def get_min_max_objective(slothy): vmulf_T1: ExecutionUnit.VEC_FPMUL, vmulf_T2: ExecutionUnit.VEC_FPMUL, ldrd: ExecutionUnit.LOAD, + ldrd_no_imm: ExecutionUnit.LOAD, ldrd_with_writeback: ExecutionUnit.LOAD, ldrd_with_post: ExecutionUnit.LOAD, strd: ExecutionUnit.STORE, @@ -416,6 +422,7 @@ def get_min_max_objective(slothy): ldr_with_writeback, ldr_with_post, ldrd, + ldrd_no_imm, ldrd_with_writeback, ldrd_with_post, strd, @@ -453,8 +460,10 @@ def get_min_max_objective(slothy): vhcadd, vand, vbic, + vbic_nodt, vorr, veor, + veor_nodt, vmulh, vmul_T1, vmul_T2, @@ -535,6 +544,7 @@ def get_min_max_objective(slothy): default_latencies = { ( ldrd, + ldrd_no_imm, ldrd_with_post, ldrd_with_writeback, ): 2, @@ -570,8 +580,10 @@ def get_min_max_objective(slothy): vhcadd, vand, vbic, + vbic_nodt, vorr, veor, + veor_nodt, qsave, save, qrestore, @@ -755,8 +767,10 @@ def get_latency(src, out_idx, dst): vhcadd, vand, vbic, + vbic_nodt, vorr, veor, + veor_nodt, vrshr, vshrnb, vshrnt, diff --git a/slothy/targets/arm_v81m/helium_experimental.py b/slothy/targets/arm_v81m/helium_experimental.py index bd91068e..e2ecb099 100644 --- a/slothy/targets/arm_v81m/helium_experimental.py +++ b/slothy/targets/arm_v81m/helium_experimental.py @@ -73,8 +73,10 @@ vhcadd, vand, vbic, + vbic_nodt, vorr, veor, + veor_nodt, vmulh, vmul_T1, vmul_T2, @@ -93,6 +95,7 @@ vmulf_T1, vmulf_T2, ldrd, + ldrd_no_imm, ldrd_with_writeback, ldrd_with_post, strd, @@ -234,8 +237,10 @@ def get_min_max_objective(slothy): vhcadd: ExecutionUnit.VEC_INT, vand: ExecutionUnit.VEC_INT, vbic: ExecutionUnit.VEC_INT, + vbic_nodt: ExecutionUnit.VEC_INT, vorr: ExecutionUnit.VEC_INT, veor: ExecutionUnit.VEC_INT, + veor_nodt: ExecutionUnit.VEC_INT, vmulh: ExecutionUnit.VEC_MUL, vmul_T1: ExecutionUnit.VEC_MUL, vmul_T2: ExecutionUnit.VEC_MUL, @@ -254,6 +259,7 @@ def get_min_max_objective(slothy): vmulf_T1: ExecutionUnit.VEC_FPU, vmulf_T2: ExecutionUnit.VEC_FPU, ldrd: ExecutionUnit.LOAD, + ldrd_no_imm: ExecutionUnit.LOAD, ldrd_with_writeback: ExecutionUnit.LOAD, ldrd_with_post: ExecutionUnit.LOAD, strd: ExecutionUnit.STORE, @@ -339,6 +345,7 @@ def get_min_max_objective(slothy): ldr_with_writeback, ldr_with_post, ldrd, + ldrd_no_imm, ldrd_with_writeback, ldrd_with_post, strd, @@ -376,8 +383,10 @@ def get_min_max_objective(slothy): vhcadd, vand, vbic, + vbic_nodt, vorr, veor, + veor_nodt, vmulh, vmul_T1, vmul_T2, @@ -458,6 +467,7 @@ def get_min_max_objective(slothy): default_latencies = { ( ldrd, + ldrd_no_imm, ldrd_with_writeback, ldrd_with_post, ): 2, @@ -493,8 +503,10 @@ def get_min_max_objective(slothy): vhcadd, vand, vbic, + vbic_nodt, vorr, veor, + veor_nodt, qsave, save, qrestore, @@ -676,8 +688,10 @@ def get_latency(src, out_idx, dst): vhcadd, vand, vbic, + vbic_nodt, vorr, veor, + veor_nodt, vrshr, vshrnb, vshrnt, diff --git a/tests/naive/armv8m/instructions.s b/tests/naive/armv8m/instructions.s index 950a0d1b..d2372538 100644 --- a/tests/naive/armv8m/instructions.s +++ b/tests/naive/armv8m/instructions.s @@ -55,6 +55,7 @@ vqdmulh.s8 q2, q0, q1 vqdmulh.s16 q2, q0, q1 vqdmulh.s32 q2, q0, q1 +ldrd r0, r1, [r2] ldrd r0, r1, [r2, #16] ldrd r0, r1, [r2, #-16] ldrd r0, r1, [r2], #16 @@ -236,15 +237,18 @@ vand.u64 q2, q0, q1 vbic.u8 q2, q0, q1 vbic.u16 q2, q0, q1 vbic.u32 q2, q0, q1 +vbic q2, q0, q1 vorr.u8 q2, q0, q1 vorr.u16 q2, q0, q1 vorr.u32 q2, q0, q1 vorr.u64 q2, q0, q1 -vorr.u8 q2, q0, q1 -vorr.u16 q2, q0, q1 -vorr.u32 q2, q0, q1 +veor.u8 q2, q0, q1 +veor.u16 q2, q0, q1 +veor.u32 q2, q0, q1 +veor.u64 q2, q0, q1 +veor q2, q0, q1 nop