Skip to content

Commit 8b27118

Browse files
authored
Merge pull request #257 from slothy-optimizer/mve-instruction-additions
MVE: Add missing instruction variants needed for MVE Keccak
2 parents 40f4741 + 277e53c commit 8b27118

File tree

5 files changed

+84
-3
lines changed

5 files changed

+84
-3
lines changed

slothy/targets/arm_v81m/arch_v81m.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -353,6 +353,7 @@ def is_load_store_instruction(self):
353353
vst42_with_writeback,
354354
vst43_with_writeback,
355355
ldrd,
356+
ldrd_no_imm,
356357
ldrd_with_writeback,
357358
ldrd_with_post,
358359
strd,
@@ -402,6 +403,7 @@ def is_scalar_load(self):
402403
return self._is_instance_of(
403404
[
404405
ldrd,
406+
ldrd_no_imm,
405407
ldrd_with_writeback,
406408
ldrd_with_post,
407409
ldr,
@@ -1075,6 +1077,27 @@ def write(self):
10751077
return super().write()
10761078

10771079

1080+
class ldrd_no_imm(MVEInstruction):
1081+
pattern = "ldrd <Rt0>, <Rt1>, [<Rn>]"
1082+
inputs = ["Rn"]
1083+
outputs = ["Rt0", "Rt1"]
1084+
1085+
@classmethod
1086+
def make(cls, src):
1087+
obj = MVEInstruction.build(cls, src)
1088+
obj.increment = None
1089+
obj.pre_index = 0
1090+
obj.addr = obj.args_in[0]
1091+
return obj
1092+
1093+
def write(self):
1094+
self.immediate = simplify(self.pre_index)
1095+
1096+
if int(self.immediate) != 0:
1097+
self.pattern = ldrd.pattern
1098+
return super().write()
1099+
1100+
10781101
class ldrd_with_writeback(MVEInstruction):
10791102
pattern = "ldrd <Rt0>, <Rt1>, [<Rn>, <imm>]!"
10801103
inputs = ["Rn"]
@@ -1425,6 +1448,12 @@ class vbic(MVEInstruction):
14251448
outputs = ["Qd"]
14261449

14271450

1451+
class vbic_nodt(MVEInstruction):
1452+
pattern = "vbic <Qd>, <Qn>, <Qm>"
1453+
inputs = ["Qn", "Qm"]
1454+
outputs = ["Qd"]
1455+
1456+
14281457
class vorr(MVEInstruction):
14291458
pattern = "vorr.<dt> <Qd>, <Qn>, <Qm>"
14301459
inputs = ["Qn", "Qm"]
@@ -1437,6 +1466,12 @@ class veor(MVEInstruction):
14371466
outputs = ["Qd"]
14381467

14391468

1469+
class veor_nodt(MVEInstruction):
1470+
pattern = "veor <Qd>, <Qn>, <Qm>"
1471+
inputs = ["Qn", "Qm"]
1472+
outputs = ["Qd"]
1473+
1474+
14401475
class nop(MVEInstruction):
14411476
pattern = "nop"
14421477

slothy/targets/arm_v81m/cortex_m55r1.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,10 @@
8080
vhcadd,
8181
vand,
8282
vbic,
83+
vbic_nodt,
8384
vorr,
8485
veor,
86+
veor_nodt,
8587
vmulh,
8688
vmul_T1,
8789
vmul_T2,
@@ -100,6 +102,7 @@
100102
vmulf_T1,
101103
vmulf_T2,
102104
ldrd,
105+
ldrd_no_imm,
103106
ldrd_with_writeback,
104107
ldrd_with_post,
105108
strd,
@@ -316,8 +319,10 @@ def get_min_max_objective(slothy):
316319
vhcadd: ExecutionUnit.VEC_INT,
317320
vand: ExecutionUnit.VEC_INT,
318321
vbic: ExecutionUnit.VEC_INT,
322+
vbic_nodt: ExecutionUnit.VEC_INT,
319323
vorr: ExecutionUnit.VEC_INT,
320324
veor: ExecutionUnit.VEC_INT,
325+
veor_nodt: ExecutionUnit.VEC_INT,
321326
vmulh: ExecutionUnit.VEC_MUL,
322327
vmul_T1: ExecutionUnit.VEC_MUL,
323328
vmul_T2: ExecutionUnit.VEC_MUL,
@@ -337,6 +342,7 @@ def get_min_max_objective(slothy):
337342
vmulf_T1: ExecutionUnit.VEC_FPU,
338343
vmulf_T2: ExecutionUnit.VEC_FPU,
339344
ldrd: ExecutionUnit.LOAD,
345+
ldrd_no_imm: ExecutionUnit.LOAD,
340346
ldrd_with_writeback: ExecutionUnit.LOAD,
341347
ldrd_with_post: ExecutionUnit.LOAD,
342348
strd: ExecutionUnit.STORE,
@@ -422,6 +428,7 @@ def get_min_max_objective(slothy):
422428
ldr_with_writeback,
423429
ldr_with_post,
424430
ldrd,
431+
ldrd_no_imm,
425432
ldrd_with_writeback,
426433
ldrd_with_post,
427434
strd,
@@ -459,8 +466,10 @@ def get_min_max_objective(slothy):
459466
vhcadd,
460467
vand,
461468
vbic,
469+
vbic_nodt,
462470
vorr,
463471
veor,
472+
veor_nodt,
464473
vmulh,
465474
vmul_T1,
466475
vmul_T2,
@@ -544,6 +553,7 @@ def get_min_max_objective(slothy):
544553
default_latencies = {
545554
(
546555
ldrd,
556+
ldrd_no_imm,
547557
ldrd_with_writeback,
548558
ldrd_with_post,
549559
): 2,
@@ -580,8 +590,10 @@ def get_min_max_objective(slothy):
580590
vaddva,
581591
vand,
582592
vbic,
593+
vbic_nodt,
583594
vorr,
584595
veor,
596+
veor_nodt,
585597
qsave,
586598
save,
587599
qrestore,
@@ -744,8 +756,10 @@ def get_latency(src, out_idx, dst):
744756
vhcadd,
745757
vand,
746758
vbic,
759+
vbic_nodt,
747760
vorr,
748761
veor,
762+
veor_nodt,
749763
vrshr,
750764
vshrnb,
751765
vshrnt,

slothy/targets/arm_v81m/cortex_m85r1.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,10 @@
8080
vhcadd,
8181
vand,
8282
vbic,
83+
vbic_nodt,
8384
vorr,
8485
veor,
86+
veor_nodt,
8587
vmulh,
8688
vmul_T1,
8789
vmul_T2,
@@ -100,6 +102,7 @@
100102
vmulf_T1,
101103
vmulf_T2,
102104
ldrd,
105+
ldrd_no_imm,
103106
ldrd_with_writeback,
104107
ldrd_with_post,
105108
strd,
@@ -308,8 +311,10 @@ def get_min_max_objective(slothy):
308311
vhcadd: ExecutionUnit.VEC_INT,
309312
vand: [ExecutionUnit.VEC_BITWA, ExecutionUnit.VEC_BITWB],
310313
vbic: [ExecutionUnit.VEC_BITWA, ExecutionUnit.VEC_BITWB],
314+
vbic_nodt: [ExecutionUnit.VEC_BITWA, ExecutionUnit.VEC_BITWB],
311315
vorr: [ExecutionUnit.VEC_BITWA, ExecutionUnit.VEC_BITWB],
312316
veor: [ExecutionUnit.VEC_BITWA, ExecutionUnit.VEC_BITWB],
317+
veor_nodt: [ExecutionUnit.VEC_BITWA, ExecutionUnit.VEC_BITWB],
313318
vmulh: ExecutionUnit.VEC_MUL,
314319
vmul_T1: ExecutionUnit.VEC_MUL,
315320
vmul_T2: ExecutionUnit.VEC_MUL,
@@ -330,6 +335,7 @@ def get_min_max_objective(slothy):
330335
vmulf_T1: ExecutionUnit.VEC_FPMUL,
331336
vmulf_T2: ExecutionUnit.VEC_FPMUL,
332337
ldrd: ExecutionUnit.LOAD,
338+
ldrd_no_imm: ExecutionUnit.LOAD,
333339
ldrd_with_writeback: ExecutionUnit.LOAD,
334340
ldrd_with_post: ExecutionUnit.LOAD,
335341
strd: ExecutionUnit.STORE,
@@ -416,6 +422,7 @@ def get_min_max_objective(slothy):
416422
ldr_with_writeback,
417423
ldr_with_post,
418424
ldrd,
425+
ldrd_no_imm,
419426
ldrd_with_writeback,
420427
ldrd_with_post,
421428
strd,
@@ -453,8 +460,10 @@ def get_min_max_objective(slothy):
453460
vhcadd,
454461
vand,
455462
vbic,
463+
vbic_nodt,
456464
vorr,
457465
veor,
466+
veor_nodt,
458467
vmulh,
459468
vmul_T1,
460469
vmul_T2,
@@ -535,6 +544,7 @@ def get_min_max_objective(slothy):
535544
default_latencies = {
536545
(
537546
ldrd,
547+
ldrd_no_imm,
538548
ldrd_with_post,
539549
ldrd_with_writeback,
540550
): 2,
@@ -570,8 +580,10 @@ def get_min_max_objective(slothy):
570580
vhcadd,
571581
vand,
572582
vbic,
583+
vbic_nodt,
573584
vorr,
574585
veor,
586+
veor_nodt,
575587
qsave,
576588
save,
577589
qrestore,
@@ -755,8 +767,10 @@ def get_latency(src, out_idx, dst):
755767
vhcadd,
756768
vand,
757769
vbic,
770+
vbic_nodt,
758771
vorr,
759772
veor,
773+
veor_nodt,
760774
vrshr,
761775
vshrnb,
762776
vshrnt,

slothy/targets/arm_v81m/helium_experimental.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,10 @@
7373
vhcadd,
7474
vand,
7575
vbic,
76+
vbic_nodt,
7677
vorr,
7778
veor,
79+
veor_nodt,
7880
vmulh,
7981
vmul_T1,
8082
vmul_T2,
@@ -93,6 +95,7 @@
9395
vmulf_T1,
9496
vmulf_T2,
9597
ldrd,
98+
ldrd_no_imm,
9699
ldrd_with_writeback,
97100
ldrd_with_post,
98101
strd,
@@ -234,8 +237,10 @@ def get_min_max_objective(slothy):
234237
vhcadd: ExecutionUnit.VEC_INT,
235238
vand: ExecutionUnit.VEC_INT,
236239
vbic: ExecutionUnit.VEC_INT,
240+
vbic_nodt: ExecutionUnit.VEC_INT,
237241
vorr: ExecutionUnit.VEC_INT,
238242
veor: ExecutionUnit.VEC_INT,
243+
veor_nodt: ExecutionUnit.VEC_INT,
239244
vmulh: ExecutionUnit.VEC_MUL,
240245
vmul_T1: ExecutionUnit.VEC_MUL,
241246
vmul_T2: ExecutionUnit.VEC_MUL,
@@ -254,6 +259,7 @@ def get_min_max_objective(slothy):
254259
vmulf_T1: ExecutionUnit.VEC_FPU,
255260
vmulf_T2: ExecutionUnit.VEC_FPU,
256261
ldrd: ExecutionUnit.LOAD,
262+
ldrd_no_imm: ExecutionUnit.LOAD,
257263
ldrd_with_writeback: ExecutionUnit.LOAD,
258264
ldrd_with_post: ExecutionUnit.LOAD,
259265
strd: ExecutionUnit.STORE,
@@ -339,6 +345,7 @@ def get_min_max_objective(slothy):
339345
ldr_with_writeback,
340346
ldr_with_post,
341347
ldrd,
348+
ldrd_no_imm,
342349
ldrd_with_writeback,
343350
ldrd_with_post,
344351
strd,
@@ -376,8 +383,10 @@ def get_min_max_objective(slothy):
376383
vhcadd,
377384
vand,
378385
vbic,
386+
vbic_nodt,
379387
vorr,
380388
veor,
389+
veor_nodt,
381390
vmulh,
382391
vmul_T1,
383392
vmul_T2,
@@ -458,6 +467,7 @@ def get_min_max_objective(slothy):
458467
default_latencies = {
459468
(
460469
ldrd,
470+
ldrd_no_imm,
461471
ldrd_with_writeback,
462472
ldrd_with_post,
463473
): 2,
@@ -493,8 +503,10 @@ def get_min_max_objective(slothy):
493503
vhcadd,
494504
vand,
495505
vbic,
506+
vbic_nodt,
496507
vorr,
497508
veor,
509+
veor_nodt,
498510
qsave,
499511
save,
500512
qrestore,
@@ -676,8 +688,10 @@ def get_latency(src, out_idx, dst):
676688
vhcadd,
677689
vand,
678690
vbic,
691+
vbic_nodt,
679692
vorr,
680693
veor,
694+
veor_nodt,
681695
vrshr,
682696
vshrnb,
683697
vshrnt,

tests/naive/armv8m/instructions.s

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ vqdmulh.s8 q2, q0, q1
5555
vqdmulh.s16 q2, q0, q1
5656
vqdmulh.s32 q2, q0, q1
5757

58+
ldrd r0, r1, [r2]
5859
ldrd r0, r1, [r2, #16]
5960
ldrd r0, r1, [r2, #-16]
6061
ldrd r0, r1, [r2], #16
@@ -236,15 +237,18 @@ vand.u64 q2, q0, q1
236237
vbic.u8 q2, q0, q1
237238
vbic.u16 q2, q0, q1
238239
vbic.u32 q2, q0, q1
240+
vbic q2, q0, q1
239241

240242
vorr.u8 q2, q0, q1
241243
vorr.u16 q2, q0, q1
242244
vorr.u32 q2, q0, q1
243245
vorr.u64 q2, q0, q1
244246

245-
vorr.u8 q2, q0, q1
246-
vorr.u16 q2, q0, q1
247-
vorr.u32 q2, q0, q1
247+
veor.u8 q2, q0, q1
248+
veor.u16 q2, q0, q1
249+
veor.u32 q2, q0, q1
250+
veor.u64 q2, q0, q1
251+
veor q2, q0, q1
248252

249253
nop
250254

0 commit comments

Comments
 (0)