Skip to content

Commit dcf576b

Browse files
SH1E0r1r2ymkannwischer
authored andcommitted
Armv7-M: Fix fusion when registers in ldm/ldrd overlap
Previously ldm and ldrd fusion would break if the same register is used as address as one of the outputs (and it's not the last output). This commit fixes that by changing the fusion to re-order the ldr overwriting the address to the very end in case there is an overlap. Note that this is not needed for stm/strd as there you cannot have an overlap. Additionally, it removes unnecessary restrictions disallowing Rd=Ra for ldrb/ldrh/ldr.
1 parent 6142139 commit dcf576b

File tree

4 files changed

+123
-68
lines changed

4 files changed

+123
-68
lines changed

example.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -669,7 +669,9 @@ def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7):
669669
def core(self,slothy):
670670
slothy.config.variable_size=True
671671
slothy.config.inputs_are_outputs = True
672+
slothy.fusion_region("start", "end", ssa=False)
672673
slothy.optimize(start="start", end="end")
674+
673675

674676
class Armv7mExample0Func(Example):
675677
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7):

examples/naive/armv7m/armv7m_simple0.s

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,8 @@ smlabt r3,r2, r2, r1
2929
asrs r3, r3,#1
3030
str r3, [r0,#4] // @slothy:writes=a
3131

32+
ldrd r0, r3, [r0, #4]
33+
ldm r0 ,{r0-r2}
34+
add r2,r3,r2
35+
str r1, [sp, #0]
3236
end:
Lines changed: 91 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1,69 +1,96 @@
11

22
start:
3-
// Instructions: 24
4-
// Expected cycles: 26
5-
// Expected IPC: 0.92
6-
//
7-
// Cycle bound: 26.0
8-
// IPC bound: 0.92
9-
//
10-
// Wall time: 0.20s
11-
// User time: 0.20s
12-
//
13-
// ----- cycle (expected) ------>
14-
// 0 25
15-
// |------------------------|----
16-
ldr r6, [r0, #4] // *............................. // @slothy:reads=a
17-
add r10, r2, r6 // .*............................
18-
eor.w r1, r10, r3 // ..*...........................
19-
smlabt r7, r2, r2, r1 // ..*...........................
20-
asrs r5, r7, #1 // ....*.........................
21-
str r5, [r0, #4] // ....*......................... // @slothy:writes=a
22-
ldm r0, {r7,r9,r11} // .....*........................ // @slothy:reads=a
23-
add r8, r9, r7 // ........*.....................
24-
eor.w r2, r8, r11 // .........*....................
25-
smlabt r12, r9, r9, r2 // .........*....................
26-
asrs r11, r12, #1 // ...........*..................
27-
str r11, [r0, #4] // ...........*.................. // @slothy:writes=a
28-
ldm r0, {r7,r8,r10} // ............*................. // @slothy:reads=a
29-
add r6, r8, r7 // ...............*..............
30-
eor.w r5, r6, r10 // ................*.............
31-
smlabt r12, r8, r8, r5 // ................*.............
32-
asrs r9, r12, #1 // ..................*...........
33-
str r9, [r0, #4] // ..................*........... // @slothy:writes=a
34-
ldm r0, {r1,r2,r8} // ...................*.......... // @slothy:reads=a
35-
add r14, r2, r1 // ......................*.......
36-
eor.w r5, r14, r8 // .......................*......
37-
smlabt r10, r2, r2, r5 // .......................*......
38-
asrs r3, r10, #1 // .........................*....
39-
str r3, [r0, #4] // .........................*.... // @slothy:writes=a
40-
41-
// ------ cycle (expected) ------>
3+
// Instructions: 37
4+
// Expected cycles: 36
5+
// Expected IPC: 1.03
6+
//
7+
// Cycle bound: 36.0
8+
// IPC bound: 1.03
9+
//
10+
// Wall time: 0.10s
11+
// User time: 0.10s
12+
//
13+
// -------- cycle (expected) --------->
4214
// 0 25
43-
// |------------------------|-----
44-
// ldr r1, [r0, #4] // *..............................
45-
// add r1, r2,r1 // .*.............................
46-
// eor.w r1,r1, r3 // ..*............................
47-
// smlabt r3,r2, r2, r1 // ..*............................
48-
// asrs r3, r3,#1 // ....*..........................
49-
// str r3, [r0,#4] // ....*..........................
50-
// ldm r0, {r1-r2,r14} // .....*.........................
51-
// add r1, r2,r1 // ........*......................
52-
// eor.w r1,r1, r14 // .........*.....................
53-
// smlabt r3,r2, r2, r1 // .........*.....................
54-
// asrs r3, r3,#1 // ...........*...................
55-
// str r3, [r0,#4] // ...........*...................
56-
// ldm r0, {r1-r3} // ............*..................
57-
// add r1, r2,r1 // ...............*...............
58-
// eor.w r1,r1, r3 // ................*..............
59-
// smlabt r3,r2, r2, r1 // ................*..............
60-
// asrs r3, r3,#1 // ..................*............
61-
// str r3, [r0,#4] // ..................*............
62-
// ldm r0, {r1,r2,r3} // ...................*...........
63-
// add r1, r2,r1 // ......................*........
64-
// eor.w r1,r1, r3 // .......................*.......
65-
// smlabt r3,r2, r2, r1 // .......................*.......
66-
// asrs r3, r3,#1 // .........................*.....
67-
// str r3, [r0,#4] // .........................*.....
15+
// |------------------------|----------
16+
ldr r4, [r0, #4] // *................................... // @slothy:reads=a
17+
add r4, r2, r4 // .*..................................
18+
eor.w r4, r4, r3 // ..*.................................
19+
smlabt r2, r2, r2, r4 // ..*.................................
20+
asrs r2, r2, #1 // ....*...............................
21+
str r2, [r0, #4] // ....*............................... // @slothy:writes=a
22+
ldr r3, [r0, #0] // ........*........................... // @slothy:reads=a
23+
ldr r2, [r0, #8] // .........*.......................... // @slothy:reads=a
24+
ldr r6, [r0, #4] // .........*.......................... // @slothy:reads=a
25+
add r5, r6, r3 // ..........*.........................
26+
eor.w r4, r5, r2 // ...........*........................
27+
smlabt r2, r6, r6, r4 // ...........*........................
28+
asrs r2, r2, #1 // .............*......................
29+
str r2, [r0, #4] // .............*...................... // @slothy:writes=a
30+
ldr r4, [r0, #0] // .................*.................. // @slothy:reads=a
31+
ldr r14, [r0, #8] // ..................*................. // @slothy:reads=a
32+
ldr r2, [r0, #4] // ..................*................. // @slothy:reads=a
33+
add r6, r2, r4 // ...................*................
34+
ldr r4, [r0, #4] // ...................*................
35+
eor.w r6, r6, r14 // ....................*...............
36+
smlabt r2, r2, r2, r6 // ....................*...............
37+
ldr r14, [r4, #4] // .....................*..............
38+
asrs r2, r2, #1 // ......................*.............
39+
str r2, [r0, #4] // ......................*............. // @slothy:writes=a
40+
ldr r3, [r0, #0] // ..........................*......... // @slothy:reads=a
41+
ldr r5, [r0, #8] // ...........................*........ // @slothy:reads=a
42+
ldr r6, [r0, #4] // ...........................*........ // @slothy:reads=a
43+
add r2, r6, r3 // ............................*.......
44+
ldr r3, [r0, #8] // ............................*.......
45+
eor.w r2, r2, r5 // .............................*......
46+
smlabt r6, r6, r6, r2 // .............................*......
47+
ldr r2, [r4, #8] // ..............................*.....
48+
str r14, [sp, #0] // ..............................*.....
49+
asrs r14, r6, #1 // ...............................*....
50+
str r14, [r0, #4] // ...............................*.... // @slothy:writes=a
51+
add r2, r3, r2 // ................................*...
52+
ldr r0, [r4, #0] // ...................................*
53+
54+
// -------- cycle (expected) --------->
55+
// 0 25
56+
// |------------------------|----------
57+
// ldr r1, [r0, #4] // *...................................
58+
// add r1, r2, r1 // .*..................................
59+
// eor.w r1, r1, r3 // ..*.................................
60+
// smlabt r3, r2, r2, r1 // ..*.................................
61+
// asrs r3, r3, #1 // ....*...............................
62+
// str r3, [r0, #4] // ....*...............................
63+
// ldr r1, [r0, #0] // ........*...........................
64+
// ldr r2, [r0, #4] // .........*..........................
65+
// ldr r14, [r0, #8] // .........*..........................
66+
// add r1, r2, r1 // ..........*.........................
67+
// eor.w r1, r1, r14 // ...........*........................
68+
// smlabt r3, r2, r2, r1 // ...........*........................
69+
// asrs r3, r3, #1 // .............*......................
70+
// str r3, [r0, #4] // .............*......................
71+
// ldr r1, [r0, #0] // .................*..................
72+
// ldr r2, [r0, #4] // ..................*.................
73+
// ldr r3, [r0, #8] // ..................*.................
74+
// add r1, r2, r1 // ...................*................
75+
// eor.w r1, r1, r3 // ....................*...............
76+
// smlabt r3, r2, r2, r1 // ....................*...............
77+
// asrs r3, r3, #1 // ......................*.............
78+
// str r3, [r0, #4] // ......................*.............
79+
// ldr r1, [r0, #0] // ..........................*.........
80+
// ldr r2, [r0, #4] // ...........................*........
81+
// ldr r3, [r0, #8] // ...........................*........
82+
// add r1, r2, r1 // ............................*.......
83+
// eor.w r1, r1, r3 // .............................*......
84+
// smlabt r3, r2, r2, r1 // .............................*......
85+
// asrs r3, r3, #1 // ...............................*....
86+
// str r3, [r0, #4] // ...............................*....
87+
// ldr r3, [r0, #8] // ............................*.......
88+
// ldr r0, [r0, #4] // ...................*................
89+
// ldr r1, [r0, #4] // .....................*..............
90+
// ldr r2, [r0, #8] // ..............................*.....
91+
// ldr r0, [r0, #0] // ...................................*
92+
// add r2, r3, r2 // ................................*...
93+
// str r1, [sp, #0] // ..............................*.....
6894

6995
end:
96+

slothy/targets/arm_v7m/arch_v7m.py

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1486,7 +1486,6 @@ def make(cls, src):
14861486
obj.increment = None
14871487
obj.pre_index = 0
14881488
obj.addr = obj.args_in[0]
1489-
obj.args_in_out_different = [(0,0)] # Can't have Rd==Ra
14901489
return obj
14911490

14921491
def write(self):
@@ -1505,7 +1504,6 @@ def make(cls, src):
15051504
obj.increment = None
15061505
obj.pre_index = obj.immediate
15071506
obj.addr = obj.args_in[0]
1508-
obj.args_in_out_different = [(0,0)] # Can't have Rd==Ra
15091507
return obj
15101508

15111509
def write(self):
@@ -1528,7 +1526,6 @@ def make(cls, src):
15281526
obj = Armv7mInstruction.build(cls, src)
15291527
obj.increment = None
15301528
obj.pre_index = obj.immediate
1531-
obj.args_in_out_different = [(0,0)] # Can't have Rd==Ra
15321529
obj.addr = obj.args_in[0]
15331530
return obj
15341531

@@ -1545,7 +1542,6 @@ def make(cls, src):
15451542
obj = Armv7mInstruction.build(cls, src)
15461543
obj.increment = None
15471544
obj.pre_index = obj.immediate
1548-
obj.args_in_out_different = [(0,0)] # Can't have Rd==Ra
15491545
obj.addr = obj.args_in[0]
15501546
return obj
15511547

@@ -1940,6 +1936,19 @@ def core(inst,t,log=None):
19401936
add_comments(inst.source_line.comments)
19411937
ldr.source_line = ldr_src
19421938

1939+
# In case the address register is also contained in the
1940+
# register list, we need to overwrite the address register
1941+
# in the last ldr
1942+
ldrs_reordered = []
1943+
for ldr, reg in zip(ldrs, regs):
1944+
if reg != ptr:
1945+
ldrs_reordered.append(ldr)
1946+
1947+
for ldr, reg in zip(ldrs, regs):
1948+
if reg == ptr:
1949+
ldrs_reordered.append(ldr)
1950+
ldrs = ldrs_reordered
1951+
19431952
if log is not None:
19441953
log(f"ldm splitting: {t.inst}; {[ldr for ldr in ldrs]}")
19451954

@@ -2128,6 +2137,19 @@ def core(inst,t,log=None):
21282137
add_comments(inst.source_line.comments)
21292138
ldr.source_line = ldr_src
21302139

2140+
# In case the address register is also contained in the
2141+
# register list, we need to overwrite the address register
2142+
# in the last ldr
2143+
ldrs_reordered = []
2144+
for ldr, reg in zip(ldrs, regs):
2145+
if reg != ptr:
2146+
ldrs_reordered.append(ldr)
2147+
2148+
for ldr, reg in zip(ldrs, regs):
2149+
if reg == ptr:
2150+
ldrs_reordered.append(ldr)
2151+
ldrs = ldrs_reordered
2152+
21312153
if log is not None:
21322154
log(f"ldrd splitting: {t.inst}; {[ldr for ldr in ldrs]}")
21332155

0 commit comments

Comments
 (0)