Skip to content

Commit 97c13c1

Browse files
committed
Add short message buffer to IPC fastpath
This extends IPC fastpath capacity from 32 bytes (MR0-MR7 register only) to 160 bytes by adding a 128-byte buffer in TCB structure. - Add msg_buffer[32] to TCB (maps to MR8-MR39) - Update ipc_read_mr/ipc_write_mr for three-tier MR access - Extend fastpath eligibility from n_untyped<=7 to n_untyped<=39 - Add ipc-fastpath.h with optimized copy logic - Preserve R4-R11 across SVC handler for message register integrity - Add pointer validation in user_log for safety Memory layout: - MR0-MR7: ctx.regs[0-7] (32 bytes, R4-R11 registers) - MR8-MR39: msg_buffer[0-31] (128 bytes, TCB buffer) - MR40-MR47: utcb->mr[0-7] (32 bytes, UTCB overflow) Inspired by QNX Neutrino short message optimization.
1 parent 66f384f commit 97c13c1

File tree

11 files changed

+435
-33
lines changed

11 files changed

+435
-33
lines changed

include/l4/ipc.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,13 @@
99
#define IPC_TI_MAP_GRANT 0x8
1010
#define IPC_TI_GRANT 0x2
1111

12-
#define IPC_MR_COUNT 16
12+
/* Total MR capacity with short message buffer:
13+
* MR0-MR7: 8 words in registers (ctx.regs[0-7])
14+
* MR8-MR39: 32 words in msg_buffer[0-31]
15+
* MR40-MR47: 8 words in UTCB (utcb->mr[0-7])
16+
* Total: 48 MRs = 192 bytes
17+
*/
18+
#define IPC_MR_COUNT 48
1319

1420
typedef union {
1521
struct {

include/l4/utcb.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,15 @@ struct utcb {
2525
uint32_t thread_word_1;
2626
uint32_t thread_word_2;
2727
/* +12w */
28-
uint32_t mr[8]; /* MRs 8-15 (0-8 are laying in
29-
r4..r11 [thread's context]) */
28+
/* Message Registers (MR) mapping with short message buffer:
29+
* MR0-MR7: Hardware registers R4-R11 (ctx.regs[0-7]) - 32 bytes
30+
* MR8-MR39: Short message buffer (tcb->msg_buffer[0-31]) - 128 bytes
31+
* MR40-MR47: UTCB overflow (mr[0-7]) - 32 bytes
32+
*
33+
* Total message capacity: 192 bytes (48 words)
34+
* Fastpath capacity: 160 bytes (40 words, MR0-MR39)
35+
*/
36+
uint32_t mr[8]; /* MRs 40-47 (overflow beyond short buffer) */
3037
/* +20w */
3138
uint32_t br[8];
3239
/* +28w */

include/platform/ipc-fastpath.h

Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
/* Copyright (c) 2026 The F9 Microkernel Project. All rights reserved.
2+
* Use of this source code is governed by a BSD-style license that can be
3+
* found in the LICENSE file.
4+
*/
5+
6+
#ifndef PLATFORM_IPC_FASTPATH_H_
7+
#define PLATFORM_IPC_FASTPATH_H_
8+
9+
#include <debug.h>
10+
#include <ipc.h>
11+
#include <platform/armv7m.h>
12+
#include <platform/irq.h>
13+
#include <sched.h>
14+
#include <syscall.h>
15+
#include <thread.h>
16+
#include <types.h>
17+
18+
/**
19+
* Fastpath IPC optimization for ARM Cortex-M.
20+
*
21+
* Bypasses softirq scheduling for simple short-message IPC by performing
22+
* direct register transfer in the SVC handler context.
23+
*
24+
* CRITICAL: Message registers (MR0-MR7) are in R4-R11, NOT R0-R7.
25+
* - R0-R3: Syscall parameters (to_tid, from_tid, timeout, unused)
26+
* - R4-R11: Message registers (MR0-MR7) via ctx.regs[0-7]
27+
*
28+
* This implementation uses a NAKED wrapper to capture R4-R11 immediately
29+
* before any compiler-generated prologue can clobber them.
30+
*
31+
* Implementation is in header as static inline for zero call overhead.
32+
*/
33+
34+
/**
35+
* ipc_fastpath_copy_mrs() - Copy message registers to receiver
36+
* @saved_mrs: Saved message registers R4-R11 (MR0-MR7)
37+
* @sender: Source thread (for msg_buffer access)
38+
* @receiver: Destination thread
39+
* @n_untyped: Number of untyped words to copy (0-39)
40+
*
41+
* Copies MR0-MR{n_untyped} from sender to receiver:
42+
* - MR0-MR7: From saved_mrs to receiver->ctx.regs[0-7]
43+
* - MR8-MR39: From sender->msg_buffer to receiver->msg_buffer (NEW)
44+
*
45+
* WCET: ~20 cycles (MR0-MR7) + ~100 cycles (MR8-MR39, if used)
46+
*/
47+
static inline void ipc_fastpath_copy_mrs(volatile uint32_t *saved_mrs,
48+
struct tcb *sender,
49+
struct tcb *receiver,
50+
int n_untyped)
51+
{
52+
int count = n_untyped + 1; /* +1 for tag in MR0 */
53+
int i;
54+
55+
/* Phase 1: Copy MR0-MR7 from saved registers (R4-R11) */
56+
for (i = 0; i < count && i < 8; i++)
57+
receiver->ctx.regs[i] = saved_mrs[i];
58+
59+
/* Phase 2: Copy MR8-MR39 from sender's msg_buffer (if needed) */
60+
if (count > 8) {
61+
int buf_count = count - 8; /* Number of words in buffer */
62+
if (buf_count > 32)
63+
buf_count = 32; /* Clamp to buffer size */
64+
65+
for (i = 0; i < buf_count; i++)
66+
receiver->msg_buffer[i] = sender->msg_buffer[i];
67+
}
68+
}
69+
70+
/**
71+
* ipc_fastpath_helper() - Fastpath IPC implementation (C helper)
72+
* @caller: Current thread attempting IPC
73+
* @svc_param: SVC stack frame (R0-R3, R12, LR, PC, xPSR)
74+
* @saved_mrs: Pre-saved R4-R11 message registers
75+
*
76+
* Called by the naked wrapper after message registers have been captured.
77+
* Returns 1 if fastpath succeeded, 0 to fall back to slowpath.
78+
*/
79+
static __attribute__((used)) inline int ipc_fastpath_helper(struct tcb *caller,
80+
uint32_t *svc_param,
81+
volatile uint32_t *saved_mrs)
82+
{
83+
struct tcb *to_thr;
84+
l4_thread_t to_tid, from_tid;
85+
ipc_msg_tag_t tag;
86+
87+
/* Extract IPC parameters from hardware stack (R0-R3) */
88+
to_tid = svc_param[REG_R0];
89+
from_tid = svc_param[REG_R1];
90+
91+
/* Extract tag from saved MR0 (R4), NOT from R0! */
92+
tag.raw = saved_mrs[0];
93+
94+
/* Fastpath Eligibility Check */
95+
96+
/* Criterion 1: Simple send (to_tid valid, from_tid = NILTHREAD) */
97+
if (to_tid == L4_NILTHREAD || from_tid != L4_NILTHREAD)
98+
return 0; /* Slowpath: receive-only or send+receive */
99+
100+
/* Criterion 2: No typed items (no MapItems/GrantItems) */
101+
if (tag.s.n_typed != 0)
102+
return 0; /* Slowpath: requires map_area() processing */
103+
104+
/* Criterion 3: Short message (fits in MR0-MR39: registers + buffer)
105+
* MR0-MR7: 8 words × 4 bytes = 32 bytes (registers)
106+
* MR8-MR39: 32 words × 4 bytes = 128 bytes (buffer)
107+
* Total capacity: 160 bytes (40 words)
108+
*/
109+
if (tag.s.n_untyped > 39)
110+
return 0; /* Slowpath: requires UTCB access */
111+
112+
/* Criterion 4: Receiver exists and is blocked waiting */
113+
to_thr = thread_by_globalid(to_tid);
114+
if (!to_thr || to_thr->state != T_RECV_BLOCKED)
115+
return 0; /* Slowpath: receiver not ready */
116+
117+
/* Criterion 5: Receiver is waiting for us */
118+
if (to_thr->ipc_from != L4_ANYTHREAD &&
119+
to_thr->ipc_from != caller->t_globalid)
120+
return 0; /* Slowpath: receiver waiting for someone else */
121+
122+
/* Criterion 6: Special thread handling */
123+
if (to_tid == TID_TO_GLOBALID(THREAD_LOG) ||
124+
to_tid == TID_TO_GLOBALID(THREAD_IRQ_REQUEST))
125+
return 0; /* Slowpath: special kernel threads */
126+
127+
/* Criterion 7: Thread start protocol */
128+
if (tag.raw == 0x00000005)
129+
return 0; /* Slowpath: thread initialization */
130+
131+
/* All criteria met - Execute Fastpath */
132+
133+
/* Phase 0: Dequeue caller (will re-enqueue later) */
134+
extern void sched_dequeue(struct tcb *);
135+
sched_dequeue(caller);
136+
137+
/* Phase 1: Copy message registers from sender to receiver
138+
* - MR0-MR7: From saved registers (R4-R11)
139+
* - MR8-MR39: From sender's msg_buffer to receiver's msg_buffer (if
140+
* needed)
141+
*/
142+
ipc_fastpath_copy_mrs(saved_mrs, caller, to_thr, tag.s.n_untyped);
143+
144+
/* Phase 2: Update receiver context */
145+
/* Set R0 to sender ID (IPC protocol) */
146+
((uint32_t *) to_thr->ctx.sp)[REG_R0] = caller->t_globalid;
147+
to_thr->utcb->sender = caller->t_globalid;
148+
149+
/* Phase 3: Update thread states */
150+
151+
/* Clear timeout events (no timeout in fastpath) */
152+
caller->timeout_event = 0;
153+
to_thr->timeout_event = 0;
154+
155+
/* Receiver becomes runnable with IPC priority boost */
156+
to_thr->state = T_RUNNABLE;
157+
to_thr->ipc_from = L4_NILTHREAD;
158+
sched_set_priority(to_thr, SCHED_PRIO_IPC);
159+
sched_enqueue(to_thr);
160+
161+
/* Caller continues (send-only, no reply expected)
162+
* Fastpath only handles from_tid==NILTHREAD (simple send).
163+
* For L4_Call (send+receive), slowpath handles blocking.
164+
*
165+
* Re-enqueue caller (was dequeued at SVC entry).
166+
* It's safe to enqueue current thread - sched has double-enqueue
167+
* protection.
168+
*/
169+
caller->state = T_RUNNABLE;
170+
sched_enqueue(caller);
171+
172+
/* Phase 4: Request context switch via PendSV */
173+
/* DON'T do immediate switch - let PendSV handle it normally */
174+
request_schedule();
175+
176+
return 1; /* Fastpath succeeded */
177+
}
178+
179+
/**
180+
* ipc_try_fastpath() - IPC fastpath using pre-saved R4-R11
181+
* @caller: Current thread attempting IPC
182+
* @svc_param: SVC stack frame (R0-R3, R12, LR, PC, xPSR)
183+
*
184+
* Reads message registers from __irq_saved_regs which were saved by
185+
* SVC_HANDLER before any C code ran, ensuring MR0-MR7 are untouched.
186+
*
187+
* Returns:
188+
* 1 if fastpath succeeded (caller should skip slowpath)
189+
* 0 if fastpath unavailable (caller must use slowpath)
190+
*
191+
* Eligibility criteria:
192+
* - Simple send (to_tid valid, from_tid == NILTHREAD)
193+
* - Short message (n_untyped <= 39, n_typed == 0)
194+
* - Receiver ready (T_RECV_BLOCKED, waiting for caller or ANYTHREAD)
195+
*/
196+
static inline int ipc_try_fastpath(struct tcb *caller, uint32_t *svc_param)
197+
{
198+
extern volatile uint32_t __irq_saved_regs[8];
199+
200+
/* Read from global __irq_saved_regs saved by SVC_HANDLER */
201+
return ipc_fastpath_helper(caller, svc_param, __irq_saved_regs);
202+
}
203+
204+
#endif /* PLATFORM_IPC_FASTPATH_H_ */

include/platform/irq.h

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,38 @@ extern volatile uint32_t __irq_saved_regs[8];
317317
request_schedule(); \
318318
irq_return(); \
319319
}
320+
321+
/*
322+
* SVC_HANDLER - Specialized handler for SVC exceptions requiring R4-R11
323+
* preservation.
324+
*
325+
* Unlike IRQ_HANDLER, this variant saves R4-R11 to __irq_saved_regs BEFORE
326+
* calling the C handler, ensuring message registers are captured untouched.
327+
*
328+
* This is critical for IPC fastpath optimization where user message registers
329+
* (MR0-MR7 in R4-R11) must be available before any C code runs.
330+
*
331+
* IMPORTANT: Must restore R4-R11 before returning since the C handler may
332+
* clobber these registers.
333+
*
334+
* Usage: SVC_HANDLER(svc_handler, __svc_handler);
335+
*/
336+
#define SVC_HANDLER(name, sub) \
337+
void name(void) __NAKED; \
338+
void name(void) \
339+
{ \
340+
irq_enter(); \
341+
irq_save_regs_only(); \
342+
sub(); \
343+
request_schedule(); \
344+
/* Restore R4-R11 before returning */ \
345+
__asm__ __volatile__( \
346+
"ldr r0, =__irq_saved_regs\n\t" \
347+
"ldm r0, {r4-r11}" :: \
348+
: "r0", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", \
349+
"memory"); \
350+
irq_return(); \
351+
}
320352
extern volatile tcb_t *current;
321353

322354
#endif /* PLATFORM_IRQ_H_ */

include/thread.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,22 @@ struct tcb {
166166
uint8_t notify_pending;
167167

168168
uint8_t _notify_pad[1]; /* Alignment padding */
169+
170+
/* Short message buffer for IPC fastpath optimization.
171+
* Extends fastpath coverage from 32 bytes (registers only) to 160 bytes.
172+
* Maps to L4 virtual registers MR8-MR39 (32 additional 4-byte registers).
173+
*
174+
* Memory layout:
175+
* - MR0-MR7: ctx.regs[0-7] (32 bytes, R4-R11 hardware registers)
176+
* - MR8-MR39: msg_buffer[0-31] (128 bytes, TCB-embedded buffer)
177+
* - MR40-MR47: utcb->mr[0-7] (32 bytes, UTCB overflow)
178+
*
179+
* Fastpath eligibility: n_untyped <= 39 (160 bytes total)
180+
* Expected fastpath coverage: 70% → 95%
181+
*
182+
* RAM impact: 128 bytes × 20 threads = 2.5 KB (1.3% of 192KB)
183+
*/
184+
uint32_t msg_buffer[32];
169185
};
170186
typedef struct tcb tcb_t;
171187

kernel/ipc.c

Lines changed: 34 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -35,19 +35,33 @@ static inline void thread_make_runnable(tcb_t *thr)
3535
sched_enqueue(thr);
3636
}
3737

38+
/* Read message register with short buffer support.
39+
* MR0-MR7: Hardware registers R4-R11 (ctx.regs[0-7])
40+
* MR8-MR39: Short message buffer (msg_buffer[0-31]) - NEW
41+
* MR40-MR47: UTCB overflow (utcb->mr[0-7])
42+
*/
3843
uint32_t ipc_read_mr(tcb_t *from, int i)
3944
{
40-
if (i >= 8)
41-
return from->utcb->mr[i - 8];
42-
return from->ctx.regs[i];
45+
if (i < 8)
46+
return from->ctx.regs[i];
47+
if (i < 40)
48+
return from->msg_buffer[i - 8];
49+
return from->utcb->mr[i - 40];
4350
}
4451

52+
/* Write message register with short buffer support.
53+
* MR0-MR7: Hardware registers R4-R11 (ctx.regs[0-7])
54+
* MR8-MR39: Short message buffer (msg_buffer[0-31]) - NEW
55+
* MR40-MR47: UTCB overflow (utcb->mr[0-7])
56+
*/
4557
void ipc_write_mr(tcb_t *to, int i, uint32_t data)
4658
{
47-
if (i >= 8)
48-
to->utcb->mr[i - 8] = data;
49-
else
59+
if (i < 8)
5060
to->ctx.regs[i] = data;
61+
else if (i < 40)
62+
to->msg_buffer[i - 8] = data;
63+
else
64+
to->utcb->mr[i - 40] = data;
5165
}
5266

5367
static void user_ipc_error(tcb_t *thr, enum user_error_t error)
@@ -351,19 +365,19 @@ void sys_ipc(uint32_t *param1)
351365
} else if (to_thr && to_thr->state == T_INACTIVE &&
352366
GLOBALID_TO_TID(to_thr->utcb->t_pager) ==
353367
GLOBALID_TO_TID(caller->t_globalid)) {
354-
if (ipc_read_mr(caller, 0) == 0x00000005) {
355-
/* mr1: thread func, mr2: stack addr,
356-
* mr3: stack size
357-
* mr4: thread entry, mr5: thread args
358-
* thread start protocol */
368+
uint32_t tag = ipc_read_mr(caller, 0);
369+
if (tag == 0x00000005) {
370+
/* Thread start protocol from pager:
371+
* mr1: thread_container (wrapper), mr2: sp,
372+
* mr3: stack size, mr4: entry point, mr5: entry arg */
359373

374+
uint32_t mr1_container = ipc_read_mr(caller, 1);
360375
memptr_t sp = ipc_read_mr(caller, 2);
361376
size_t stack_size = ipc_read_mr(caller, 3);
377+
uint32_t entry_point = ipc_read_mr(caller, 4);
378+
uint32_t entry_arg = ipc_read_mr(caller, 5);
362379
uint32_t regs[4]; /* r0, r1, r2, r3 */
363380

364-
dbg_printf(DL_IPC, "IPC: %t thread start sp:%p stack_size:%p\n",
365-
to_tid, sp, stack_size);
366-
367381
/* Security check: Ensure stack is in user-writable memory */
368382
int pid = mempool_search(sp - stack_size, stack_size);
369383
mempool_t *mp = mempool_getbyid(pid);
@@ -387,10 +401,12 @@ void sys_ipc(uint32_t *param1)
387401

388402
regs[REG_R0] = (uint32_t) &kip;
389403
regs[REG_R1] = (uint32_t) to_thr->utcb;
390-
regs[REG_R2] = ipc_read_mr(caller, 4);
391-
regs[REG_R3] = ipc_read_mr(caller, 5);
392-
thread_init_ctx((void *) sp, (void *) ipc_read_mr(caller, 1),
393-
regs, to_thr);
404+
regs[REG_R2] =
405+
entry_point; /* Actual entry passed to container */
406+
regs[REG_R3] = entry_arg;
407+
408+
thread_init_ctx((void *) sp, (void *) mr1_container, regs,
409+
to_thr);
394410

395411
thread_make_runnable(caller);
396412

0 commit comments

Comments
 (0)