Skip to content

Commit 4023585

Browse files
committed
pref: Optimize memory prefetch strategy by replacing prefetcht2 with prefetchnta
1 parent 8f0277d commit 4023585

File tree

2 files changed

+10
-10
lines changed

2 files changed

+10
-10
lines changed

src/decoder.rs

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -105,14 +105,14 @@ impl Decoder {
105105
let instruction_cache_key = {
106106
// according to RISC-V instruction encoding, the lowest bit in PC will always be zero
107107
let pc = pc >> 1;
108-
// Here we try to balance between local code and remote code. At times,
109-
// we can find the code jumping to a remote function(e.g., memcpy or
110-
// alloc), then resumes execution at a local location. Previous cache
111-
// key only optimizes for local operations, while this new cache key
112-
// balances the code between a 8192-byte local region, and certain remote
113-
// code region. Notice the value 12 and 8 here are chosen by empirical
114-
// evidence.
115-
((pc & 0xFF) | (pc >> 12 << 8)) as usize % INSTRUCTION_CACHE_SIZE
108+
// This indexing strategy optimizes instruction cache utilization by improving the distribution of addresses.
109+
// - `pc >> 5`: Incorporates higher bits to ensure a more even spread across cache indices.
110+
// - `pc << 1`: Spreads lower-bit information into higher positions, enhancing variability.
111+
// - `^` (XOR): Further randomizes index distribution, reducing cache conflicts and improving hit rates.
112+
//
113+
// This approach helps balance cache efficiency between local execution and remote function calls,
114+
// reducing hotspots and improving overall performance.
115+
((pc >> 5) ^ (pc << 1)) as usize % INSTRUCTION_CACHE_SIZE
116116
};
117117
let cached_instruction = self.instructions_cache[instruction_cache_key];
118118
if cached_instruction.0 == pc {

src/machine/asm/execute_x64.S

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -458,7 +458,7 @@ ckb_vm_x64_execute:
458458
andq CKB_VM_ASM_INVOKE_DATA_OFFSET_FIXED_TRACE_MASK(INVOKE_DATA), %rax
459459
imul $CKB_VM_ASM_FIXED_TRACE_STRUCT_SIZE, %eax
460460
movq CKB_VM_ASM_INVOKE_DATA_OFFSET_FIXED_TRACES(INVOKE_DATA), %rdx
461-
prefetcht2 0(%rdx, %rax)
461+
prefetchnta 0(%rdx, %rax)
462462
lea CKB_VM_ASM_TRACE_OFFSET_THREADS(TRACE), INST_PC
463463
mov INST_PC, INST_ARGS
464464
add $8, INST_ARGS
@@ -468,7 +468,7 @@ ckb_vm_x64_execute:
468468
/* Load current instruction as the full trace address */
469469
movq -16(INST_ARGS), TRACE
470470
/* Prefetch trace info for the consecutive block */
471-
prefetcht2 CKB_VM_ASM_TRACE_OFFSET_THREADS(TRACE)
471+
prefetchnta CKB_VM_ASM_TRACE_OFFSET_THREADS(TRACE)
472472
mov CKB_VM_ASM_TRACE_OFFSET_LENGTH(TRACE), %edx
473473
movq CKB_VM_ASM_ASM_CORE_MACHINE_OFFSET_CYCLES(MACHINE), %rax
474474
addq CKB_VM_ASM_TRACE_OFFSET_CYCLES(TRACE), %rax

0 commit comments

Comments
 (0)