Skip to content

Implement redundant load/store elimination in peephole optimizer #289

@jserv

Description

@jserv

Description

Consecutive load/store operations often contain redundancies that can be eliminated at the instruction level. This peephole optimization can significantly reduce memory traffic and improve performance, especially important for shecc's ARM and RISC-V targets.

Current State

The peephole optimizer in src/peephole.c currently has minimal patterns. The file is only ~100 lines and handles basic cases. Extending it with load/store elimination would significantly improve code quality.

Patterns to Optimize

1. Store followed by Load (Same Location)

# ARM - Before
str r0, [sp, #4]
ldr r1, [sp, #4]  ; Loading what was just stored

# ARM - After
str r0, [sp, #4]
mov r1, r0        ; Use register copy instead

# RISC-V - Before
sw a0, 4(sp)
lw a1, 4(sp)      ; Redundant load

# RISC-V - After  
sw a0, 4(sp)
mv a1, a0         ; Register move

2. Redundant Stores

# ARM - Before
str r0, [sp, #4]
str r1, [sp, #4]  ; Overwrites previous store

# ARM - After
str r1, [sp, #4]  ; Only keep last store

# RISC-V - Before
sw a0, 4(sp)
sw a1, 4(sp)      ; Dead store

# RISC-V - After
sw a1, 4(sp)      ; Eliminate first store

3. Load after Load (Same Location)

# ARM - Before
ldr r0, [sp, #4]
ldr r1, [sp, #4]  ; Same location

# ARM - After
ldr r0, [sp, #4]
mov r1, r0        ; Copy register

# RISC-V - Before
lw a0, 4(sp)
lw a1, 4(sp)      ; Redundant

# RISC-V - After
lw a0, 4(sp)
mv a1, a0         ; Register copy

4. Dead Store Elimination

# Before
str r0, [sp, #4]  ; Value never read
add sp, sp, #8    ; Stack adjusted, making store dead

# After
add sp, sp, #8    ; Store eliminated

5. Load/Store Forwarding

# Before
str r0, [r1, #0]
ldr r2, [r1, #0]  ; Same address in different registers

# After
str r0, [r1, #0]
mov r2, r0        ; Forward stored value

Implementation in shecc

1. Integration Point

Extend src/peephole.c with new patterns:

// Add to existing peephole_optimization() function
void peephole_optimization(insn_t *insns, int count) {
    // Existing optimizations...
    
    // NEW: Add load/store elimination
    eliminate_redundant_memory_ops(insns, count);
}

2. Memory Location Tracking

typedef struct {
    enum { MEM_STACK, MEM_GLOBAL, MEM_HEAP } type;
    int base_reg;      // sp, gp, or general register
    int offset;
    int size;          // 1, 2, 4, or 8 bytes
} mem_location_t;

typedef struct {
    mem_location_t loc;
    int value_reg;     // Register containing the value
    int insn_idx;      // Instruction index
    bool is_valid;
} mem_state_t;

// Track last 8 memory operations (tunable)
#define MEM_TRACK_SIZE 8
typedef struct {
    mem_state_t stores[MEM_TRACK_SIZE];
    mem_state_t loads[MEM_TRACK_SIZE];
    int store_count;
    int load_count;
} mem_tracker_t;

3. Pattern Detection Functions

bool same_memory_location(mem_location_t *loc1, mem_location_t *loc2) {
    return loc1->type == loc2->type &&
           loc1->base_reg == loc2->base_reg &&
           loc1->offset == loc2->offset &&
           loc1->size == loc2->size;
}

mem_location_t extract_location(insn_t *insn) {
    mem_location_t loc = {0};
    
    // ARM addressing modes
    if (insn->addressing_mode == ADDR_OFFSET) {
        loc.base_reg = insn->base_reg;
        loc.offset = insn->offset;
    }
    // RISC-V addressing
    else if (insn->addressing_mode == ADDR_DISPLACEMENT) {
        loc.base_reg = insn->rs1;  // Base register
        loc.offset = insn->imm;     // Immediate offset
    }
    
    // Determine type based on base register
    if (loc.base_reg == REG_SP) {
        loc.type = MEM_STACK;
    } else if (loc.base_reg == REG_GP) {
        loc.type = MEM_GLOBAL;
    } else {
        loc.type = MEM_HEAP;
    }
    
    loc.size = insn->size;
    return loc;
}

4. Optimization Implementation

void eliminate_redundant_memory_ops(insn_t *insns, int count) {
    mem_tracker_t tracker = {0};
    
    for (int i = 0; i < count; i++) {
        insn_t *insn = &insns[i];
        
        switch (insn->op) {
        case OP_STORE:
            handle_store(&tracker, insn, i);
            break;
            
        case OP_LOAD:
            handle_load(&tracker, insn, i);
            break;
            
        case OP_CALL:
        case OP_BRANCH:
            // Conservative: invalidate memory state
            invalidate_tracker(&tracker);
            break;
            
        default:
            // Check if instruction modifies tracked registers
            update_register_state(&tracker, insn);
            break;
        }
    }
}

void handle_store(mem_tracker_t *tracker, insn_t *insn, int idx) {
    mem_location_t loc = extract_location(insn);
    
    // Check for dead store (overwritten before read)
    for (int i = 0; i < tracker->store_count; i++) {
        if (same_memory_location(&loc, &tracker->stores[i].loc)) {
            // Previous store is dead
            mark_for_deletion(&insns[tracker->stores[i].insn_idx]);
            tracker->stores[i].is_valid = false;
        }
    }
    
    // Add new store to tracker
    add_store_to_tracker(tracker, loc, insn->rs, idx);
}

void handle_load(mem_tracker_t *tracker, insn_t *insn, int idx) {
    mem_location_t loc = extract_location(insn);
    
    // Check if we can forward from recent store
    for (int i = tracker->store_count - 1; i >= 0; i--) {
        if (tracker->stores[i].is_valid &&
            same_memory_location(&loc, &tracker->stores[i].loc)) {
            
            // Check if stored value register is still valid
            if (is_register_unchanged(tracker->stores[i].value_reg,
                                     tracker->stores[i].insn_idx, idx)) {
                // Replace load with register move
                convert_to_move(insn, tracker->stores[i].value_reg);
                return;
            }
        }
    }
    
    // Check for redundant load
    for (int i = tracker->load_count - 1; i >= 0; i--) {
        if (tracker->loads[i].is_valid &&
            same_memory_location(&loc, &tracker->loads[i].loc)) {
            
            if (is_register_unchanged(tracker->loads[i].value_reg,
                                     tracker->loads[i].insn_idx, idx)) {
                // Replace with register copy
                convert_to_move(insn, tracker->loads[i].value_reg);
                return;
            }
        }
    }
    
    // Add load to tracker
    add_load_to_tracker(tracker, loc, insn->rd, idx);
}

void convert_to_move(insn_t *insn, int src_reg) {
    // Convert load to register move
    insn->op = OP_MOV;
    insn->rs = src_reg;
    insn->addressing_mode = ADDR_NONE;
    // Clear memory-related fields
    insn->base_reg = -1;
    insn->offset = 0;
}

5. Safety Checks

bool is_register_unchanged(int reg, int from_idx, int to_idx) {
    for (int i = from_idx + 1; i < to_idx; i++) {
        if (insns[i].rd == reg) {
            return false;  // Register modified
        }
    }
    return true;
}

bool may_alias(mem_location_t *loc1, mem_location_t *loc2) {
    // Conservative aliasing check
    if (loc1->type != loc2->type) {
        return false;  // Different memory types don't alias
    }
    
    if (loc1->type == MEM_STACK) {
        // Stack locations with different offsets don't alias
        return loc1->offset == loc2->offset;
    }
    
    // Conservative: assume heap/global may alias
    return true;
}

void invalidate_tracker(mem_tracker_t *tracker) {
    // Function calls may modify memory
    for (int i = 0; i < tracker->store_count; i++) {
        tracker->stores[i].is_valid = false;
    }
    for (int i = 0; i < tracker->load_count; i++) {
        tracker->loads[i].is_valid = false;
    }
}

Test Cases

// tests/test_load_store_elim.c

// Store-load forwarding
int test_store_load_forward() {
    int x = 5;
    int y = x;  // Should not generate load after store
    return y;
}

// Dead store elimination
int test_dead_store() {
    int x = 5;  // Dead store if x not used
    x = 10;     // This overwrites previous
    return x;
}

// Load-load elimination
int test_redundant_load(int *p) {
    int a = *p;
    int b = *p;  // Should reuse first load
    return a + b;
}

// Complex pattern
int test_complex_pattern() {
    int arr[10];
    arr[0] = 5;
    int x = arr[0];  // Should forward value 5
    arr[1] = 10;
    int y = arr[1];  // Should forward value 10
    int z = arr[0];  // Should still forward 5 if unchanged
    return x + y + z;
}

// Should NOT optimize (aliasing)
int test_aliasing(int *p, int *q) {
    *p = 5;
    *q = 10;  // May alias with p
    return *p;  // Cannot forward - may have been modified
}

Architecture-Specific Benefits

ARM (ARMv7-A)

  • Critical due to limited registers (12 allocatable)
  • Load/store multiple instructions can be optimized
  • Reduces memory bandwidth pressure
  • Thumb-2 code density improvement

RISC-V (RV32IM)

  • More registers available (31) but still beneficial
  • Reduces memory instructions
  • Enables compressed instruction usage
  • Better for embedded systems with slow memory

Expected Performance Impact

  • 10-30% reduction in memory operations
  • 5-15% speedup for memory-bound code
  • Reduced cache pressure
  • Lower power consumption (fewer memory accesses)

Files to Modify

  • src/peephole.c: Add memory operation elimination
  • src/defs.h: Add definitions for memory tracking structures
  • src/arm-codegen.c: Ensure proper instruction encoding
  • src/riscv-codegen.c: Ensure proper instruction encoding

Success Criteria

  • 20% reduction in load/store instructions on average
  • No incorrect optimizations (all tests pass)
  • Bootstrap compilation succeeds
  • Measurable performance improvement on benchmarks
  • Correct handling of aliasing and side effects

Metadata

Metadata

Assignees

No one assigned

    Labels

    enhancementNew feature or request

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions