Skip to content

[AOT] Memory Info Restore Mechanism with Better Performance #4113

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions core/iwasm/compilation/aot_compiler.c
Original file line number Diff line number Diff line change
Expand Up @@ -999,6 +999,7 @@ static bool
aot_compile_func(AOTCompContext *comp_ctx, uint32 func_index)
{
AOTFuncContext *func_ctx = comp_ctx->func_ctxes[func_index];
WASMModule *module = comp_ctx->comp_data->wasm_module;
LLVMValueRef func_index_ref;
uint8 *frame_ip = func_ctx->aot_func->code, opcode, *p_f32, *p_f64;
uint8 *frame_ip_end = frame_ip + func_ctx->aot_func->code_size;
Expand Down Expand Up @@ -1230,6 +1231,8 @@ aot_compile_func(AOTCompContext *comp_ctx, uint32 func_index)
read_leb_uint32(frame_ip, frame_ip_end, func_idx);
if (!aot_compile_op_call(comp_ctx, func_ctx, func_idx, false))
return false;
if (module->functions[func_index]->has_memory_operations)
restore_memory_info(comp_ctx, func_ctx);
break;
}

Expand All @@ -1250,6 +1253,8 @@ aot_compile_func(AOTCompContext *comp_ctx, uint32 func_index)
if (!aot_compile_op_call_indirect(comp_ctx, func_ctx, type_idx,
tbl_idx))
return false;
if (module->functions[func_index]->has_memory_operations)
restore_memory_info(comp_ctx, func_ctx);
break;
}

Expand Down Expand Up @@ -1420,6 +1425,8 @@ aot_compile_func(AOTCompContext *comp_ctx, uint32 func_index)
if (!aot_compile_op_call_ref(comp_ctx, func_ctx, type_idx,
false))
return false;
if (module->functions[func_index]->has_memory_operations)
restore_memory_info(comp_ctx, func_ctx);
break;
}

Expand Down Expand Up @@ -2092,6 +2099,8 @@ aot_compile_func(AOTCompContext *comp_ctx, uint32 func_index)
read_leb_uint32(frame_ip, frame_ip_end, mem_idx);
if (!aot_compile_op_memory_grow(comp_ctx, func_ctx))
return false;
if (module->functions[func_index]->has_memory_operations)
restore_memory_info(comp_ctx, func_ctx);
break;

case WASM_OP_I32_CONST:
Expand Down
1 change: 1 addition & 0 deletions core/iwasm/compilation/aot_compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,7 @@ set_local_gc_ref(AOTCompFrame *frame, int n, LLVMValueRef value, uint8 ref_type)
#define SIZE_T_TYPE comp_ctx->basic_types.size_t_type
#define MD_TYPE comp_ctx->basic_types.meta_data_type
#define INT8_PTR_TYPE comp_ctx->basic_types.int8_ptr_type
#define INT8_PPTR_TYPE comp_ctx->basic_types.int8_pptr_type
#define INT16_PTR_TYPE comp_ctx->basic_types.int16_ptr_type
#define INT32_PTR_TYPE comp_ctx->basic_types.int32_ptr_type
#define INT64_PTR_TYPE comp_ctx->basic_types.int64_ptr_type
Expand Down
57 changes: 20 additions & 37 deletions core/iwasm/compilation/aot_emit_memory.c
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,6 @@ get_memory_check_bound(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
return NULL;
}

if (func_ctx->mem_space_unchanged)
return mem_check_bound;

if (!(mem_check_bound = LLVMBuildLoad2(
comp_ctx->builder,
(comp_ctx->pointer_size == sizeof(uint64)) ? I64_TYPE : I32_TYPE,
Expand Down Expand Up @@ -164,17 +161,15 @@ aot_check_memory_overflow(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
}

/* Get memory base address and memory data size */
if (func_ctx->mem_space_unchanged
#if WASM_ENABLE_SHARED_MEMORY != 0
|| is_shared_memory
#endif
) {
if (is_shared_memory)
mem_base_addr = func_ctx->mem_info[0].mem_base_addr;
}
else {
else
#endif
{
if (!(mem_base_addr = LLVMBuildLoad2(
comp_ctx->builder, OPQ_PTR_TYPE,
func_ctx->mem_info[0].mem_base_addr, "mem_base"))) {
func_ctx->mem_info[0].mem_base_addr, "mem_base_addr"))) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume the main purpose of this PR is to minimize or eliminate load instructions in memory operations. However, the changes eliminated all the fast accesses(if branch) but retained the slow ones(else branch). Does this actually address the original problem?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These two else branches are different. In this PR, mem_info[0].mem_base_addr comes from LLVMBuildAlloca, which allocates memory on the stack (often in registers due to mem2reg optimization or in cache), whereas the previous version loads from global memory.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe it's become more confusing.

There is a critical need to pass the base address of linear memory in various functions. Therefore, it's necessary to have some sort of global variable to hold this address.

In the original code, the process is as follows: the base address is always retrieved from the global variable, stored in a temporary variable mem_base_addr, and then this temporary variable is used to compute the final address.

%mem_base_addr_offset = getelementptr inbounds i8, ptr %aot_inst, i32 376
%mem_base_addr = load ptr, ptr %mem_base_addr_offset, align 8
;; when using
%maddr = getelementptr inbounds i8, ptr %mem_base_addr, i64 %offset1

In the PR, a new local variable, mem_base_addr, is introduced to hold the base address after obtaining it from the temporary variable mem_base_addr1. Although loading from a local variable isn't a significant issue, it does raise a small question as to why this is necessary. After all, the base address is already present in the temporary variable(if using the original design).

%mem_base_addr = alloca ptr, align 8
%mem_base_addr_offset = getelementptr inbounds i8, ptr %aot_inst, i32 376
%mem_base_addr1 = load ptr, ptr %mem_base_addr_offset, align 8
store ptr %mem_base_addr1, ptr %mem_base_addr, align 8

%mem_base_addr9 = load ptr, ptr %mem_base_addr, align 8
%maddr = getelementptr inbounds i8, ptr %mem_base_addr9, i64 %offset1

And after changes to linear memory, such as memory.grow, it is still necessary to reload values from the global variable, then from the temporary variable, and finally save them to the local variable. This doesn't appear to be more optimized, in my view.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your thought is correct, but you have overlooked the optimization of LLVMBuildAlloca variables by mem2reg.

In this PR, mem_info[0].mem_base_addr comes from LLVMBuildAlloca, which allocates memory on the stack (often in registers due to mem2reg optimization or in cache).

For example:

define i32 @foo(i32 %x) {
entry:
  %y = alloca i32
  store i32 %x, i32* %y
  %val = load i32, i32* %y
  ret i32 %val
}

The load/store of %y be optimized:

define i32 @foo(i32 %x) {
entry:
  ret i32 %x
}

Similarly,

%mem_base_addr = alloca ptr, align 8
%mem_base_addr_offset = getelementptr inbounds i8, ptr %aot_inst, i32 376
%mem_base_addr1 = load ptr, ptr %mem_base_addr_offset, align 8
store ptr %mem_base_addr1, ptr %mem_base_addr, align 8

%mem_base_addr9 = load ptr, ptr %mem_base_addr, align 8
%maddr = getelementptr inbounds i8, ptr %mem_base_addr9, i64 %offset1

can be optimized to:

%mem_base_addr_offset = getelementptr inbounds i8, ptr %aot_inst, i32 376
%mem_base_addr1 = load ptr, ptr %mem_base_addr_offset, align 8

%maddr = getelementptr inbounds i8, ptr %mem_base_addr1, i64 %offset1

In this PR, if you dump the IR of substr.wasm above, you will see that the load/store operations of %mem_base_addr are also optimized. As a result, the final outcome remains the same as in the original code.

image

image

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've devised a straightforward test case to assess the enhancement. Here is the code:

(module
  (memory 5 10)

  (func $store32 (export "store32")  (param i32 i32)
    (i32.store (local.get 0) (local.get 1))
  )

  (func $load32 (export "load32")  (param i32) (result i32)
    (i32.load (local.get 0))
  )

  (func (export "load_store") (param i32 i32) (result i32)
    (local i32)

    (i32.load (local.get 0))
    (local.tee 2)
    (i32.store (local.get 1))

    (i32.load (local.get 1))
    (local.get 2)
    (i32.eq)
  )

  (func (export "load_grow_store") (param i32 i32) (result i32)
    (local i32)

    (i32.load (local.get 0))
    (local.tee 2)
    (i32.store (local.get 1))

    (memory.grow (i32.const 1))
    (drop)

    (i32.load (local.get 1))
    (local.get 2)
    (i32.eq)
  )

  (func (export "load_store_w_func") (param i32 i32) (result i32)
    (local i32)

    (local.get 0)
    (call $load32)

    (local.tee 2)
    (local.get 1)
    (call $store32)

    (i32.load (local.get 1))
    (local.get 2)
    (i32.eq)
  )
  
  (func (export "load_grow_store_w_func") (param i32 i32) (result i32)
    (local i32)

    (local.get 0)
    (call $load32)

    (local.tee 2)
    (local.get 1)
    (call $store32)

    (memory.grow (i32.const 1))
    (drop)

    (i32.load (local.get 1))
    (local.get 2)
    (i32.eq)
  )
)

And I used the following command: --bounds-checks=1 --format=llvmir-op, to create optimized llvmir. The --bounds-checks=1 is employed to apply the noinline attribute.

Several intriguing findings emerged from the comparison of the before and after scenarios:

  • Look at f0, f1, and f2. These are elementary cases involving load and store. As previously mentioned, the mem2reg optimization refines alloca variables, allowing the revised version to produce no additional IR compared to the original version.
  • Now, consider f4 and f5. I believed they presented issues that this PR aims to address. Clearly, as seen in f5, there is no necessity to reload the memory base address after calling f1, as there is no memory growth in f4. This PR should eliminate that redundant loading. However, the modified version maintains the status quo.

🆙 If I'm mistaken, please correct me.

This leads to my confusion: if there is no difference for basic cases and no enhancement for redundant loading, what is the rationale for changing?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This PR is to eliminate redundant memory info loads between multiple load/store instructions when the memory remains unchanged.

I use the following WAST example to compare the (optimized) IR generated by this PR with the original code, identifying specific scenarios where this optimization applies. I don’t need the noinline attribute, so I only used the --format=llvmir-op option.

(module
  (memory 5 10)

  (func (export "load_load") (param i32 i32) (result i32)
    (i32.load (local.get 0))
    (i32.load (local.get 1))
    (i32.eq)

    (memory.grow (i32.const 1))
    (drop)
  )

  (func (export "load_store") (param i32 i32)
    (i32.load (local.get 0))
    (i32.store (local.get 1))

    (memory.grow (i32.const 1))
    (drop)
  )

  (func (export "store_store") (param i32 i32)
    (i32.store (local.get 0) (i32.const 42))
    (i32.store (local.get 1) (i32.const 42))

    (memory.grow (i32.const 1))
    (drop)
  )

  (func (export "store_load") (param i32 i32) (result i32)
    (i32.store (local.get 0) (i32.const 42))
    (i32.load (local.get 1))

    (memory.grow (i32.const 1))
    (drop)
  )
)

Same IR in f0 and f1.
image

In f3 and f4, the IR generated by this PR is different.
image

This PR primarily optimizes store-store and store-load scenarios. As for load-load and load-store scenarios, I believe they might have been optimized.

In your example, f0\f1 shows this PR won't produce no addtional IR compared to the original version. The memory in f2 is unchanged. f3 is the load-store case. f4 and f5 contains two call and one load instruction. Therefore, their IRs remains the same.

aot_set_last_error("llvm build load failed.");
goto fail;
}
Expand Down Expand Up @@ -1015,16 +1010,11 @@ get_memory_curr_page_count(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx)
{
LLVMValueRef mem_size;

if (func_ctx->mem_space_unchanged) {
mem_size = func_ctx->mem_info[0].mem_cur_page_count_addr;
}
else {
if (!(mem_size = LLVMBuildLoad2(
comp_ctx->builder, I32_TYPE,
func_ctx->mem_info[0].mem_cur_page_count_addr, "mem_size"))) {
aot_set_last_error("llvm build load failed.");
goto fail;
}
if (!(mem_size = LLVMBuildLoad2(
comp_ctx->builder, I32_TYPE,
func_ctx->mem_info[0].mem_cur_page_count, "mem_size"))) {
aot_set_last_error("llvm build load failed.");
goto fail;
}

return LLVMBuildIntCast(comp_ctx->builder, mem_size,
Expand Down Expand Up @@ -1165,16 +1155,14 @@ check_bulk_memory_overflow(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
#if WASM_ENABLE_SHARED_MEMORY != 0
bool is_shared_memory = comp_ctx->comp_data->memories[0].flags & 0x02;

if (func_ctx->mem_space_unchanged || is_shared_memory) {
#else
if (func_ctx->mem_space_unchanged) {
#endif
if (is_shared_memory)
mem_base_addr = func_ctx->mem_info[0].mem_base_addr;
}
else {
else
#endif
{
if (!(mem_base_addr = LLVMBuildLoad2(
comp_ctx->builder, OPQ_PTR_TYPE,
func_ctx->mem_info[0].mem_base_addr, "mem_base"))) {
func_ctx->mem_info[0].mem_base_addr, "mem_base_addr"))) {
aot_set_last_error("llvm build load failed.");
goto fail;
}
Expand Down Expand Up @@ -1206,16 +1194,11 @@ check_bulk_memory_overflow(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx,
}
}

if (func_ctx->mem_space_unchanged) {
mem_size = func_ctx->mem_info[0].mem_data_size_addr;
}
else {
if (!(mem_size = LLVMBuildLoad2(
comp_ctx->builder, I64_TYPE,
func_ctx->mem_info[0].mem_data_size_addr, "mem_size"))) {
aot_set_last_error("llvm build load failed.");
goto fail;
}
if (!(mem_size = LLVMBuildLoad2(
comp_ctx->builder, I64_TYPE,
func_ctx->mem_info[0].mem_data_size, "mem_size"))) {
aot_set_last_error("llvm build load failed.");
goto fail;
}

ADD_BASIC_BLOCK(check_succ, "check_succ");
Expand Down
Loading
Loading