-
Notifications
You must be signed in to change notification settings - Fork 704
[AOT] Memory Info Restore Mechanism with Better Performance #4113
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -78,9 +78,6 @@ get_memory_check_bound(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx, | |
return NULL; | ||
} | ||
|
||
if (func_ctx->mem_space_unchanged) | ||
return mem_check_bound; | ||
|
||
if (!(mem_check_bound = LLVMBuildLoad2( | ||
comp_ctx->builder, | ||
(comp_ctx->pointer_size == sizeof(uint64)) ? I64_TYPE : I32_TYPE, | ||
|
@@ -164,17 +161,15 @@ aot_check_memory_overflow(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx, | |
} | ||
|
||
/* Get memory base address and memory data size */ | ||
if (func_ctx->mem_space_unchanged | ||
#if WASM_ENABLE_SHARED_MEMORY != 0 | ||
|| is_shared_memory | ||
#endif | ||
) { | ||
if (is_shared_memory) | ||
mem_base_addr = func_ctx->mem_info[0].mem_base_addr; | ||
} | ||
else { | ||
else | ||
#endif | ||
{ | ||
if (!(mem_base_addr = LLVMBuildLoad2( | ||
comp_ctx->builder, OPQ_PTR_TYPE, | ||
func_ctx->mem_info[0].mem_base_addr, "mem_base"))) { | ||
func_ctx->mem_info[0].mem_base_addr, "mem_base_addr"))) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I assume the main purpose of this PR is to minimize or eliminate There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These two There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe it's become more confusing. There is a critical need to pass the base address of linear memory in various functions. Therefore, it's necessary to have some sort of global variable to hold this address. In the original code, the process is as follows: the base address is always retrieved from the global variable, stored in a temporary variable
In the PR, a new local variable,
And after changes to linear memory, such as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Your thought is correct, but you have overlooked the optimization of
For example:
The
Similarly,
can be optimized to:
In this PR, if you dump the IR of substr.wasm above, you will see that the load/store operations of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've devised a straightforward test case to assess the enhancement. Here is the code: (module
(memory 5 10)
(func $store32 (export "store32") (param i32 i32)
(i32.store (local.get 0) (local.get 1))
)
(func $load32 (export "load32") (param i32) (result i32)
(i32.load (local.get 0))
)
(func (export "load_store") (param i32 i32) (result i32)
(local i32)
(i32.load (local.get 0))
(local.tee 2)
(i32.store (local.get 1))
(i32.load (local.get 1))
(local.get 2)
(i32.eq)
)
(func (export "load_grow_store") (param i32 i32) (result i32)
(local i32)
(i32.load (local.get 0))
(local.tee 2)
(i32.store (local.get 1))
(memory.grow (i32.const 1))
(drop)
(i32.load (local.get 1))
(local.get 2)
(i32.eq)
)
(func (export "load_store_w_func") (param i32 i32) (result i32)
(local i32)
(local.get 0)
(call $load32)
(local.tee 2)
(local.get 1)
(call $store32)
(i32.load (local.get 1))
(local.get 2)
(i32.eq)
)
(func (export "load_grow_store_w_func") (param i32 i32) (result i32)
(local i32)
(local.get 0)
(call $load32)
(local.tee 2)
(local.get 1)
(call $store32)
(memory.grow (i32.const 1))
(drop)
(i32.load (local.get 1))
(local.get 2)
(i32.eq)
)
) And I used the following command: Several intriguing findings emerged from the comparison of the before and after scenarios:
🆙 If I'm mistaken, please correct me. This leads to my confusion: if there is no difference for basic cases and no enhancement for redundant loading, what is the rationale for changing? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This PR is to eliminate redundant memory info loads between multiple load/store instructions when the memory remains unchanged. I use the following WAST example to compare the (optimized) IR generated by this PR with the original code, identifying specific scenarios where this optimization applies. I don’t need the (module
(memory 5 10)
(func (export "load_load") (param i32 i32) (result i32)
(i32.load (local.get 0))
(i32.load (local.get 1))
(i32.eq)
(memory.grow (i32.const 1))
(drop)
)
(func (export "load_store") (param i32 i32)
(i32.load (local.get 0))
(i32.store (local.get 1))
(memory.grow (i32.const 1))
(drop)
)
(func (export "store_store") (param i32 i32)
(i32.store (local.get 0) (i32.const 42))
(i32.store (local.get 1) (i32.const 42))
(memory.grow (i32.const 1))
(drop)
)
(func (export "store_load") (param i32 i32) (result i32)
(i32.store (local.get 0) (i32.const 42))
(i32.load (local.get 1))
(memory.grow (i32.const 1))
(drop)
)
) In f3 and f4, the IR generated by this PR is different. This PR primarily optimizes store-store and store-load scenarios. As for load-load and load-store scenarios, I believe they might have been optimized. In your example, f0\f1 shows this PR won't produce no addtional IR compared to the original version. The memory in f2 is unchanged. f3 is the load-store case. f4 and f5 contains two call and one load instruction. Therefore, their IRs remains the same. |
||
aot_set_last_error("llvm build load failed."); | ||
goto fail; | ||
} | ||
|
@@ -1015,16 +1010,11 @@ get_memory_curr_page_count(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx) | |
{ | ||
LLVMValueRef mem_size; | ||
|
||
if (func_ctx->mem_space_unchanged) { | ||
mem_size = func_ctx->mem_info[0].mem_cur_page_count_addr; | ||
} | ||
else { | ||
if (!(mem_size = LLVMBuildLoad2( | ||
comp_ctx->builder, I32_TYPE, | ||
func_ctx->mem_info[0].mem_cur_page_count_addr, "mem_size"))) { | ||
aot_set_last_error("llvm build load failed."); | ||
goto fail; | ||
} | ||
if (!(mem_size = LLVMBuildLoad2( | ||
comp_ctx->builder, I32_TYPE, | ||
func_ctx->mem_info[0].mem_cur_page_count, "mem_size"))) { | ||
aot_set_last_error("llvm build load failed."); | ||
goto fail; | ||
} | ||
|
||
return LLVMBuildIntCast(comp_ctx->builder, mem_size, | ||
|
@@ -1165,16 +1155,14 @@ check_bulk_memory_overflow(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx, | |
#if WASM_ENABLE_SHARED_MEMORY != 0 | ||
bool is_shared_memory = comp_ctx->comp_data->memories[0].flags & 0x02; | ||
|
||
if (func_ctx->mem_space_unchanged || is_shared_memory) { | ||
#else | ||
if (func_ctx->mem_space_unchanged) { | ||
#endif | ||
if (is_shared_memory) | ||
mem_base_addr = func_ctx->mem_info[0].mem_base_addr; | ||
} | ||
else { | ||
else | ||
#endif | ||
{ | ||
if (!(mem_base_addr = LLVMBuildLoad2( | ||
comp_ctx->builder, OPQ_PTR_TYPE, | ||
func_ctx->mem_info[0].mem_base_addr, "mem_base"))) { | ||
func_ctx->mem_info[0].mem_base_addr, "mem_base_addr"))) { | ||
aot_set_last_error("llvm build load failed."); | ||
goto fail; | ||
} | ||
|
@@ -1206,16 +1194,11 @@ check_bulk_memory_overflow(AOTCompContext *comp_ctx, AOTFuncContext *func_ctx, | |
} | ||
} | ||
|
||
if (func_ctx->mem_space_unchanged) { | ||
mem_size = func_ctx->mem_info[0].mem_data_size_addr; | ||
} | ||
else { | ||
if (!(mem_size = LLVMBuildLoad2( | ||
comp_ctx->builder, I64_TYPE, | ||
func_ctx->mem_info[0].mem_data_size_addr, "mem_size"))) { | ||
aot_set_last_error("llvm build load failed."); | ||
goto fail; | ||
} | ||
if (!(mem_size = LLVMBuildLoad2( | ||
comp_ctx->builder, I64_TYPE, | ||
func_ctx->mem_info[0].mem_data_size, "mem_size"))) { | ||
aot_set_last_error("llvm build load failed."); | ||
goto fail; | ||
} | ||
|
||
ADD_BASIC_BLOCK(check_succ, "check_succ"); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe we can defer this until the next br/load/store?