Implement AOT static PGO (#2243)
LLVM PGO (Profile-Guided Optimization) allows the compiler to better optimize code
for how it actually runs. This PR implements the AOT static PGO, and is tested on
Linux x86-64 and x86-32. The basic steps are:
1. Use `wamrc --enable-llvm-pgo -o <aot_file_of_pgo> <wasm_file>`
to generate an instrumented aot file.
2. Compile iwasm with `cmake -DWAMR_BUILD_STATIC_PGO=1` and run
`iwasm --gen-prof-file=<raw_profile_file> <aot_file_of_pgo>`
to generate the raw profile file.
3. Run `llvm-profdata merge -output=<profile_file> <raw_profile_file>`
to merge the raw profile file into the profile file.
4. Run `wamrc --use-prof-file=<profile_file> -o <aot_file> <wasm_file>`
to generate the optimized aot file.
5. Run the optimized aot_file: `iwasm <aot_file>`.
The test scripts are also added for each benchmark, run `test_pgo.sh` under
each benchmark's folder to test the AOT static pgo.
This commit is contained in:
@ -1430,8 +1430,28 @@ destroy_object_data_sections(AOTObjectDataSection *data_sections,
|
||||
uint32 i;
|
||||
AOTObjectDataSection *data_section = data_sections;
|
||||
for (i = 0; i < data_section_count; i++, data_section++)
|
||||
if (data_section->data)
|
||||
if (data_section->data) {
|
||||
#if WASM_ENABLE_STATIC_PGO != 0
|
||||
if (!strncmp(data_section->name, "__llvm_prf_data", 15)) {
|
||||
LLVMProfileData *data = (LLVMProfileData *)data_section->data;
|
||||
if (data->values) {
|
||||
uint32 num_value_sites =
|
||||
data->num_value_sites[0] + data->num_value_sites[1];
|
||||
uint32 j;
|
||||
for (j = 0; j < num_value_sites; j++) {
|
||||
ValueProfNode *node = data->values[j], *node_next;
|
||||
while (node) {
|
||||
node_next = node->next;
|
||||
wasm_runtime_free(node);
|
||||
node = node_next;
|
||||
}
|
||||
}
|
||||
wasm_runtime_free(data->values);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
os_munmap(data_section->data, data_section->size);
|
||||
}
|
||||
wasm_runtime_free(data_sections);
|
||||
}
|
||||
|
||||
@ -1900,6 +1920,8 @@ str2uint64(const char *buf, uint64 *p_res)
|
||||
return true;
|
||||
}
|
||||
|
||||
#define R_X86_64_GOTPCREL 9 /* 32 bit signed PC relative offset to GOT */
|
||||
|
||||
static bool
|
||||
do_text_relocation(AOTModule *module, AOTRelocationGroup *group,
|
||||
char *error_buf, uint32 error_buf_size)
|
||||
@ -1937,6 +1959,14 @@ do_text_relocation(AOTModule *module, AOTRelocationGroup *group,
|
||||
bh_memcpy_s(symbol, symbol_len, relocation->symbol_name, symbol_len);
|
||||
symbol[symbol_len] = '\0';
|
||||
|
||||
#if WASM_ENABLE_STATIC_PGO != 0
|
||||
if (!strcmp(symbol, "__llvm_profile_runtime")
|
||||
|| !strcmp(symbol, "__llvm_profile_register_function")
|
||||
|| !strcmp(symbol, "__llvm_profile_register_names_function")) {
|
||||
continue;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!strncmp(symbol, AOT_FUNC_PREFIX, strlen(AOT_FUNC_PREFIX))) {
|
||||
p = symbol + strlen(AOT_FUNC_PREFIX);
|
||||
if (*p == '\0'
|
||||
@ -1945,7 +1975,26 @@ do_text_relocation(AOTModule *module, AOTRelocationGroup *group,
|
||||
"invalid import symbol %s", symbol);
|
||||
goto check_symbol_fail;
|
||||
}
|
||||
#if (defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64)) \
|
||||
&& !defined(BH_PLATFORM_WINDOWS)
|
||||
if (relocation->relocation_type == R_X86_64_GOTPCREL) {
|
||||
GOTItem *got_item = module->got_item_list;
|
||||
uint32 got_item_idx = 0;
|
||||
|
||||
while (got_item) {
|
||||
if (got_item->func_idx == func_index)
|
||||
break;
|
||||
got_item_idx++;
|
||||
got_item = got_item->next;
|
||||
}
|
||||
/* Calculate `GOT + G` */
|
||||
symbol_addr = module->got_func_ptrs + got_item_idx;
|
||||
}
|
||||
else
|
||||
symbol_addr = module->func_ptrs[func_index];
|
||||
#else
|
||||
symbol_addr = module->func_ptrs[func_index];
|
||||
#endif
|
||||
}
|
||||
else if (!strcmp(symbol, ".text")) {
|
||||
symbol_addr = module->code;
|
||||
@ -1956,7 +2005,13 @@ do_text_relocation(AOTModule *module, AOTRelocationGroup *group,
|
||||
/* ".rodata.cst4/8/16/.." */
|
||||
|| !strncmp(symbol, ".rodata.cst", strlen(".rodata.cst"))
|
||||
/* ".rodata.strn.m" */
|
||||
|| !strncmp(symbol, ".rodata.str", strlen(".rodata.str"))) {
|
||||
|| !strncmp(symbol, ".rodata.str", strlen(".rodata.str"))
|
||||
#if WASM_ENABLE_STATIC_PGO != 0
|
||||
|| !strncmp(symbol, "__llvm_prf_cnts", 15)
|
||||
|| !strncmp(symbol, "__llvm_prf_data", 15)
|
||||
|| !strncmp(symbol, "__llvm_prf_names", 16)
|
||||
#endif
|
||||
) {
|
||||
symbol_addr = get_data_section_addr(module, symbol, NULL);
|
||||
if (!symbol_addr) {
|
||||
set_error_buf_v(error_buf, error_buf_size,
|
||||
@ -2088,6 +2143,14 @@ do_data_relocation(AOTModule *module, AOTRelocationGroup *group,
|
||||
else if (!strcmp(group->section_name, ".rdata")) {
|
||||
data_section_name = group->section_name;
|
||||
}
|
||||
#if WASM_ENABLE_STATIC_PGO != 0
|
||||
else if (!strncmp(group->section_name, ".rel__llvm_prf_data", 19)) {
|
||||
data_section_name = group->section_name + strlen(".rel");
|
||||
}
|
||||
else if (!strncmp(group->section_name, ".rela__llvm_prf_data", 20)) {
|
||||
data_section_name = group->section_name + strlen(".rela");
|
||||
}
|
||||
#endif
|
||||
else {
|
||||
set_error_buf(error_buf, error_buf_size,
|
||||
"invalid data relocation section name");
|
||||
@ -2107,6 +2170,49 @@ do_data_relocation(AOTModule *module, AOTRelocationGroup *group,
|
||||
if (!strcmp(symbol, ".text")) {
|
||||
symbol_addr = module->code;
|
||||
}
|
||||
#if WASM_ENABLE_STATIC_PGO != 0
|
||||
else if (!strncmp(symbol, AOT_FUNC_PREFIX, strlen(AOT_FUNC_PREFIX))) {
|
||||
char *p = symbol + strlen(AOT_FUNC_PREFIX);
|
||||
uint32 func_index;
|
||||
if (*p == '\0'
|
||||
|| (func_index = (uint32)atoi(p)) > module->func_count) {
|
||||
set_error_buf_v(error_buf, error_buf_size,
|
||||
"invalid relocation symbol %s", symbol);
|
||||
return false;
|
||||
}
|
||||
symbol_addr = module->func_ptrs[func_index];
|
||||
}
|
||||
else if (!strcmp(symbol, "__llvm_prf_cnts")) {
|
||||
uint32 j;
|
||||
for (j = 0; j < module->data_section_count; j++) {
|
||||
if (!strncmp(module->data_sections[j].name, symbol, 15)) {
|
||||
bh_assert(relocation->relocation_addend + sizeof(uint64)
|
||||
<= module->data_sections[j].size);
|
||||
symbol_addr = module->data_sections[j].data;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (j == module->data_section_count) {
|
||||
set_error_buf_v(error_buf, error_buf_size,
|
||||
"invalid relocation symbol %s", symbol);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else if (!strncmp(symbol, "__llvm_prf_cnts", 15)) {
|
||||
uint32 j;
|
||||
for (j = 0; j < module->data_section_count; j++) {
|
||||
if (!strcmp(module->data_sections[j].name, symbol)) {
|
||||
symbol_addr = module->data_sections[j].data;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (j == module->data_section_count) {
|
||||
set_error_buf_v(error_buf, error_buf_size,
|
||||
"invalid relocation symbol %s", symbol);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
#endif /* end of WASM_ENABLE_STATIC_PGO != 0 */
|
||||
else {
|
||||
set_error_buf_v(error_buf, error_buf_size,
|
||||
"invalid relocation symbol %s", symbol);
|
||||
@ -2154,7 +2260,7 @@ load_relocation_section(const uint8 *buf, const uint8 *buf_end,
|
||||
{
|
||||
AOTRelocationGroup *groups = NULL, *group;
|
||||
uint32 symbol_count = 0;
|
||||
uint32 group_count = 0, i, j;
|
||||
uint32 group_count = 0, i, j, got_item_count = 0;
|
||||
uint64 size;
|
||||
uint32 *symbol_offsets, total_string_len;
|
||||
uint8 *symbol_buf, *symbol_buf_end;
|
||||
@ -2216,6 +2322,8 @@ load_relocation_section(const uint8 *buf, const uint8 *buf_end,
|
||||
|
||||
for (j = 0; j < relocation_count; j++) {
|
||||
AOTRelocation relocation = { 0 };
|
||||
char group_name_buf[128] = { 0 };
|
||||
char symbol_name_buf[128] = { 0 };
|
||||
uint32 symbol_index, offset32;
|
||||
int32 addend32;
|
||||
uint16 symbol_name_len;
|
||||
@ -2244,10 +2352,10 @@ load_relocation_section(const uint8 *buf, const uint8 *buf_end,
|
||||
symbol_name_len = *(uint16 *)symbol_name;
|
||||
symbol_name += sizeof(uint16);
|
||||
|
||||
char group_name_buf[128] = { 0 };
|
||||
char symbol_name_buf[128] = { 0 };
|
||||
memcpy(group_name_buf, group_name, group_name_len);
|
||||
memcpy(symbol_name_buf, symbol_name, symbol_name_len);
|
||||
bh_memcpy_s(group_name_buf, (uint32)sizeof(group_name_buf),
|
||||
group_name, group_name_len);
|
||||
bh_memcpy_s(symbol_name_buf, (uint32)sizeof(symbol_name_buf),
|
||||
symbol_name, symbol_name_len);
|
||||
|
||||
if ((group_name_len == strlen(".text")
|
||||
|| (module->is_indirect_mode
|
||||
@ -2309,6 +2417,139 @@ load_relocation_section(const uint8 *buf, const uint8 *buf_end,
|
||||
}
|
||||
#endif /* end of defined(BH_PLATFORM_WINDOWS) */
|
||||
|
||||
#if (defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64)) \
|
||||
&& !defined(BH_PLATFORM_WINDOWS)
|
||||
buf = symbol_buf_end;
|
||||
read_uint32(buf, buf_end, group_count);
|
||||
|
||||
/* Resolve the relocations of type R_X86_64_GOTPCREL */
|
||||
for (i = 0; i < group_count; i++) {
|
||||
uint32 name_index, relocation_count;
|
||||
uint16 group_name_len;
|
||||
uint8 *group_name;
|
||||
|
||||
/* section name address is 4 bytes aligned. */
|
||||
buf = (uint8 *)align_ptr(buf, sizeof(uint32));
|
||||
read_uint32(buf, buf_end, name_index);
|
||||
|
||||
if (name_index >= symbol_count) {
|
||||
set_error_buf(error_buf, error_buf_size,
|
||||
"symbol index out of range");
|
||||
goto fail;
|
||||
}
|
||||
|
||||
group_name = symbol_buf + symbol_offsets[name_index];
|
||||
group_name_len = *(uint16 *)group_name;
|
||||
group_name += sizeof(uint16);
|
||||
|
||||
read_uint32(buf, buf_end, relocation_count);
|
||||
|
||||
for (j = 0; j < relocation_count; j++) {
|
||||
AOTRelocation relocation = { 0 };
|
||||
char group_name_buf[128] = { 0 };
|
||||
char symbol_name_buf[128] = { 0 };
|
||||
uint32 symbol_index;
|
||||
uint16 symbol_name_len;
|
||||
uint8 *symbol_name;
|
||||
|
||||
/* relocation offset and addend */
|
||||
buf += sizeof(void *) * 2;
|
||||
|
||||
read_uint32(buf, buf_end, relocation.relocation_type);
|
||||
read_uint32(buf, buf_end, symbol_index);
|
||||
|
||||
if (symbol_index >= symbol_count) {
|
||||
set_error_buf(error_buf, error_buf_size,
|
||||
"symbol index out of range");
|
||||
goto fail;
|
||||
}
|
||||
|
||||
symbol_name = symbol_buf + symbol_offsets[symbol_index];
|
||||
symbol_name_len = *(uint16 *)symbol_name;
|
||||
symbol_name += sizeof(uint16);
|
||||
|
||||
bh_memcpy_s(group_name_buf, (uint32)sizeof(group_name_buf),
|
||||
group_name, group_name_len);
|
||||
bh_memcpy_s(symbol_name_buf, (uint32)sizeof(symbol_name_buf),
|
||||
symbol_name, symbol_name_len);
|
||||
|
||||
if (relocation.relocation_type == R_X86_64_GOTPCREL
|
||||
&& !strncmp(symbol_name_buf, AOT_FUNC_PREFIX,
|
||||
strlen(AOT_FUNC_PREFIX))) {
|
||||
uint32 func_idx =
|
||||
atoi(symbol_name_buf + strlen(AOT_FUNC_PREFIX));
|
||||
GOTItem *got_item = module->got_item_list;
|
||||
|
||||
if (func_idx >= module->func_count) {
|
||||
set_error_buf(error_buf, error_buf_size,
|
||||
"func index out of range");
|
||||
goto fail;
|
||||
}
|
||||
|
||||
while (got_item) {
|
||||
if (got_item->func_idx == func_idx)
|
||||
break;
|
||||
got_item = got_item->next;
|
||||
}
|
||||
|
||||
if (!got_item) {
|
||||
/* Create the got item and append to the list */
|
||||
got_item = wasm_runtime_malloc(sizeof(GOTItem));
|
||||
if (!got_item) {
|
||||
set_error_buf(error_buf, error_buf_size,
|
||||
"allocate memory failed");
|
||||
goto fail;
|
||||
}
|
||||
|
||||
got_item->func_idx = func_idx;
|
||||
got_item->next = NULL;
|
||||
if (!module->got_item_list) {
|
||||
module->got_item_list = module->got_item_list_end =
|
||||
got_item;
|
||||
}
|
||||
else {
|
||||
module->got_item_list_end->next = got_item;
|
||||
module->got_item_list_end = got_item;
|
||||
}
|
||||
|
||||
got_item_count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (got_item_count) {
|
||||
GOTItem *got_item = module->got_item_list;
|
||||
uint32 got_item_idx = 0;
|
||||
|
||||
map_prot = MMAP_PROT_READ | MMAP_PROT_WRITE;
|
||||
/* aot code and data in x86_64 must be in range 0 to 2G due to
|
||||
relocation for R_X86_64_32/32S/PC32 */
|
||||
map_flags = MMAP_MAP_32BIT;
|
||||
|
||||
/* Create the GOT for func_ptrs, note that it is different from
|
||||
the .got section of a dynamic object file */
|
||||
size = (uint64)sizeof(void *) * got_item_count;
|
||||
if (size > UINT32_MAX
|
||||
|| !(module->got_func_ptrs =
|
||||
os_mmap(NULL, (uint32)size, map_prot, map_flags))) {
|
||||
set_error_buf(error_buf, error_buf_size, "mmap memory failed");
|
||||
goto fail;
|
||||
}
|
||||
|
||||
while (got_item) {
|
||||
module->got_func_ptrs[got_item_idx++] =
|
||||
module->func_ptrs[got_item->func_idx];
|
||||
got_item = got_item->next;
|
||||
}
|
||||
|
||||
module->got_item_count = got_item_count;
|
||||
}
|
||||
#else
|
||||
(void)got_item_count;
|
||||
#endif /* (defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64)) && \
|
||||
!defined(BH_PLATFORM_WINDOWS) */
|
||||
|
||||
buf = symbol_buf_end;
|
||||
read_uint32(buf, buf_end, group_count);
|
||||
|
||||
@ -2994,9 +3235,27 @@ aot_unload(AOTModule *module)
|
||||
}
|
||||
#endif
|
||||
|
||||
#if (defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64)) \
|
||||
&& !defined(BH_PLATFORM_WINDOWS)
|
||||
{
|
||||
GOTItem *got_item = module->got_item_list, *got_item_next;
|
||||
|
||||
if (module->got_func_ptrs) {
|
||||
os_munmap(module->got_func_ptrs,
|
||||
sizeof(void *) * module->got_item_count);
|
||||
}
|
||||
while (got_item) {
|
||||
got_item_next = got_item->next;
|
||||
wasm_runtime_free(got_item);
|
||||
got_item = got_item_next;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (module->data_sections)
|
||||
destroy_object_data_sections(module->data_sections,
|
||||
module->data_section_count);
|
||||
|
||||
#if WASM_ENABLE_DEBUG_AOT != 0
|
||||
jit_code_entry_destroy(module->elf_hdr);
|
||||
#endif
|
||||
@ -3043,3 +3302,23 @@ aot_get_custom_section(const AOTModule *module, const char *name, uint32 *len)
|
||||
return NULL;
|
||||
}
|
||||
#endif /* end of WASM_ENABLE_LOAD_CUSTOM_SECTION */
|
||||
|
||||
#if WASM_ENABLE_STATIC_PGO != 0
|
||||
void
|
||||
aot_exchange_uint16(uint8 *p_data)
|
||||
{
|
||||
return exchange_uint16(p_data);
|
||||
}
|
||||
|
||||
void
|
||||
aot_exchange_uint32(uint8 *p_data)
|
||||
{
|
||||
return exchange_uint32(p_data);
|
||||
}
|
||||
|
||||
void
|
||||
aot_exchange_uint64(uint8 *p_data)
|
||||
{
|
||||
return exchange_uint64(p_data);
|
||||
}
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user