Implement AOT static PGO (#2243)

LLVM PGO (Profile-Guided Optimization) allows the compiler to better optimize code
for how it actually runs. This PR implements the AOT static PGO, and is tested on
Linux x86-64 and x86-32. The basic steps are:

1. Use `wamrc --enable-llvm-pgo -o <aot_file_of_pgo> <wasm_file>`
   to generate an instrumented aot file.
2. Compile iwasm with `cmake -DWAMR_BUILD_STATIC_PGO=1` and run
      `iwasm --gen-prof-file=<raw_profile_file> <aot_file_of_pgo>`
    to generate the raw profile file.
3. Run `llvm-profdata merge -output=<profile_file> <raw_profile_file>`
    to merge the raw profile file into the profile file.
4. Run `wamrc --use-prof-file=<profile_file> -o <aot_file> <wasm_file>`
    to generate the optimized aot file.
5. Run the optimized aot_file: `iwasm <aot_file>`.

The test scripts are also added for each benchmark, run `test_pgo.sh` under
each benchmark's folder to test the AOT static pgo.
This commit is contained in:
Wenyong Huang
2023-06-05 09:17:39 +08:00
committed by GitHub
parent f1e9029ebc
commit 8d88471c46
29 changed files with 2000 additions and 53 deletions

View File

@ -41,6 +41,10 @@ typedef struct AOTObjectDataSection {
char *name;
uint8 *data;
uint32 size;
#if WASM_ENABLE_WAMR_COMPILER != 0 || WASM_ENABLE_JIT != 0
bool is_name_allocated;
bool is_data_allocated;
#endif
} AOTObjectDataSection;
/* Relocation info */
@ -51,6 +55,9 @@ typedef struct AOTRelocation {
char *symbol_name;
/* index in the symbol offset field */
uint32 symbol_index;
#if WASM_ENABLE_WAMR_COMPILER != 0 || WASM_ENABLE_JIT != 0
bool is_symbol_name_allocated;
#endif
} AOTRelocation;
/* Relocation Group */
@ -60,6 +67,9 @@ typedef struct AOTRelocationGroup {
uint32 name_index;
uint32 relocation_count;
AOTRelocation *relocations;
#if WASM_ENABLE_WAMR_COMPILER != 0 || WASM_ENABLE_JIT != 0
bool is_section_name_allocated;
#endif
} AOTRelocationGroup;
/* AOT function instance */
@ -108,6 +118,13 @@ typedef struct AOTUnwindInfo {
#define PLT_ITEM_SIZE 12
#endif
#if defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64)
typedef struct GOTItem {
uint32 func_idx;
struct GOTItem *next;
} GOTItem, *GOTItemList;
#endif
typedef struct AOTModule {
uint32 module_type;
@ -204,6 +221,13 @@ typedef struct AOTModule {
bool rtl_func_table_registered;
#endif
#if defined(BUILD_TARGET_X86_64) || defined(BUILD_TARGET_AMD_64)
uint32 got_item_count;
GOTItemList got_item_list;
GOTItemList got_item_list_end;
void **got_func_ptrs;
#endif
/* data sections in AOT object file, including .data, .rodata
and .rodata.cstN. */
AOTObjectDataSection *data_sections;
@ -294,6 +318,54 @@ typedef struct AOTFrame {
#endif
} AOTFrame;
#if WASM_ENABLE_STATIC_PGO != 0
typedef struct LLVMProfileRawHeader {
uint64 magic;
uint64 version;
uint64 binary_ids_size;
uint64 num_prof_data;
uint64 padding_bytes_before_counters;
uint64 num_prof_counters;
uint64 padding_bytes_after_counters;
uint64 names_size;
uint64 counters_delta;
uint64 names_delta;
uint64 value_kind_last;
} LLVMProfileRawHeader;
typedef struct ValueProfNode {
uint64 value;
uint64 count;
struct ValueProfNode *next;
} ValueProfNode;
/* The profiling data of data sections created by aot compiler and
used when profiling, the width of pointer can be 8 bytes (64-bit)
or 4 bytes (32-bit) */
typedef struct LLVMProfileData {
uint64 func_md5;
uint64 func_hash;
uint64 offset_counters;
uintptr_t func_ptr;
ValueProfNode **values;
uint32 num_counters;
uint16 num_value_sites[2];
} LLVMProfileData;
/* The profiling data for writting to the output file, the width of
pointer is 8 bytes suppose we always use wamrc and llvm-profdata
with 64-bit mode */
typedef struct LLVMProfileData_64 {
uint64 func_md5;
uint64 func_hash;
uint64 offset_counters;
uint64 func_ptr;
uint64 values;
uint32 num_counters;
uint16 num_value_sites[2];
} LLVMProfileData_64;
#endif /* end of WASM_ENABLE_STATIC_PGO != 0 */
/**
* Load a AOT module from aot file buffer
* @param buf the byte buffer which contains the AOT file data
@ -564,6 +636,32 @@ aot_dump_perf_profiling(const AOTModuleInstance *module_inst);
const uint8 *
aot_get_custom_section(const AOTModule *module, const char *name, uint32 *len);
#if WASM_ENABLE_STATIC_PGO != 0
void
llvm_profile_instrument_target(uint64 target_value, void *data,
uint32 counter_idx);
void
llvm_profile_instrument_memop(uint64 target_value, void *data,
uint32 counter_idx);
uint32
aot_get_pgo_prof_data_size(AOTModuleInstance *module_inst);
uint32
aot_dump_pgo_prof_data_to_buf(AOTModuleInstance *module_inst, char *buf,
uint32 len);
void
aot_exchange_uint16(uint8 *p_data);
void
aot_exchange_uint32(uint8 *p_data);
void
aot_exchange_uint64(uint8 *p_data);
#endif /* end of WASM_ENABLE_STATIC_PGO != 0 */
#ifdef __cplusplus
} /* end of extern "C" */
#endif