Implement AOT static PGO (#2243)
LLVM PGO (Profile-Guided Optimization) allows the compiler to better optimize code
for how it actually runs. This PR implements the AOT static PGO, and is tested on
Linux x86-64 and x86-32. The basic steps are:
1. Use `wamrc --enable-llvm-pgo -o <aot_file_of_pgo> <wasm_file>`
to generate an instrumented aot file.
2. Compile iwasm with `cmake -DWAMR_BUILD_STATIC_PGO=1` and run
`iwasm --gen-prof-file=<raw_profile_file> <aot_file_of_pgo>`
to generate the raw profile file.
3. Run `llvm-profdata merge -output=<profile_file> <raw_profile_file>`
to merge the raw profile file into the profile file.
4. Run `wamrc --use-prof-file=<profile_file> -o <aot_file> <wasm_file>`
to generate the optimized aot file.
5. Run the optimized aot_file: `iwasm <aot_file>`.
The test scripts are also added for each benchmark, run `test_pgo.sh` under
each benchmark's folder to test the AOT static pgo.
This commit is contained in:
@ -8,6 +8,9 @@
|
||||
#define R_386_32 1 /* Direct 32 bit */
|
||||
#define R_386_PC32 2 /* PC relative 32 bit */
|
||||
#define R_386_PLT32 4 /* 32-bit address ProcedureLinkageTable */
|
||||
#define R_386_TLS_GD_32 \
|
||||
24 /* Direct 32 bit for general dynamic \
|
||||
thread local data */
|
||||
|
||||
#if !defined(_WIN32) && !defined(_WIN32_)
|
||||
/* clang-format off */
|
||||
@ -110,6 +113,9 @@ apply_relocation(AOTModule *module, uint8 *target_section_addr,
|
||||
{
|
||||
switch (reloc_type) {
|
||||
case R_386_32:
|
||||
#if WASM_ENABLE_STATIC_PGO != 0
|
||||
case R_386_TLS_GD_32:
|
||||
#endif
|
||||
{
|
||||
intptr_t value;
|
||||
|
||||
|
||||
@ -6,11 +6,13 @@
|
||||
#include "aot_reloc.h"
|
||||
|
||||
#if !defined(BH_PLATFORM_WINDOWS)
|
||||
#define R_X86_64_64 1 /* Direct 64 bit */
|
||||
#define R_X86_64_PC32 2 /* PC relative 32 bit signed */
|
||||
#define R_X86_64_PLT32 4 /* 32 bit PLT address */
|
||||
#define R_X86_64_32 10 /* Direct 32 bit zero extended */
|
||||
#define R_X86_64_32S 11 /* Direct 32 bit sign extended */
|
||||
#define R_X86_64_64 1 /* Direct 64 bit */
|
||||
#define R_X86_64_PC32 2 /* PC relative 32 bit signed */
|
||||
#define R_X86_64_PLT32 4 /* 32 bit PLT address */
|
||||
#define R_X86_64_GOTPCREL 9 /* 32 bit signed PC relative offset to GOT */
|
||||
#define R_X86_64_32 10 /* Direct 32 bit zero extended */
|
||||
#define R_X86_64_32S 11 /* Direct 32 bit sign extended */
|
||||
#define R_X86_64_PC64 24 /* PC relative 64 bit */
|
||||
#else
|
||||
#ifndef IMAGE_REL_AMD64_ADDR64
|
||||
#define IMAGE_REL_AMD64_ADDR64 1 /* The 64-bit VA of the relocation target */
|
||||
@ -164,6 +166,7 @@ apply_relocation(AOTModule *module, uint8 *target_section_addr,
|
||||
#endif
|
||||
#if !defined(BH_PLATFORM_WINDOWS)
|
||||
case R_X86_64_PC32:
|
||||
case R_X86_64_GOTPCREL: /* GOT + G has been calculated as symbol_addr */
|
||||
{
|
||||
intptr_t target_addr = (intptr_t) /* S + A - P */
|
||||
((uintptr_t)symbol_addr + reloc_addend
|
||||
@ -182,6 +185,16 @@ apply_relocation(AOTModule *module, uint8 *target_section_addr,
|
||||
*(int32 *)(target_section_addr + reloc_offset) = (int32)target_addr;
|
||||
break;
|
||||
}
|
||||
case R_X86_64_PC64:
|
||||
{
|
||||
intptr_t target_addr = (intptr_t) /* S + A - P */
|
||||
((uintptr_t)symbol_addr + reloc_addend
|
||||
- (uintptr_t)(target_section_addr + reloc_offset));
|
||||
|
||||
CHECK_RELOC_OFFSET(sizeof(int64));
|
||||
*(int64 *)(target_section_addr + reloc_offset) = (int64)target_addr;
|
||||
break;
|
||||
}
|
||||
case R_X86_64_32:
|
||||
case R_X86_64_32S:
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user