Implement AOT static PGO (#2243)

LLVM PGO (Profile-Guided Optimization) allows the compiler to better optimize code
for how it actually runs. This PR implements the AOT static PGO, and is tested on
Linux x86-64 and x86-32. The basic steps are:

1. Use `wamrc --enable-llvm-pgo -o <aot_file_of_pgo> <wasm_file>`
   to generate an instrumented aot file.
2. Compile iwasm with `cmake -DWAMR_BUILD_STATIC_PGO=1` and run
      `iwasm --gen-prof-file=<raw_profile_file> <aot_file_of_pgo>`
    to generate the raw profile file.
3. Run `llvm-profdata merge -output=<profile_file> <raw_profile_file>`
    to merge the raw profile file into the profile file.
4. Run `wamrc --use-prof-file=<profile_file> -o <aot_file> <wasm_file>`
    to generate the optimized aot file.
5. Run the optimized aot_file: `iwasm <aot_file>`.

The test scripts are also added for each benchmark, run `test_pgo.sh` under
each benchmark's folder to test the AOT static pgo.
This commit is contained in:
Wenyong Huang
2023-06-05 09:17:39 +08:00
committed by GitHub
parent f1e9029ebc
commit 8d88471c46
29 changed files with 2000 additions and 53 deletions

View File

@ -8,6 +8,9 @@
#define R_386_32 1 /* Direct 32 bit */
#define R_386_PC32 2 /* PC relative 32 bit */
#define R_386_PLT32 4 /* 32-bit address ProcedureLinkageTable */
#define R_386_TLS_GD_32 \
24 /* Direct 32 bit for general dynamic \
thread local data */
#if !defined(_WIN32) && !defined(_WIN32_)
/* clang-format off */
@ -110,6 +113,9 @@ apply_relocation(AOTModule *module, uint8 *target_section_addr,
{
switch (reloc_type) {
case R_386_32:
#if WASM_ENABLE_STATIC_PGO != 0
case R_386_TLS_GD_32:
#endif
{
intptr_t value;

View File

@ -6,11 +6,13 @@
#include "aot_reloc.h"
#if !defined(BH_PLATFORM_WINDOWS)
#define R_X86_64_64 1 /* Direct 64 bit */
#define R_X86_64_PC32 2 /* PC relative 32 bit signed */
#define R_X86_64_PLT32 4 /* 32 bit PLT address */
#define R_X86_64_32 10 /* Direct 32 bit zero extended */
#define R_X86_64_32S 11 /* Direct 32 bit sign extended */
#define R_X86_64_64 1 /* Direct 64 bit */
#define R_X86_64_PC32 2 /* PC relative 32 bit signed */
#define R_X86_64_PLT32 4 /* 32 bit PLT address */
#define R_X86_64_GOTPCREL 9 /* 32 bit signed PC relative offset to GOT */
#define R_X86_64_32 10 /* Direct 32 bit zero extended */
#define R_X86_64_32S 11 /* Direct 32 bit sign extended */
#define R_X86_64_PC64 24 /* PC relative 64 bit */
#else
#ifndef IMAGE_REL_AMD64_ADDR64
#define IMAGE_REL_AMD64_ADDR64 1 /* The 64-bit VA of the relocation target */
@ -164,6 +166,7 @@ apply_relocation(AOTModule *module, uint8 *target_section_addr,
#endif
#if !defined(BH_PLATFORM_WINDOWS)
case R_X86_64_PC32:
case R_X86_64_GOTPCREL: /* GOT + G has been calculated as symbol_addr */
{
intptr_t target_addr = (intptr_t) /* S + A - P */
((uintptr_t)symbol_addr + reloc_addend
@ -182,6 +185,16 @@ apply_relocation(AOTModule *module, uint8 *target_section_addr,
*(int32 *)(target_section_addr + reloc_offset) = (int32)target_addr;
break;
}
case R_X86_64_PC64:
{
intptr_t target_addr = (intptr_t) /* S + A - P */
((uintptr_t)symbol_addr + reloc_addend
- (uintptr_t)(target_section_addr + reloc_offset));
CHECK_RELOC_OFFSET(sizeof(int64));
*(int64 *)(target_section_addr + reloc_offset) = (int64)target_addr;
break;
}
case R_X86_64_32:
case R_X86_64_32S:
{