Implement AOT static PGO (#2243)

LLVM PGO (Profile-Guided Optimization) allows the compiler to better optimize code
for how it actually runs. This PR implements the AOT static PGO, and is tested on
Linux x86-64 and x86-32. The basic steps are:

1. Use `wamrc --enable-llvm-pgo -o <aot_file_of_pgo> <wasm_file>`
   to generate an instrumented aot file.
2. Compile iwasm with `cmake -DWAMR_BUILD_STATIC_PGO=1` and run
      `iwasm --gen-prof-file=<raw_profile_file> <aot_file_of_pgo>`
    to generate the raw profile file.
3. Run `llvm-profdata merge -output=<profile_file> <raw_profile_file>`
    to merge the raw profile file into the profile file.
4. Run `wamrc --use-prof-file=<profile_file> -o <aot_file> <wasm_file>`
    to generate the optimized aot file.
5. Run the optimized aot_file: `iwasm <aot_file>`.

The test scripts are also added for each benchmark, run `test_pgo.sh` under
each benchmark's folder to test the AOT static pgo.
This commit is contained in:
Wenyong Huang
2023-06-05 09:17:39 +08:00
committed by GitHub
parent f1e9029ebc
commit 8d88471c46
29 changed files with 2000 additions and 53 deletions

View File

@ -2852,3 +2852,520 @@ aot_dump_perf_profiling(const AOTModuleInstance *module_inst)
}
}
#endif /* end of WASM_ENABLE_PERF_PROFILING */
#if WASM_ENABLE_STATIC_PGO != 0
/* indirect call target */
#define IPVK_IndirectCallTarget 0
/* memory intrinsic functions size */
#define IPVK_MemOPSize 1
#define IPVK_First IPVK_IndirectCallTarget
#define IPVK_Last IPVK_MemOPSize
#define INSTR_PROF_DEFAULT_NUM_VAL_PER_SITE 24
#define INSTR_PROF_MAX_NUM_VAL_PER_SITE 255
static int hasNonDefaultValsPerSite = 0;
static uint32 VPMaxNumValsPerSite = INSTR_PROF_DEFAULT_NUM_VAL_PER_SITE;
static bool
cmpxchg_ptr(void **ptr, void *old_val, void *new_val)
{
#if defined(os_atomic_cmpxchg)
return os_atomic_cmpxchg(ptr, &old_val, new_val);
#else
/* TODO: add lock when thread-manager is enabled */
void *read = *ptr;
if (read == old_val) {
*ptr = new_val;
return true;
}
return false;
#endif
}
static int
allocateValueProfileCounters(LLVMProfileData *Data)
{
ValueProfNode **Mem;
uint64 NumVSites = 0, total_size;
uint32 VKI;
/* When dynamic allocation is enabled, allow tracking the max number of
values allowed. */
if (!hasNonDefaultValsPerSite)
VPMaxNumValsPerSite = INSTR_PROF_MAX_NUM_VAL_PER_SITE;
for (VKI = IPVK_First; VKI <= IPVK_Last; ++VKI)
NumVSites += Data->num_value_sites[VKI];
/* If NumVSites = 0, calloc is allowed to return a non-null pointer. */
bh_assert(NumVSites > 0 && "NumVSites can't be zero");
total_size = (uint64)sizeof(ValueProfNode *) * NumVSites;
if (total_size > UINT32_MAX
|| !(Mem = (ValueProfNode **)wasm_runtime_malloc((uint32)total_size))) {
return 0;
}
memset(Mem, 0, (uint32)total_size);
if (!cmpxchg_ptr((void **)&Data->values, NULL, Mem)) {
wasm_runtime_free(Mem);
return 0;
}
return 1;
}
static ValueProfNode *
allocateOneNode(void)
{
ValueProfNode *Node;
Node = wasm_runtime_malloc((uint32)sizeof(ValueProfNode));
if (Node)
memset(Node, 0, sizeof(ValueProfNode));
return Node;
}
static void
instrumentTargetValueImpl(uint64 TargetValue, void *Data, uint32 CounterIndex,
uint64 CountValue)
{
ValueProfNode **ValueCounters;
ValueProfNode *PrevVNode = NULL, *MinCountVNode = NULL, *CurVNode;
LLVMProfileData *PData = (LLVMProfileData *)Data;
uint64 MinCount = UINT64_MAX;
uint8 VDataCount = 0;
bool success = false;
if (!PData)
return;
if (!CountValue)
return;
if (!PData->values) {
if (!allocateValueProfileCounters(PData))
return;
}
ValueCounters = (ValueProfNode **)PData->values;
CurVNode = ValueCounters[CounterIndex];
while (CurVNode) {
if (TargetValue == CurVNode->value) {
CurVNode->count += CountValue;
return;
}
if (CurVNode->count < MinCount) {
MinCount = CurVNode->count;
MinCountVNode = CurVNode;
}
PrevVNode = CurVNode;
CurVNode = CurVNode->next;
++VDataCount;
}
if (VDataCount >= VPMaxNumValsPerSite) {
if (MinCountVNode->count <= CountValue) {
CurVNode = MinCountVNode;
CurVNode->value = TargetValue;
CurVNode->count = CountValue;
}
else
MinCountVNode->count -= CountValue;
return;
}
CurVNode = allocateOneNode();
if (!CurVNode)
return;
CurVNode->value = TargetValue;
CurVNode->count += CountValue;
if (!ValueCounters[CounterIndex]) {
success =
cmpxchg_ptr((void **)&ValueCounters[CounterIndex], NULL, CurVNode);
}
else if (PrevVNode && !PrevVNode->next) {
success = cmpxchg_ptr((void **)&PrevVNode->next, 0, CurVNode);
}
if (!success) {
wasm_runtime_free(CurVNode);
}
}
void
llvm_profile_instrument_target(uint64 target_value, void *data,
uint32 counter_idx)
{
instrumentTargetValueImpl(target_value, data, counter_idx, 1);
}
static inline uint32
popcount64(uint64 u)
{
uint32 ret = 0;
while (u) {
u = (u & (u - 1));
ret++;
}
return ret;
}
static inline uint32
clz64(uint64 type)
{
uint32 num = 0;
if (type == 0)
return 64;
while (!(type & 0x8000000000000000LL)) {
num++;
type <<= 1;
}
return num;
}
/* Map an (observed) memop size value to the representative value of its range.
For example, 5 -> 5, 22 -> 17, 99 -> 65, 256 -> 256, 1001 -> 513. */
static uint64
InstrProfGetRangeRepValue(uint64 Value)
{
if (Value <= 8)
/* The first ranges are individually tracked. Use the value as is. */
return Value;
else if (Value >= 513)
/* The last range is mapped to its lowest value. */
return 513;
else if (popcount64(Value) == 1)
/* If it's a power of two, use it as is. */
return Value;
else
/* Otherwise, take to the previous power of two + 1. */
return (((uint64)1) << (64 - clz64(Value) - 1)) + 1;
}
void
llvm_profile_instrument_memop(uint64 target_value, void *data,
uint32 counter_idx)
{
uint64 rep_value = InstrProfGetRangeRepValue(target_value);
instrumentTargetValueImpl(rep_value, data, counter_idx, 1);
}
static uint32
get_pgo_prof_data_size(AOTModuleInstance *module_inst, uint32 *p_num_prof_data,
uint32 *p_num_prof_counters, uint32 *p_padding_size,
uint32 *p_prof_counters_size, uint32 *p_prof_names_size,
uint32 *p_value_counters_size, uint8 **p_prof_names)
{
AOTModule *module = (AOTModule *)module_inst->module;
LLVMProfileData *prof_data;
uint8 *prof_names = NULL;
uint32 num_prof_data = 0, num_prof_counters = 0, padding_size, i;
uint32 prof_counters_size = 0, prof_names_size = 0;
uint32 total_size, total_size_wo_value_counters;
for (i = 0; i < module->data_section_count; i++) {
if (!strncmp(module->data_sections[i].name, "__llvm_prf_data", 15)) {
bh_assert(module->data_sections[i].size == sizeof(LLVMProfileData));
num_prof_data++;
prof_data = (LLVMProfileData *)module->data_sections[i].data;
num_prof_counters += prof_data->num_counters;
}
else if (!strncmp(module->data_sections[i].name, "__llvm_prf_cnts",
15)) {
prof_counters_size += module->data_sections[i].size;
}
else if (!strncmp(module->data_sections[i].name, "__llvm_prf_names",
16)) {
prof_names_size = module->data_sections[i].size;
prof_names = module->data_sections[i].data;
}
}
if (prof_counters_size != num_prof_counters * sizeof(uint64))
return 0;
total_size = sizeof(LLVMProfileRawHeader)
+ num_prof_data * sizeof(LLVMProfileData_64)
+ prof_counters_size + prof_names_size;
padding_size = sizeof(uint64) - (prof_names_size % sizeof(uint64));
if (padding_size != sizeof(uint64))
total_size += padding_size;
/* Total size excluding value counters */
total_size_wo_value_counters = total_size;
for (i = 0; i < module->data_section_count; i++) {
if (!strncmp(module->data_sections[i].name, "__llvm_prf_data", 15)) {
uint32 j, k, num_value_sites, num_value_nodes;
ValueProfNode **values, *value_node;
prof_data = (LLVMProfileData *)module->data_sections[i].data;
values = prof_data->values;
if (prof_data->num_value_sites[0] > 0
|| prof_data->num_value_sites[1] > 0) {
/* TotalSize (uint32) and NumValueKinds (uint32) */
total_size += 8;
for (j = 0; j < 2; j++) {
if ((num_value_sites = prof_data->num_value_sites[j]) > 0) {
/* ValueKind (uint32) and NumValueSites (uint32) */
total_size += 8;
/* (Value + Counter) group counts of each value site,
each count is one byte */
total_size += align_uint(num_value_sites, 8);
if (values) {
for (k = 0; k < num_value_sites; k++) {
num_value_nodes = 0;
value_node = *values;
while (value_node) {
num_value_nodes++;
value_node = value_node->next;
}
if (num_value_nodes) {
/* (Value + Counter) groups */
total_size += num_value_nodes * 8 * 2;
}
values++;
}
}
}
}
}
}
}
if (p_num_prof_data)
*p_num_prof_data = num_prof_data;
if (p_num_prof_counters)
*p_num_prof_counters = num_prof_counters;
if (p_padding_size)
*p_padding_size = padding_size;
if (p_prof_counters_size)
*p_prof_counters_size = prof_counters_size;
if (p_prof_names_size)
*p_prof_names_size = prof_names_size;
if (p_value_counters_size)
*p_value_counters_size = total_size - total_size_wo_value_counters;
if (p_prof_names)
*p_prof_names = prof_names;
return total_size;
}
uint32
aot_get_pgo_prof_data_size(AOTModuleInstance *module_inst)
{
return get_pgo_prof_data_size(module_inst, NULL, NULL, NULL, NULL, NULL,
NULL, NULL);
}
static union {
int a;
char b;
} __ue = { .a = 1 };
#define is_little_endian() (__ue.b == 1)
uint32
aot_dump_pgo_prof_data_to_buf(AOTModuleInstance *module_inst, char *buf,
uint32 len)
{
AOTModule *module = (AOTModule *)module_inst->module;
LLVMProfileRawHeader prof_header = { 0 };
LLVMProfileData *prof_data;
uint8 *prof_names = NULL;
uint32 num_prof_data = 0, num_prof_counters = 0, padding_size, i;
uint32 prof_counters_size = 0, prof_names_size = 0;
uint32 value_counters_size = 0, value_counters_size_backup = 0;
uint32 total_size, size;
int64 counters_delta, offset_counters;
total_size = get_pgo_prof_data_size(module_inst, &num_prof_data,
&num_prof_counters, &padding_size,
&prof_counters_size, &prof_names_size,
&value_counters_size, &prof_names);
if (len < total_size)
return 0;
value_counters_size_backup = value_counters_size;
value_counters_size = 0;
prof_header.counters_delta = counters_delta =
sizeof(LLVMProfileData_64) * num_prof_data;
offset_counters = 0;
for (i = 0; i < module->data_section_count; i++) {
if (!strncmp(module->data_sections[i].name, "__llvm_prf_data", 15)) {
prof_data = (LLVMProfileData *)module->data_sections[i].data;
prof_data->offset_counters = counters_delta + offset_counters;
offset_counters += prof_data->num_counters * sizeof(uint64);
counters_delta -= sizeof(LLVMProfileData_64);
}
}
prof_header.magic = 0xFF6C70726F667281LL;
/* Version 8 */
prof_header.version = 0x0000000000000008LL;
/* with VARIANT_MASK_IR_PROF (IR Instrumentation) */
prof_header.version |= 0x1ULL << 56;
/* with VARIANT_MASK_MEMPROF (Memory Profile) */
prof_header.version |= 0x1ULL << 62;
prof_header.num_prof_data = num_prof_data;
prof_header.num_prof_counters = num_prof_counters;
prof_header.names_size = prof_names_size;
prof_header.value_kind_last = 1;
if (!is_little_endian()) {
aot_exchange_uint64((uint8 *)&prof_header.magic);
aot_exchange_uint64((uint8 *)&prof_header.version);
aot_exchange_uint64((uint8 *)&prof_header.num_prof_data);
aot_exchange_uint64((uint8 *)&prof_header.num_prof_counters);
aot_exchange_uint64((uint8 *)&prof_header.names_size);
aot_exchange_uint64((uint8 *)&prof_header.counters_delta);
aot_exchange_uint64((uint8 *)&prof_header.value_kind_last);
}
size = sizeof(LLVMProfileRawHeader);
bh_memcpy_s(buf, size, &prof_header, size);
buf += size;
for (i = 0; i < module->data_section_count; i++) {
if (!strncmp(module->data_sections[i].name, "__llvm_prf_data", 15)) {
LLVMProfileData_64 *prof_data_64 = (LLVMProfileData_64 *)buf;
/* Convert LLVMProfileData to LLVMProfileData_64, the pointer width
in the output file is alawys 8 bytes */
prof_data = (LLVMProfileData *)module->data_sections[i].data;
prof_data_64->func_md5 = prof_data->func_md5;
prof_data_64->func_hash = prof_data->func_hash;
prof_data_64->offset_counters = prof_data->offset_counters;
prof_data_64->func_ptr = prof_data->func_ptr;
prof_data_64->values = (uint64)(uintptr_t)prof_data->values;
prof_data_64->num_counters = prof_data->num_counters;
prof_data_64->num_value_sites[0] = prof_data->num_value_sites[0];
prof_data_64->num_value_sites[1] = prof_data->num_value_sites[1];
if (!is_little_endian()) {
aot_exchange_uint64((uint8 *)&prof_data_64->func_hash);
aot_exchange_uint64((uint8 *)&prof_data_64->offset_counters);
aot_exchange_uint64((uint8 *)&prof_data_64->offset_counters);
aot_exchange_uint64((uint8 *)&prof_data_64->func_ptr);
aot_exchange_uint64((uint8 *)&prof_data_64->values);
aot_exchange_uint32((uint8 *)&prof_data_64->num_counters);
aot_exchange_uint16((uint8 *)&prof_data_64->num_value_sites[0]);
aot_exchange_uint16((uint8 *)&prof_data_64->num_value_sites[1]);
}
buf += sizeof(LLVMProfileData_64);
}
}
for (i = 0; i < module->data_section_count; i++) {
if (!strncmp(module->data_sections[i].name, "__llvm_prf_cnts", 15)) {
size = module->data_sections[i].size;
bh_memcpy_s(buf, size, module->data_sections[i].data, size);
buf += size;
}
}
if (prof_names && prof_names_size > 0) {
size = prof_names_size;
bh_memcpy_s(buf, size, prof_names, size);
buf += size;
padding_size = sizeof(uint64) - (prof_names_size % sizeof(uint64));
if (padding_size != sizeof(uint64)) {
char padding_buf[8] = { 0 };
bh_memcpy_s(buf, padding_size, padding_buf, padding_size);
buf += padding_size;
}
}
for (i = 0; i < module->data_section_count; i++) {
if (!strncmp(module->data_sections[i].name, "__llvm_prf_data", 15)) {
uint32 j, k, num_value_sites, num_value_nodes;
ValueProfNode **values, **values_tmp, *value_node;
prof_data = (LLVMProfileData *)module->data_sections[i].data;
values = values_tmp = prof_data->values;
if (prof_data->num_value_sites[0] > 0
|| prof_data->num_value_sites[1] > 0) {
uint32 *buf_total_size = (uint32 *)buf;
buf += 4; /* emit TotalSize later */
*(uint32 *)buf = (prof_data->num_value_sites[0] > 0
&& prof_data->num_value_sites[1] > 0)
? 2
: 1;
if (!is_little_endian())
aot_exchange_uint32((uint8 *)buf);
buf += 4;
for (j = 0; j < 2; j++) {
if ((num_value_sites = prof_data->num_value_sites[j]) > 0) {
/* ValueKind */
*(uint32 *)buf = j;
if (!is_little_endian())
aot_exchange_uint32((uint8 *)buf);
buf += 4;
/* NumValueSites */
*(uint32 *)buf = num_value_sites;
if (!is_little_endian())
aot_exchange_uint32((uint8 *)buf);
buf += 4;
for (k = 0; k < num_value_sites; k++) {
num_value_nodes = 0;
if (values_tmp) {
value_node = *values_tmp;
while (value_node) {
num_value_nodes++;
value_node = value_node->next;
}
values_tmp++;
}
bh_assert(num_value_nodes < 255);
*(uint8 *)buf++ = (uint8)num_value_nodes;
}
if (num_value_sites % 8) {
buf += 8 - (num_value_sites % 8);
}
for (k = 0; k < num_value_sites; k++) {
if (values) {
value_node = *values;
while (value_node) {
*(uint64 *)buf = value_node->value;
if (!is_little_endian())
aot_exchange_uint64((uint8 *)buf);
buf += 8;
*(uint64 *)buf = value_node->count;
if (!is_little_endian())
aot_exchange_uint64((uint8 *)buf);
buf += 8;
value_node = value_node->next;
}
values++;
}
}
}
}
/* TotalSize */
*(uint32 *)buf_total_size =
(uint8 *)buf - (uint8 *)buf_total_size;
if (!is_little_endian())
aot_exchange_uint64((uint8 *)buf_total_size);
value_counters_size += (uint8 *)buf - (uint8 *)buf_total_size;
}
}
}
bh_assert(value_counters_size == value_counters_size_backup);
(void)value_counters_size_backup;
return total_size;
}
#endif /* end of WASM_ENABLE_STATIC_PGO != 0 */