Merge dev/simd for fast-interp (#4131)

* Implement the first few SIMD opcodes for fast interpreter (v128.const, v128.any_true) (#3818)

Tested on the following code:
```
(module
  (import "wasi_snapshot_preview1" "proc_exit" (func $proc_exit (param i32)))
  (memory (export "memory") 1)

  ;; WASI entry point
  (func $main (export "_start")
    v128.const i8x16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
    v128.any_true
    if
      unreachable
    end
    
    v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15     
    v128.any_true
    i32.const 0
    i32.eq
    if
      unreachable
    end

    i32.const 0
    call $proc_exit
  )
)
```

* implement POP_V128()

This is to simplify the simd implementation for fast interpreter

* Add all SIMD operations into wasm_interp_fast switch

* Add V128 comparison operations

Tested using
```
(module
  (import "wasi_snapshot_preview1" "proc_exit" (func $proc_exit (param i32)))

  (memory (export "memory") 1)

  (func $assert_true (param v128)
    local.get 0
    v128.any_true
    i32.eqz
    if
      unreachable
    end
  )

  (func $main (export "_start")
    ;; Test v128.not
    v128.const i8x16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
    v128.not
    v128.const i8x16 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255
    i8x16.eq
    call $assert_true

    ;; Test v128.and
    v128.const i8x16 255 255 255 255 0 0 0 0 255 255 255 255 0 0 0 0
    v128.const i8x16 255 255 0 0 255 255 0 0 255 255 0 0 255 255 0 0
    v128.and
    v128.const i8x16 255 255 0 0 0 0 0 0 255 255 0 0 0 0 0 0
    i8x16.eq
    call $assert_true

    ;; Test v128.andnot
    v128.const i8x16 255 255 255 255 0 0 0 0 255 255 255 255 0 0 0 0
    v128.const i8x16 255 255 0 0 255 255 0 0 255 255 0 0 255 255 0 0
    v128.andnot
    v128.const i8x16 0 0 255 255 0 0 0 0 0 0 255 255 0 0 0 0
    i8x16.eq
    call $assert_true

    ;; Test v128.or
    v128.const i8x16 255 255 0 0 0 0 255 255 255 255 0 0 0 0 255 0
    v128.const i8x16 0 0 255 255 255 255 0 0 0 0 255 255 255 255 0 0
    v128.or
    v128.const i8x16 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 0
    i8x16.eq
    call $assert_true

    ;; Test v128.xor
    v128.const i8x16 255 255 0 0 255 255 0 0 255 255 0 0 255 255 0 0
    v128.const i8x16 255 255 255 255 0 0 0 0 255 255 255 255 0 0 0 0
    v128.xor
    v128.const i8x16 0 0 255 255 255 255 0 0 0 0 255 255 255 255 0 0
    i8x16.eq
    call $assert_true

    i32.const 0
    call $proc_exit
  )
)
```

* Add first NEON SIMD opcode implementations to fast interpreter (#3859)

Add some implementations of SIMD opcodes using NEON instructions.
Tested using:
```wast
(module
  (import "wasi_snapshot_preview1" "proc_exit" (func $proc_exit (param i32)))
  (memory (export "memory") 1)

  (func $assert_true (param v128)
    local.get 0
    v128.any_true 
    i32.eqz
    if
      unreachable
    end
  )
  (func $main (export "_start")
    i32.const 0
    i32.const 32
    memory.grow
    drop

    i32.const 0
    v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
    v128.store

    i32.const 0
    v128.load

    v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
    i8x16.eq
    call $assert_true

    i32.const 16
    v128.const i8x16 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
    v128.store

    i32.const 16
    v128.load
    v128.const i8x16 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
    i8x16.eq
    call $assert_true

    i32.const 0
    v128.load
    v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
    i8x16.eq
    call $assert_true
    drop

    i32.const 0
    i32.const 1
    memory.grow
    drop

    i32.const 0
    i64.const 0x7F80FF017E02FE80
    i64.store

    i32.const 0
    v128.load8x8_s

    v128.const i16x8 127 -128 -1 1 126 2 -2 -128

    i16x8.eq
    call $assert_true

    i32.const 0
    i64.const 0x80FE027E01FF807F
    i64.store

    i32.const 0
    v128.load8x8_u

    v128.const i16x8 128 254 2 126 1 255 128 127

    i16x8.eq
    call $assert_true

    i32.const 0
    i64.const 0x8000FFFE7FFF0001
    i64.store

    i32.const 0
    v128.load16x4_s

    v128.const i32x4 -32768 -2 32767 1

    i32x4.eq
    call $assert_true

    i32.const 0
    i64.const 0x8000FFFE7FFF0001 
    i64.store

    i32.const 0
    v128.load16x4_u

    v128.const i32x4 32768 65534 32767 1   

    i32x4.eq
    call $assert_true

    i32.const 0
    i64.const 0x8000000000000001
    i64.store

    i32.const 0
    v128.load32x2_s

    v128.const i64x2 -2147483648 1 

    i64x2.eq
    call $assert_true

    i32.const 0
    i64.const 0x8000000000000001
    i64.store

    i32.const 0
    v128.load32x2_u

    v128.const i64x2 2147483648 1

    i64x2.eq
    call $assert_true

    call $proc_exit
  )
)
```

* Emit imm for lane extract and replace (#3906)

* Fix replacement value not being correct (#3919)

* Implement load lanes opcodes for wasm (#3942)

* Implement final SIMD opcodes: store lane (#4001)

* Fix load/store (#4054)

* Correctly use unsigned functions  (#4055)

* implement local and function calls for v128 in the fast interpreter

* Fix splat opcodes, add V128 handling in preserve_referenced_local and reserve_block_ret

* Fix incorrect memory overflow values + SIMD ifdefs

* Fix load/load_splat macros

* correct endif wasm loader

* Update core/iwasm/interpreter/wasm_opcode.h

* Fix spec tests when WASM_CPU_SUPPORTS_UNALIGNED_ADDR_ACCESS is 0

* Resolve merge conflicts arising from main -> dev/simd_for_interp and implement fast interpreter const offset loader support for V128

* Enable SIMDe tests on CI

* Document WAMR_BUILD_LIB_SIMDE

---------

Co-authored-by: James Marsh <mrshnja@amazon.co.uk>
Co-authored-by: jammar1 <108334558+jammar1@users.noreply.github.com>
Co-authored-by: Maks Litskevich <makslit@amazon.com>
Co-authored-by: Marcin Kolny <marcin.kolny@gmail.com>
Co-authored-by: Wenyong Huang <wenyong.huang@intel.com>
This commit is contained in:
Marcin Kolny
2025-03-20 06:23:20 +00:00
committed by GitHub
parent c30e65ba5d
commit efa8019bdb
13 changed files with 2189 additions and 73 deletions

View File

@ -151,7 +151,8 @@ is_valid_value_type(uint8 type)
bool
is_valid_value_type_for_interpreter(uint8 value_type)
{
#if (WASM_ENABLE_WAMR_COMPILER == 0) && (WASM_ENABLE_JIT == 0)
#if (WASM_ENABLE_WAMR_COMPILER == 0) && (WASM_ENABLE_JIT == 0) \
&& (WASM_ENABLE_FAST_INTERP == 0)
/*
* Note: regardless of WASM_ENABLE_SIMD, our interpreters don't have
* SIMD implemented. It's safer to reject v128, especially for the

View File

@ -37,6 +37,10 @@ extern "C" {
do { \
*(int64 *)(addr) = (int64)(value); \
} while (0)
#define PUT_V128_TO_ADDR(addr, value) \
do { \
*(V128 *)(addr) = (value); \
} while (0)
#define PUT_F64_TO_ADDR(addr, value) \
do { \
*(float64 *)(addr) = (float64)(value); \
@ -49,6 +53,7 @@ extern "C" {
#define GET_I64_FROM_ADDR(addr) (*(int64 *)(addr))
#define GET_F64_FROM_ADDR(addr) (*(float64 *)(addr))
#define GET_REF_FROM_ADDR(addr) (*(void **)(addr))
#define GET_V128_FROM_ADDR(addr) (*(V128 *)(addr))
/* For STORE opcodes */
#define STORE_I64 PUT_I64_TO_ADDR
@ -68,6 +73,12 @@ STORE_U8(void *addr, uint8_t value)
*(uint8 *)addr = value;
}
static inline void
STORE_V128(void *addr, V128 value)
{
*(V128 *)addr = value;
}
/* For LOAD opcodes */
#define LOAD_I64(addr) (*(int64 *)(addr))
#define LOAD_F64(addr) (*(float64 *)(addr))
@ -75,6 +86,7 @@ STORE_U8(void *addr, uint8_t value)
#define LOAD_U32(addr) (*(uint32 *)(addr))
#define LOAD_I16(addr) (*(int16 *)(addr))
#define LOAD_U16(addr) (*(uint16 *)(addr))
#define LOAD_V128(addr) (*(V128 *)(addr))
#define STORE_PTR(addr, ptr) \
do { \
@ -83,6 +95,15 @@ STORE_U8(void *addr, uint8_t value)
#else /* WASM_CPU_SUPPORTS_UNALIGNED_ADDR_ACCESS != 0 */
#define PUT_V128_TO_ADDR(addr, value) \
do { \
uint32 *addr_u32 = (uint32 *)(addr); \
addr_u32[0] = (value).i32x4[0]; \
addr_u32[1] = (value).i32x4[1]; \
addr_u32[2] = (value).i32x4[2]; \
addr_u32[3] = (value).i32x4[3]; \
} while (0)
#define PUT_I64_TO_ADDR(addr, value) \
do { \
uint32 *addr_u32 = (uint32 *)(addr); \
@ -124,6 +145,17 @@ STORE_U8(void *addr, uint8_t value)
} while (0)
#endif
static inline V128
GET_V128_FROM_ADDR(uint32 *addr)
{
V128 ret;
ret.i32x4[0] = addr[0];
ret.i32x4[1] = addr[1];
ret.i32x4[2] = addr[2];
ret.i32x4[3] = addr[3];
return ret;
}
static inline int64
GET_I64_FROM_ADDR(uint32 *addr)
{
@ -239,7 +271,94 @@ STORE_U16(void *addr, uint16_t value)
((uint8_t *)(addr))[0] = u.u8[0];
((uint8_t *)(addr))[1] = u.u8[1];
}
static inline void
STORE_V128(void *addr, V128 value)
{
uintptr_t addr_ = (uintptr_t)(addr);
union {
V128 val;
uint64 u64[2];
uint32 u32[4];
uint16 u16[8];
uint8 u8[16];
} u;
if ((addr_ & (uintptr_t)15) == 0) {
*(V128 *)addr = value;
}
else if ((addr_ & (uintptr_t)7) == 0) {
u.val = value;
((uint64 *)(addr))[0] = u.u64[0];
((uint64 *)(addr))[1] = u.u64[1];
}
else if ((addr_ & (uintptr_t)3) == 0) {
u.val = value;
((uint32 *)addr)[0] = u.u32[0];
((uint32 *)addr)[1] = u.u32[1];
((uint32 *)addr)[2] = u.u32[2];
((uint32 *)addr)[3] = u.u32[3];
}
else if ((addr_ & (uintptr_t)1) == 0) {
u.val = value;
((uint16 *)addr)[0] = u.u16[0];
((uint16 *)addr)[1] = u.u16[1];
((uint16 *)addr)[2] = u.u16[2];
((uint16 *)addr)[3] = u.u16[3];
((uint16 *)addr)[4] = u.u16[4];
((uint16 *)addr)[5] = u.u16[5];
((uint16 *)addr)[6] = u.u16[6];
((uint16 *)addr)[7] = u.u16[7];
}
else {
u.val = value;
for (int i = 0; i < 16; i++)
((uint8 *)addr)[i] = u.u8[i];
}
}
/* For LOAD opcodes */
static inline V128
LOAD_V128(void *addr)
{
uintptr_t addr1 = (uintptr_t)addr;
union {
V128 val;
uint64 u64[2];
uint32 u32[4];
uint16 u16[8];
uint8 u8[16];
} u;
if ((addr1 & (uintptr_t)15) == 0)
return *(V128 *)addr;
if ((addr1 & (uintptr_t)7) == 0) {
u.u64[0] = ((uint64 *)addr)[0];
u.u64[1] = ((uint64 *)addr)[1];
}
else if ((addr1 & (uintptr_t)3) == 0) {
u.u32[0] = ((uint32 *)addr)[0];
u.u32[1] = ((uint32 *)addr)[1];
u.u32[2] = ((uint32 *)addr)[2];
u.u32[3] = ((uint32 *)addr)[3];
}
else if ((addr1 & (uintptr_t)1) == 0) {
u.u16[0] = ((uint16 *)addr)[0];
u.u16[1] = ((uint16 *)addr)[1];
u.u16[2] = ((uint16 *)addr)[2];
u.u16[3] = ((uint16 *)addr)[3];
u.u16[4] = ((uint16 *)addr)[4];
u.u16[5] = ((uint16 *)addr)[5];
u.u16[6] = ((uint16 *)addr)[6];
u.u16[7] = ((uint16 *)addr)[7];
}
else {
for (int i = 0; i < 16; i++)
u.u8[i] = ((uint8 *)addr)[i];
}
return u.val;
}
static inline int64
LOAD_I64(void *addr)
{