Merge branch main into dev/wasi-libc-windows

2023-08-23 17:05:19 +08:00
parent 834e0314aa fa2f29fd8a
commit b7a9da1620
77 changed files with 1903 additions and 588 deletions
--- a/core/iwasm/libraries/lib-rats/lib_rats.cmake
+++ b/core/iwasm/libraries/lib-rats/lib_rats.cmake
@ -23,6 +23,7 @@ include(FetchContent)
 set(RATS_BUILD_MODE "sgx"
    CACHE INTERNAL "Select build mode for librats(host|occlum|sgx｜wasm)")
 set(RATS_INSTALL_PATH  "${CMAKE_BINARY_DIR}/librats" CACHE INTERNAL "")
+set(BUILD_SAMPLES OFF)

 FetchContent_Declare(
    librats
@ -34,8 +35,17 @@ if (NOT librats_POPULATED)
    message("-- Fetching librats ..")
    FetchContent_Populate(librats)
    include_directories("${librats_SOURCE_DIR}/include")
+    
+    # Prevent the propagation of the CMAKE_C_FLAGS of WAMR into librats
+    set(SAVED_CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
+    set(CMAKE_C_FLAGS "")
+
+    # Import the building scripts of librats
    add_subdirectory(${librats_SOURCE_DIR} ${librats_BINARY_DIR} EXCLUDE_FROM_ALL)

+    # Restore the CMAKE_C_FLAGS of WAMR
+    set(CMAKE_C_FLAGS ${SAVED_CMAKE_C_FLAGS})
+
 endif()

 file (GLOB source_all ${LIB_RATS_DIR}/*.c)
--- a/core/iwasm/libraries/lib-wasi-threads/test/build.sh
+++ b/core/iwasm/libraries/lib-wasi-threads/test/build.sh
@ -9,8 +9,32 @@ set -eo pipefail
 CC=${CC:=/opt/wasi-sdk/bin/clang}
 WAMR_DIR=../../../../..

+show_usage() {
+    echo "Usage: $0 [--sysroot PATH_TO_SYSROOT]"
+    echo "--sysroot PATH_TO_SYSROOT specify to build with custom sysroot for wasi-libc"
+}
+
+while [[ $# -gt 0 ]]; do
+    key="$1"
+    case $key in
+        --sysroot)
+            sysroot_path="$2"
+            shift
+            shift
+            ;;
+        --help)
+            show_usage
+            exit
+            ;;
+        *)
+            echo "Unknown option: $1"
+            exit 1
+            ;;
+    esac
+done
+
 # Stress tests names
-thread_start_file_exclusions=("spawn_stress_test.wasm" "linear_memory_size_update.wasm")
+thread_start_file_exclusions=("spawn_stress_test.wasm" "linear_memory_size_update.wasm" "stress_test_threads_creation.wasm")

 for test_c in *.c; do
    test_wasm="$(basename $test_c .c).wasm"
@ -21,9 +45,18 @@ for test_c in *.c; do
        thread_start_file=$WAMR_DIR/samples/wasi-threads/wasm-apps/wasi_thread_start.S
    fi

+    if [[ -n "$sysroot_path" ]]; then 
+        if [ ! -d "$sysroot_path" ]; then 
+            echo "Directory $sysroot_path  doesn't exist. Aborting"
+            exit 1
+        fi
+        sysroot_command="--sysroot $sysroot_path"
+    fi
+    
    echo "Compiling $test_c to $test_wasm"
    $CC \
        -target wasm32-wasi-threads \
+        -O2 \
        -pthread -ftls-model=local-exec \
        -z stack-size=32768 \
        -Wl,--export=__heap_base \
@ -33,6 +66,7 @@ for test_c in *.c; do
        -Wl,--export=malloc \
        -Wl,--export=free \
        -I $WAMR_DIR/samples/wasi-threads/wasm-apps \
+        $sysroot_command \
        $thread_start_file \
        $test_c -o $test_wasm
 done
--- a/core/iwasm/libraries/lib-wasi-threads/test/skip.json
+++ b/core/iwasm/libraries/lib-wasi-threads/test/skip.json
@ -1,5 +1,6 @@
 {
    "lib-wasi-threads tests": {
-        "spawn_stress_test": "Stress tests are incompatible with the other part and executed differently"
+        "spawn_stress_test": "Stress tests are incompatible with the other part and executed differently",
+        "stress_test_threads_creation": "Stress tests are incompatible with the other part and executed differently"
    }
 }
--- a/core/iwasm/libraries/lib-wasi-threads/test/spawn_stress_test.c
+++ b/core/iwasm/libraries/lib-wasi-threads/test/spawn_stress_test.c
@ -18,8 +18,9 @@

 enum CONSTANTS {
    NUM_ITER = 100000,
-    NUM_RETRY = 5,
+    NUM_RETRY = 8,
    MAX_NUM_THREADS = 8,
+    RETRY_SLEEP_TIME_US = 2000,
 };

 unsigned prime_numbers_count = 0;
@ -62,11 +63,13 @@ void
 spawn_thread(pthread_t *thread, unsigned int *arg)
 {
    int status_code = -1;
+    int timeout_us = RETRY_SLEEP_TIME_US;
    for (int tries = 0; status_code != 0 && tries < NUM_RETRY; ++tries) {
        status_code = pthread_create(thread, NULL, &check_if_prime, arg);
        assert(status_code == 0 || status_code == EAGAIN);
        if (status_code == EAGAIN) {
-            usleep(2000);
+            usleep(timeout_us);
+            timeout_us *= 2;
        }
    }

@ -95,7 +98,7 @@ main(int argc, char **argv)

        args[thread_num] = factorised_number;

-        usleep(2000);
+        usleep(RETRY_SLEEP_TIME_US);
        spawn_thread(&threads[thread_num], &args[thread_num]);
        assert(threads[thread_num] != 0);
    }
--- a/core/iwasm/libraries/lib-wasi-threads/test/stress_test_threads_creation.c
+++ b/core/iwasm/libraries/lib-wasi-threads/test/stress_test_threads_creation.c
@ -0,0 +1,93 @@
+/*
+ * Copyright (C) 2023 Amazon.com Inc. or its affiliates. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <unistd.h>
+
+enum CONSTANTS {
+    NUM_ITER = 200000,
+    NUM_RETRY = 8,
+    MAX_NUM_THREADS = 8,
+    RETRY_SLEEP_TIME_US = 4000,
+    SECOND = 1000 * 1000 * 1000
+};
+
+int threads_executed = 0;
+unsigned int threads_creation_tried = 0;
+unsigned int threads_in_use = 0;
+
+void *
+thread_func(void *arg)
+{
+    (void)(arg);
+    __atomic_fetch_add(&threads_executed, 1, __ATOMIC_RELAXED);
+    __atomic_fetch_sub(&threads_in_use, 1, __ATOMIC_SEQ_CST);
+    return NULL;
+}
+
+void
+spawn_thread(pthread_t *thread)
+{
+    int status_code = -1;
+    int timeout_us = RETRY_SLEEP_TIME_US;
+    for (int tries = 0; status_code != 0 && tries < NUM_RETRY; ++tries) {
+        status_code = pthread_create(thread, NULL, &thread_func, NULL);
+        __atomic_fetch_add(&threads_creation_tried, 1, __ATOMIC_RELAXED);
+
+        assert(status_code == 0 || status_code == EAGAIN);
+        if (status_code == EAGAIN) {
+            usleep(timeout_us);
+            timeout_us *= 2;
+        }
+    }
+
+    assert(status_code == 0 && "Thread creation should succeed");
+}
+
+int
+main(int argc, char **argv)
+{
+    double percentage = 0.1;
+
+    for (int iter = 0; iter < NUM_ITER; ++iter) {
+        if (iter > NUM_ITER * percentage) {
+            fprintf(stderr, "Spawning stress test is %d%% finished\n",
+                    (unsigned int)(percentage * 100));
+            percentage += 0.1;
+        }
+        while (__atomic_load_n(&threads_in_use, __ATOMIC_SEQ_CST)
+               == MAX_NUM_THREADS) {
+            usleep(100);
+        }
+
+        __atomic_fetch_add(&threads_in_use, 1, __ATOMIC_SEQ_CST);
+        pthread_t tmp;
+        spawn_thread(&tmp);
+        pthread_detach(tmp);
+    }
+
+    while ((__atomic_load_n(&threads_in_use, __ATOMIC_SEQ_CST) != 0)) {
+        __builtin_wasm_memory_atomic_wait32(&threads_in_use, 0, SECOND);
+    }
+
+    assert(__atomic_load_n(&threads_in_use, __ATOMIC_SEQ_CST) == 0);
+
+    // Validation
+    assert(threads_creation_tried >= threads_executed
+           && "Test executed more threads than were created");
+    assert((1. * threads_creation_tried) / threads_executed < 2.5
+           && "Ensuring that we're retrying thread creation less than 2.5 "
+              "times on average ");
+
+    fprintf(stderr,
+            "Spawning stress test finished successfully executed %d threads "
+            "with retry ratio %f\n",
+            threads_creation_tried,
+            (1. * threads_creation_tried) / threads_executed);
+    return 0;
+}
--- a/core/iwasm/libraries/lib-wasi-threads/tid_allocator.c
+++ b/core/iwasm/libraries/lib-wasi-threads/tid_allocator.c
@ -21,7 +21,8 @@ tid_allocator_init(TidAllocator *tid_allocator)
        return false;

    for (int64 i = tid_allocator->pos - 1; i >= 0; i--)
-        tid_allocator->ids[i] = TID_MIN + (tid_allocator->pos - 1 - i);
+        tid_allocator->ids[i] =
+            (uint32)(TID_MIN + (tid_allocator->pos - 1 - i));

    return true;
 }
@ -54,7 +55,8 @@ tid_allocator_get_tid(TidAllocator *tid_allocator)
            LOG_ERROR("Overflow detected during realloc");
            return -1;
        }
-        int32 *tmp = wasm_runtime_realloc(tid_allocator->ids, realloc_size);
+        int32 *tmp =
+            wasm_runtime_realloc(tid_allocator->ids, (uint32)realloc_size);
        if (tmp == NULL) {
            LOG_ERROR("Thread ID allocator realloc failed");
            return -1;
@ -64,7 +66,8 @@ tid_allocator_get_tid(TidAllocator *tid_allocator)
        tid_allocator->pos = new_size - old_size;
        tid_allocator->ids = tmp;
        for (int64 i = tid_allocator->pos - 1; i >= 0; i--)
-            tid_allocator->ids[i] = TID_MIN + (tid_allocator->size - 1 - i);
+            tid_allocator->ids[i] =
+                (uint32)(TID_MIN + (tid_allocator->size - 1 - i));
    }

    // Pop available thread identifier from the stack
@ -77,4 +80,4 @@ tid_allocator_release_tid(TidAllocator *tid_allocator, int32 thread_id)
    // Release thread identifier by pushing it into the stack
    bh_assert(tid_allocator->pos < tid_allocator->size);
    tid_allocator->ids[tid_allocator->pos++] = thread_id;
-}
+}
--- a/core/iwasm/libraries/thread-mgr/thread_manager.c
+++ b/core/iwasm/libraries/thread-mgr/thread_manager.c
@ -746,10 +746,10 @@ wasm_cluster_dup_c_api_imports(WASMModuleInstanceCommon *module_inst_dst,

 #if WASM_ENABLE_INTERP != 0
    if (module_inst_src->module_type == Wasm_Module_Bytecode) {
-        new_c_api_func_imports =
-            &(((WASMModuleInstance *)module_inst_dst)->e->c_api_func_imports);
+        new_c_api_func_imports = &(((WASMModuleInstance *)module_inst_dst)
+                                       ->e->common.c_api_func_imports);
        c_api_func_imports = ((const WASMModuleInstance *)module_inst_src)
-                                 ->e->c_api_func_imports;
+                                 ->e->common.c_api_func_imports;
        import_func_count =
            ((WASMModule *)(((const WASMModuleInstance *)module_inst_src)
                                ->module))
@ -760,10 +760,10 @@ wasm_cluster_dup_c_api_imports(WASMModuleInstanceCommon *module_inst_dst,
    if (module_inst_src->module_type == Wasm_Module_AoT) {
        AOTModuleInstanceExtra *e =
            (AOTModuleInstanceExtra *)((AOTModuleInstance *)module_inst_dst)->e;
-        new_c_api_func_imports = &(e->c_api_func_imports);
+        new_c_api_func_imports = &(e->common.c_api_func_imports);

        e = (AOTModuleInstanceExtra *)((AOTModuleInstance *)module_inst_src)->e;
-        c_api_func_imports = e->c_api_func_imports;
+        c_api_func_imports = e->common.c_api_func_imports;

        import_func_count =
            ((AOTModule *)(((AOTModuleInstance *)module_inst_src)->module))
--- a/core/iwasm/libraries/wasi-nn/.gitignore
+++ b/core/iwasm/libraries/wasi-nn/.gitignore
@ -0,0 +1,2 @@
+**/*.wasm
+**/*.tflite
--- a/core/iwasm/libraries/wasi-nn/README.md
+++ b/core/iwasm/libraries/wasi-nn/README.md
@ -25,6 +25,7 @@ Build the runtime image for your execution target type.
 * `cpu`
 * `nvidia-gpu`
 * `vx-delegate`
+* `tpu`

 ```
 EXECUTION_TYPE=cpu
@ -64,6 +65,8 @@ docker run \
 ```

 * (NVIDIA) GPU
+    * Requirements:
+        * [NVIDIA docker](https://github.com/NVIDIA/nvidia-docker).

 ```
 docker run \
@ -76,25 +79,36 @@ docker run \
    /assets/test_tensorflow.wasm
 ```

-* vx-delegate for NPU (x86 simulater)
+* vx-delegate for NPU (x86 simulator)

 ```
 docker run \
-    -v $PWD/core/iwasm/libraries/wasi-nn/test:/assets wasi-nn-vx-delegate \
-    --dir=/assets \
+    -v $PWD/core/iwasm/libraries/wasi-nn/test:/assets \
+    wasi-nn-vx-delegate \
+    --dir=/ \
    --env="TARGET=gpu" \
-    /assets/test_tensorflow.wasm
+    /assets/test_tensorflow_quantized.wasm
 ```

+* (Coral) TPU
+    * Requirements:
+        * [Coral USB](https://coral.ai/products/accelerator/).

-
-Requirements:
-* [NVIDIA docker](https://github.com/NVIDIA/nvidia-docker).
+```
+docker run \
+    --privileged \
+    --device=/dev/bus/usb:/dev/bus/usb \
+    -v $PWD/core/iwasm/libraries/wasi-nn/test:/assets \
+    wasi-nn-tpu \
+    --dir=/ \
+    --env="TARGET=tpu" \
+    /assets/test_tensorflow_quantized.wasm
+```

 ## What is missing

 Supported:

 * Graph encoding: `tensorflowlite`.
-* Execution target: `cpu` and `gpu`.
+* Execution target: `cpu`, `gpu` and `tpu`.
 * Tensor type: `fp32`.
--- a/core/iwasm/libraries/wasi-nn/cmake/Findtensorflow_lite.cmake
+++ b/core/iwasm/libraries/wasi-nn/cmake/Findtensorflow_lite.cmake
@ -18,12 +18,16 @@ if(NOT EXISTS ${TENSORFLOW_LITE})

  set(TENSORFLOW_SOURCE_DIR "${WAMR_ROOT_DIR}/core/deps/tensorflow-src")

-  if(WASI_NN_ENABLE_GPU EQUAL 1)
+  if(WAMR_BUILD_WASI_NN_ENABLE_GPU EQUAL 1)
    # Tensorflow specific:
    # * https://www.tensorflow.org/lite/guide/build_cmake#available_options_to_build_tensorflow_lite
    set (TFLITE_ENABLE_GPU ON)
  endif()

+  if (CMAKE_SIZEOF_VOID_P EQUAL 4)
+    set (TFLITE_ENABLE_XNNPACK OFF)
+  endif()
+
  add_subdirectory(
    "${TENSORFLOW_SOURCE_DIR}/tensorflow/lite"
    "${CMAKE_CURRENT_BINARY_DIR}/tensorflow-lite"
--- a/core/iwasm/libraries/wasi-nn/src/wasi_nn_tensorflowlite.cpp
+++ b/core/iwasm/libraries/wasi-nn/src/wasi_nn_tensorflowlite.cpp
@ -16,11 +16,11 @@
 #include <tensorflow/lite/optional_debug_tools.h>
 #include <tensorflow/lite/error_reporter.h>

-#if defined(WASI_NN_ENABLE_GPU)
+#if WASM_ENABLE_WASI_NN_GPU != 0
 #include <tensorflow/lite/delegates/gpu/delegate.h>
 #endif

-#if defined(WASI_NN_ENABLE_EXTERNAL_DELEGATE)
+#if WASM_ENABLE_WASI_NN_EXTERNAL_DELEGATE != 0
 #include <tensorflow/lite/delegates/external/external_delegate.h>
 #endif

@ -130,8 +130,8 @@ tensorflowlite_load(void *tflite_ctx, graph_builder_array *builder,
        return invalid_argument;
    }

-    if (target != cpu && target != gpu) {
-        NN_ERR_PRINTF("Only CPU and GPU target is supported.");
+    if (target != cpu && target != gpu && target != tpu) {
+        NN_ERR_PRINTF("Only CPU, GPU and TPU target is supported.");
        return invalid_argument;
    }

@ -195,7 +195,7 @@ tensorflowlite_init_execution_context(void *tflite_ctx, graph g,
    switch (tfl_ctx->models[g].target) {
        case gpu:
        {
-#if defined(WASI_NN_ENABLE_GPU)
+#if WASM_ENABLE_WASI_NN_GPU != 0
            NN_WARN_PRINTF("GPU enabled.");
            // https://www.tensorflow.org/lite/performance/gpu
            TfLiteGpuDelegateOptionsV2 options =
@ -216,10 +216,19 @@ tensorflowlite_init_execution_context(void *tflite_ctx, graph g,
                NN_ERR_PRINTF("Error when enabling GPU delegate.");
                use_default = true;
            }
-#elif defined(WASI_NN_ENABLE_EXTERNAL_DELEGATE)
+#else
+            NN_WARN_PRINTF("GPU not enabled.");
+            use_default = true;
+#endif
+            break;
+        }
+        case tpu:
+        {
+#if WASM_ENABLE_WASI_NN_EXTERNAL_DELEGATE != 0
            NN_WARN_PRINTF("external delegation enabled.");
            TfLiteExternalDelegateOptions options =
-                TfLiteExternalDelegateOptionsDefault(WASI_NN_EXT_DELEGATE_PATH);
+                TfLiteExternalDelegateOptionsDefault(
+                    WASM_WASI_NN_EXTERNAL_DELEGATE_PATH);
            tfl_ctx->delegate = TfLiteExternalDelegateCreate(&options);
            if (tfl_ctx->delegate == NULL) {
                NN_ERR_PRINTF("Error when generating External delegate.");
@ -233,7 +242,7 @@ tensorflowlite_init_execution_context(void *tflite_ctx, graph g,
                use_default = true;
            }
 #else
-            NN_WARN_PRINTF("GPU not enabled.");
+            NN_WARN_PRINTF("External delegate not enabled.");
            use_default = true;
 #endif
            break;
@ -285,14 +294,37 @@ tensorflowlite_set_input(void *tflite_ctx, graph_execution_context ctx,
        return invalid_argument;
    }

-    auto *input =
-        tfl_ctx->interpreters[ctx].interpreter->typed_input_tensor<float>(
-            index);
-    if (input == NULL)
-        return missing_memory;
+    if (tensor->quantization.type == kTfLiteNoQuantization) {
+        NN_DBG_PRINTF("No quantization information. Using float as default");
+        float *it =
+            tfl_ctx->interpreters[ctx].interpreter->typed_input_tensor<float>(
+                index);
+
+        int size = model_tensor_size * sizeof(float);
+        bh_memcpy_s(it, size, input_tensor->data, size);
+    }
+    else { // TODO: Assumming uint8 quantized networks.
+        TfLiteAffineQuantization *quant_info =
+            (TfLiteAffineQuantization *)tensor->quantization.params;
+        if (quant_info->scale->size != 1 || quant_info->zero_point->size != 1) {
+            NN_ERR_PRINTF("Quantization per channel is not supported");
+            return runtime_error;
+        }
+        uint8_t *it =
+            tfl_ctx->interpreters[ctx].interpreter->typed_input_tensor<uint8_t>(
+                index);
+
+        float scale = quant_info->scale->data[0];
+        float zero_point = (float)quant_info->zero_point->data[0];
+        NN_DBG_PRINTF("input tensor: (scale, offset) = (%f, %f)", scale,
+                      zero_point);
+
+        float *input_tensor_f = (float *)input_tensor->data;
+        for (uint32_t i = 0; i < model_tensor_size; ++i) {
+            it[i] = (uint8_t)(input_tensor_f[i] / scale + zero_point);
+        }
+    }

-    bh_memcpy_s(input, model_tensor_size * sizeof(float), input_tensor->data,
-                model_tensor_size * sizeof(float));
    return success;
 }

@ -325,6 +357,7 @@ tensorflowlite_get_output(void *tflite_ctx, graph_execution_context ctx,
    NN_DBG_PRINTF("Number of tensors (%d)", num_output_tensors);

    if (index + 1 > num_output_tensors) {
+        NN_ERR_PRINTF("Index %d is invalid.", index);
        return runtime_error;
    }

@ -343,15 +376,37 @@ tensorflowlite_get_output(void *tflite_ctx, graph_execution_context ctx,
        return missing_memory;
    }

-    float *tensor_f =
-        tfl_ctx->interpreters[ctx].interpreter->typed_output_tensor<float>(
-            index);
-    for (uint32_t i = 0; i < model_tensor_size; ++i)
-        NN_DBG_PRINTF("output: %f", tensor_f[i]);
+    if (tensor->quantization.type == kTfLiteNoQuantization) {
+        NN_DBG_PRINTF("No quantization information");
+        float *ot =
+            tfl_ctx->interpreters[ctx].interpreter->typed_output_tensor<float>(
+                index);
+
+        int size = model_tensor_size * sizeof(float);
+        bh_memcpy_s(output_tensor, size, ot, size);
+    }
+    else { // TODO: Assumming uint8 quantized networks.
+        TfLiteAffineQuantization *quant_info =
+            (TfLiteAffineQuantization *)tensor->quantization.params;
+        if (quant_info->scale->size != 1 || quant_info->zero_point->size != 1) {
+            NN_ERR_PRINTF("Quantization per channel is not supported");
+            return runtime_error;
+        }
+        uint8_t *ot = tfl_ctx->interpreters[ctx]
+                          .interpreter->typed_output_tensor<uint8_t>(index);
+
+        float scale = quant_info->scale->data[0];
+        float zero_point = (float)quant_info->zero_point->data[0];
+        NN_DBG_PRINTF("output tensor: (scale, offset) = (%f, %f)", scale,
+                      zero_point);
+
+        float *output_tensor_f = (float *)output_tensor;
+        for (uint32_t i = 0; i < model_tensor_size; ++i) {
+            output_tensor_f[i] = (ot[i] - zero_point) * scale;
+        }
+    }

    *output_tensor_size = model_tensor_size;
-    bh_memcpy_s(output_tensor, model_tensor_size * sizeof(float), tensor_f,
-                model_tensor_size * sizeof(float));
    return success;
 }

@ -392,19 +447,35 @@ tensorflowlite_destroy(void *tflite_ctx)
    */
    TFLiteContext *tfl_ctx = (TFLiteContext *)tflite_ctx;

-    if (tfl_ctx->delegate != NULL) {
-#if defined(WASI_NN_ENABLE_GPU)
-        TfLiteGpuDelegateV2Delete(tfl_ctx->delegate);
-#elif defined(WASI_NN_ENABLE_EXTERNAL_DELEGATE)
-        TfLiteExternalDelegateDelete(tfl_ctx->delegate);
-#endif
-    }
-
    NN_DBG_PRINTF("Freeing memory.");
    for (int i = 0; i < MAX_GRAPHS_PER_INST; ++i) {
        tfl_ctx->models[i].model.reset();
-        if (tfl_ctx->models[i].model_pointer)
+        if (tfl_ctx->models[i].model_pointer) {
+            if (tfl_ctx->delegate) {
+                switch (tfl_ctx->models[i].target) {
+                    case gpu:
+                    {
+#if WASM_ENABLE_WASI_NN_GPU != 0
+                        TfLiteGpuDelegateV2Delete(tfl_ctx->delegate);
+#else
+                        NN_ERR_PRINTF("GPU delegate delete but not enabled.");
+#endif
+                        break;
+                    }
+                    case tpu:
+                    {
+#if WASM_ENABLE_WASI_NN_EXTERNAL_DELEGATE != 0
+                        TfLiteExternalDelegateDelete(tfl_ctx->delegate);
+#else
+                        NN_ERR_PRINTF(
+                            "External delegate delete but not enabled.");
+#endif
+                        break;
+                    }
+                }
+            }
            wasm_runtime_free(tfl_ctx->models[i].model_pointer);
+        }
        tfl_ctx->models[i].model_pointer = NULL;
    }
    for (int i = 0; i < MAX_GRAPH_EXEC_CONTEXTS_PER_INST; ++i) {
--- a/core/iwasm/libraries/wasi-nn/test/Dockerfile.cpu
+++ b/core/iwasm/libraries/wasi-nn/test/Dockerfile.cpu
@ -30,7 +30,6 @@ RUN make -j "$(grep -c ^processor /proc/cpuinfo)"

 FROM ubuntu:22.04

-COPY --from=base /home/wamr/product-mini/platforms/linux/build/libvmlib.so /libvmlib.so
 COPY --from=base /home/wamr/product-mini/platforms/linux/build/iwasm /iwasm

 ENTRYPOINT [ "/iwasm" ]
--- a/core/iwasm/libraries/wasi-nn/test/Dockerfile.nvidia-gpu
+++ b/core/iwasm/libraries/wasi-nn/test/Dockerfile.nvidia-gpu
@ -24,7 +24,7 @@ RUN apt-get install -y wget ca-certificates --no-install-recommends \

 RUN cmake \
    -DWAMR_BUILD_WASI_NN=1 \
-    -DWASI_NN_ENABLE_GPU=1 \
+    -DWAMR_BUILD_WASI_NN_ENABLE_GPU=1 \
    ..

 RUN make -j "$(grep -c ^processor /proc/cpuinfo)"
@ -44,7 +44,6 @@ RUN mkdir -p /etc/OpenCL/vendors && \
 ENV NVIDIA_VISIBLE_DEVICES=all
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility

-COPY --from=base /home/wamr/product-mini/platforms/linux/build/libvmlib.so /libvmlib.so
 COPY --from=base /home/wamr/product-mini/platforms/linux/build/iwasm /iwasm

 ENTRYPOINT [ "/iwasm" ]
--- a/core/iwasm/libraries/wasi-nn/test/Dockerfile.tpu
+++ b/core/iwasm/libraries/wasi-nn/test/Dockerfile.tpu
@ -0,0 +1,37 @@
+# Copyright (C) 2019 Intel Corporation.  All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+FROM ubuntu:20.04 AS base
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# hadolint ignore=DL3008
+RUN apt-get update && apt-get install -y \
+    cmake build-essential git curl gnupg --no-install-recommends && \
+    rm -rf /var/lib/apt/lists/*
+
+# hadolint ignore=DL3008,DL4006
+RUN echo "deb https://packages.cloud.google.com/apt coral-edgetpu-stable main" | tee /etc/apt/sources.list.d/coral-edgetpu.list && \
+    curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add - && \
+    apt-get update && apt-get install -y libedgetpu1-std --no-install-recommends && \
+    rm -rf /var/lib/apt/lists/*
+
+WORKDIR /home/wamr
+
+COPY . .
+
+WORKDIR /home/wamr/product-mini/platforms/linux/build
+
+RUN cmake \
+  -DWAMR_BUILD_WASI_NN=1 \
+  -DWAMR_BUILD_WASI_NN_ENABLE_EXTERNAL_DELEGATE=1 \
+  -DWAMR_BUILD_WASI_NN_EXTERNAL_DELEGATE_PATH="libedgetpu.so.1.0" \
+  -DWAMR_BUILD_WASI_NN_ENABLE_GPU=1 \
+  ..
+
+RUN make -j "$(grep -c ^processor /proc/cpuinfo)" && \
+    cp /home/wamr/product-mini/platforms/linux/build/iwasm /iwasm
+
+WORKDIR /assets
+
+ENTRYPOINT [ "/iwasm" ]
--- a/core/iwasm/libraries/wasi-nn/test/build.sh
+++ b/core/iwasm/libraries/wasi-nn/test/build.sh
@ -1,6 +1,10 @@
+#!/bin/sh
+
 # Copyright (C) 2019 Intel Corporation.  All rights reserved.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

+CURR_PATH=$(cd $(dirname $0) && pwd -P)
+
 # WASM application that uses WASI-NN

 /opt/wasi-sdk/bin/clang \
@ -13,9 +17,25 @@

 # TFLite models to use in the tests

-cd models
+cd ${CURR_PATH}/models
 python3 average.py
 python3 max.py
 python3 mult_dimension.py
 python3 mult_outputs.py
 python3 sum.py
+
+# Specific tests for TPU
+
+cd ${CURR_PATH}
+/opt/wasi-sdk/bin/clang \
+    -Wl,--allow-undefined \
+    -Wl,--strip-all,--no-entry \
+    --sysroot=/opt/wasi-sdk/share/wasi-sysroot \
+    -I../include -I../src/utils \
+    -o test_tensorflow_quantized.wasm \
+    test_tensorflow_quantized.c utils.c
+
+cd ${CURR_PATH}/models
+python3 quantized.py
+
+cd ${CURR_PATH}
--- a/core/iwasm/libraries/wasi-nn/test/models/quantized.py
+++ b/core/iwasm/libraries/wasi-nn/test/models/quantized.py
@ -0,0 +1,30 @@
+# Copyright (C) 2019 Intel Corporation.  All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import tensorflow as tf
+import numpy as np
+import pathlib
+
+model = tf.keras.Sequential([
+    tf.keras.layers.InputLayer(input_shape=[5, 5, 1]),
+    tf.keras.layers.AveragePooling2D(
+        pool_size=(5, 5), strides=None, padding="valid", data_format=None)
+
+])
+
+def representative_dataset():
+    for _ in range(1000):
+      data = np.random.randint(0, 25, (1, 5, 5, 1))
+      yield [data.astype(np.float32)]
+
+converter = tf.lite.TFLiteConverter.from_keras_model(model)
+converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.representative_dataset = representative_dataset
+converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+converter.inference_input_type = tf.uint8  # or tf.int8
+converter.inference_output_type = tf.uint8  # or tf.int8
+tflite_model = converter.convert()
+
+tflite_models_dir = pathlib.Path("./")
+tflite_model_file = tflite_models_dir / "quantized_model.tflite"
+tflite_model_file.write_bytes(tflite_model)
--- a/core/iwasm/libraries/wasi-nn/test/test_tensorflow_quantized.c
+++ b/core/iwasm/libraries/wasi-nn/test/test_tensorflow_quantized.c
@ -0,0 +1,63 @@
+/*
+ * Copyright (C) 2019 Intel Corporation.  All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <math.h>
+
+#include "utils.h"
+#include "logger.h"
+
+#undef EPSILON
+#define EPSILON 1e-2
+
+void
+test_average_quantized(execution_target target)
+{
+    int dims[] = { 1, 5, 5, 1 };
+    input_info input = create_input(dims);
+
+    uint32_t output_size = 0;
+    float *output =
+        run_inference(target, input.input_tensor, input.dim, &output_size,
+                      "./models/quantized_model.tflite", 1);
+
+    NN_INFO_PRINTF("Output size: %d", output_size);
+    NN_INFO_PRINTF("Result: average is %f", output[0]);
+    // NOTE: 11.95 instead of 12 because of errors due quantization
+    assert(fabs(output[0] - 11.95) < EPSILON);
+
+    free(input.dim);
+    free(input.input_tensor);
+    free(output);
+}
+
+int
+main()
+{
+    char *env = getenv("TARGET");
+    if (env == NULL) {
+        NN_INFO_PRINTF("Usage:\n--env=\"TARGET=[cpu|gpu|tpu]\"");
+        return 1;
+    }
+    execution_target target;
+    if (strcmp(env, "cpu") == 0)
+        target = cpu;
+    else if (strcmp(env, "gpu") == 0)
+        target = gpu;
+    else if (strcmp(env, "tpu") == 0)
+        target = tpu;
+    else {
+        NN_ERR_PRINTF("Wrong target!");
+        return 1;
+    }
+    NN_INFO_PRINTF("################### Testing quantized model...");
+    test_average_quantized(target);
+
+    NN_INFO_PRINTF("Tests: passed!");
+    return 0;
+}
--- a/core/iwasm/libraries/wasi-nn/test/utils.c
+++ b/core/iwasm/libraries/wasi-nn/test/utils.c
@ -132,8 +132,8 @@ run_inference(execution_target target, float *input, uint32_t *input_size,
        *output_size = MAX_OUTPUT_TENSOR_SIZE - *output_size;
        if (wasm_get_output(ctx, i, &out_tensor[offset], output_size)
            != success) {
-            NN_ERR_PRINTF("Error when getting output.");
-            exit(1);
+            NN_ERR_PRINTF("Error when getting index %d.", i);
+            break;
        }

        offset += *output_size;
--- a/core/iwasm/libraries/wasi-nn/test/utils.h
+++ b/core/iwasm/libraries/wasi-nn/test/utils.h
@ -11,7 +11,7 @@
 #include "wasi_nn.h"

 #define MAX_MODEL_SIZE 85000000
-#define MAX_OUTPUT_TENSOR_SIZE 200
+#define MAX_OUTPUT_TENSOR_SIZE 1000000
 #define INPUT_TENSOR_DIMS 4
 #define EPSILON 1e-8