Add wasm tacle-bench targets

2026-06-12 20:06:22 +02:00
parent 30daa8a00c
commit 08c2e9c13d
1122 changed files with 520422 additions and 0 deletions
--- a/targets/wasm-tacle/kernel/jfdctint/generated/default/jfdctint.wasm
+++ b/targets/wasm-tacle/kernel/jfdctint/generated/default/jfdctint.wasm
--- a/targets/wasm-tacle/kernel/jfdctint/generated/default/jfdctint.wat
+++ b/targets/wasm-tacle/kernel/jfdctint/generated/default/jfdctint.wat
@ -0,0 +1,679 @@
+(module $jfdctint.wasm
+  (type (;0;) (func (param i32 i32)))
+  (type (;1;) (func))
+  (type (;2;) (func (result i32)))
+  (import "__pragma" "loopbound" (func $__pragma_loopbound (type 0)))
+  (func $__wasm_apply_data_relocs (type 1))
+  (func $jfdctint_return (type 2) (result i32)
+    i32.const 64
+    i32.const 64
+    call $__pragma_loopbound
+    i32.const -1
+    i32.const 0
+    i32.const 0
+    i32.load offset=1276
+    i32.const 0
+    i32.load offset=1272
+    i32.const 0
+    i32.load offset=1268
+    i32.const 0
+    i32.load offset=1264
+    i32.const 0
+    i32.load offset=1260
+    i32.const 0
+    i32.load offset=1256
+    i32.const 0
+    i32.load offset=1252
+    i32.const 0
+    i32.load offset=1248
+    i32.const 0
+    i32.load offset=1244
+    i32.const 0
+    i32.load offset=1240
+    i32.const 0
+    i32.load offset=1236
+    i32.const 0
+    i32.load offset=1232
+    i32.const 0
+    i32.load offset=1228
+    i32.const 0
+    i32.load offset=1224
+    i32.const 0
+    i32.load offset=1220
+    i32.const 0
+    i32.load offset=1216
+    i32.const 0
+    i32.load offset=1212
+    i32.const 0
+    i32.load offset=1208
+    i32.const 0
+    i32.load offset=1204
+    i32.const 0
+    i32.load offset=1200
+    i32.const 0
+    i32.load offset=1196
+    i32.const 0
+    i32.load offset=1192
+    i32.const 0
+    i32.load offset=1188
+    i32.const 0
+    i32.load offset=1184
+    i32.const 0
+    i32.load offset=1180
+    i32.const 0
+    i32.load offset=1176
+    i32.const 0
+    i32.load offset=1172
+    i32.const 0
+    i32.load offset=1168
+    i32.const 0
+    i32.load offset=1164
+    i32.const 0
+    i32.load offset=1160
+    i32.const 0
+    i32.load offset=1156
+    i32.const 0
+    i32.load offset=1152
+    i32.const 0
+    i32.load offset=1148
+    i32.const 0
+    i32.load offset=1144
+    i32.const 0
+    i32.load offset=1140
+    i32.const 0
+    i32.load offset=1136
+    i32.const 0
+    i32.load offset=1132
+    i32.const 0
+    i32.load offset=1128
+    i32.const 0
+    i32.load offset=1124
+    i32.const 0
+    i32.load offset=1120
+    i32.const 0
+    i32.load offset=1116
+    i32.const 0
+    i32.load offset=1112
+    i32.const 0
+    i32.load offset=1108
+    i32.const 0
+    i32.load offset=1104
+    i32.const 0
+    i32.load offset=1100
+    i32.const 0
+    i32.load offset=1096
+    i32.const 0
+    i32.load offset=1092
+    i32.const 0
+    i32.load offset=1088
+    i32.const 0
+    i32.load offset=1084
+    i32.const 0
+    i32.load offset=1080
+    i32.const 0
+    i32.load offset=1076
+    i32.const 0
+    i32.load offset=1072
+    i32.const 0
+    i32.load offset=1068
+    i32.const 0
+    i32.load offset=1064
+    i32.const 0
+    i32.load offset=1060
+    i32.const 0
+    i32.load offset=1056
+    i32.const 0
+    i32.load offset=1052
+    i32.const 0
+    i32.load offset=1048
+    i32.const 0
+    i32.load offset=1044
+    i32.const 0
+    i32.load offset=1040
+    i32.const 0
+    i32.load offset=1036
+    i32.const 0
+    i32.load offset=1032
+    i32.const 0
+    i32.load offset=1028
+    i32.const 0
+    i32.load offset=1024
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.const 1668124
+    i32.ne
+    select)
+  (func $jfdctint_jpeg_fdct_islow (type 1)
+    (local i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32)
+    i32.const 8
+    i32.const 8
+    call $__pragma_loopbound
+    i32.const -256
+    local.set 0
+    loop  ;; label = @1
+      local.get 0
+      i32.const 1296
+      i32.add
+      local.tee 1
+      local.get 1
+      i32.load
+      local.tee 1
+      local.get 0
+      i32.const 1292
+      i32.add
+      local.tee 2
+      i32.load
+      local.tee 3
+      i32.add
+      local.tee 4
+      local.get 0
+      i32.const 1308
+      i32.add
+      local.tee 5
+      i32.load
+      local.tee 6
+      local.get 0
+      i32.const 1280
+      i32.add
+      local.tee 7
+      i32.load
+      local.tee 8
+      i32.add
+      local.tee 9
+      i32.add
+      local.tee 10
+      local.get 0
+      i32.const 1300
+      i32.add
+      local.tee 11
+      i32.load
+      local.tee 12
+      local.get 0
+      i32.const 1288
+      i32.add
+      local.tee 13
+      i32.load
+      local.tee 14
+      i32.add
+      local.tee 15
+      local.get 0
+      i32.const 1304
+      i32.add
+      local.tee 16
+      i32.load
+      local.tee 17
+      local.get 0
+      i32.const 1284
+      i32.add
+      local.tee 18
+      i32.load
+      local.tee 19
+      i32.add
+      local.tee 20
+      i32.add
+      local.tee 21
+      i32.sub
+      i32.const 2
+      i32.shl
+      i32.store
+      local.get 7
+      local.get 10
+      local.get 21
+      i32.add
+      i32.const 2
+      i32.shl
+      i32.store
+      local.get 5
+      local.get 3
+      local.get 1
+      i32.sub
+      local.tee 1
+      local.get 8
+      local.get 6
+      i32.sub
+      local.tee 3
+      i32.add
+      i32.const -7373
+      i32.mul
+      i32.const 1024
+      i32.add
+      local.tee 7
+      local.get 1
+      i32.const 2446
+      i32.mul
+      i32.add
+      local.get 1
+      local.get 19
+      local.get 17
+      i32.sub
+      local.tee 6
+      i32.add
+      local.tee 8
+      local.get 14
+      local.get 12
+      i32.sub
+      local.tee 1
+      local.get 3
+      i32.add
+      local.tee 10
+      i32.add
+      i32.const 9633
+      i32.mul
+      local.tee 12
+      local.get 8
+      i32.const -16069
+      i32.mul
+      i32.add
+      local.tee 8
+      i32.add
+      i32.const 11
+      i32.shr_s
+      i32.store
+      local.get 16
+      local.get 9
+      local.get 4
+      i32.sub
+      local.tee 4
+      local.get 20
+      local.get 15
+      i32.sub
+      local.tee 5
+      i32.add
+      i32.const 4433
+      i32.mul
+      i32.const 1024
+      i32.add
+      local.tee 9
+      local.get 5
+      i32.const -15137
+      i32.mul
+      i32.add
+      i32.const 11
+      i32.shr_s
+      i32.store
+      local.get 13
+      local.get 9
+      local.get 4
+      i32.const 6270
+      i32.mul
+      i32.add
+      i32.const 11
+      i32.shr_s
+      i32.store
+      local.get 11
+      local.get 1
+      local.get 6
+      i32.add
+      i32.const -20995
+      i32.mul
+      i32.const 1024
+      i32.add
+      local.tee 4
+      local.get 1
+      i32.const 16819
+      i32.mul
+      i32.add
+      local.get 12
+      local.get 10
+      i32.const -3196
+      i32.mul
+      i32.add
+      local.tee 1
+      i32.add
+      i32.const 11
+      i32.shr_s
+      i32.store
+      local.get 2
+      local.get 4
+      local.get 6
+      i32.const 25172
+      i32.mul
+      i32.add
+      local.get 8
+      i32.add
+      i32.const 11
+      i32.shr_s
+      i32.store
+      local.get 18
+      local.get 7
+      local.get 3
+      i32.const 12299
+      i32.mul
+      i32.add
+      local.get 1
+      i32.add
+      i32.const 11
+      i32.shr_s
+      i32.store
+      local.get 0
+      i32.const 32
+      i32.add
+      local.tee 0
+      br_if 0 (;@1;)
+    end
+    i32.const 8
+    i32.const 8
+    call $__pragma_loopbound
+    i32.const -32
+    local.set 0
+    loop  ;; label = @1
+      local.get 0
+      i32.const 1184
+      i32.add
+      local.tee 1
+      local.get 1
+      i32.load
+      local.tee 1
+      local.get 0
+      i32.const 1152
+      i32.add
+      local.tee 2
+      i32.load
+      local.tee 3
+      i32.add
+      local.tee 4
+      local.get 0
+      i32.const 1280
+      i32.add
+      local.tee 5
+      i32.load
+      local.tee 6
+      local.get 0
+      i32.const 1056
+      i32.add
+      local.tee 7
+      i32.load
+      local.tee 8
+      i32.add
+      local.tee 9
+      i32.add
+      local.tee 10
+      local.get 0
+      i32.const 1216
+      i32.add
+      local.tee 11
+      i32.load
+      local.tee 12
+      local.get 0
+      i32.const 1120
+      i32.add
+      local.tee 13
+      i32.load
+      local.tee 14
+      i32.add
+      local.tee 15
+      local.get 0
+      i32.const 1248
+      i32.add
+      local.tee 16
+      i32.load
+      local.tee 17
+      local.get 0
+      i32.const 1088
+      i32.add
+      local.tee 18
+      i32.load
+      local.tee 19
+      i32.add
+      local.tee 20
+      i32.add
+      local.tee 21
+      i32.sub
+      i32.const 2
+      i32.add
+      i32.const 2
+      i32.shr_s
+      i32.store
+      local.get 7
+      local.get 21
+      local.get 10
+      i32.add
+      i32.const 2
+      i32.add
+      i32.const 2
+      i32.shr_s
+      i32.store
+      local.get 5
+      local.get 3
+      local.get 1
+      i32.sub
+      local.tee 1
+      local.get 8
+      local.get 6
+      i32.sub
+      local.tee 3
+      i32.add
+      i32.const -7373
+      i32.mul
+      i32.const 16384
+      i32.add
+      local.tee 7
+      local.get 1
+      i32.const 2446
+      i32.mul
+      i32.add
+      local.get 1
+      local.get 19
+      local.get 17
+      i32.sub
+      local.tee 6
+      i32.add
+      local.tee 8
+      local.get 14
+      local.get 12
+      i32.sub
+      local.tee 1
+      local.get 3
+      i32.add
+      local.tee 10
+      i32.add
+      i32.const 9633
+      i32.mul
+      local.tee 12
+      local.get 8
+      i32.const -16069
+      i32.mul
+      i32.add
+      local.tee 8
+      i32.add
+      i32.const 15
+      i32.shr_s
+      i32.store
+      local.get 16
+      local.get 9
+      local.get 4
+      i32.sub
+      local.tee 4
+      local.get 20
+      local.get 15
+      i32.sub
+      local.tee 5
+      i32.add
+      i32.const 4433
+      i32.mul
+      i32.const 16384
+      i32.add
+      local.tee 9
+      local.get 5
+      i32.const -15137
+      i32.mul
+      i32.add
+      i32.const 15
+      i32.shr_s
+      i32.store
+      local.get 13
+      local.get 9
+      local.get 4
+      i32.const 6270
+      i32.mul
+      i32.add
+      i32.const 15
+      i32.shr_s
+      i32.store
+      local.get 11
+      local.get 1
+      local.get 6
+      i32.add
+      i32.const -20995
+      i32.mul
+      i32.const 16384
+      i32.add
+      local.tee 4
+      local.get 1
+      i32.const 16819
+      i32.mul
+      i32.add
+      local.get 12
+      local.get 10
+      i32.const -3196
+      i32.mul
+      i32.add
+      local.tee 1
+      i32.add
+      i32.const 15
+      i32.shr_s
+      i32.store
+      local.get 2
+      local.get 4
+      local.get 6
+      i32.const 25172
+      i32.mul
+      i32.add
+      local.get 8
+      i32.add
+      i32.const 15
+      i32.shr_s
+      i32.store
+      local.get 18
+      local.get 7
+      local.get 3
+      i32.const 12299
+      i32.mul
+      i32.add
+      local.get 1
+      i32.add
+      i32.const 15
+      i32.shr_s
+      i32.store
+      local.get 0
+      i32.const 4
+      i32.add
+      local.tee 0
+      br_if 0 (;@1;)
+    end)
+  (func $jfdctint_main (type 1)
+    call $jfdctint_jpeg_fdct_islow)
+  (func $__original_main (type 2) (result i32)
+    (local i32 i32)
+    i32.const 64
+    i32.const 64
+    call $__pragma_loopbound
+    i32.const 1
+    local.set 0
+    i32.const -256
+    local.set 1
+    loop  ;; label = @1
+      local.get 1
+      i32.const 1280
+      i32.add
+      local.get 0
+      i32.const 133
+      i32.mul
+      i32.const 81
+      i32.add
+      i32.const 65535
+      i32.rem_s
+      local.tee 0
+      i32.store
+      local.get 1
+      i32.const 1284
+      i32.add
+      local.get 0
+      i32.const 133
+      i32.mul
+      i32.const 81
+      i32.add
+      i32.const 65535
+      i32.rem_s
+      local.tee 0
+      i32.store
+      local.get 1
+      i32.const 8
+      i32.add
+      local.tee 1
+      br_if 0 (;@1;)
+    end
+    call $jfdctint_main
+    call $jfdctint_return)
+  (table (;0;) 1 1 funcref)
+  (memory (;0;) 1)
+  (global $__stack_pointer (mut i32) (i32.const 5376))
+  (global (;1;) i32 (i32.const 1280))
+  (global (;2;) i32 (i32.const 5376))
+  (export "memory" (memory 0))
+  (export "__wasm_apply_data_relocs" (func $__wasm_apply_data_relocs))
+  (export "entrypoint" (func $jfdctint_main))
+  (export "main" (func $__original_main))
+  (export "__data_end" (global 1))
+  (export "__heap_base" (global 2)))
--- a/targets/wasm-tacle/kernel/jfdctint/generated/modified_sources/default/jfdctint.c
+++ b/targets/wasm-tacle/kernel/jfdctint/generated/modified_sources/default/jfdctint.c
@ -0,0 +1,314 @@
+/*
+
+  This program is part of the TACLeBench benchmark suite.
+  Version V 1.x
+
+  Name: jfdctint
+
+  Author: Thomas G. Lane, Public domain JPEG source code.
+          Modified by Steven Li at Princeton University.
+
+  Function: JPEG slow-but-accurate integer implementation of the
+            forward  DCT (Discrete Cosine Transform) on a 8x8
+            pixel block [from original file documentations]
+
+   Copyright (C) 1991-1994, Thomas G. Lane.
+   This file is part of the Independent JPEG Group's software.
+   For conditions of distribution and use, see the accompanying README file.
+
+   This file contains a slow-but-accurate integer implementation of the
+   forward DCT (Discrete Cosine Transform).
+
+   A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
+   on each column.  Direct algorithms are also available, but they are
+   much more complex and seem not to be any faster when reduced to code.
+
+   This implementation is based on an algorithm described in
+     C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
+     Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
+     Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
+   The primary algorithm described there uses 11 multiplies and 29 adds.
+   We use their alternate method with 12 multiplies and 32 adds.
+   The advantage of this method is that no data path contains more than one
+   multiplication; this allows a very simple and accurate implementation in
+   scaled fixed-point arithmetic, with a minimal number of shifts.
+
+  Source: SNU-RT Benchmark Suite for Worst Case Timing Analysis
+          Collected and Modified by S.-S. Lim
+          Real-Time Research Group
+          Seoul National University
+
+  Changes: Moved initialisation code from jfdctint_main() to jfdctint_init(),
+           added checksum calculation in jfdctint_return()
+
+  License: see README
+
+*/
+
+/*  COMMENTS: Long calculation sequences (i.e., long basic blocks),      */
+/*            single-nested loops.                                       */
+
+/**********************************************************************
+    Functions to be timed
+***********************************************************************/
+
+/* This definitions are added by Steven Li so as to bypass the header
+   files.
+*/
+
+// Wasm loop bounds
+
+__attribute__((import_module("__pragma"), import_name("loopbound"))) extern void
+__pragma_loopbound(unsigned int min_bound, unsigned int max_bound);
+
+#define DCTSIZE       8
+#define DESCALE(x, n) (((x) + (((int) 1) << ((n) - 1))) >> (n))
+
+/*
+   The poop on this scaling stuff is as follows:
+
+   Each 1-D DCT step produces outputs which are a factor of sqrt(N)
+   larger than the true DCT outputs.  The final outputs are therefore
+   a factor of N larger than desired; since N=8 this can be cured by
+   a simple right shift at the end of the algorithm.  The advantage of
+   this arrangement is that we save two multiplications per 1-D DCT,
+   because the y0 and y4 outputs need not be divided by sqrt(N).
+   In the IJG code, this factor of 8 is removed by the quantization step
+   (in jcdctmgr.c), NOT in this module.
+
+   We have to do addition and subtraction of the integer inputs, which
+   is no problem, and multiplication by fractional constants, which is
+   a problem to do in integer arithmetic.  We multiply all the constants
+   by CONST_SCALE and convert them to integer constants (thus retaining
+   CONST_BITS (13) bits of precision in the constants).  After doing a
+   multiplication we have to divide the product by CONST_SCALE, with proper
+   rounding, to produce the correct output.  This division can be done
+   cheaply as a right shift of CONST_BITS (13) bits.  We postpone shifting
+   as long as possible so that partial sums can be added together with
+   full fractional precision.
+
+   The outputs of the first pass are scaled up by PASS1_BITS (2) bits so that
+   they are represented to better-than-integral precision.  These outputs
+   require BITS_IN_JSAMPLE (8) + PASS1_BITS (2) + 3 bits; this fits in a
+   16-bit word with the recommended scaling.  (For 12-bit sample data, the
+   intermediate array is int anyway.)
+
+   To avoid overflow of the 32-bit intermediate results in pass 2, we must
+   have BITS_IN_JSAMPLE (8) + CONST_BITS (13) + PASS1_BITS (2) <= 26.
+   Error analysis shows that the values given below are the most effective.
+*/
+
+/*
+  Forward declaration of functions
+*/
+
+void jfdctint_init();
+int jfdctint_return();
+__attribute__((noinline)) __attribute__((export_name("entrypoint"))) void
+jfdctint_main();
+__attribute__((noinline)) __attribute__((export_name("main"))) int main(void);
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+
+/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
+   causing a lot of useless floating-point operations at run time.
+   To get around this we use the following pre-calculated constants.
+   If you change CONST_BITS you may want to add appropriate values.
+   (With a reasonable C compiler, you can just rely on the FIX() macro...)
+*/
+
+#define FIX_0_298631336 ((int) 2446)  /* FIX(0.298631336) */
+#define FIX_0_390180644 ((int) 3196)  /* FIX(0.390180644) */
+#define FIX_0_541196100 ((int) 4433)  /* FIX(0.541196100) */
+#define FIX_0_765366865 ((int) 6270)  /* FIX(0.765366865) */
+#define FIX_0_899976223 ((int) 7373)  /* FIX(0.899976223) */
+#define FIX_1_175875602 ((int) 9633)  /* FIX(1.175875602) */
+#define FIX_1_501321110 ((int) 12299) /* FIX(1.501321110) */
+#define FIX_1_847759065 ((int) 15137) /* FIX(1.847759065) */
+#define FIX_1_961570560 ((int) 16069) /* FIX(1.961570560) */
+#define FIX_2_053119869 ((int) 16819) /* FIX(2.053119869) */
+#define FIX_2_562915447 ((int) 20995) /* FIX(2.562915447) */
+#define FIX_3_072711026 ((int) 25172) /* FIX(3.072711026) */
+
+/* Multiply an int variable by an int constant to yield an int result.
+   For 8-bit samples with the recommended scaling, all the variable
+   and constant values involved are no more than 16 bits wide, so a
+   16x16->32 bit multiply can be used instead of a full 32x32 multiply.
+   For 12-bit samples, a full 32-bit multiplication will be needed.
+*/
+
+int jfdctint_data[64];
+
+const int jfdctint_CHECKSUM = 1668124;
+
+void
+jfdctint_init() {
+    int i, seed;
+
+    /* Worst case settings */
+    /* Set array to random values */
+    seed = 1;
+
+    __pragma_loopbound(64, 64);
+    for (i = 0; i < 64; i++) {
+        seed = ((seed * 133) + 81) % 65535;
+        jfdctint_data[i] = seed;
+    }
+}
+
+int
+jfdctint_return() {
+    int checksum = 0;
+    int i;
+    __pragma_loopbound(64, 64);
+    for (i = 0; i < 64; ++i)
+        checksum += jfdctint_data[i];
+    return ((checksum == jfdctint_CHECKSUM) ? 0 : -1);
+}
+
+/*
+   Perform the forward DCT on one block of samples.
+*/
+
+void
+jfdctint_jpeg_fdct_islow(void) {
+    int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    int tmp10, tmp11, tmp12, tmp13;
+    int z1, z2, z3, z4, z5;
+    int *dataptr;
+    int ctr;
+
+    /* Pass 1: process rows. */
+    /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+    /* furthermore, we scale the results by 2**PASS1_BITS. */
+
+    dataptr = jfdctint_data;
+    __pragma_loopbound(8, 8);
+    for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
+
+        tmp0 = dataptr[0] + dataptr[7];
+        tmp7 = dataptr[0] - dataptr[7];
+        tmp1 = dataptr[1] + dataptr[6];
+        tmp6 = dataptr[1] - dataptr[6];
+        tmp2 = dataptr[2] + dataptr[5];
+        tmp5 = dataptr[2] - dataptr[5];
+        tmp3 = dataptr[3] + dataptr[4];
+        tmp4 = dataptr[3] - dataptr[4];
+
+        tmp10 = tmp0 + tmp3;
+        tmp13 = tmp0 - tmp3;
+        tmp11 = tmp1 + tmp2;
+        tmp12 = tmp1 - tmp2;
+
+        dataptr[0] = (int) ((tmp10 + tmp11) << PASS1_BITS);
+        dataptr[4] = (int) ((tmp10 - tmp11) << PASS1_BITS);
+
+        z1 = (tmp12 + tmp13) * FIX_0_541196100;
+        dataptr[2] = (int) DESCALE(z1 + tmp13 * FIX_0_765366865,
+                                   CONST_BITS - PASS1_BITS);
+        dataptr[6] = (int) DESCALE(z1 + tmp12 * (-FIX_1_847759065),
+                                   CONST_BITS - PASS1_BITS);
+
+        z1 = tmp4 + tmp7;
+        z2 = tmp5 + tmp6;
+        z3 = tmp4 + tmp6;
+        z4 = tmp5 + tmp7;
+        z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */
+
+        tmp4 = tmp4 * FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */
+        tmp5 = tmp5 * FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */
+        tmp6 = tmp6 * FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */
+        tmp7 = tmp7 * FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */
+        z1 = z1 * (-FIX_0_899976223);  /* sqrt(2) * (c7-c3) */
+        z2 = z2 * (-FIX_2_562915447);  /* sqrt(2) * (-c1-c3) */
+        z3 = z3 * (-FIX_1_961570560);  /* sqrt(2) * (-c3-c5) */
+        z4 = z4 * (-FIX_0_390180644);  /* sqrt(2) * (c5-c3) */
+
+        z3 += z5;
+        z4 += z5;
+
+        dataptr[7] = (int) DESCALE(tmp4 + z1 + z3, CONST_BITS - PASS1_BITS);
+        dataptr[5] = (int) DESCALE(tmp5 + z2 + z4, CONST_BITS - PASS1_BITS);
+        dataptr[3] = (int) DESCALE(tmp6 + z2 + z3, CONST_BITS - PASS1_BITS);
+        dataptr[1] = (int) DESCALE(tmp7 + z1 + z4, CONST_BITS - PASS1_BITS);
+
+        dataptr += DCTSIZE; /* advance pointer to next row */
+    }
+
+    dataptr = jfdctint_data;
+    __pragma_loopbound(8, 8);
+    for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
+        tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
+        tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
+        tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
+        tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
+        tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
+        tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
+        tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
+        tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
+
+        tmp10 = tmp0 + tmp3;
+        tmp13 = tmp0 - tmp3;
+        tmp11 = tmp1 + tmp2;
+        tmp12 = tmp1 - tmp2;
+
+        dataptr[DCTSIZE * 0] = (int) DESCALE(tmp10 + tmp11, PASS1_BITS);
+        dataptr[DCTSIZE * 4] = (int) DESCALE(tmp10 - tmp11, PASS1_BITS);
+
+        z1 = (tmp12 + tmp13) * FIX_0_541196100;
+        dataptr[DCTSIZE * 2] = (int) DESCALE(z1 + tmp13 * FIX_0_765366865,
+                                             CONST_BITS + PASS1_BITS);
+        dataptr[DCTSIZE * 6] = (int) DESCALE(z1 + tmp12 * (-FIX_1_847759065),
+                                             CONST_BITS + PASS1_BITS);
+
+        z1 = tmp4 + tmp7;
+        z2 = tmp5 + tmp6;
+        z3 = tmp4 + tmp6;
+        z4 = tmp5 + tmp7;
+        z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */
+
+        tmp4 = tmp4 * FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */
+        tmp5 = tmp5 * FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */
+        tmp6 = tmp6 * FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */
+        tmp7 = tmp7 * FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */
+        z1 = z1 * (-FIX_0_899976223);  /* sqrt(2) * (c7-c3) */
+        z2 = z2 * (-FIX_2_562915447);  /* sqrt(2) * (-c1-c3) */
+        z3 = z3 * (-FIX_1_961570560);  /* sqrt(2) * (-c3-c5) */
+        z4 = z4 * (-FIX_0_390180644);  /* sqrt(2) * (c5-c3) */
+
+        z3 += z5;
+        z4 += z5;
+
+        dataptr[DCTSIZE * 7] =
+            (int) DESCALE(tmp4 + z1 + z3, CONST_BITS + PASS1_BITS);
+        dataptr[DCTSIZE * 5] =
+            (int) DESCALE(tmp5 + z2 + z4, CONST_BITS + PASS1_BITS);
+        dataptr[DCTSIZE * 3] =
+            (int) DESCALE(tmp6 + z2 + z3, CONST_BITS + PASS1_BITS);
+        dataptr[DCTSIZE * 1] =
+            (int) DESCALE(tmp7 + z1 + z4, CONST_BITS + PASS1_BITS);
+
+        dataptr++; /* advance pointer to next column */
+    }
+}
+
+/* Main function
+   Time to function execution time using logic analyzer,
+   which measures the OFF time of a LED on board.
+
+   The switching latency, including the function call/return time,
+   is measured to be equal to 1.1us (22 clock cycles).
+*/
+__attribute__((noinline)) __attribute__((export_name("entrypoint"))) void
+jfdctint_main(void) {
+    jfdctint_jpeg_fdct_islow();
+}
+
+__attribute__((noinline)) __attribute__((export_name("main"))) int
+main(void) {
+    jfdctint_init();
+    jfdctint_main();
+
+    return (jfdctint_return());
+}
--- a/targets/wasm-tacle/kernel/jfdctint/generated/modified_sources/inline/jfdctint.c
+++ b/targets/wasm-tacle/kernel/jfdctint/generated/modified_sources/inline/jfdctint.c
@ -0,0 +1,322 @@
+/*
+
+  This program is part of the TACLeBench benchmark suite.
+  Version V 1.x
+
+  Name: jfdctint
+
+  Author: Thomas G. Lane, Public domain JPEG source code.
+          Modified by Steven Li at Princeton University.
+
+  Function: JPEG slow-but-accurate integer implementation of the
+            forward  DCT (Discrete Cosine Transform) on a 8x8
+            pixel block [from original file documentations]
+
+   Copyright (C) 1991-1994, Thomas G. Lane.
+   This file is part of the Independent JPEG Group's software.
+   For conditions of distribution and use, see the accompanying README file.
+
+   This file contains a slow-but-accurate integer implementation of the
+   forward DCT (Discrete Cosine Transform).
+
+   A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
+   on each column.  Direct algorithms are also available, but they are
+   much more complex and seem not to be any faster when reduced to code.
+
+   This implementation is based on an algorithm described in
+     C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
+     Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
+     Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
+   The primary algorithm described there uses 11 multiplies and 29 adds.
+   We use their alternate method with 12 multiplies and 32 adds.
+   The advantage of this method is that no data path contains more than one
+   multiplication; this allows a very simple and accurate implementation in
+   scaled fixed-point arithmetic, with a minimal number of shifts.
+
+  Source: SNU-RT Benchmark Suite for Worst Case Timing Analysis
+          Collected and Modified by S.-S. Lim
+          Real-Time Research Group
+          Seoul National University
+
+  Changes: Moved initialisation code from jfdctint_main() to jfdctint_init(),
+           added checksum calculation in jfdctint_return()
+
+  License: see README
+
+*/
+
+/*  COMMENTS: Long calculation sequences (i.e., long basic blocks),      */
+/*            single-nested loops.                                       */
+
+/**********************************************************************
+    Functions to be timed
+***********************************************************************/
+
+/* This definitions are added by Steven Li so as to bypass the header
+   files.
+*/
+
+// Wasm loop bounds
+
+
+
+
+__attribute__((import_module("__pragma"), import_name("loopbound"))) extern void
+__pragma_loopbound(unsigned int min_bound, unsigned int max_bound);
+
+#define DCTSIZE       8
+#define DESCALE(x, n) (((x) + (((int) 1) << ((n) - 1))) >> (n))
+
+/*
+   The poop on this scaling stuff is as follows:
+
+   Each 1-D DCT step produces outputs which are a factor of sqrt(N)
+   larger than the true DCT outputs.  The final outputs are therefore
+   a factor of N larger than desired; since N=8 this can be cured by
+   a simple right shift at the end of the algorithm.  The advantage of
+   this arrangement is that we save two multiplications per 1-D DCT,
+   because the y0 and y4 outputs need not be divided by sqrt(N).
+   In the IJG code, this factor of 8 is removed by the quantization step
+   (in jcdctmgr.c), NOT in this module.
+
+   We have to do addition and subtraction of the integer inputs, which
+   is no problem, and multiplication by fractional constants, which is
+   a problem to do in integer arithmetic.  We multiply all the constants
+   by CONST_SCALE and convert them to integer constants (thus retaining
+   CONST_BITS (13) bits of precision in the constants).  After doing a
+   multiplication we have to divide the product by CONST_SCALE, with proper
+   rounding, to produce the correct output.  This division can be done
+   cheaply as a right shift of CONST_BITS (13) bits.  We postpone shifting
+   as long as possible so that partial sums can be added together with
+   full fractional precision.
+
+   The outputs of the first pass are scaled up by PASS1_BITS (2) bits so that
+   they are represented to better-than-integral precision.  These outputs
+   require BITS_IN_JSAMPLE (8) + PASS1_BITS (2) + 3 bits; this fits in a
+   16-bit word with the recommended scaling.  (For 12-bit sample data, the
+   intermediate array is int anyway.)
+
+   To avoid overflow of the 32-bit intermediate results in pass 2, we must
+   have BITS_IN_JSAMPLE (8) + CONST_BITS (13) + PASS1_BITS (2) <= 26.
+   Error analysis shows that the values given below are the most effective.
+*/
+
+/*
+  Forward declaration of functions
+*/
+
+__attribute__((always_inline)) static inline void jfdctint_init();
+__attribute__((always_inline)) static inline int jfdctint_return();
+__attribute__((noinline)) __attribute__((export_name("entrypoint")))
+__attribute__((noinline)) __attribute__((export_name("entrypoint"))) void
+jfdctint_main();
+__attribute__((noinline)) __attribute__((export_name("main")))
+__attribute__((noinline)) __attribute__((export_name("main"))) int
+main(void);
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+
+/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
+   causing a lot of useless floating-point operations at run time.
+   To get around this we use the following pre-calculated constants.
+   If you change CONST_BITS you may want to add appropriate values.
+   (With a reasonable C compiler, you can just rely on the FIX() macro...)
+*/
+
+#define FIX_0_298631336 ((int) 2446)  /* FIX(0.298631336) */
+#define FIX_0_390180644 ((int) 3196)  /* FIX(0.390180644) */
+#define FIX_0_541196100 ((int) 4433)  /* FIX(0.541196100) */
+#define FIX_0_765366865 ((int) 6270)  /* FIX(0.765366865) */
+#define FIX_0_899976223 ((int) 7373)  /* FIX(0.899976223) */
+#define FIX_1_175875602 ((int) 9633)  /* FIX(1.175875602) */
+#define FIX_1_501321110 ((int) 12299) /* FIX(1.501321110) */
+#define FIX_1_847759065 ((int) 15137) /* FIX(1.847759065) */
+#define FIX_1_961570560 ((int) 16069) /* FIX(1.961570560) */
+#define FIX_2_053119869 ((int) 16819) /* FIX(2.053119869) */
+#define FIX_2_562915447 ((int) 20995) /* FIX(2.562915447) */
+#define FIX_3_072711026 ((int) 25172) /* FIX(3.072711026) */
+
+/* Multiply an int variable by an int constant to yield an int result.
+   For 8-bit samples with the recommended scaling, all the variable
+   and constant values involved are no more than 16 bits wide, so a
+   16x16->32 bit multiply can be used instead of a full 32x32 multiply.
+   For 12-bit samples, a full 32-bit multiplication will be needed.
+*/
+
+int jfdctint_data[64];
+
+const int jfdctint_CHECKSUM = 1668124;
+
+__attribute__((always_inline)) static inline void
+jfdctint_init() {
+    int i, seed;
+
+    /* Worst case settings */
+    /* Set array to random values */
+    seed = 1;
+
+    __pragma_loopbound(64, 64);
+    for (i = 0; i < 64; i++) {
+        seed = ((seed * 133) + 81) % 65535;
+        jfdctint_data[i] = seed;
+    }
+}
+
+__attribute__((always_inline)) static inline int
+jfdctint_return() {
+    int checksum = 0;
+    int i;
+    __pragma_loopbound(64, 64);
+    for (i = 0; i < 64; ++i)
+        checksum += jfdctint_data[i];
+    return ((checksum == jfdctint_CHECKSUM) ? 0 : -1);
+}
+
+/*
+   Perform the forward DCT on one block of samples.
+*/
+
+__attribute__((always_inline)) static inline void
+jfdctint_jpeg_fdct_islow(void) {
+    int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    int tmp10, tmp11, tmp12, tmp13;
+    int z1, z2, z3, z4, z5;
+    int *dataptr;
+    int ctr;
+
+    /* Pass 1: process rows. */
+    /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+    /* furthermore, we scale the results by 2**PASS1_BITS. */
+
+    dataptr = jfdctint_data;
+    __pragma_loopbound(8, 8);
+    for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
+
+        tmp0 = dataptr[0] + dataptr[7];
+        tmp7 = dataptr[0] - dataptr[7];
+        tmp1 = dataptr[1] + dataptr[6];
+        tmp6 = dataptr[1] - dataptr[6];
+        tmp2 = dataptr[2] + dataptr[5];
+        tmp5 = dataptr[2] - dataptr[5];
+        tmp3 = dataptr[3] + dataptr[4];
+        tmp4 = dataptr[3] - dataptr[4];
+
+        tmp10 = tmp0 + tmp3;
+        tmp13 = tmp0 - tmp3;
+        tmp11 = tmp1 + tmp2;
+        tmp12 = tmp1 - tmp2;
+
+        dataptr[0] = (int) ((tmp10 + tmp11) << PASS1_BITS);
+        dataptr[4] = (int) ((tmp10 - tmp11) << PASS1_BITS);
+
+        z1 = (tmp12 + tmp13) * FIX_0_541196100;
+        dataptr[2] = (int) DESCALE(z1 + tmp13 * FIX_0_765366865,
+                                   CONST_BITS - PASS1_BITS);
+        dataptr[6] = (int) DESCALE(z1 + tmp12 * (-FIX_1_847759065),
+                                   CONST_BITS - PASS1_BITS);
+
+        z1 = tmp4 + tmp7;
+        z2 = tmp5 + tmp6;
+        z3 = tmp4 + tmp6;
+        z4 = tmp5 + tmp7;
+        z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */
+
+        tmp4 = tmp4 * FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */
+        tmp5 = tmp5 * FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */
+        tmp6 = tmp6 * FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */
+        tmp7 = tmp7 * FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */
+        z1 = z1 * (-FIX_0_899976223);  /* sqrt(2) * (c7-c3) */
+        z2 = z2 * (-FIX_2_562915447);  /* sqrt(2) * (-c1-c3) */
+        z3 = z3 * (-FIX_1_961570560);  /* sqrt(2) * (-c3-c5) */
+        z4 = z4 * (-FIX_0_390180644);  /* sqrt(2) * (c5-c3) */
+
+        z3 += z5;
+        z4 += z5;
+
+        dataptr[7] = (int) DESCALE(tmp4 + z1 + z3, CONST_BITS - PASS1_BITS);
+        dataptr[5] = (int) DESCALE(tmp5 + z2 + z4, CONST_BITS - PASS1_BITS);
+        dataptr[3] = (int) DESCALE(tmp6 + z2 + z3, CONST_BITS - PASS1_BITS);
+        dataptr[1] = (int) DESCALE(tmp7 + z1 + z4, CONST_BITS - PASS1_BITS);
+
+        dataptr += DCTSIZE; /* advance pointer to next row */
+    }
+
+    dataptr = jfdctint_data;
+    __pragma_loopbound(8, 8);
+    for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
+        tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
+        tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
+        tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
+        tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
+        tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
+        tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
+        tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
+        tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
+
+        tmp10 = tmp0 + tmp3;
+        tmp13 = tmp0 - tmp3;
+        tmp11 = tmp1 + tmp2;
+        tmp12 = tmp1 - tmp2;
+
+        dataptr[DCTSIZE * 0] = (int) DESCALE(tmp10 + tmp11, PASS1_BITS);
+        dataptr[DCTSIZE * 4] = (int) DESCALE(tmp10 - tmp11, PASS1_BITS);
+
+        z1 = (tmp12 + tmp13) * FIX_0_541196100;
+        dataptr[DCTSIZE * 2] = (int) DESCALE(z1 + tmp13 * FIX_0_765366865,
+                                             CONST_BITS + PASS1_BITS);
+        dataptr[DCTSIZE * 6] = (int) DESCALE(z1 + tmp12 * (-FIX_1_847759065),
+                                             CONST_BITS + PASS1_BITS);
+
+        z1 = tmp4 + tmp7;
+        z2 = tmp5 + tmp6;
+        z3 = tmp4 + tmp6;
+        z4 = tmp5 + tmp7;
+        z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */
+
+        tmp4 = tmp4 * FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */
+        tmp5 = tmp5 * FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */
+        tmp6 = tmp6 * FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */
+        tmp7 = tmp7 * FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */
+        z1 = z1 * (-FIX_0_899976223);  /* sqrt(2) * (c7-c3) */
+        z2 = z2 * (-FIX_2_562915447);  /* sqrt(2) * (-c1-c3) */
+        z3 = z3 * (-FIX_1_961570560);  /* sqrt(2) * (-c3-c5) */
+        z4 = z4 * (-FIX_0_390180644);  /* sqrt(2) * (c5-c3) */
+
+        z3 += z5;
+        z4 += z5;
+
+        dataptr[DCTSIZE * 7] =
+            (int) DESCALE(tmp4 + z1 + z3, CONST_BITS + PASS1_BITS);
+        dataptr[DCTSIZE * 5] =
+            (int) DESCALE(tmp5 + z2 + z4, CONST_BITS + PASS1_BITS);
+        dataptr[DCTSIZE * 3] =
+            (int) DESCALE(tmp6 + z2 + z3, CONST_BITS + PASS1_BITS);
+        dataptr[DCTSIZE * 1] =
+            (int) DESCALE(tmp7 + z1 + z4, CONST_BITS + PASS1_BITS);
+
+        dataptr++; /* advance pointer to next column */
+    }
+}
+
+/* Main function
+   Time to function execution time using logic analyzer,
+   which measures the OFF time of a LED on board.
+
+   The switching latency, including the function call/return time,
+   is measured to be equal to 1.1us (22 clock cycles).
+*/
+__attribute__((noinline)) __attribute__((export_name("entrypoint")))
+__attribute__((noinline)) __attribute__((export_name("entrypoint"))) void
+jfdctint_main(void) {
+    jfdctint_jpeg_fdct_islow();
+}
+
+__attribute__((noinline)) __attribute__((export_name("main")))
+__attribute__((noinline)) __attribute__((export_name("main"))) int
+main(void) {
+    jfdctint_init();
+    jfdctint_main();
+
+    return (jfdctint_return());
+}