Add wasm tacle-bench targets

This commit is contained in:
2026-06-12 20:06:22 +02:00
parent 30daa8a00c
commit 08c2e9c13d
1122 changed files with 520422 additions and 0 deletions

View File

@ -0,0 +1,679 @@
(module $jfdctint.wasm
(type (;0;) (func (param i32 i32)))
(type (;1;) (func))
(type (;2;) (func (result i32)))
(import "__pragma" "loopbound" (func $__pragma_loopbound (type 0)))
(func $__wasm_apply_data_relocs (type 1))
(func $jfdctint_return (type 2) (result i32)
i32.const 64
i32.const 64
call $__pragma_loopbound
i32.const -1
i32.const 0
i32.const 0
i32.load offset=1276
i32.const 0
i32.load offset=1272
i32.const 0
i32.load offset=1268
i32.const 0
i32.load offset=1264
i32.const 0
i32.load offset=1260
i32.const 0
i32.load offset=1256
i32.const 0
i32.load offset=1252
i32.const 0
i32.load offset=1248
i32.const 0
i32.load offset=1244
i32.const 0
i32.load offset=1240
i32.const 0
i32.load offset=1236
i32.const 0
i32.load offset=1232
i32.const 0
i32.load offset=1228
i32.const 0
i32.load offset=1224
i32.const 0
i32.load offset=1220
i32.const 0
i32.load offset=1216
i32.const 0
i32.load offset=1212
i32.const 0
i32.load offset=1208
i32.const 0
i32.load offset=1204
i32.const 0
i32.load offset=1200
i32.const 0
i32.load offset=1196
i32.const 0
i32.load offset=1192
i32.const 0
i32.load offset=1188
i32.const 0
i32.load offset=1184
i32.const 0
i32.load offset=1180
i32.const 0
i32.load offset=1176
i32.const 0
i32.load offset=1172
i32.const 0
i32.load offset=1168
i32.const 0
i32.load offset=1164
i32.const 0
i32.load offset=1160
i32.const 0
i32.load offset=1156
i32.const 0
i32.load offset=1152
i32.const 0
i32.load offset=1148
i32.const 0
i32.load offset=1144
i32.const 0
i32.load offset=1140
i32.const 0
i32.load offset=1136
i32.const 0
i32.load offset=1132
i32.const 0
i32.load offset=1128
i32.const 0
i32.load offset=1124
i32.const 0
i32.load offset=1120
i32.const 0
i32.load offset=1116
i32.const 0
i32.load offset=1112
i32.const 0
i32.load offset=1108
i32.const 0
i32.load offset=1104
i32.const 0
i32.load offset=1100
i32.const 0
i32.load offset=1096
i32.const 0
i32.load offset=1092
i32.const 0
i32.load offset=1088
i32.const 0
i32.load offset=1084
i32.const 0
i32.load offset=1080
i32.const 0
i32.load offset=1076
i32.const 0
i32.load offset=1072
i32.const 0
i32.load offset=1068
i32.const 0
i32.load offset=1064
i32.const 0
i32.load offset=1060
i32.const 0
i32.load offset=1056
i32.const 0
i32.load offset=1052
i32.const 0
i32.load offset=1048
i32.const 0
i32.load offset=1044
i32.const 0
i32.load offset=1040
i32.const 0
i32.load offset=1036
i32.const 0
i32.load offset=1032
i32.const 0
i32.load offset=1028
i32.const 0
i32.load offset=1024
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.add
i32.const 1668124
i32.ne
select)
(func $jfdctint_jpeg_fdct_islow (type 1)
(local i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32)
i32.const 8
i32.const 8
call $__pragma_loopbound
i32.const -256
local.set 0
loop ;; label = @1
local.get 0
i32.const 1296
i32.add
local.tee 1
local.get 1
i32.load
local.tee 1
local.get 0
i32.const 1292
i32.add
local.tee 2
i32.load
local.tee 3
i32.add
local.tee 4
local.get 0
i32.const 1308
i32.add
local.tee 5
i32.load
local.tee 6
local.get 0
i32.const 1280
i32.add
local.tee 7
i32.load
local.tee 8
i32.add
local.tee 9
i32.add
local.tee 10
local.get 0
i32.const 1300
i32.add
local.tee 11
i32.load
local.tee 12
local.get 0
i32.const 1288
i32.add
local.tee 13
i32.load
local.tee 14
i32.add
local.tee 15
local.get 0
i32.const 1304
i32.add
local.tee 16
i32.load
local.tee 17
local.get 0
i32.const 1284
i32.add
local.tee 18
i32.load
local.tee 19
i32.add
local.tee 20
i32.add
local.tee 21
i32.sub
i32.const 2
i32.shl
i32.store
local.get 7
local.get 10
local.get 21
i32.add
i32.const 2
i32.shl
i32.store
local.get 5
local.get 3
local.get 1
i32.sub
local.tee 1
local.get 8
local.get 6
i32.sub
local.tee 3
i32.add
i32.const -7373
i32.mul
i32.const 1024
i32.add
local.tee 7
local.get 1
i32.const 2446
i32.mul
i32.add
local.get 1
local.get 19
local.get 17
i32.sub
local.tee 6
i32.add
local.tee 8
local.get 14
local.get 12
i32.sub
local.tee 1
local.get 3
i32.add
local.tee 10
i32.add
i32.const 9633
i32.mul
local.tee 12
local.get 8
i32.const -16069
i32.mul
i32.add
local.tee 8
i32.add
i32.const 11
i32.shr_s
i32.store
local.get 16
local.get 9
local.get 4
i32.sub
local.tee 4
local.get 20
local.get 15
i32.sub
local.tee 5
i32.add
i32.const 4433
i32.mul
i32.const 1024
i32.add
local.tee 9
local.get 5
i32.const -15137
i32.mul
i32.add
i32.const 11
i32.shr_s
i32.store
local.get 13
local.get 9
local.get 4
i32.const 6270
i32.mul
i32.add
i32.const 11
i32.shr_s
i32.store
local.get 11
local.get 1
local.get 6
i32.add
i32.const -20995
i32.mul
i32.const 1024
i32.add
local.tee 4
local.get 1
i32.const 16819
i32.mul
i32.add
local.get 12
local.get 10
i32.const -3196
i32.mul
i32.add
local.tee 1
i32.add
i32.const 11
i32.shr_s
i32.store
local.get 2
local.get 4
local.get 6
i32.const 25172
i32.mul
i32.add
local.get 8
i32.add
i32.const 11
i32.shr_s
i32.store
local.get 18
local.get 7
local.get 3
i32.const 12299
i32.mul
i32.add
local.get 1
i32.add
i32.const 11
i32.shr_s
i32.store
local.get 0
i32.const 32
i32.add
local.tee 0
br_if 0 (;@1;)
end
i32.const 8
i32.const 8
call $__pragma_loopbound
i32.const -32
local.set 0
loop ;; label = @1
local.get 0
i32.const 1184
i32.add
local.tee 1
local.get 1
i32.load
local.tee 1
local.get 0
i32.const 1152
i32.add
local.tee 2
i32.load
local.tee 3
i32.add
local.tee 4
local.get 0
i32.const 1280
i32.add
local.tee 5
i32.load
local.tee 6
local.get 0
i32.const 1056
i32.add
local.tee 7
i32.load
local.tee 8
i32.add
local.tee 9
i32.add
local.tee 10
local.get 0
i32.const 1216
i32.add
local.tee 11
i32.load
local.tee 12
local.get 0
i32.const 1120
i32.add
local.tee 13
i32.load
local.tee 14
i32.add
local.tee 15
local.get 0
i32.const 1248
i32.add
local.tee 16
i32.load
local.tee 17
local.get 0
i32.const 1088
i32.add
local.tee 18
i32.load
local.tee 19
i32.add
local.tee 20
i32.add
local.tee 21
i32.sub
i32.const 2
i32.add
i32.const 2
i32.shr_s
i32.store
local.get 7
local.get 21
local.get 10
i32.add
i32.const 2
i32.add
i32.const 2
i32.shr_s
i32.store
local.get 5
local.get 3
local.get 1
i32.sub
local.tee 1
local.get 8
local.get 6
i32.sub
local.tee 3
i32.add
i32.const -7373
i32.mul
i32.const 16384
i32.add
local.tee 7
local.get 1
i32.const 2446
i32.mul
i32.add
local.get 1
local.get 19
local.get 17
i32.sub
local.tee 6
i32.add
local.tee 8
local.get 14
local.get 12
i32.sub
local.tee 1
local.get 3
i32.add
local.tee 10
i32.add
i32.const 9633
i32.mul
local.tee 12
local.get 8
i32.const -16069
i32.mul
i32.add
local.tee 8
i32.add
i32.const 15
i32.shr_s
i32.store
local.get 16
local.get 9
local.get 4
i32.sub
local.tee 4
local.get 20
local.get 15
i32.sub
local.tee 5
i32.add
i32.const 4433
i32.mul
i32.const 16384
i32.add
local.tee 9
local.get 5
i32.const -15137
i32.mul
i32.add
i32.const 15
i32.shr_s
i32.store
local.get 13
local.get 9
local.get 4
i32.const 6270
i32.mul
i32.add
i32.const 15
i32.shr_s
i32.store
local.get 11
local.get 1
local.get 6
i32.add
i32.const -20995
i32.mul
i32.const 16384
i32.add
local.tee 4
local.get 1
i32.const 16819
i32.mul
i32.add
local.get 12
local.get 10
i32.const -3196
i32.mul
i32.add
local.tee 1
i32.add
i32.const 15
i32.shr_s
i32.store
local.get 2
local.get 4
local.get 6
i32.const 25172
i32.mul
i32.add
local.get 8
i32.add
i32.const 15
i32.shr_s
i32.store
local.get 18
local.get 7
local.get 3
i32.const 12299
i32.mul
i32.add
local.get 1
i32.add
i32.const 15
i32.shr_s
i32.store
local.get 0
i32.const 4
i32.add
local.tee 0
br_if 0 (;@1;)
end)
(func $jfdctint_main (type 1)
call $jfdctint_jpeg_fdct_islow)
(func $__original_main (type 2) (result i32)
(local i32 i32)
i32.const 64
i32.const 64
call $__pragma_loopbound
i32.const 1
local.set 0
i32.const -256
local.set 1
loop ;; label = @1
local.get 1
i32.const 1280
i32.add
local.get 0
i32.const 133
i32.mul
i32.const 81
i32.add
i32.const 65535
i32.rem_s
local.tee 0
i32.store
local.get 1
i32.const 1284
i32.add
local.get 0
i32.const 133
i32.mul
i32.const 81
i32.add
i32.const 65535
i32.rem_s
local.tee 0
i32.store
local.get 1
i32.const 8
i32.add
local.tee 1
br_if 0 (;@1;)
end
call $jfdctint_main
call $jfdctint_return)
(table (;0;) 1 1 funcref)
(memory (;0;) 1)
(global $__stack_pointer (mut i32) (i32.const 5376))
(global (;1;) i32 (i32.const 1280))
(global (;2;) i32 (i32.const 5376))
(export "memory" (memory 0))
(export "__wasm_apply_data_relocs" (func $__wasm_apply_data_relocs))
(export "entrypoint" (func $jfdctint_main))
(export "main" (func $__original_main))
(export "__data_end" (global 1))
(export "__heap_base" (global 2)))

View File

@ -0,0 +1,314 @@
/*
This program is part of the TACLeBench benchmark suite.
Version V 1.x
Name: jfdctint
Author: Thomas G. Lane, Public domain JPEG source code.
Modified by Steven Li at Princeton University.
Function: JPEG slow-but-accurate integer implementation of the
forward DCT (Discrete Cosine Transform) on a 8x8
pixel block [from original file documentations]
Copyright (C) 1991-1994, Thomas G. Lane.
This file is part of the Independent JPEG Group's software.
For conditions of distribution and use, see the accompanying README file.
This file contains a slow-but-accurate integer implementation of the
forward DCT (Discrete Cosine Transform).
A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
on each column. Direct algorithms are also available, but they are
much more complex and seem not to be any faster when reduced to code.
This implementation is based on an algorithm described in
C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
The primary algorithm described there uses 11 multiplies and 29 adds.
We use their alternate method with 12 multiplies and 32 adds.
The advantage of this method is that no data path contains more than one
multiplication; this allows a very simple and accurate implementation in
scaled fixed-point arithmetic, with a minimal number of shifts.
Source: SNU-RT Benchmark Suite for Worst Case Timing Analysis
Collected and Modified by S.-S. Lim
Real-Time Research Group
Seoul National University
Changes: Moved initialisation code from jfdctint_main() to jfdctint_init(),
added checksum calculation in jfdctint_return()
License: see README
*/
/* COMMENTS: Long calculation sequences (i.e., long basic blocks), */
/* single-nested loops. */
/**********************************************************************
Functions to be timed
***********************************************************************/
/* This definitions are added by Steven Li so as to bypass the header
files.
*/
// Wasm loop bounds
__attribute__((import_module("__pragma"), import_name("loopbound"))) extern void
__pragma_loopbound(unsigned int min_bound, unsigned int max_bound);
#define DCTSIZE 8
#define DESCALE(x, n) (((x) + (((int) 1) << ((n) - 1))) >> (n))
/*
The poop on this scaling stuff is as follows:
Each 1-D DCT step produces outputs which are a factor of sqrt(N)
larger than the true DCT outputs. The final outputs are therefore
a factor of N larger than desired; since N=8 this can be cured by
a simple right shift at the end of the algorithm. The advantage of
this arrangement is that we save two multiplications per 1-D DCT,
because the y0 and y4 outputs need not be divided by sqrt(N).
In the IJG code, this factor of 8 is removed by the quantization step
(in jcdctmgr.c), NOT in this module.
We have to do addition and subtraction of the integer inputs, which
is no problem, and multiplication by fractional constants, which is
a problem to do in integer arithmetic. We multiply all the constants
by CONST_SCALE and convert them to integer constants (thus retaining
CONST_BITS (13) bits of precision in the constants). After doing a
multiplication we have to divide the product by CONST_SCALE, with proper
rounding, to produce the correct output. This division can be done
cheaply as a right shift of CONST_BITS (13) bits. We postpone shifting
as long as possible so that partial sums can be added together with
full fractional precision.
The outputs of the first pass are scaled up by PASS1_BITS (2) bits so that
they are represented to better-than-integral precision. These outputs
require BITS_IN_JSAMPLE (8) + PASS1_BITS (2) + 3 bits; this fits in a
16-bit word with the recommended scaling. (For 12-bit sample data, the
intermediate array is int anyway.)
To avoid overflow of the 32-bit intermediate results in pass 2, we must
have BITS_IN_JSAMPLE (8) + CONST_BITS (13) + PASS1_BITS (2) <= 26.
Error analysis shows that the values given below are the most effective.
*/
/*
Forward declaration of functions
*/
void jfdctint_init();
int jfdctint_return();
__attribute__((noinline)) __attribute__((export_name("entrypoint"))) void
jfdctint_main();
__attribute__((noinline)) __attribute__((export_name("main"))) int main(void);
#define CONST_BITS 13
#define PASS1_BITS 2
/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
causing a lot of useless floating-point operations at run time.
To get around this we use the following pre-calculated constants.
If you change CONST_BITS you may want to add appropriate values.
(With a reasonable C compiler, you can just rely on the FIX() macro...)
*/
#define FIX_0_298631336 ((int) 2446) /* FIX(0.298631336) */
#define FIX_0_390180644 ((int) 3196) /* FIX(0.390180644) */
#define FIX_0_541196100 ((int) 4433) /* FIX(0.541196100) */
#define FIX_0_765366865 ((int) 6270) /* FIX(0.765366865) */
#define FIX_0_899976223 ((int) 7373) /* FIX(0.899976223) */
#define FIX_1_175875602 ((int) 9633) /* FIX(1.175875602) */
#define FIX_1_501321110 ((int) 12299) /* FIX(1.501321110) */
#define FIX_1_847759065 ((int) 15137) /* FIX(1.847759065) */
#define FIX_1_961570560 ((int) 16069) /* FIX(1.961570560) */
#define FIX_2_053119869 ((int) 16819) /* FIX(2.053119869) */
#define FIX_2_562915447 ((int) 20995) /* FIX(2.562915447) */
#define FIX_3_072711026 ((int) 25172) /* FIX(3.072711026) */
/* Multiply an int variable by an int constant to yield an int result.
For 8-bit samples with the recommended scaling, all the variable
and constant values involved are no more than 16 bits wide, so a
16x16->32 bit multiply can be used instead of a full 32x32 multiply.
For 12-bit samples, a full 32-bit multiplication will be needed.
*/
int jfdctint_data[64];
const int jfdctint_CHECKSUM = 1668124;
void
jfdctint_init() {
int i, seed;
/* Worst case settings */
/* Set array to random values */
seed = 1;
__pragma_loopbound(64, 64);
for (i = 0; i < 64; i++) {
seed = ((seed * 133) + 81) % 65535;
jfdctint_data[i] = seed;
}
}
int
jfdctint_return() {
int checksum = 0;
int i;
__pragma_loopbound(64, 64);
for (i = 0; i < 64; ++i)
checksum += jfdctint_data[i];
return ((checksum == jfdctint_CHECKSUM) ? 0 : -1);
}
/*
Perform the forward DCT on one block of samples.
*/
void
jfdctint_jpeg_fdct_islow(void) {
int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int tmp10, tmp11, tmp12, tmp13;
int z1, z2, z3, z4, z5;
int *dataptr;
int ctr;
/* Pass 1: process rows. */
/* Note results are scaled up by sqrt(8) compared to a true DCT; */
/* furthermore, we scale the results by 2**PASS1_BITS. */
dataptr = jfdctint_data;
__pragma_loopbound(8, 8);
for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
tmp0 = dataptr[0] + dataptr[7];
tmp7 = dataptr[0] - dataptr[7];
tmp1 = dataptr[1] + dataptr[6];
tmp6 = dataptr[1] - dataptr[6];
tmp2 = dataptr[2] + dataptr[5];
tmp5 = dataptr[2] - dataptr[5];
tmp3 = dataptr[3] + dataptr[4];
tmp4 = dataptr[3] - dataptr[4];
tmp10 = tmp0 + tmp3;
tmp13 = tmp0 - tmp3;
tmp11 = tmp1 + tmp2;
tmp12 = tmp1 - tmp2;
dataptr[0] = (int) ((tmp10 + tmp11) << PASS1_BITS);
dataptr[4] = (int) ((tmp10 - tmp11) << PASS1_BITS);
z1 = (tmp12 + tmp13) * FIX_0_541196100;
dataptr[2] = (int) DESCALE(z1 + tmp13 * FIX_0_765366865,
CONST_BITS - PASS1_BITS);
dataptr[6] = (int) DESCALE(z1 + tmp12 * (-FIX_1_847759065),
CONST_BITS - PASS1_BITS);
z1 = tmp4 + tmp7;
z2 = tmp5 + tmp6;
z3 = tmp4 + tmp6;
z4 = tmp5 + tmp7;
z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */
tmp4 = tmp4 * FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */
tmp5 = tmp5 * FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */
tmp6 = tmp6 * FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */
tmp7 = tmp7 * FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */
z1 = z1 * (-FIX_0_899976223); /* sqrt(2) * (c7-c3) */
z2 = z2 * (-FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
z3 = z3 * (-FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
z4 = z4 * (-FIX_0_390180644); /* sqrt(2) * (c5-c3) */
z3 += z5;
z4 += z5;
dataptr[7] = (int) DESCALE(tmp4 + z1 + z3, CONST_BITS - PASS1_BITS);
dataptr[5] = (int) DESCALE(tmp5 + z2 + z4, CONST_BITS - PASS1_BITS);
dataptr[3] = (int) DESCALE(tmp6 + z2 + z3, CONST_BITS - PASS1_BITS);
dataptr[1] = (int) DESCALE(tmp7 + z1 + z4, CONST_BITS - PASS1_BITS);
dataptr += DCTSIZE; /* advance pointer to next row */
}
dataptr = jfdctint_data;
__pragma_loopbound(8, 8);
for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
tmp10 = tmp0 + tmp3;
tmp13 = tmp0 - tmp3;
tmp11 = tmp1 + tmp2;
tmp12 = tmp1 - tmp2;
dataptr[DCTSIZE * 0] = (int) DESCALE(tmp10 + tmp11, PASS1_BITS);
dataptr[DCTSIZE * 4] = (int) DESCALE(tmp10 - tmp11, PASS1_BITS);
z1 = (tmp12 + tmp13) * FIX_0_541196100;
dataptr[DCTSIZE * 2] = (int) DESCALE(z1 + tmp13 * FIX_0_765366865,
CONST_BITS + PASS1_BITS);
dataptr[DCTSIZE * 6] = (int) DESCALE(z1 + tmp12 * (-FIX_1_847759065),
CONST_BITS + PASS1_BITS);
z1 = tmp4 + tmp7;
z2 = tmp5 + tmp6;
z3 = tmp4 + tmp6;
z4 = tmp5 + tmp7;
z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */
tmp4 = tmp4 * FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */
tmp5 = tmp5 * FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */
tmp6 = tmp6 * FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */
tmp7 = tmp7 * FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */
z1 = z1 * (-FIX_0_899976223); /* sqrt(2) * (c7-c3) */
z2 = z2 * (-FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
z3 = z3 * (-FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
z4 = z4 * (-FIX_0_390180644); /* sqrt(2) * (c5-c3) */
z3 += z5;
z4 += z5;
dataptr[DCTSIZE * 7] =
(int) DESCALE(tmp4 + z1 + z3, CONST_BITS + PASS1_BITS);
dataptr[DCTSIZE * 5] =
(int) DESCALE(tmp5 + z2 + z4, CONST_BITS + PASS1_BITS);
dataptr[DCTSIZE * 3] =
(int) DESCALE(tmp6 + z2 + z3, CONST_BITS + PASS1_BITS);
dataptr[DCTSIZE * 1] =
(int) DESCALE(tmp7 + z1 + z4, CONST_BITS + PASS1_BITS);
dataptr++; /* advance pointer to next column */
}
}
/* Main function
Time to function execution time using logic analyzer,
which measures the OFF time of a LED on board.
The switching latency, including the function call/return time,
is measured to be equal to 1.1us (22 clock cycles).
*/
__attribute__((noinline)) __attribute__((export_name("entrypoint"))) void
jfdctint_main(void) {
jfdctint_jpeg_fdct_islow();
}
__attribute__((noinline)) __attribute__((export_name("main"))) int
main(void) {
jfdctint_init();
jfdctint_main();
return (jfdctint_return());
}

View File

@ -0,0 +1,322 @@
/*
This program is part of the TACLeBench benchmark suite.
Version V 1.x
Name: jfdctint
Author: Thomas G. Lane, Public domain JPEG source code.
Modified by Steven Li at Princeton University.
Function: JPEG slow-but-accurate integer implementation of the
forward DCT (Discrete Cosine Transform) on a 8x8
pixel block [from original file documentations]
Copyright (C) 1991-1994, Thomas G. Lane.
This file is part of the Independent JPEG Group's software.
For conditions of distribution and use, see the accompanying README file.
This file contains a slow-but-accurate integer implementation of the
forward DCT (Discrete Cosine Transform).
A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
on each column. Direct algorithms are also available, but they are
much more complex and seem not to be any faster when reduced to code.
This implementation is based on an algorithm described in
C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
The primary algorithm described there uses 11 multiplies and 29 adds.
We use their alternate method with 12 multiplies and 32 adds.
The advantage of this method is that no data path contains more than one
multiplication; this allows a very simple and accurate implementation in
scaled fixed-point arithmetic, with a minimal number of shifts.
Source: SNU-RT Benchmark Suite for Worst Case Timing Analysis
Collected and Modified by S.-S. Lim
Real-Time Research Group
Seoul National University
Changes: Moved initialisation code from jfdctint_main() to jfdctint_init(),
added checksum calculation in jfdctint_return()
License: see README
*/
/* COMMENTS: Long calculation sequences (i.e., long basic blocks), */
/* single-nested loops. */
/**********************************************************************
Functions to be timed
***********************************************************************/
/* This definitions are added by Steven Li so as to bypass the header
files.
*/
// Wasm loop bounds
__attribute__((import_module("__pragma"), import_name("loopbound"))) extern void
__pragma_loopbound(unsigned int min_bound, unsigned int max_bound);
#define DCTSIZE 8
#define DESCALE(x, n) (((x) + (((int) 1) << ((n) - 1))) >> (n))
/*
The poop on this scaling stuff is as follows:
Each 1-D DCT step produces outputs which are a factor of sqrt(N)
larger than the true DCT outputs. The final outputs are therefore
a factor of N larger than desired; since N=8 this can be cured by
a simple right shift at the end of the algorithm. The advantage of
this arrangement is that we save two multiplications per 1-D DCT,
because the y0 and y4 outputs need not be divided by sqrt(N).
In the IJG code, this factor of 8 is removed by the quantization step
(in jcdctmgr.c), NOT in this module.
We have to do addition and subtraction of the integer inputs, which
is no problem, and multiplication by fractional constants, which is
a problem to do in integer arithmetic. We multiply all the constants
by CONST_SCALE and convert them to integer constants (thus retaining
CONST_BITS (13) bits of precision in the constants). After doing a
multiplication we have to divide the product by CONST_SCALE, with proper
rounding, to produce the correct output. This division can be done
cheaply as a right shift of CONST_BITS (13) bits. We postpone shifting
as long as possible so that partial sums can be added together with
full fractional precision.
The outputs of the first pass are scaled up by PASS1_BITS (2) bits so that
they are represented to better-than-integral precision. These outputs
require BITS_IN_JSAMPLE (8) + PASS1_BITS (2) + 3 bits; this fits in a
16-bit word with the recommended scaling. (For 12-bit sample data, the
intermediate array is int anyway.)
To avoid overflow of the 32-bit intermediate results in pass 2, we must
have BITS_IN_JSAMPLE (8) + CONST_BITS (13) + PASS1_BITS (2) <= 26.
Error analysis shows that the values given below are the most effective.
*/
/*
Forward declaration of functions
*/
__attribute__((always_inline)) static inline void jfdctint_init();
__attribute__((always_inline)) static inline int jfdctint_return();
__attribute__((noinline)) __attribute__((export_name("entrypoint")))
__attribute__((noinline)) __attribute__((export_name("entrypoint"))) void
jfdctint_main();
__attribute__((noinline)) __attribute__((export_name("main")))
__attribute__((noinline)) __attribute__((export_name("main"))) int
main(void);
#define CONST_BITS 13
#define PASS1_BITS 2
/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
causing a lot of useless floating-point operations at run time.
To get around this we use the following pre-calculated constants.
If you change CONST_BITS you may want to add appropriate values.
(With a reasonable C compiler, you can just rely on the FIX() macro...)
*/
#define FIX_0_298631336 ((int) 2446) /* FIX(0.298631336) */
#define FIX_0_390180644 ((int) 3196) /* FIX(0.390180644) */
#define FIX_0_541196100 ((int) 4433) /* FIX(0.541196100) */
#define FIX_0_765366865 ((int) 6270) /* FIX(0.765366865) */
#define FIX_0_899976223 ((int) 7373) /* FIX(0.899976223) */
#define FIX_1_175875602 ((int) 9633) /* FIX(1.175875602) */
#define FIX_1_501321110 ((int) 12299) /* FIX(1.501321110) */
#define FIX_1_847759065 ((int) 15137) /* FIX(1.847759065) */
#define FIX_1_961570560 ((int) 16069) /* FIX(1.961570560) */
#define FIX_2_053119869 ((int) 16819) /* FIX(2.053119869) */
#define FIX_2_562915447 ((int) 20995) /* FIX(2.562915447) */
#define FIX_3_072711026 ((int) 25172) /* FIX(3.072711026) */
/* Multiply an int variable by an int constant to yield an int result.
For 8-bit samples with the recommended scaling, all the variable
and constant values involved are no more than 16 bits wide, so a
16x16->32 bit multiply can be used instead of a full 32x32 multiply.
For 12-bit samples, a full 32-bit multiplication will be needed.
*/
int jfdctint_data[64];
const int jfdctint_CHECKSUM = 1668124;
__attribute__((always_inline)) static inline void
jfdctint_init() {
int i, seed;
/* Worst case settings */
/* Set array to random values */
seed = 1;
__pragma_loopbound(64, 64);
for (i = 0; i < 64; i++) {
seed = ((seed * 133) + 81) % 65535;
jfdctint_data[i] = seed;
}
}
__attribute__((always_inline)) static inline int
jfdctint_return() {
int checksum = 0;
int i;
__pragma_loopbound(64, 64);
for (i = 0; i < 64; ++i)
checksum += jfdctint_data[i];
return ((checksum == jfdctint_CHECKSUM) ? 0 : -1);
}
/*
Perform the forward DCT on one block of samples.
*/
__attribute__((always_inline)) static inline void
jfdctint_jpeg_fdct_islow(void) {
int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int tmp10, tmp11, tmp12, tmp13;
int z1, z2, z3, z4, z5;
int *dataptr;
int ctr;
/* Pass 1: process rows. */
/* Note results are scaled up by sqrt(8) compared to a true DCT; */
/* furthermore, we scale the results by 2**PASS1_BITS. */
dataptr = jfdctint_data;
__pragma_loopbound(8, 8);
for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
tmp0 = dataptr[0] + dataptr[7];
tmp7 = dataptr[0] - dataptr[7];
tmp1 = dataptr[1] + dataptr[6];
tmp6 = dataptr[1] - dataptr[6];
tmp2 = dataptr[2] + dataptr[5];
tmp5 = dataptr[2] - dataptr[5];
tmp3 = dataptr[3] + dataptr[4];
tmp4 = dataptr[3] - dataptr[4];
tmp10 = tmp0 + tmp3;
tmp13 = tmp0 - tmp3;
tmp11 = tmp1 + tmp2;
tmp12 = tmp1 - tmp2;
dataptr[0] = (int) ((tmp10 + tmp11) << PASS1_BITS);
dataptr[4] = (int) ((tmp10 - tmp11) << PASS1_BITS);
z1 = (tmp12 + tmp13) * FIX_0_541196100;
dataptr[2] = (int) DESCALE(z1 + tmp13 * FIX_0_765366865,
CONST_BITS - PASS1_BITS);
dataptr[6] = (int) DESCALE(z1 + tmp12 * (-FIX_1_847759065),
CONST_BITS - PASS1_BITS);
z1 = tmp4 + tmp7;
z2 = tmp5 + tmp6;
z3 = tmp4 + tmp6;
z4 = tmp5 + tmp7;
z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */
tmp4 = tmp4 * FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */
tmp5 = tmp5 * FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */
tmp6 = tmp6 * FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */
tmp7 = tmp7 * FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */
z1 = z1 * (-FIX_0_899976223); /* sqrt(2) * (c7-c3) */
z2 = z2 * (-FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
z3 = z3 * (-FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
z4 = z4 * (-FIX_0_390180644); /* sqrt(2) * (c5-c3) */
z3 += z5;
z4 += z5;
dataptr[7] = (int) DESCALE(tmp4 + z1 + z3, CONST_BITS - PASS1_BITS);
dataptr[5] = (int) DESCALE(tmp5 + z2 + z4, CONST_BITS - PASS1_BITS);
dataptr[3] = (int) DESCALE(tmp6 + z2 + z3, CONST_BITS - PASS1_BITS);
dataptr[1] = (int) DESCALE(tmp7 + z1 + z4, CONST_BITS - PASS1_BITS);
dataptr += DCTSIZE; /* advance pointer to next row */
}
dataptr = jfdctint_data;
__pragma_loopbound(8, 8);
for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
tmp10 = tmp0 + tmp3;
tmp13 = tmp0 - tmp3;
tmp11 = tmp1 + tmp2;
tmp12 = tmp1 - tmp2;
dataptr[DCTSIZE * 0] = (int) DESCALE(tmp10 + tmp11, PASS1_BITS);
dataptr[DCTSIZE * 4] = (int) DESCALE(tmp10 - tmp11, PASS1_BITS);
z1 = (tmp12 + tmp13) * FIX_0_541196100;
dataptr[DCTSIZE * 2] = (int) DESCALE(z1 + tmp13 * FIX_0_765366865,
CONST_BITS + PASS1_BITS);
dataptr[DCTSIZE * 6] = (int) DESCALE(z1 + tmp12 * (-FIX_1_847759065),
CONST_BITS + PASS1_BITS);
z1 = tmp4 + tmp7;
z2 = tmp5 + tmp6;
z3 = tmp4 + tmp6;
z4 = tmp5 + tmp7;
z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */
tmp4 = tmp4 * FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */
tmp5 = tmp5 * FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */
tmp6 = tmp6 * FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */
tmp7 = tmp7 * FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */
z1 = z1 * (-FIX_0_899976223); /* sqrt(2) * (c7-c3) */
z2 = z2 * (-FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
z3 = z3 * (-FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
z4 = z4 * (-FIX_0_390180644); /* sqrt(2) * (c5-c3) */
z3 += z5;
z4 += z5;
dataptr[DCTSIZE * 7] =
(int) DESCALE(tmp4 + z1 + z3, CONST_BITS + PASS1_BITS);
dataptr[DCTSIZE * 5] =
(int) DESCALE(tmp5 + z2 + z4, CONST_BITS + PASS1_BITS);
dataptr[DCTSIZE * 3] =
(int) DESCALE(tmp6 + z2 + z3, CONST_BITS + PASS1_BITS);
dataptr[DCTSIZE * 1] =
(int) DESCALE(tmp7 + z1 + z4, CONST_BITS + PASS1_BITS);
dataptr++; /* advance pointer to next column */
}
}
/* Main function
Time to function execution time using logic analyzer,
which measures the OFF time of a LED on board.
The switching latency, including the function call/return time,
is measured to be equal to 1.1us (22 clock cycles).
*/
__attribute__((noinline)) __attribute__((export_name("entrypoint")))
__attribute__((noinline)) __attribute__((export_name("entrypoint"))) void
jfdctint_main(void) {
jfdctint_jpeg_fdct_islow();
}
__attribute__((noinline)) __attribute__((export_name("main")))
__attribute__((noinline)) __attribute__((export_name("main"))) int
main(void) {
jfdctint_init();
jfdctint_main();
return (jfdctint_return());
}