315 lines
12 KiB
C
315 lines
12 KiB
C
/*
|
|
|
|
This program is part of the TACLeBench benchmark suite.
|
|
Version V 1.x
|
|
|
|
Name: jfdctint
|
|
|
|
Author: Thomas G. Lane, Public domain JPEG source code.
|
|
Modified by Steven Li at Princeton University.
|
|
|
|
Function: JPEG slow-but-accurate integer implementation of the
|
|
forward DCT (Discrete Cosine Transform) on a 8x8
|
|
pixel block [from original file documentations]
|
|
|
|
Copyright (C) 1991-1994, Thomas G. Lane.
|
|
This file is part of the Independent JPEG Group's software.
|
|
For conditions of distribution and use, see the accompanying README file.
|
|
|
|
This file contains a slow-but-accurate integer implementation of the
|
|
forward DCT (Discrete Cosine Transform).
|
|
|
|
A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
|
|
on each column. Direct algorithms are also available, but they are
|
|
much more complex and seem not to be any faster when reduced to code.
|
|
|
|
This implementation is based on an algorithm described in
|
|
C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
|
|
Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
|
|
Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
|
|
The primary algorithm described there uses 11 multiplies and 29 adds.
|
|
We use their alternate method with 12 multiplies and 32 adds.
|
|
The advantage of this method is that no data path contains more than one
|
|
multiplication; this allows a very simple and accurate implementation in
|
|
scaled fixed-point arithmetic, with a minimal number of shifts.
|
|
|
|
Source: SNU-RT Benchmark Suite for Worst Case Timing Analysis
|
|
Collected and Modified by S.-S. Lim
|
|
Real-Time Research Group
|
|
Seoul National University
|
|
|
|
Changes: Moved initialisation code from jfdctint_main() to jfdctint_init(),
|
|
added checksum calculation in jfdctint_return()
|
|
|
|
License: see README
|
|
|
|
*/
|
|
|
|
/* COMMENTS: Long calculation sequences (i.e., long basic blocks), */
|
|
/* single-nested loops. */
|
|
|
|
/**********************************************************************
|
|
Functions to be timed
|
|
***********************************************************************/
|
|
|
|
/* This definitions are added by Steven Li so as to bypass the header
|
|
files.
|
|
*/
|
|
|
|
// Wasm loop bounds
|
|
|
|
__attribute__((import_module("__pragma"), import_name("loopbound"))) extern void
|
|
__pragma_loopbound(unsigned int min_bound, unsigned int max_bound);
|
|
|
|
#define DCTSIZE 8
|
|
#define DESCALE(x, n) (((x) + (((int) 1) << ((n) - 1))) >> (n))
|
|
|
|
/*
|
|
The poop on this scaling stuff is as follows:
|
|
|
|
Each 1-D DCT step produces outputs which are a factor of sqrt(N)
|
|
larger than the true DCT outputs. The final outputs are therefore
|
|
a factor of N larger than desired; since N=8 this can be cured by
|
|
a simple right shift at the end of the algorithm. The advantage of
|
|
this arrangement is that we save two multiplications per 1-D DCT,
|
|
because the y0 and y4 outputs need not be divided by sqrt(N).
|
|
In the IJG code, this factor of 8 is removed by the quantization step
|
|
(in jcdctmgr.c), NOT in this module.
|
|
|
|
We have to do addition and subtraction of the integer inputs, which
|
|
is no problem, and multiplication by fractional constants, which is
|
|
a problem to do in integer arithmetic. We multiply all the constants
|
|
by CONST_SCALE and convert them to integer constants (thus retaining
|
|
CONST_BITS (13) bits of precision in the constants). After doing a
|
|
multiplication we have to divide the product by CONST_SCALE, with proper
|
|
rounding, to produce the correct output. This division can be done
|
|
cheaply as a right shift of CONST_BITS (13) bits. We postpone shifting
|
|
as long as possible so that partial sums can be added together with
|
|
full fractional precision.
|
|
|
|
The outputs of the first pass are scaled up by PASS1_BITS (2) bits so that
|
|
they are represented to better-than-integral precision. These outputs
|
|
require BITS_IN_JSAMPLE (8) + PASS1_BITS (2) + 3 bits; this fits in a
|
|
16-bit word with the recommended scaling. (For 12-bit sample data, the
|
|
intermediate array is int anyway.)
|
|
|
|
To avoid overflow of the 32-bit intermediate results in pass 2, we must
|
|
have BITS_IN_JSAMPLE (8) + CONST_BITS (13) + PASS1_BITS (2) <= 26.
|
|
Error analysis shows that the values given below are the most effective.
|
|
*/
|
|
|
|
/*
|
|
Forward declaration of functions
|
|
*/
|
|
|
|
void jfdctint_init();
|
|
int jfdctint_return();
|
|
__attribute__((noinline)) __attribute__((export_name("entrypoint"))) void
|
|
jfdctint_main();
|
|
__attribute__((noinline)) __attribute__((export_name("main"))) int main(void);
|
|
|
|
#define CONST_BITS 13
|
|
#define PASS1_BITS 2
|
|
|
|
/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
|
|
causing a lot of useless floating-point operations at run time.
|
|
To get around this we use the following pre-calculated constants.
|
|
If you change CONST_BITS you may want to add appropriate values.
|
|
(With a reasonable C compiler, you can just rely on the FIX() macro...)
|
|
*/
|
|
|
|
#define FIX_0_298631336 ((int) 2446) /* FIX(0.298631336) */
|
|
#define FIX_0_390180644 ((int) 3196) /* FIX(0.390180644) */
|
|
#define FIX_0_541196100 ((int) 4433) /* FIX(0.541196100) */
|
|
#define FIX_0_765366865 ((int) 6270) /* FIX(0.765366865) */
|
|
#define FIX_0_899976223 ((int) 7373) /* FIX(0.899976223) */
|
|
#define FIX_1_175875602 ((int) 9633) /* FIX(1.175875602) */
|
|
#define FIX_1_501321110 ((int) 12299) /* FIX(1.501321110) */
|
|
#define FIX_1_847759065 ((int) 15137) /* FIX(1.847759065) */
|
|
#define FIX_1_961570560 ((int) 16069) /* FIX(1.961570560) */
|
|
#define FIX_2_053119869 ((int) 16819) /* FIX(2.053119869) */
|
|
#define FIX_2_562915447 ((int) 20995) /* FIX(2.562915447) */
|
|
#define FIX_3_072711026 ((int) 25172) /* FIX(3.072711026) */
|
|
|
|
/* Multiply an int variable by an int constant to yield an int result.
|
|
For 8-bit samples with the recommended scaling, all the variable
|
|
and constant values involved are no more than 16 bits wide, so a
|
|
16x16->32 bit multiply can be used instead of a full 32x32 multiply.
|
|
For 12-bit samples, a full 32-bit multiplication will be needed.
|
|
*/
|
|
|
|
int jfdctint_data[64];
|
|
|
|
const int jfdctint_CHECKSUM = 1668124;
|
|
|
|
void
|
|
jfdctint_init() {
|
|
int i, seed;
|
|
|
|
/* Worst case settings */
|
|
/* Set array to random values */
|
|
seed = 1;
|
|
|
|
__pragma_loopbound(64, 64);
|
|
for (i = 0; i < 64; i++) {
|
|
seed = ((seed * 133) + 81) % 65535;
|
|
jfdctint_data[i] = seed;
|
|
}
|
|
}
|
|
|
|
int
|
|
jfdctint_return() {
|
|
int checksum = 0;
|
|
int i;
|
|
__pragma_loopbound(64, 64);
|
|
for (i = 0; i < 64; ++i)
|
|
checksum += jfdctint_data[i];
|
|
return ((checksum == jfdctint_CHECKSUM) ? 0 : -1);
|
|
}
|
|
|
|
/*
|
|
Perform the forward DCT on one block of samples.
|
|
*/
|
|
|
|
void
|
|
jfdctint_jpeg_fdct_islow(void) {
|
|
int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
|
int tmp10, tmp11, tmp12, tmp13;
|
|
int z1, z2, z3, z4, z5;
|
|
int *dataptr;
|
|
int ctr;
|
|
|
|
/* Pass 1: process rows. */
|
|
/* Note results are scaled up by sqrt(8) compared to a true DCT; */
|
|
/* furthermore, we scale the results by 2**PASS1_BITS. */
|
|
|
|
dataptr = jfdctint_data;
|
|
__pragma_loopbound(8, 8);
|
|
for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
|
|
|
|
tmp0 = dataptr[0] + dataptr[7];
|
|
tmp7 = dataptr[0] - dataptr[7];
|
|
tmp1 = dataptr[1] + dataptr[6];
|
|
tmp6 = dataptr[1] - dataptr[6];
|
|
tmp2 = dataptr[2] + dataptr[5];
|
|
tmp5 = dataptr[2] - dataptr[5];
|
|
tmp3 = dataptr[3] + dataptr[4];
|
|
tmp4 = dataptr[3] - dataptr[4];
|
|
|
|
tmp10 = tmp0 + tmp3;
|
|
tmp13 = tmp0 - tmp3;
|
|
tmp11 = tmp1 + tmp2;
|
|
tmp12 = tmp1 - tmp2;
|
|
|
|
dataptr[0] = (int) ((tmp10 + tmp11) << PASS1_BITS);
|
|
dataptr[4] = (int) ((tmp10 - tmp11) << PASS1_BITS);
|
|
|
|
z1 = (tmp12 + tmp13) * FIX_0_541196100;
|
|
dataptr[2] = (int) DESCALE(z1 + tmp13 * FIX_0_765366865,
|
|
CONST_BITS - PASS1_BITS);
|
|
dataptr[6] = (int) DESCALE(z1 + tmp12 * (-FIX_1_847759065),
|
|
CONST_BITS - PASS1_BITS);
|
|
|
|
z1 = tmp4 + tmp7;
|
|
z2 = tmp5 + tmp6;
|
|
z3 = tmp4 + tmp6;
|
|
z4 = tmp5 + tmp7;
|
|
z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */
|
|
|
|
tmp4 = tmp4 * FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */
|
|
tmp5 = tmp5 * FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */
|
|
tmp6 = tmp6 * FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */
|
|
tmp7 = tmp7 * FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */
|
|
z1 = z1 * (-FIX_0_899976223); /* sqrt(2) * (c7-c3) */
|
|
z2 = z2 * (-FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
|
|
z3 = z3 * (-FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
|
|
z4 = z4 * (-FIX_0_390180644); /* sqrt(2) * (c5-c3) */
|
|
|
|
z3 += z5;
|
|
z4 += z5;
|
|
|
|
dataptr[7] = (int) DESCALE(tmp4 + z1 + z3, CONST_BITS - PASS1_BITS);
|
|
dataptr[5] = (int) DESCALE(tmp5 + z2 + z4, CONST_BITS - PASS1_BITS);
|
|
dataptr[3] = (int) DESCALE(tmp6 + z2 + z3, CONST_BITS - PASS1_BITS);
|
|
dataptr[1] = (int) DESCALE(tmp7 + z1 + z4, CONST_BITS - PASS1_BITS);
|
|
|
|
dataptr += DCTSIZE; /* advance pointer to next row */
|
|
}
|
|
|
|
dataptr = jfdctint_data;
|
|
__pragma_loopbound(8, 8);
|
|
for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
|
|
tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
|
|
tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
|
|
tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
|
|
tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
|
|
tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
|
|
tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
|
|
tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
|
|
tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
|
|
|
|
tmp10 = tmp0 + tmp3;
|
|
tmp13 = tmp0 - tmp3;
|
|
tmp11 = tmp1 + tmp2;
|
|
tmp12 = tmp1 - tmp2;
|
|
|
|
dataptr[DCTSIZE * 0] = (int) DESCALE(tmp10 + tmp11, PASS1_BITS);
|
|
dataptr[DCTSIZE * 4] = (int) DESCALE(tmp10 - tmp11, PASS1_BITS);
|
|
|
|
z1 = (tmp12 + tmp13) * FIX_0_541196100;
|
|
dataptr[DCTSIZE * 2] = (int) DESCALE(z1 + tmp13 * FIX_0_765366865,
|
|
CONST_BITS + PASS1_BITS);
|
|
dataptr[DCTSIZE * 6] = (int) DESCALE(z1 + tmp12 * (-FIX_1_847759065),
|
|
CONST_BITS + PASS1_BITS);
|
|
|
|
z1 = tmp4 + tmp7;
|
|
z2 = tmp5 + tmp6;
|
|
z3 = tmp4 + tmp6;
|
|
z4 = tmp5 + tmp7;
|
|
z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */
|
|
|
|
tmp4 = tmp4 * FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */
|
|
tmp5 = tmp5 * FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */
|
|
tmp6 = tmp6 * FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */
|
|
tmp7 = tmp7 * FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */
|
|
z1 = z1 * (-FIX_0_899976223); /* sqrt(2) * (c7-c3) */
|
|
z2 = z2 * (-FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
|
|
z3 = z3 * (-FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
|
|
z4 = z4 * (-FIX_0_390180644); /* sqrt(2) * (c5-c3) */
|
|
|
|
z3 += z5;
|
|
z4 += z5;
|
|
|
|
dataptr[DCTSIZE * 7] =
|
|
(int) DESCALE(tmp4 + z1 + z3, CONST_BITS + PASS1_BITS);
|
|
dataptr[DCTSIZE * 5] =
|
|
(int) DESCALE(tmp5 + z2 + z4, CONST_BITS + PASS1_BITS);
|
|
dataptr[DCTSIZE * 3] =
|
|
(int) DESCALE(tmp6 + z2 + z3, CONST_BITS + PASS1_BITS);
|
|
dataptr[DCTSIZE * 1] =
|
|
(int) DESCALE(tmp7 + z1 + z4, CONST_BITS + PASS1_BITS);
|
|
|
|
dataptr++; /* advance pointer to next column */
|
|
}
|
|
}
|
|
|
|
/* Main function
|
|
Time to function execution time using logic analyzer,
|
|
which measures the OFF time of a LED on board.
|
|
|
|
The switching latency, including the function call/return time,
|
|
is measured to be equal to 1.1us (22 clock cycles).
|
|
*/
|
|
__attribute__((noinline)) __attribute__((export_name("entrypoint"))) void
|
|
jfdctint_main(void) {
|
|
jfdctint_jpeg_fdct_islow();
|
|
}
|
|
|
|
__attribute__((noinline)) __attribute__((export_name("main"))) int
|
|
main(void) {
|
|
jfdctint_init();
|
|
jfdctint_main();
|
|
|
|
return (jfdctint_return());
|
|
}
|