Add wasm tacle-bench targets

2026-06-12 20:06:22 +02:00
parent 30daa8a00c
commit 08c2e9c13d
1122 changed files with 520422 additions and 0 deletions
--- a/targets/wasm-tacle/kernel/jfdctint/CMakeLists.txt
+++ b/targets/wasm-tacle/kernel/jfdctint/CMakeLists.txt
@ -0,0 +1,25 @@
+# ~~~
+# SPDX-License-Identifier: MIT
+# SPDX-FileCopyrightText: 2026, Friedrich-Alexander-Universität Erlangen-Nürnberg (FAU)
+# ~~~
+
+cmake_minimum_required(VERSION 3.20)
+
+project(jfdctint)
+
+set(TACLEBENCH_ROOT_PATH "${CMAKE_CURRENT_LIST_DIR}/../../..")
+set(REPOSITORY_ROOT_PATH "${CMAKE_CURRENT_LIST_DIR}/../../../..")
+
+set(APP_TARGET_NAME "${CMAKE_PROJECT_NAME}")
+
+if(DEFINED TACLEBENCH_VARIANT AND "${TACLEBENCH_VARIANT}" STREQUAL "inline")
+    set(APP_SOURCE_FILE_PATH 
+        "generated/modified_sources/inline/jfdctint.c")
+else()
+    set(APP_SOURCE_FILE_PATH 
+        "generated/modified_sources/default/jfdctint.c")
+endif()
+
+include(${REPOSITORY_ROOT_PATH}/cmake/taclebench_wasm.cmake)
+
+
--- a/targets/wasm-tacle/kernel/jfdctint/ChangeLog.txt
+++ b/targets/wasm-tacle/kernel/jfdctint/ChangeLog.txt
@ -0,0 +1,70 @@
+File: jfdctint.c
+Original provenience: SNU-RT Benchmark Suite for Worst Case Timing Analysis
+
+2016-02-01:
+- Added generic TACLeBench header.
+- Removed old file header (keep some information in TACLeBench header).
+- Renamed global variable date to jfdctint_data.
+- Renamed main to jfdctint_main.
+- Moved initialisation code to jfdctint_init
+- Implemented new main function according to TACLeBench guidlines.
+- Implemented new function jfdctint_return, calculates checksum over
+  all data.
+- Applied code formatting according to the following rules
+  - Lines shall not be wider than 80 characters; whenever possible, appropriate
+    line breaks shall be inserted to keep lines below 80 characters
+  - Indentation is done using whitespaces only, no tabs. Code is indented by
+    two whitespaces
+  - Two empty lines are put between any two functions
+  - In non-empty lists or index expressions, opening '(' and '[' are followed by
+    one whitespace, closing ')' and ']' are preceded by one whitespace
+  - In comma- or colon-separated argument lists, one whitespace is put after
+    each comma/colon
+  - Names of functions and global variables all start with a benchmark-specific
+    prefix (here: bs_) followed by lowercase letter (e.g., bs_square)
+  - For pointer types, one whitespace is put before the '*'
+  - Operators within expressions shall be preceded and followed by one
+    whitespace
+  - Code of then- and else-parts of if-then-else statements shall be put in
+    separate lines, not in the same lines as the if-condition or the keyword
+    "else"
+  - Opening braces '{' denoting the beginning of code for some if-else or loop
+    body shall be put at the end of the same line where the keywords "if",
+    "else", "for", "while" etc. occur
+  - In non-empty lists or index expressions, opening '(' and '[' are followed by
+    one whitespace, closing ')' and ']' are preceded by one whitespace
+  - Operators within expressions shall be preceded and followed by one
+    whitespace
+
+2016-02-03:
+- Removed all PROFILINGs.
+- Macro types replaced by actual types:
+  - Replaced INT32 with int.
+  - Replaced DCTELEM with int.
+- Removed macros:
+  - GLOBAL (useless)
+  - Unused "FIX_... FIX(..)" definitions (unused)
+  - BITS_IN_JSAMPLE (used in #ifdef...#else..., keep only #if part)
+  - SHIFT_TEMPS (empty)
+  - JPEG_INTERNALS (unused)
+  - MULTIPLY (simply multiply *)
+  - ONE (used only once)
+  - RIGHT_SHIFT (used only once)
+  
+2016-04-05:
+- Return '0' on success
+
+2016-04-06:
+- Fixed generation of return value
+
+2016-04-21:
+- Fixed checksum value
+- Fixed license
+
+2016-06-01:
+- Changed all prefixes to lower-case
+- Changed return type of jfdctint_main
+
+2016-06-08:
+- Prefix
+- removed return from jfdctint_main
--- a/targets/wasm-tacle/kernel/jfdctint/README
+++ b/targets/wasm-tacle/kernel/jfdctint/README
@ -0,0 +1,383 @@
+The Independent JPEG Group's JPEG software
+==========================================
+
+README for release 6a of 7-Feb-96
+=================================
+
+This distribution contains the sixth public release of the Independent JPEG
+Group's free JPEG software.  You are welcome to redistribute this software and
+to use it for any purpose, subject to the conditions under LEGAL ISSUES, below.
+
+Serious users of this software (particularly those incorporating it into
+larger programs) should contact IJG at jpeg-info@uunet.uu.net to be added to
+our electronic mailing list.  Mailing list members are notified of updates
+and have a chance to participate in technical discussions, etc.
+
+This software is the work of Tom Lane, Philip Gladstone, Luis Ortiz, Jim
+Boucher, Lee Crocker, Julian Minguillon, George Phillips, Davide Rossi,
+Ge' Weijers, and other members of the Independent JPEG Group.
+
+IJG is not affiliated with the official ISO JPEG standards committee.
+
+
+DOCUMENTATION ROADMAP
+=====================
+
+This file contains the following sections:
+
+OVERVIEW            General description of JPEG and the IJG software.
+LEGAL ISSUES        Copyright, lack of warranty, terms of distribution.
+REFERENCES          Where to learn more about JPEG.
+ARCHIVE LOCATIONS   Where to find newer versions of this software.
+RELATED SOFTWARE    Other stuff you should get.
+FILE FORMAT WARS    Software *not* to get.
+TO DO               Plans for future IJG releases.
+
+Other documentation files in the distribution are:
+
+User documentation:
+  install.doc       How to configure and install the IJG software.
+  usage.doc         Usage instructions for cjpeg, djpeg, jpegtran,
+                    rdjpgcom, and wrjpgcom.
+  *.1               Unix-style man pages for programs (same info as usage.doc).
+  wizard.doc        Advanced usage instructions for JPEG wizards only.
+  change.log        Version-to-version change highlights.
+Programmer and internal documentation:
+  libjpeg.doc       How to use the JPEG library in your own programs.
+  example.c         Sample code for calling the JPEG library.
+  structure.doc     Overview of the JPEG library's internal structure.
+  filelist.doc      Road map of IJG files.
+  coderules.doc     Coding style rules --- please read if you contribute code.
+
+Please read at least the files install.doc and usage.doc.  Useful information
+can also be found in the JPEG FAQ (Frequently Asked Questions) article.  See
+ARCHIVE LOCATIONS below to find out where to obtain the FAQ article.
+
+If you want to understand how the JPEG code works, we suggest reading one or
+more of the REFERENCES, then looking at the documentation files (in roughly
+the order listed) before diving into the code.
+
+
+OVERVIEW
+========
+
+This package contains C software to implement JPEG image compression and
+decompression.  JPEG (pronounced "jay-peg") is a standardized compression
+method for full-color and gray-scale images.  JPEG is intended for compressing
+"real-world" scenes; line drawings, cartoons and other non-realistic images
+are not its strong suit.  JPEG is lossy, meaning that the output image is not
+exactly identical to the input image.  Hence you must not use JPEG if you
+have to have identical output bits.  However, on typical photographic images,
+very good compression levels can be obtained with no visible change, and
+remarkably high compression levels are possible if you can tolerate a
+low-quality image.  For more details, see the references, or just experiment
+with various compression settings.
+
+This software implements JPEG baseline, extended-sequential, and progressive
+compression processes.  Provision is made for supporting all variants of these
+processes, although some uncommon parameter settings aren't implemented yet.
+For legal reasons, we are not distributing code for the arithmetic-coding
+variants of JPEG; see LEGAL ISSUES.  We have made no provision for supporting
+the hierarchical or lossless processes defined in the standard.
+
+We provide a set of library routines for reading and writing JPEG image files,
+plus two sample applications "cjpeg" and "djpeg", which use the library to
+perform conversion between JPEG and some other popular image file formats.
+The library is intended to be reused in other applications.
+
+In order to support file conversion and viewing software, we have included
+considerable functionality beyond the bare JPEG coding/decoding capability;
+for example, the color quantization modules are not strictly part of JPEG
+decoding, but they are essential for output to colormapped file formats or
+colormapped displays.  These extra functions can be compiled out of the
+library if not required for a particular application.  We have also included
+"jpegtran", a utility for lossless transcoding between different JPEG
+processes, and "rdjpgcom" and "wrjpgcom", two simple applications for
+inserting and extracting textual comments in JFIF files.
+
+The emphasis in designing this software has been on achieving portability and
+flexibility, while also making it fast enough to be useful.  In particular,
+the software is not intended to be read as a tutorial on JPEG.  (See the
+REFERENCES section for introductory material.)  Rather, it is intended to
+be reliable, portable, industrial-strength code.  We do not claim to have
+achieved that goal in every aspect of the software, but we strive for it.
+
+We welcome the use of this software as a component of commercial products.
+No royalty is required, but we do ask for an acknowledgement in product
+documentation, as described under LEGAL ISSUES.
+
+
+LEGAL ISSUES
+============
+
+In plain English:
+
+1. We don't promise that this software works.  (But if you find any bugs,
+   please let us know!)
+2. You can use this software for whatever you want.  You don't have to pay us.
+3. You may not pretend that you wrote this software.  If you use it in a
+   program, you must acknowledge somewhere in your documentation that
+   you've used the IJG code.
+
+In legalese:
+
+The authors make NO WARRANTY or representation, either express or implied,
+with respect to this software, its quality, accuracy, merchantability, or
+fitness for a particular purpose.  This software is provided "AS IS", and you,
+its user, assume the entire risk as to its quality and accuracy.
+
+This software is copyright (C) 1991-1996, Thomas G. Lane.
+All Rights Reserved except as specified below.
+
+Permission is hereby granted to use, copy, modify, and distribute this
+software (or portions thereof) for any purpose, without fee, subject to these
+conditions:
+(1) If any part of the source code for this software is distributed, then this
+README file must be included, with this copyright and no-warranty notice
+unaltered; and any additions, deletions, or changes to the original files
+must be clearly indicated in accompanying documentation.
+(2) If only executable code is distributed, then the accompanying
+documentation must state that "this software is based in part on the work of
+the Independent JPEG Group".
+(3) Permission for use of this software is granted only if the user accepts
+full responsibility for any undesirable consequences; the authors accept
+NO LIABILITY for damages of any kind.
+
+These conditions apply to any software derived from or based on the IJG code,
+not just to the unmodified library.  If you use our work, you ought to
+acknowledge us.
+
+Permission is NOT granted for the use of any IJG author's name or company name
+in advertising or publicity relating to this software or products derived from
+it.  This software may be referred to only as "the Independent JPEG Group's
+software".
+
+We specifically permit and encourage the use of this software as the basis of
+commercial products, provided that all warranty or liability claims are
+assumed by the product vendor.
+
+
+ansi2knr.c is included in this distribution by permission of L. Peter Deutsch,
+sole proprietor of its copyright holder, Aladdin Enterprises of Menlo Park, CA.
+ansi2knr.c is NOT covered by the above copyright and conditions, but instead
+by the usual distribution terms of the Free Software Foundation; principally,
+that you must include source code if you redistribute it.  (See the file
+ansi2knr.c for full details.)  However, since ansi2knr.c is not needed as part
+of any program generated from the IJG code, this does not limit you more than
+the foregoing paragraphs do.
+
+The configuration script "configure" was produced with GNU Autoconf.  It
+is copyright by the Free Software Foundation but is freely distributable.
+
+It appears that the arithmetic coding option of the JPEG spec is covered by
+patents owned by IBM, AT&T, and Mitsubishi.  Hence arithmetic coding cannot
+legally be used without obtaining one or more licenses.  For this reason,
+support for arithmetic coding has been removed from the free JPEG software.
+(Since arithmetic coding provides only a marginal gain over the unpatented
+Huffman mode, it is unlikely that very many implementations will support it.)
+So far as we are aware, there are no patent restrictions on the remaining
+code.
+
+WARNING: Unisys has begun to enforce their patent on LZW compression against
+GIF encoders and decoders.  You will need a license from Unisys to use the
+included rdgif.c or wrgif.c files in a commercial or shareware application.
+At this time, Unisys is not enforcing their patent against freeware, so
+distribution of this package remains legal.  However, we intend to remove
+GIF support from the IJG package as soon as a suitable replacement format
+becomes reasonably popular.
+
+We are required to state that
+    "The Graphics Interchange Format(c) is the Copyright property of
+    CompuServe Incorporated.  GIF(sm) is a Service Mark property of
+    CompuServe Incorporated."
+
+
+REFERENCES
+==========
+
+We highly recommend reading one or more of these references before trying to
+understand the innards of the JPEG software.
+
+The best short technical introduction to the JPEG compression algorithm is
+	Wallace, Gregory K.  "The JPEG Still Picture Compression Standard",
+	Communications of the ACM, April 1991 (vol. 34 no. 4), pp. 30-44.
+(Adjacent articles in that issue discuss MPEG motion picture compression,
+applications of JPEG, and related topics.)  If you don't have the CACM issue
+handy, a PostScript file containing a revised version of Wallace's article
+is available at ftp.uu.net, graphics/jpeg/wallace.ps.gz.  The file (actually
+a preprint for an article that appeared in IEEE Trans. Consumer Electronics)
+omits the sample images that appeared in CACM, but it includes corrections
+and some added material.  Note: the Wallace article is copyright ACM and
+IEEE, and it may not be used for commercial purposes.
+
+A somewhat less technical, more leisurely introduction to JPEG can be found in
+"The Data Compression Book" by Mark Nelson, published by M&T Books (Redwood
+City, CA), 1991, ISBN 1-55851-216-0.  This book provides good explanations and
+example C code for a multitude of compression methods including JPEG.  It is
+an excellent source if you are comfortable reading C code but don't know much
+about data compression in general.  The book's JPEG sample code is far from
+industrial-strength, but when you are ready to look at a full implementation,
+you've got one here...
+
+The best full description of JPEG is the textbook "JPEG Still Image Data
+Compression Standard" by William B. Pennebaker and Joan L. Mitchell, published
+by Van Nostrand Reinhold, 1993, ISBN 0-442-01272-1.  Price US$59.95, 638 pp.
+The book includes the complete text of the ISO JPEG standards (DIS 10918-1
+and draft DIS 10918-2).  This is by far the most complete exposition of JPEG
+in existence, and we highly recommend it.
+
+The JPEG standard itself is not available electronically; you must order a
+paper copy through ISO or ITU.  (Unless you feel a need to own a certified
+official copy, we recommend buying the Pennebaker and Mitchell book instead;
+it's much cheaper and includes a great deal of useful explanatory material.)
+In the USA, copies of the standard may be ordered from ANSI Sales at (212)
+642-4900, or from Global Engineering Documents at (800) 854-7179.  (ANSI
+doesn't take credit card orders, but Global does.)  It's not cheap: as of
+1992, ANSI was charging $95 for Part 1 and $47 for Part 2, plus 7%
+shipping/handling.  The standard is divided into two parts, Part 1 being the
+actual specification, while Part 2 covers compliance testing methods.  Part 1
+is titled "Digital Compression and Coding of Continuous-tone Still Images,
+Part 1: Requirements and guidelines" and has document numbers ISO/IEC IS
+10918-1, ITU-T T.81.  Part 2 is titled "Digital Compression and Coding of
+Continuous-tone Still Images, Part 2: Compliance testing" and has document
+numbers ISO/IEC IS 10918-2, ITU-T T.83.
+
+Extensions to the original JPEG standard are defined in JPEG Part 3, a new ISO
+document.  Part 3 is undergoing ISO balloting and is expected to be approved
+by the end of 1995; it will have document numbers ISO/IEC IS 10918-3, ITU-T
+T.84.  IJG currently does not support any Part 3 extensions.
+
+The JPEG standard does not specify all details of an interchangeable file
+format.  For the omitted details we follow the "JFIF" conventions, revision
+1.02.  A copy of the JFIF spec is available from:
+	Literature Department
+	C-Cube Microsystems, Inc.
+	1778 McCarthy Blvd.
+	Milpitas, CA 95035
+	phone (408) 944-6300,  fax (408) 944-6314
+A PostScript version of this document is available at ftp.uu.net, file
+graphics/jpeg/jfif.ps.gz.  It can also be obtained by e-mail from the C-Cube
+mail server, netlib@c3.pla.ca.us.  Send the message "send jfif_ps from jpeg"
+to the server to obtain the JFIF document; send the message "help" if you have
+trouble.
+
+The TIFF 6.0 file format specification can be obtained by FTP from sgi.com
+(192.48.153.1), file graphics/tiff/TIFF6.ps.Z; or you can order a printed
+copy from Aldus Corp. at (206) 628-6593.  The JPEG incorporation scheme
+found in the TIFF 6.0 spec of 3-June-92 has a number of serious problems.
+IJG does not recommend use of the TIFF 6.0 design (TIFF Compression tag 6).
+Instead, we recommend the JPEG design proposed by TIFF Technical Note #2
+(Compression tag 7).  Copies of this Note can be obtained from sgi.com or
+from ftp.uu.net:/graphics/jpeg/.  It is expected that the next revision of
+the TIFF spec will replace the 6.0 JPEG design with the Note's design.
+Although IJG's own code does not support TIFF/JPEG, the free libtiff library
+uses our library to implement TIFF/JPEG per the Note.  libtiff is available
+from sgi.com:/graphics/tiff/.
+
+
+ARCHIVE LOCATIONS
+=================
+
+The "official" archive site for this software is ftp.uu.net (Internet
+address 192.48.96.9).  The most recent released version can always be found
+there in directory graphics/jpeg.  This particular version will be archived
+as graphics/jpeg/jpegsrc.v6a.tar.gz.  If you are on the Internet, you
+can retrieve files from ftp.uu.net by standard anonymous FTP.  If you don't
+have FTP access, UUNET's archives are also available via UUCP; contact
+help@uunet.uu.net for information on retrieving files that way.
+
+Numerous Internet sites maintain copies of the UUNET files.  However, only
+ftp.uu.net is guaranteed to have the latest official version.
+
+You can also obtain this software in DOS-compatible "zip" archive format from
+the SimTel archives (ftp.coast.net:/SimTel/msdos/graphics/), or on CompuServe
+in the Graphics Support forum (GO CIS:GRAPHSUP), library 12 "JPEG Tools".
+Again, these versions may sometimes lag behind the ftp.uu.net release.
+
+The JPEG FAQ (Frequently Asked Questions) article is a useful source of
+general information about JPEG.  It is updated constantly and therefore is
+not included in this distribution.  The FAQ is posted every two weeks to
+Usenet newsgroups comp.graphics.misc, news.answers, and other groups.
+You can always obtain the latest version from the news.answers archive at
+rtfm.mit.edu.  By FTP, fetch /pub/usenet/news.answers/jpeg-faq/part1 and
+.../part2.  If you don't have FTP, send e-mail to mail-server@rtfm.mit.edu
+with body
+	send usenet/news.answers/jpeg-faq/part1
+	send usenet/news.answers/jpeg-faq/part2
+
+
+RELATED SOFTWARE
+================
+
+Numerous viewing and image manipulation programs now support JPEG.  (Quite a
+few of them use this library to do so.)  The JPEG FAQ described above lists
+some of the more popular free and shareware viewers, and tells where to
+obtain them on Internet.
+
+If you are on a Unix machine, we highly recommend Jef Poskanzer's free
+PBMPLUS image software, which provides many useful operations on PPM-format
+image files.  In particular, it can convert PPM images to and from a wide
+range of other formats.  You can obtain this package by FTP from ftp.x.org
+(contrib/pbmplus*.tar.Z) or ftp.ee.lbl.gov (pbmplus*.tar.Z).  There is also
+a newer update of this package called NETPBM, available from
+wuarchive.wustl.edu under directory /graphics/graphics/packages/NetPBM/.
+Unfortunately PBMPLUS/NETPBM is not nearly as portable as the IJG software
+is; you are likely to have difficulty making it work on any non-Unix machine.
+
+A different free JPEG implementation, written by the PVRG group at Stanford,
+is available from havefun.stanford.edu in directory pub/jpeg.  This program
+is designed for research and experimentation rather than production use;
+it is slower, harder to use, and less portable than the IJG code, but it
+is easier to read and modify.  Also, the PVRG code supports lossless JPEG,
+which we do not.
+
+
+FILE FORMAT WARS
+================
+
+Some JPEG programs produce files that are not compatible with our library.
+The root of the problem is that the ISO JPEG committee failed to specify a
+concrete file format.  Some vendors "filled in the blanks" on their own,
+creating proprietary formats that no one else could read.  (For example, none
+of the early commercial JPEG implementations for the Macintosh were able to
+exchange compressed files.)
+
+The file format we have adopted is called JFIF (see REFERENCES).  This format
+has been agreed to by a number of major commercial JPEG vendors, and it has
+become the de facto standard.  JFIF is a minimal or "low end" representation.
+We recommend the use of TIFF/JPEG (TIFF revision 6.0 as modified by TIFF
+Technical Note #2) for "high end" applications that need to record a lot of
+additional data about an image.  TIFF/JPEG is fairly new and not yet widely
+supported, unfortunately.
+
+The upcoming JPEG Part 3 standard defines a file format called SPIFF.
+SPIFF is interoperable with JFIF, in the sense that most JFIF decoders should
+be able to read the most common variant of SPIFF.  SPIFF has some technical
+advantages over JFIF, but its major claim to fame is simply that it is an
+official standard rather than an informal one.  At this point it is unclear
+whether SPIFF will supersede JFIF or whether JFIF will remain the de-facto
+standard.  IJG intends to support SPIFF once the standard is frozen, but we
+have not decided whether it should become our default output format or not.
+(In any case, our decoder will remain capable of reading JFIF indefinitely.)
+
+Various proprietary file formats incorporating JPEG compression also exist.
+We have little or no sympathy for the existence of these formats.  Indeed,
+one of the original reasons for developing this free software was to help
+force convergence on common, open format standards for JPEG files.  Don't
+use a proprietary file format!
+
+
+TO DO
+=====
+
+In future versions, we are considering supporting some of the upcoming JPEG
+Part 3 extensions --- principally, variable quantization and the SPIFF file
+format.
+
+Tuning the software for better behavior at low quality/high compression
+settings is also of interest.  The current method for scaling the
+quantization tables is known not to be very good at low Q values.
+
+As always, speeding things up is high on our priority list.
+
+Please send bug reports, offers of help, etc. to jpeg-info@uunet.uu.net.
--- a/targets/wasm-tacle/kernel/jfdctint/generated/default/jfdctint.wasm
+++ b/targets/wasm-tacle/kernel/jfdctint/generated/default/jfdctint.wasm
--- a/targets/wasm-tacle/kernel/jfdctint/generated/default/jfdctint.wat
+++ b/targets/wasm-tacle/kernel/jfdctint/generated/default/jfdctint.wat
@ -0,0 +1,679 @@
+(module $jfdctint.wasm
+  (type (;0;) (func (param i32 i32)))
+  (type (;1;) (func))
+  (type (;2;) (func (result i32)))
+  (import "__pragma" "loopbound" (func $__pragma_loopbound (type 0)))
+  (func $__wasm_apply_data_relocs (type 1))
+  (func $jfdctint_return (type 2) (result i32)
+    i32.const 64
+    i32.const 64
+    call $__pragma_loopbound
+    i32.const -1
+    i32.const 0
+    i32.const 0
+    i32.load offset=1276
+    i32.const 0
+    i32.load offset=1272
+    i32.const 0
+    i32.load offset=1268
+    i32.const 0
+    i32.load offset=1264
+    i32.const 0
+    i32.load offset=1260
+    i32.const 0
+    i32.load offset=1256
+    i32.const 0
+    i32.load offset=1252
+    i32.const 0
+    i32.load offset=1248
+    i32.const 0
+    i32.load offset=1244
+    i32.const 0
+    i32.load offset=1240
+    i32.const 0
+    i32.load offset=1236
+    i32.const 0
+    i32.load offset=1232
+    i32.const 0
+    i32.load offset=1228
+    i32.const 0
+    i32.load offset=1224
+    i32.const 0
+    i32.load offset=1220
+    i32.const 0
+    i32.load offset=1216
+    i32.const 0
+    i32.load offset=1212
+    i32.const 0
+    i32.load offset=1208
+    i32.const 0
+    i32.load offset=1204
+    i32.const 0
+    i32.load offset=1200
+    i32.const 0
+    i32.load offset=1196
+    i32.const 0
+    i32.load offset=1192
+    i32.const 0
+    i32.load offset=1188
+    i32.const 0
+    i32.load offset=1184
+    i32.const 0
+    i32.load offset=1180
+    i32.const 0
+    i32.load offset=1176
+    i32.const 0
+    i32.load offset=1172
+    i32.const 0
+    i32.load offset=1168
+    i32.const 0
+    i32.load offset=1164
+    i32.const 0
+    i32.load offset=1160
+    i32.const 0
+    i32.load offset=1156
+    i32.const 0
+    i32.load offset=1152
+    i32.const 0
+    i32.load offset=1148
+    i32.const 0
+    i32.load offset=1144
+    i32.const 0
+    i32.load offset=1140
+    i32.const 0
+    i32.load offset=1136
+    i32.const 0
+    i32.load offset=1132
+    i32.const 0
+    i32.load offset=1128
+    i32.const 0
+    i32.load offset=1124
+    i32.const 0
+    i32.load offset=1120
+    i32.const 0
+    i32.load offset=1116
+    i32.const 0
+    i32.load offset=1112
+    i32.const 0
+    i32.load offset=1108
+    i32.const 0
+    i32.load offset=1104
+    i32.const 0
+    i32.load offset=1100
+    i32.const 0
+    i32.load offset=1096
+    i32.const 0
+    i32.load offset=1092
+    i32.const 0
+    i32.load offset=1088
+    i32.const 0
+    i32.load offset=1084
+    i32.const 0
+    i32.load offset=1080
+    i32.const 0
+    i32.load offset=1076
+    i32.const 0
+    i32.load offset=1072
+    i32.const 0
+    i32.load offset=1068
+    i32.const 0
+    i32.load offset=1064
+    i32.const 0
+    i32.load offset=1060
+    i32.const 0
+    i32.load offset=1056
+    i32.const 0
+    i32.load offset=1052
+    i32.const 0
+    i32.load offset=1048
+    i32.const 0
+    i32.load offset=1044
+    i32.const 0
+    i32.load offset=1040
+    i32.const 0
+    i32.load offset=1036
+    i32.const 0
+    i32.load offset=1032
+    i32.const 0
+    i32.load offset=1028
+    i32.const 0
+    i32.load offset=1024
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.add
+    i32.const 1668124
+    i32.ne
+    select)
+  (func $jfdctint_jpeg_fdct_islow (type 1)
+    (local i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32 i32)
+    i32.const 8
+    i32.const 8
+    call $__pragma_loopbound
+    i32.const -256
+    local.set 0
+    loop  ;; label = @1
+      local.get 0
+      i32.const 1296
+      i32.add
+      local.tee 1
+      local.get 1
+      i32.load
+      local.tee 1
+      local.get 0
+      i32.const 1292
+      i32.add
+      local.tee 2
+      i32.load
+      local.tee 3
+      i32.add
+      local.tee 4
+      local.get 0
+      i32.const 1308
+      i32.add
+      local.tee 5
+      i32.load
+      local.tee 6
+      local.get 0
+      i32.const 1280
+      i32.add
+      local.tee 7
+      i32.load
+      local.tee 8
+      i32.add
+      local.tee 9
+      i32.add
+      local.tee 10
+      local.get 0
+      i32.const 1300
+      i32.add
+      local.tee 11
+      i32.load
+      local.tee 12
+      local.get 0
+      i32.const 1288
+      i32.add
+      local.tee 13
+      i32.load
+      local.tee 14
+      i32.add
+      local.tee 15
+      local.get 0
+      i32.const 1304
+      i32.add
+      local.tee 16
+      i32.load
+      local.tee 17
+      local.get 0
+      i32.const 1284
+      i32.add
+      local.tee 18
+      i32.load
+      local.tee 19
+      i32.add
+      local.tee 20
+      i32.add
+      local.tee 21
+      i32.sub
+      i32.const 2
+      i32.shl
+      i32.store
+      local.get 7
+      local.get 10
+      local.get 21
+      i32.add
+      i32.const 2
+      i32.shl
+      i32.store
+      local.get 5
+      local.get 3
+      local.get 1
+      i32.sub
+      local.tee 1
+      local.get 8
+      local.get 6
+      i32.sub
+      local.tee 3
+      i32.add
+      i32.const -7373
+      i32.mul
+      i32.const 1024
+      i32.add
+      local.tee 7
+      local.get 1
+      i32.const 2446
+      i32.mul
+      i32.add
+      local.get 1
+      local.get 19
+      local.get 17
+      i32.sub
+      local.tee 6
+      i32.add
+      local.tee 8
+      local.get 14
+      local.get 12
+      i32.sub
+      local.tee 1
+      local.get 3
+      i32.add
+      local.tee 10
+      i32.add
+      i32.const 9633
+      i32.mul
+      local.tee 12
+      local.get 8
+      i32.const -16069
+      i32.mul
+      i32.add
+      local.tee 8
+      i32.add
+      i32.const 11
+      i32.shr_s
+      i32.store
+      local.get 16
+      local.get 9
+      local.get 4
+      i32.sub
+      local.tee 4
+      local.get 20
+      local.get 15
+      i32.sub
+      local.tee 5
+      i32.add
+      i32.const 4433
+      i32.mul
+      i32.const 1024
+      i32.add
+      local.tee 9
+      local.get 5
+      i32.const -15137
+      i32.mul
+      i32.add
+      i32.const 11
+      i32.shr_s
+      i32.store
+      local.get 13
+      local.get 9
+      local.get 4
+      i32.const 6270
+      i32.mul
+      i32.add
+      i32.const 11
+      i32.shr_s
+      i32.store
+      local.get 11
+      local.get 1
+      local.get 6
+      i32.add
+      i32.const -20995
+      i32.mul
+      i32.const 1024
+      i32.add
+      local.tee 4
+      local.get 1
+      i32.const 16819
+      i32.mul
+      i32.add
+      local.get 12
+      local.get 10
+      i32.const -3196
+      i32.mul
+      i32.add
+      local.tee 1
+      i32.add
+      i32.const 11
+      i32.shr_s
+      i32.store
+      local.get 2
+      local.get 4
+      local.get 6
+      i32.const 25172
+      i32.mul
+      i32.add
+      local.get 8
+      i32.add
+      i32.const 11
+      i32.shr_s
+      i32.store
+      local.get 18
+      local.get 7
+      local.get 3
+      i32.const 12299
+      i32.mul
+      i32.add
+      local.get 1
+      i32.add
+      i32.const 11
+      i32.shr_s
+      i32.store
+      local.get 0
+      i32.const 32
+      i32.add
+      local.tee 0
+      br_if 0 (;@1;)
+    end
+    i32.const 8
+    i32.const 8
+    call $__pragma_loopbound
+    i32.const -32
+    local.set 0
+    loop  ;; label = @1
+      local.get 0
+      i32.const 1184
+      i32.add
+      local.tee 1
+      local.get 1
+      i32.load
+      local.tee 1
+      local.get 0
+      i32.const 1152
+      i32.add
+      local.tee 2
+      i32.load
+      local.tee 3
+      i32.add
+      local.tee 4
+      local.get 0
+      i32.const 1280
+      i32.add
+      local.tee 5
+      i32.load
+      local.tee 6
+      local.get 0
+      i32.const 1056
+      i32.add
+      local.tee 7
+      i32.load
+      local.tee 8
+      i32.add
+      local.tee 9
+      i32.add
+      local.tee 10
+      local.get 0
+      i32.const 1216
+      i32.add
+      local.tee 11
+      i32.load
+      local.tee 12
+      local.get 0
+      i32.const 1120
+      i32.add
+      local.tee 13
+      i32.load
+      local.tee 14
+      i32.add
+      local.tee 15
+      local.get 0
+      i32.const 1248
+      i32.add
+      local.tee 16
+      i32.load
+      local.tee 17
+      local.get 0
+      i32.const 1088
+      i32.add
+      local.tee 18
+      i32.load
+      local.tee 19
+      i32.add
+      local.tee 20
+      i32.add
+      local.tee 21
+      i32.sub
+      i32.const 2
+      i32.add
+      i32.const 2
+      i32.shr_s
+      i32.store
+      local.get 7
+      local.get 21
+      local.get 10
+      i32.add
+      i32.const 2
+      i32.add
+      i32.const 2
+      i32.shr_s
+      i32.store
+      local.get 5
+      local.get 3
+      local.get 1
+      i32.sub
+      local.tee 1
+      local.get 8
+      local.get 6
+      i32.sub
+      local.tee 3
+      i32.add
+      i32.const -7373
+      i32.mul
+      i32.const 16384
+      i32.add
+      local.tee 7
+      local.get 1
+      i32.const 2446
+      i32.mul
+      i32.add
+      local.get 1
+      local.get 19
+      local.get 17
+      i32.sub
+      local.tee 6
+      i32.add
+      local.tee 8
+      local.get 14
+      local.get 12
+      i32.sub
+      local.tee 1
+      local.get 3
+      i32.add
+      local.tee 10
+      i32.add
+      i32.const 9633
+      i32.mul
+      local.tee 12
+      local.get 8
+      i32.const -16069
+      i32.mul
+      i32.add
+      local.tee 8
+      i32.add
+      i32.const 15
+      i32.shr_s
+      i32.store
+      local.get 16
+      local.get 9
+      local.get 4
+      i32.sub
+      local.tee 4
+      local.get 20
+      local.get 15
+      i32.sub
+      local.tee 5
+      i32.add
+      i32.const 4433
+      i32.mul
+      i32.const 16384
+      i32.add
+      local.tee 9
+      local.get 5
+      i32.const -15137
+      i32.mul
+      i32.add
+      i32.const 15
+      i32.shr_s
+      i32.store
+      local.get 13
+      local.get 9
+      local.get 4
+      i32.const 6270
+      i32.mul
+      i32.add
+      i32.const 15
+      i32.shr_s
+      i32.store
+      local.get 11
+      local.get 1
+      local.get 6
+      i32.add
+      i32.const -20995
+      i32.mul
+      i32.const 16384
+      i32.add
+      local.tee 4
+      local.get 1
+      i32.const 16819
+      i32.mul
+      i32.add
+      local.get 12
+      local.get 10
+      i32.const -3196
+      i32.mul
+      i32.add
+      local.tee 1
+      i32.add
+      i32.const 15
+      i32.shr_s
+      i32.store
+      local.get 2
+      local.get 4
+      local.get 6
+      i32.const 25172
+      i32.mul
+      i32.add
+      local.get 8
+      i32.add
+      i32.const 15
+      i32.shr_s
+      i32.store
+      local.get 18
+      local.get 7
+      local.get 3
+      i32.const 12299
+      i32.mul
+      i32.add
+      local.get 1
+      i32.add
+      i32.const 15
+      i32.shr_s
+      i32.store
+      local.get 0
+      i32.const 4
+      i32.add
+      local.tee 0
+      br_if 0 (;@1;)
+    end)
+  (func $jfdctint_main (type 1)
+    call $jfdctint_jpeg_fdct_islow)
+  (func $__original_main (type 2) (result i32)
+    (local i32 i32)
+    i32.const 64
+    i32.const 64
+    call $__pragma_loopbound
+    i32.const 1
+    local.set 0
+    i32.const -256
+    local.set 1
+    loop  ;; label = @1
+      local.get 1
+      i32.const 1280
+      i32.add
+      local.get 0
+      i32.const 133
+      i32.mul
+      i32.const 81
+      i32.add
+      i32.const 65535
+      i32.rem_s
+      local.tee 0
+      i32.store
+      local.get 1
+      i32.const 1284
+      i32.add
+      local.get 0
+      i32.const 133
+      i32.mul
+      i32.const 81
+      i32.add
+      i32.const 65535
+      i32.rem_s
+      local.tee 0
+      i32.store
+      local.get 1
+      i32.const 8
+      i32.add
+      local.tee 1
+      br_if 0 (;@1;)
+    end
+    call $jfdctint_main
+    call $jfdctint_return)
+  (table (;0;) 1 1 funcref)
+  (memory (;0;) 1)
+  (global $__stack_pointer (mut i32) (i32.const 5376))
+  (global (;1;) i32 (i32.const 1280))
+  (global (;2;) i32 (i32.const 5376))
+  (export "memory" (memory 0))
+  (export "__wasm_apply_data_relocs" (func $__wasm_apply_data_relocs))
+  (export "entrypoint" (func $jfdctint_main))
+  (export "main" (func $__original_main))
+  (export "__data_end" (global 1))
+  (export "__heap_base" (global 2)))
--- a/targets/wasm-tacle/kernel/jfdctint/generated/modified_sources/default/jfdctint.c
+++ b/targets/wasm-tacle/kernel/jfdctint/generated/modified_sources/default/jfdctint.c
@ -0,0 +1,314 @@
+/*
+
+  This program is part of the TACLeBench benchmark suite.
+  Version V 1.x
+
+  Name: jfdctint
+
+  Author: Thomas G. Lane, Public domain JPEG source code.
+          Modified by Steven Li at Princeton University.
+
+  Function: JPEG slow-but-accurate integer implementation of the
+            forward  DCT (Discrete Cosine Transform) on a 8x8
+            pixel block [from original file documentations]
+
+   Copyright (C) 1991-1994, Thomas G. Lane.
+   This file is part of the Independent JPEG Group's software.
+   For conditions of distribution and use, see the accompanying README file.
+
+   This file contains a slow-but-accurate integer implementation of the
+   forward DCT (Discrete Cosine Transform).
+
+   A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
+   on each column.  Direct algorithms are also available, but they are
+   much more complex and seem not to be any faster when reduced to code.
+
+   This implementation is based on an algorithm described in
+     C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
+     Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
+     Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
+   The primary algorithm described there uses 11 multiplies and 29 adds.
+   We use their alternate method with 12 multiplies and 32 adds.
+   The advantage of this method is that no data path contains more than one
+   multiplication; this allows a very simple and accurate implementation in
+   scaled fixed-point arithmetic, with a minimal number of shifts.
+
+  Source: SNU-RT Benchmark Suite for Worst Case Timing Analysis
+          Collected and Modified by S.-S. Lim
+          Real-Time Research Group
+          Seoul National University
+
+  Changes: Moved initialisation code from jfdctint_main() to jfdctint_init(),
+           added checksum calculation in jfdctint_return()
+
+  License: see README
+
+*/
+
+/*  COMMENTS: Long calculation sequences (i.e., long basic blocks),      */
+/*            single-nested loops.                                       */
+
+/**********************************************************************
+    Functions to be timed
+***********************************************************************/
+
+/* This definitions are added by Steven Li so as to bypass the header
+   files.
+*/
+
+// Wasm loop bounds
+
+__attribute__((import_module("__pragma"), import_name("loopbound"))) extern void
+__pragma_loopbound(unsigned int min_bound, unsigned int max_bound);
+
+#define DCTSIZE       8
+#define DESCALE(x, n) (((x) + (((int) 1) << ((n) - 1))) >> (n))
+
+/*
+   The poop on this scaling stuff is as follows:
+
+   Each 1-D DCT step produces outputs which are a factor of sqrt(N)
+   larger than the true DCT outputs.  The final outputs are therefore
+   a factor of N larger than desired; since N=8 this can be cured by
+   a simple right shift at the end of the algorithm.  The advantage of
+   this arrangement is that we save two multiplications per 1-D DCT,
+   because the y0 and y4 outputs need not be divided by sqrt(N).
+   In the IJG code, this factor of 8 is removed by the quantization step
+   (in jcdctmgr.c), NOT in this module.
+
+   We have to do addition and subtraction of the integer inputs, which
+   is no problem, and multiplication by fractional constants, which is
+   a problem to do in integer arithmetic.  We multiply all the constants
+   by CONST_SCALE and convert them to integer constants (thus retaining
+   CONST_BITS (13) bits of precision in the constants).  After doing a
+   multiplication we have to divide the product by CONST_SCALE, with proper
+   rounding, to produce the correct output.  This division can be done
+   cheaply as a right shift of CONST_BITS (13) bits.  We postpone shifting
+   as long as possible so that partial sums can be added together with
+   full fractional precision.
+
+   The outputs of the first pass are scaled up by PASS1_BITS (2) bits so that
+   they are represented to better-than-integral precision.  These outputs
+   require BITS_IN_JSAMPLE (8) + PASS1_BITS (2) + 3 bits; this fits in a
+   16-bit word with the recommended scaling.  (For 12-bit sample data, the
+   intermediate array is int anyway.)
+
+   To avoid overflow of the 32-bit intermediate results in pass 2, we must
+   have BITS_IN_JSAMPLE (8) + CONST_BITS (13) + PASS1_BITS (2) <= 26.
+   Error analysis shows that the values given below are the most effective.
+*/
+
+/*
+  Forward declaration of functions
+*/
+
+void jfdctint_init();
+int jfdctint_return();
+__attribute__((noinline)) __attribute__((export_name("entrypoint"))) void
+jfdctint_main();
+__attribute__((noinline)) __attribute__((export_name("main"))) int main(void);
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+
+/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
+   causing a lot of useless floating-point operations at run time.
+   To get around this we use the following pre-calculated constants.
+   If you change CONST_BITS you may want to add appropriate values.
+   (With a reasonable C compiler, you can just rely on the FIX() macro...)
+*/
+
+#define FIX_0_298631336 ((int) 2446)  /* FIX(0.298631336) */
+#define FIX_0_390180644 ((int) 3196)  /* FIX(0.390180644) */
+#define FIX_0_541196100 ((int) 4433)  /* FIX(0.541196100) */
+#define FIX_0_765366865 ((int) 6270)  /* FIX(0.765366865) */
+#define FIX_0_899976223 ((int) 7373)  /* FIX(0.899976223) */
+#define FIX_1_175875602 ((int) 9633)  /* FIX(1.175875602) */
+#define FIX_1_501321110 ((int) 12299) /* FIX(1.501321110) */
+#define FIX_1_847759065 ((int) 15137) /* FIX(1.847759065) */
+#define FIX_1_961570560 ((int) 16069) /* FIX(1.961570560) */
+#define FIX_2_053119869 ((int) 16819) /* FIX(2.053119869) */
+#define FIX_2_562915447 ((int) 20995) /* FIX(2.562915447) */
+#define FIX_3_072711026 ((int) 25172) /* FIX(3.072711026) */
+
+/* Multiply an int variable by an int constant to yield an int result.
+   For 8-bit samples with the recommended scaling, all the variable
+   and constant values involved are no more than 16 bits wide, so a
+   16x16->32 bit multiply can be used instead of a full 32x32 multiply.
+   For 12-bit samples, a full 32-bit multiplication will be needed.
+*/
+
+int jfdctint_data[64];
+
+const int jfdctint_CHECKSUM = 1668124;
+
+void
+jfdctint_init() {
+    int i, seed;
+
+    /* Worst case settings */
+    /* Set array to random values */
+    seed = 1;
+
+    __pragma_loopbound(64, 64);
+    for (i = 0; i < 64; i++) {
+        seed = ((seed * 133) + 81) % 65535;
+        jfdctint_data[i] = seed;
+    }
+}
+
+int
+jfdctint_return() {
+    int checksum = 0;
+    int i;
+    __pragma_loopbound(64, 64);
+    for (i = 0; i < 64; ++i)
+        checksum += jfdctint_data[i];
+    return ((checksum == jfdctint_CHECKSUM) ? 0 : -1);
+}
+
+/*
+   Perform the forward DCT on one block of samples.
+*/
+
+void
+jfdctint_jpeg_fdct_islow(void) {
+    int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    int tmp10, tmp11, tmp12, tmp13;
+    int z1, z2, z3, z4, z5;
+    int *dataptr;
+    int ctr;
+
+    /* Pass 1: process rows. */
+    /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+    /* furthermore, we scale the results by 2**PASS1_BITS. */
+
+    dataptr = jfdctint_data;
+    __pragma_loopbound(8, 8);
+    for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
+
+        tmp0 = dataptr[0] + dataptr[7];
+        tmp7 = dataptr[0] - dataptr[7];
+        tmp1 = dataptr[1] + dataptr[6];
+        tmp6 = dataptr[1] - dataptr[6];
+        tmp2 = dataptr[2] + dataptr[5];
+        tmp5 = dataptr[2] - dataptr[5];
+        tmp3 = dataptr[3] + dataptr[4];
+        tmp4 = dataptr[3] - dataptr[4];
+
+        tmp10 = tmp0 + tmp3;
+        tmp13 = tmp0 - tmp3;
+        tmp11 = tmp1 + tmp2;
+        tmp12 = tmp1 - tmp2;
+
+        dataptr[0] = (int) ((tmp10 + tmp11) << PASS1_BITS);
+        dataptr[4] = (int) ((tmp10 - tmp11) << PASS1_BITS);
+
+        z1 = (tmp12 + tmp13) * FIX_0_541196100;
+        dataptr[2] = (int) DESCALE(z1 + tmp13 * FIX_0_765366865,
+                                   CONST_BITS - PASS1_BITS);
+        dataptr[6] = (int) DESCALE(z1 + tmp12 * (-FIX_1_847759065),
+                                   CONST_BITS - PASS1_BITS);
+
+        z1 = tmp4 + tmp7;
+        z2 = tmp5 + tmp6;
+        z3 = tmp4 + tmp6;
+        z4 = tmp5 + tmp7;
+        z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */
+
+        tmp4 = tmp4 * FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */
+        tmp5 = tmp5 * FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */
+        tmp6 = tmp6 * FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */
+        tmp7 = tmp7 * FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */
+        z1 = z1 * (-FIX_0_899976223);  /* sqrt(2) * (c7-c3) */
+        z2 = z2 * (-FIX_2_562915447);  /* sqrt(2) * (-c1-c3) */
+        z3 = z3 * (-FIX_1_961570560);  /* sqrt(2) * (-c3-c5) */
+        z4 = z4 * (-FIX_0_390180644);  /* sqrt(2) * (c5-c3) */
+
+        z3 += z5;
+        z4 += z5;
+
+        dataptr[7] = (int) DESCALE(tmp4 + z1 + z3, CONST_BITS - PASS1_BITS);
+        dataptr[5] = (int) DESCALE(tmp5 + z2 + z4, CONST_BITS - PASS1_BITS);
+        dataptr[3] = (int) DESCALE(tmp6 + z2 + z3, CONST_BITS - PASS1_BITS);
+        dataptr[1] = (int) DESCALE(tmp7 + z1 + z4, CONST_BITS - PASS1_BITS);
+
+        dataptr += DCTSIZE; /* advance pointer to next row */
+    }
+
+    dataptr = jfdctint_data;
+    __pragma_loopbound(8, 8);
+    for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
+        tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
+        tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
+        tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
+        tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
+        tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
+        tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
+        tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
+        tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
+
+        tmp10 = tmp0 + tmp3;
+        tmp13 = tmp0 - tmp3;
+        tmp11 = tmp1 + tmp2;
+        tmp12 = tmp1 - tmp2;
+
+        dataptr[DCTSIZE * 0] = (int) DESCALE(tmp10 + tmp11, PASS1_BITS);
+        dataptr[DCTSIZE * 4] = (int) DESCALE(tmp10 - tmp11, PASS1_BITS);
+
+        z1 = (tmp12 + tmp13) * FIX_0_541196100;
+        dataptr[DCTSIZE * 2] = (int) DESCALE(z1 + tmp13 * FIX_0_765366865,
+                                             CONST_BITS + PASS1_BITS);
+        dataptr[DCTSIZE * 6] = (int) DESCALE(z1 + tmp12 * (-FIX_1_847759065),
+                                             CONST_BITS + PASS1_BITS);
+
+        z1 = tmp4 + tmp7;
+        z2 = tmp5 + tmp6;
+        z3 = tmp4 + tmp6;
+        z4 = tmp5 + tmp7;
+        z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */
+
+        tmp4 = tmp4 * FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */
+        tmp5 = tmp5 * FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */
+        tmp6 = tmp6 * FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */
+        tmp7 = tmp7 * FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */
+        z1 = z1 * (-FIX_0_899976223);  /* sqrt(2) * (c7-c3) */
+        z2 = z2 * (-FIX_2_562915447);  /* sqrt(2) * (-c1-c3) */
+        z3 = z3 * (-FIX_1_961570560);  /* sqrt(2) * (-c3-c5) */
+        z4 = z4 * (-FIX_0_390180644);  /* sqrt(2) * (c5-c3) */
+
+        z3 += z5;
+        z4 += z5;
+
+        dataptr[DCTSIZE * 7] =
+            (int) DESCALE(tmp4 + z1 + z3, CONST_BITS + PASS1_BITS);
+        dataptr[DCTSIZE * 5] =
+            (int) DESCALE(tmp5 + z2 + z4, CONST_BITS + PASS1_BITS);
+        dataptr[DCTSIZE * 3] =
+            (int) DESCALE(tmp6 + z2 + z3, CONST_BITS + PASS1_BITS);
+        dataptr[DCTSIZE * 1] =
+            (int) DESCALE(tmp7 + z1 + z4, CONST_BITS + PASS1_BITS);
+
+        dataptr++; /* advance pointer to next column */
+    }
+}
+
+/* Main function
+   Time to function execution time using logic analyzer,
+   which measures the OFF time of a LED on board.
+
+   The switching latency, including the function call/return time,
+   is measured to be equal to 1.1us (22 clock cycles).
+*/
+__attribute__((noinline)) __attribute__((export_name("entrypoint"))) void
+jfdctint_main(void) {
+    jfdctint_jpeg_fdct_islow();
+}
+
+__attribute__((noinline)) __attribute__((export_name("main"))) int
+main(void) {
+    jfdctint_init();
+    jfdctint_main();
+
+    return (jfdctint_return());
+}
--- a/targets/wasm-tacle/kernel/jfdctint/generated/modified_sources/inline/jfdctint.c
+++ b/targets/wasm-tacle/kernel/jfdctint/generated/modified_sources/inline/jfdctint.c
@ -0,0 +1,322 @@
+/*
+
+  This program is part of the TACLeBench benchmark suite.
+  Version V 1.x
+
+  Name: jfdctint
+
+  Author: Thomas G. Lane, Public domain JPEG source code.
+          Modified by Steven Li at Princeton University.
+
+  Function: JPEG slow-but-accurate integer implementation of the
+            forward  DCT (Discrete Cosine Transform) on a 8x8
+            pixel block [from original file documentations]
+
+   Copyright (C) 1991-1994, Thomas G. Lane.
+   This file is part of the Independent JPEG Group's software.
+   For conditions of distribution and use, see the accompanying README file.
+
+   This file contains a slow-but-accurate integer implementation of the
+   forward DCT (Discrete Cosine Transform).
+
+   A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
+   on each column.  Direct algorithms are also available, but they are
+   much more complex and seem not to be any faster when reduced to code.
+
+   This implementation is based on an algorithm described in
+     C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
+     Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
+     Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
+   The primary algorithm described there uses 11 multiplies and 29 adds.
+   We use their alternate method with 12 multiplies and 32 adds.
+   The advantage of this method is that no data path contains more than one
+   multiplication; this allows a very simple and accurate implementation in
+   scaled fixed-point arithmetic, with a minimal number of shifts.
+
+  Source: SNU-RT Benchmark Suite for Worst Case Timing Analysis
+          Collected and Modified by S.-S. Lim
+          Real-Time Research Group
+          Seoul National University
+
+  Changes: Moved initialisation code from jfdctint_main() to jfdctint_init(),
+           added checksum calculation in jfdctint_return()
+
+  License: see README
+
+*/
+
+/*  COMMENTS: Long calculation sequences (i.e., long basic blocks),      */
+/*            single-nested loops.                                       */
+
+/**********************************************************************
+    Functions to be timed
+***********************************************************************/
+
+/* This definitions are added by Steven Li so as to bypass the header
+   files.
+*/
+
+// Wasm loop bounds
+
+
+
+
+__attribute__((import_module("__pragma"), import_name("loopbound"))) extern void
+__pragma_loopbound(unsigned int min_bound, unsigned int max_bound);
+
+#define DCTSIZE       8
+#define DESCALE(x, n) (((x) + (((int) 1) << ((n) - 1))) >> (n))
+
+/*
+   The poop on this scaling stuff is as follows:
+
+   Each 1-D DCT step produces outputs which are a factor of sqrt(N)
+   larger than the true DCT outputs.  The final outputs are therefore
+   a factor of N larger than desired; since N=8 this can be cured by
+   a simple right shift at the end of the algorithm.  The advantage of
+   this arrangement is that we save two multiplications per 1-D DCT,
+   because the y0 and y4 outputs need not be divided by sqrt(N).
+   In the IJG code, this factor of 8 is removed by the quantization step
+   (in jcdctmgr.c), NOT in this module.
+
+   We have to do addition and subtraction of the integer inputs, which
+   is no problem, and multiplication by fractional constants, which is
+   a problem to do in integer arithmetic.  We multiply all the constants
+   by CONST_SCALE and convert them to integer constants (thus retaining
+   CONST_BITS (13) bits of precision in the constants).  After doing a
+   multiplication we have to divide the product by CONST_SCALE, with proper
+   rounding, to produce the correct output.  This division can be done
+   cheaply as a right shift of CONST_BITS (13) bits.  We postpone shifting
+   as long as possible so that partial sums can be added together with
+   full fractional precision.
+
+   The outputs of the first pass are scaled up by PASS1_BITS (2) bits so that
+   they are represented to better-than-integral precision.  These outputs
+   require BITS_IN_JSAMPLE (8) + PASS1_BITS (2) + 3 bits; this fits in a
+   16-bit word with the recommended scaling.  (For 12-bit sample data, the
+   intermediate array is int anyway.)
+
+   To avoid overflow of the 32-bit intermediate results in pass 2, we must
+   have BITS_IN_JSAMPLE (8) + CONST_BITS (13) + PASS1_BITS (2) <= 26.
+   Error analysis shows that the values given below are the most effective.
+*/
+
+/*
+  Forward declaration of functions
+*/
+
+__attribute__((always_inline)) static inline void jfdctint_init();
+__attribute__((always_inline)) static inline int jfdctint_return();
+__attribute__((noinline)) __attribute__((export_name("entrypoint")))
+__attribute__((noinline)) __attribute__((export_name("entrypoint"))) void
+jfdctint_main();
+__attribute__((noinline)) __attribute__((export_name("main")))
+__attribute__((noinline)) __attribute__((export_name("main"))) int
+main(void);
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+
+/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
+   causing a lot of useless floating-point operations at run time.
+   To get around this we use the following pre-calculated constants.
+   If you change CONST_BITS you may want to add appropriate values.
+   (With a reasonable C compiler, you can just rely on the FIX() macro...)
+*/
+
+#define FIX_0_298631336 ((int) 2446)  /* FIX(0.298631336) */
+#define FIX_0_390180644 ((int) 3196)  /* FIX(0.390180644) */
+#define FIX_0_541196100 ((int) 4433)  /* FIX(0.541196100) */
+#define FIX_0_765366865 ((int) 6270)  /* FIX(0.765366865) */
+#define FIX_0_899976223 ((int) 7373)  /* FIX(0.899976223) */
+#define FIX_1_175875602 ((int) 9633)  /* FIX(1.175875602) */
+#define FIX_1_501321110 ((int) 12299) /* FIX(1.501321110) */
+#define FIX_1_847759065 ((int) 15137) /* FIX(1.847759065) */
+#define FIX_1_961570560 ((int) 16069) /* FIX(1.961570560) */
+#define FIX_2_053119869 ((int) 16819) /* FIX(2.053119869) */
+#define FIX_2_562915447 ((int) 20995) /* FIX(2.562915447) */
+#define FIX_3_072711026 ((int) 25172) /* FIX(3.072711026) */
+
+/* Multiply an int variable by an int constant to yield an int result.
+   For 8-bit samples with the recommended scaling, all the variable
+   and constant values involved are no more than 16 bits wide, so a
+   16x16->32 bit multiply can be used instead of a full 32x32 multiply.
+   For 12-bit samples, a full 32-bit multiplication will be needed.
+*/
+
+int jfdctint_data[64];
+
+const int jfdctint_CHECKSUM = 1668124;
+
+__attribute__((always_inline)) static inline void
+jfdctint_init() {
+    int i, seed;
+
+    /* Worst case settings */
+    /* Set array to random values */
+    seed = 1;
+
+    __pragma_loopbound(64, 64);
+    for (i = 0; i < 64; i++) {
+        seed = ((seed * 133) + 81) % 65535;
+        jfdctint_data[i] = seed;
+    }
+}
+
+__attribute__((always_inline)) static inline int
+jfdctint_return() {
+    int checksum = 0;
+    int i;
+    __pragma_loopbound(64, 64);
+    for (i = 0; i < 64; ++i)
+        checksum += jfdctint_data[i];
+    return ((checksum == jfdctint_CHECKSUM) ? 0 : -1);
+}
+
+/*
+   Perform the forward DCT on one block of samples.
+*/
+
+__attribute__((always_inline)) static inline void
+jfdctint_jpeg_fdct_islow(void) {
+    int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    int tmp10, tmp11, tmp12, tmp13;
+    int z1, z2, z3, z4, z5;
+    int *dataptr;
+    int ctr;
+
+    /* Pass 1: process rows. */
+    /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+    /* furthermore, we scale the results by 2**PASS1_BITS. */
+
+    dataptr = jfdctint_data;
+    __pragma_loopbound(8, 8);
+    for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
+
+        tmp0 = dataptr[0] + dataptr[7];
+        tmp7 = dataptr[0] - dataptr[7];
+        tmp1 = dataptr[1] + dataptr[6];
+        tmp6 = dataptr[1] - dataptr[6];
+        tmp2 = dataptr[2] + dataptr[5];
+        tmp5 = dataptr[2] - dataptr[5];
+        tmp3 = dataptr[3] + dataptr[4];
+        tmp4 = dataptr[3] - dataptr[4];
+
+        tmp10 = tmp0 + tmp3;
+        tmp13 = tmp0 - tmp3;
+        tmp11 = tmp1 + tmp2;
+        tmp12 = tmp1 - tmp2;
+
+        dataptr[0] = (int) ((tmp10 + tmp11) << PASS1_BITS);
+        dataptr[4] = (int) ((tmp10 - tmp11) << PASS1_BITS);
+
+        z1 = (tmp12 + tmp13) * FIX_0_541196100;
+        dataptr[2] = (int) DESCALE(z1 + tmp13 * FIX_0_765366865,
+                                   CONST_BITS - PASS1_BITS);
+        dataptr[6] = (int) DESCALE(z1 + tmp12 * (-FIX_1_847759065),
+                                   CONST_BITS - PASS1_BITS);
+
+        z1 = tmp4 + tmp7;
+        z2 = tmp5 + tmp6;
+        z3 = tmp4 + tmp6;
+        z4 = tmp5 + tmp7;
+        z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */
+
+        tmp4 = tmp4 * FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */
+        tmp5 = tmp5 * FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */
+        tmp6 = tmp6 * FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */
+        tmp7 = tmp7 * FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */
+        z1 = z1 * (-FIX_0_899976223);  /* sqrt(2) * (c7-c3) */
+        z2 = z2 * (-FIX_2_562915447);  /* sqrt(2) * (-c1-c3) */
+        z3 = z3 * (-FIX_1_961570560);  /* sqrt(2) * (-c3-c5) */
+        z4 = z4 * (-FIX_0_390180644);  /* sqrt(2) * (c5-c3) */
+
+        z3 += z5;
+        z4 += z5;
+
+        dataptr[7] = (int) DESCALE(tmp4 + z1 + z3, CONST_BITS - PASS1_BITS);
+        dataptr[5] = (int) DESCALE(tmp5 + z2 + z4, CONST_BITS - PASS1_BITS);
+        dataptr[3] = (int) DESCALE(tmp6 + z2 + z3, CONST_BITS - PASS1_BITS);
+        dataptr[1] = (int) DESCALE(tmp7 + z1 + z4, CONST_BITS - PASS1_BITS);
+
+        dataptr += DCTSIZE; /* advance pointer to next row */
+    }
+
+    dataptr = jfdctint_data;
+    __pragma_loopbound(8, 8);
+    for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
+        tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
+        tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
+        tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
+        tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
+        tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
+        tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
+        tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
+        tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
+
+        tmp10 = tmp0 + tmp3;
+        tmp13 = tmp0 - tmp3;
+        tmp11 = tmp1 + tmp2;
+        tmp12 = tmp1 - tmp2;
+
+        dataptr[DCTSIZE * 0] = (int) DESCALE(tmp10 + tmp11, PASS1_BITS);
+        dataptr[DCTSIZE * 4] = (int) DESCALE(tmp10 - tmp11, PASS1_BITS);
+
+        z1 = (tmp12 + tmp13) * FIX_0_541196100;
+        dataptr[DCTSIZE * 2] = (int) DESCALE(z1 + tmp13 * FIX_0_765366865,
+                                             CONST_BITS + PASS1_BITS);
+        dataptr[DCTSIZE * 6] = (int) DESCALE(z1 + tmp12 * (-FIX_1_847759065),
+                                             CONST_BITS + PASS1_BITS);
+
+        z1 = tmp4 + tmp7;
+        z2 = tmp5 + tmp6;
+        z3 = tmp4 + tmp6;
+        z4 = tmp5 + tmp7;
+        z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */
+
+        tmp4 = tmp4 * FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */
+        tmp5 = tmp5 * FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */
+        tmp6 = tmp6 * FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */
+        tmp7 = tmp7 * FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */
+        z1 = z1 * (-FIX_0_899976223);  /* sqrt(2) * (c7-c3) */
+        z2 = z2 * (-FIX_2_562915447);  /* sqrt(2) * (-c1-c3) */
+        z3 = z3 * (-FIX_1_961570560);  /* sqrt(2) * (-c3-c5) */
+        z4 = z4 * (-FIX_0_390180644);  /* sqrt(2) * (c5-c3) */
+
+        z3 += z5;
+        z4 += z5;
+
+        dataptr[DCTSIZE * 7] =
+            (int) DESCALE(tmp4 + z1 + z3, CONST_BITS + PASS1_BITS);
+        dataptr[DCTSIZE * 5] =
+            (int) DESCALE(tmp5 + z2 + z4, CONST_BITS + PASS1_BITS);
+        dataptr[DCTSIZE * 3] =
+            (int) DESCALE(tmp6 + z2 + z3, CONST_BITS + PASS1_BITS);
+        dataptr[DCTSIZE * 1] =
+            (int) DESCALE(tmp7 + z1 + z4, CONST_BITS + PASS1_BITS);
+
+        dataptr++; /* advance pointer to next column */
+    }
+}
+
+/* Main function
+   Time to function execution time using logic analyzer,
+   which measures the OFF time of a LED on board.
+
+   The switching latency, including the function call/return time,
+   is measured to be equal to 1.1us (22 clock cycles).
+*/
+__attribute__((noinline)) __attribute__((export_name("entrypoint")))
+__attribute__((noinline)) __attribute__((export_name("entrypoint"))) void
+jfdctint_main(void) {
+    jfdctint_jpeg_fdct_islow();
+}
+
+__attribute__((noinline)) __attribute__((export_name("main")))
+__attribute__((noinline)) __attribute__((export_name("main"))) int
+main(void) {
+    jfdctint_init();
+    jfdctint_main();
+
+    return (jfdctint_return());
+}
--- a/targets/wasm-tacle/kernel/jfdctint/jfdctint.c
+++ b/targets/wasm-tacle/kernel/jfdctint/jfdctint.c
@ -0,0 +1,319 @@
+/*
+
+  This program is part of the TACLeBench benchmark suite.
+  Version V 1.x
+
+  Name: jfdctint
+
+  Author: Thomas G. Lane, Public domain JPEG source code.
+          Modified by Steven Li at Princeton University.
+
+  Function: JPEG slow-but-accurate integer implementation of the
+            forward  DCT (Discrete Cosine Transform) on a 8x8
+            pixel block [from original file documentations]
+
+   Copyright (C) 1991-1994, Thomas G. Lane.
+   This file is part of the Independent JPEG Group's software.
+   For conditions of distribution and use, see the accompanying README file.
+
+   This file contains a slow-but-accurate integer implementation of the
+   forward DCT (Discrete Cosine Transform).
+
+   A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
+   on each column.  Direct algorithms are also available, but they are
+   much more complex and seem not to be any faster when reduced to code.
+
+   This implementation is based on an algorithm described in
+     C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
+     Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
+     Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
+   The primary algorithm described there uses 11 multiplies and 29 adds.
+   We use their alternate method with 12 multiplies and 32 adds.
+   The advantage of this method is that no data path contains more than one
+   multiplication; this allows a very simple and accurate implementation in
+   scaled fixed-point arithmetic, with a minimal number of shifts.
+
+  Source: SNU-RT Benchmark Suite for Worst Case Timing Analysis
+          Collected and Modified by S.-S. Lim
+          Real-Time Research Group
+          Seoul National University
+
+  Changes: Moved initialisation code from jfdctint_main() to jfdctint_init(),
+           added checksum calculation in jfdctint_return()
+
+  License: see README
+
+*/
+
+
+/*  COMMENTS: Long calculation sequences (i.e., long basic blocks),      */
+/*            single-nested loops.                                       */
+
+/**********************************************************************
+    Functions to be timed
+***********************************************************************/
+
+/* This definitions are added by Steven Li so as to bypass the header
+   files.
+*/
+
+#define DCTSIZE 8
+#define DESCALE(x,n)  (((x) + (((int)1) << ((n)-1))) >> (n))
+
+/*
+   The poop on this scaling stuff is as follows:
+
+   Each 1-D DCT step produces outputs which are a factor of sqrt(N)
+   larger than the true DCT outputs.  The final outputs are therefore
+   a factor of N larger than desired; since N=8 this can be cured by
+   a simple right shift at the end of the algorithm.  The advantage of
+   this arrangement is that we save two multiplications per 1-D DCT,
+   because the y0 and y4 outputs need not be divided by sqrt(N).
+   In the IJG code, this factor of 8 is removed by the quantization step
+   (in jcdctmgr.c), NOT in this module.
+
+   We have to do addition and subtraction of the integer inputs, which
+   is no problem, and multiplication by fractional constants, which is
+   a problem to do in integer arithmetic.  We multiply all the constants
+   by CONST_SCALE and convert them to integer constants (thus retaining
+   CONST_BITS (13) bits of precision in the constants).  After doing a
+   multiplication we have to divide the product by CONST_SCALE, with proper
+   rounding, to produce the correct output.  This division can be done
+   cheaply as a right shift of CONST_BITS (13) bits.  We postpone shifting
+   as long as possible so that partial sums can be added together with
+   full fractional precision.
+
+   The outputs of the first pass are scaled up by PASS1_BITS (2) bits so that
+   they are represented to better-than-integral precision.  These outputs
+   require BITS_IN_JSAMPLE (8) + PASS1_BITS (2) + 3 bits; this fits in a
+   16-bit word with the recommended scaling.  (For 12-bit sample data, the
+   intermediate array is int anyway.)
+
+   To avoid overflow of the 32-bit intermediate results in pass 2, we must
+   have BITS_IN_JSAMPLE (8) + CONST_BITS (13) + PASS1_BITS (2) <= 26.
+   Error analysis shows that the values given below are the most effective.
+*/
+
+/*
+  Forward declaration of functions
+*/
+
+void jfdctint_init();
+int jfdctint_return();
+void jfdctint_main();
+int main( void );
+
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+
+/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
+   causing a lot of useless floating-point operations at run time.
+   To get around this we use the following pre-calculated constants.
+   If you change CONST_BITS you may want to add appropriate values.
+   (With a reasonable C compiler, you can just rely on the FIX() macro...)
+*/
+
+#define FIX_0_298631336  ((int)  2446)  /* FIX(0.298631336) */
+#define FIX_0_390180644  ((int)  3196)  /* FIX(0.390180644) */
+#define FIX_0_541196100  ((int)  4433)  /* FIX(0.541196100) */
+#define FIX_0_765366865  ((int)  6270)  /* FIX(0.765366865) */
+#define FIX_0_899976223  ((int)  7373)  /* FIX(0.899976223) */
+#define FIX_1_175875602  ((int)  9633)  /* FIX(1.175875602) */
+#define FIX_1_501321110  ((int)  12299) /* FIX(1.501321110) */
+#define FIX_1_847759065  ((int)  15137) /* FIX(1.847759065) */
+#define FIX_1_961570560  ((int)  16069) /* FIX(1.961570560) */
+#define FIX_2_053119869  ((int)  16819) /* FIX(2.053119869) */
+#define FIX_2_562915447  ((int)  20995) /* FIX(2.562915447) */
+#define FIX_3_072711026  ((int)  25172) /* FIX(3.072711026) */
+
+
+/* Multiply an int variable by an int constant to yield an int result.
+   For 8-bit samples with the recommended scaling, all the variable
+   and constant values involved are no more than 16 bits wide, so a
+   16x16->32 bit multiply can be used instead of a full 32x32 multiply.
+   For 12-bit samples, a full 32-bit multiplication will be needed.
+*/
+
+
+int jfdctint_data[ 64 ];
+
+
+const int jfdctint_CHECKSUM = 1668124;
+
+void jfdctint_init()
+{
+  int i, seed;
+
+  /* Worst case settings */
+  /* Set array to random values */
+  seed = 1;
+
+  _Pragma( "loopbound min 64 max 64" )
+  for ( i = 0; i < 64; i++ ) {
+    seed = ( ( seed * 133 ) + 81 ) % 65535;
+    jfdctint_data[ i ] = seed;
+  }
+
+}
+
+
+int jfdctint_return()
+{
+  int checksum = 0;
+  int i;
+  _Pragma( "loopbound min 64 max 64" )
+  for ( i = 0; i < 64; ++i )
+    checksum += jfdctint_data[ i ];
+  return ( ( checksum == jfdctint_CHECKSUM ) ? 0 : -1 );
+}
+
+
+/*
+   Perform the forward DCT on one block of samples.
+*/
+
+void jfdctint_jpeg_fdct_islow( void )
+{
+  int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int tmp10, tmp11, tmp12, tmp13;
+  int z1, z2, z3, z4, z5;
+  int *dataptr;
+  int ctr;
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+
+  dataptr = jfdctint_data;
+  _Pragma( "loopbound min 8 max 8" )
+  for ( ctr = DCTSIZE - 1; ctr >= 0; ctr-- ) {
+
+    tmp0 = dataptr[ 0 ] + dataptr[ 7 ];
+    tmp7 = dataptr[ 0 ] - dataptr[ 7 ];
+    tmp1 = dataptr[ 1 ] + dataptr[ 6 ];
+    tmp6 = dataptr[ 1 ] - dataptr[ 6 ];
+    tmp2 = dataptr[ 2 ] + dataptr[ 5 ];
+    tmp5 = dataptr[ 2 ] - dataptr[ 5 ];
+    tmp3 = dataptr[ 3 ] + dataptr[ 4 ];
+    tmp4 = dataptr[ 3 ] - dataptr[ 4 ];
+
+    tmp10 = tmp0 + tmp3;
+    tmp13 = tmp0 - tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp12 = tmp1 - tmp2;
+
+    dataptr[ 0 ] = ( int ) ( ( tmp10 + tmp11 ) << PASS1_BITS );
+    dataptr[ 4 ] = ( int ) ( ( tmp10 - tmp11 ) << PASS1_BITS );
+
+    z1 = ( tmp12 + tmp13 ) * FIX_0_541196100;
+    dataptr[ 2 ] = ( int ) DESCALE( z1 + tmp13 * FIX_0_765366865,
+                                    CONST_BITS - PASS1_BITS );
+    dataptr[ 6 ] = ( int ) DESCALE( z1 + tmp12 * ( - FIX_1_847759065 ),
+                                    CONST_BITS - PASS1_BITS );
+
+    z1 = tmp4 + tmp7;
+    z2 = tmp5 + tmp6;
+    z3 = tmp4 + tmp6;
+    z4 = tmp5 + tmp7;
+    z5 = ( z3 + z4 ) * FIX_1_175875602; /* sqrt(2) * c3 */
+
+    tmp4 = tmp4 * FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */
+    tmp5 = tmp5 * FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */
+    tmp6 = tmp6 * FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */
+    tmp7 = tmp7 * FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */
+    z1 = z1 * ( - FIX_0_899976223 ); /* sqrt(2) * (c7-c3) */
+    z2 = z2 * ( - FIX_2_562915447 ); /* sqrt(2) * (-c1-c3) */
+    z3 = z3 * ( - FIX_1_961570560 ); /* sqrt(2) * (-c3-c5) */
+    z4 = z4 * ( - FIX_0_390180644 ); /* sqrt(2) * (c5-c3) */
+
+    z3 += z5;
+    z4 += z5;
+
+    dataptr[ 7 ] = ( int ) DESCALE( tmp4 + z1 + z3, CONST_BITS - PASS1_BITS );
+    dataptr[ 5 ] = ( int ) DESCALE( tmp5 + z2 + z4, CONST_BITS - PASS1_BITS );
+    dataptr[ 3 ] = ( int ) DESCALE( tmp6 + z2 + z3, CONST_BITS - PASS1_BITS );
+    dataptr[ 1 ] = ( int ) DESCALE( tmp7 + z1 + z4, CONST_BITS - PASS1_BITS );
+
+    dataptr += DCTSIZE;   /* advance pointer to next row */
+  }
+
+  dataptr = jfdctint_data;
+  _Pragma( "loopbound min 8 max 8" )
+  for ( ctr = DCTSIZE - 1; ctr >= 0; ctr-- ) {
+    tmp0 = dataptr[ DCTSIZE * 0 ] + dataptr[ DCTSIZE * 7 ];
+    tmp7 = dataptr[ DCTSIZE * 0 ] - dataptr[ DCTSIZE * 7 ];
+    tmp1 = dataptr[ DCTSIZE * 1 ] + dataptr[ DCTSIZE * 6 ];
+    tmp6 = dataptr[ DCTSIZE * 1 ] - dataptr[ DCTSIZE * 6 ];
+    tmp2 = dataptr[ DCTSIZE * 2 ] + dataptr[ DCTSIZE * 5 ];
+    tmp5 = dataptr[ DCTSIZE * 2 ] - dataptr[ DCTSIZE * 5 ];
+    tmp3 = dataptr[ DCTSIZE * 3 ] + dataptr[ DCTSIZE * 4 ];
+    tmp4 = dataptr[ DCTSIZE * 3 ] - dataptr[ DCTSIZE * 4 ];
+
+    tmp10 = tmp0 + tmp3;
+    tmp13 = tmp0 - tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp12 = tmp1 - tmp2;
+
+    dataptr[ DCTSIZE * 0 ] = ( int ) DESCALE( tmp10 + tmp11, PASS1_BITS );
+    dataptr[ DCTSIZE * 4 ] = ( int ) DESCALE( tmp10 - tmp11, PASS1_BITS );
+
+    z1 = ( tmp12 + tmp13 ) * FIX_0_541196100;
+    dataptr[ DCTSIZE * 2 ] = ( int ) DESCALE( z1 + tmp13 * FIX_0_765366865,
+                             CONST_BITS + PASS1_BITS );
+    dataptr[ DCTSIZE * 6 ] = ( int ) DESCALE( z1
+                             + tmp12 * ( - FIX_1_847759065 ),
+                             CONST_BITS + PASS1_BITS );
+
+    z1 = tmp4 + tmp7;
+    z2 = tmp5 + tmp6;
+    z3 = tmp4 + tmp6;
+    z4 = tmp5 + tmp7;
+    z5 = ( z3 + z4 ) * FIX_1_175875602; /* sqrt(2) * c3 */
+
+    tmp4 = tmp4 * FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */
+    tmp5 = tmp5 * FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */
+    tmp6 = tmp6 * FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */
+    tmp7 = tmp7 * FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */
+    z1 = z1 * ( - FIX_0_899976223 ); /* sqrt(2) * (c7-c3) */
+    z2 = z2 * ( - FIX_2_562915447 ); /* sqrt(2) * (-c1-c3) */
+    z3 = z3 * ( - FIX_1_961570560 ); /* sqrt(2) * (-c3-c5) */
+    z4 = z4 * ( - FIX_0_390180644 ); /* sqrt(2) * (c5-c3) */
+
+    z3 += z5;
+    z4 += z5;
+
+    dataptr[ DCTSIZE * 7 ] = ( int ) DESCALE( tmp4 + z1 + z3,
+                             CONST_BITS + PASS1_BITS );
+    dataptr[ DCTSIZE * 5 ] = ( int ) DESCALE( tmp5 + z2 + z4,
+                             CONST_BITS + PASS1_BITS );
+    dataptr[ DCTSIZE * 3 ] = ( int ) DESCALE( tmp6 + z2 + z3,
+                             CONST_BITS + PASS1_BITS );
+    dataptr[ DCTSIZE * 1 ] = ( int ) DESCALE( tmp7 + z1 + z4,
+                             CONST_BITS + PASS1_BITS );
+
+    dataptr++;      /* advance pointer to next column */
+  }
+
+}
+
+/* Main function
+   Time to function execution time using logic analyzer,
+   which measures the OFF time of a LED on board.
+
+   The switching latency, including the function call/return time,
+   is measured to be equal to 1.1us (22 clock cycles).
+*/
+void _Pragma ( "entrypoint" ) jfdctint_main( void )
+{
+  jfdctint_jpeg_fdct_islow();
+}
+
+
+int main( void )
+{
+  jfdctint_init();
+  jfdctint_main();
+
+  return ( jfdctint_return() );
+}