diff --git a/src/experiments/perf-test/CMakeLists.txt b/src/experiments/perf-test/CMakeLists.txt new file mode 100644 index 00000000..fd32857f --- /dev/null +++ b/src/experiments/perf-test/CMakeLists.txt @@ -0,0 +1,17 @@ +set(EXPERIMENT_NAME perf-test) +set(EXPERIMENT_TYPE PerfTestExperiment) +configure_file(../instantiate-experiment.ah.in + ${CMAKE_CURRENT_BINARY_DIR}/instantiate-${EXPERIMENT_NAME}.ah @ONLY +) + +#experiment sources +set(MY_EXPERIMENT_SRCS + experiment.hpp + experiment.cc +) + +#### include directories #### +include_directories(${CMAKE_CURRENT_BINARY_DIR}) + +## build library +add_library(fail-${EXPERIMENT_NAME} ${MY_EXPERIMENT_SRCS}) diff --git a/src/experiments/perf-test/experiment.cc b/src/experiments/perf-test/experiment.cc new file mode 100644 index 00000000..0671f2d2 --- /dev/null +++ b/src/experiments/perf-test/experiment.cc @@ -0,0 +1,58 @@ +#include "util/Logger.hpp" +#include "util/WallclockTimer.hpp" + +#include "experiment.hpp" +#include "sal/SALInst.hpp" +#include "sal/Listener.hpp" +#include "config/FailConfig.hpp" + +// Check if configuration dependencies are satisfied: +#if !defined(CONFIG_EVENT_BREAKPOINTS) + #error This experiment just needs breakpoints. You may want to enable Fast-Breakpoints as well. +#endif + +using namespace std; +using namespace fail; + +bool PerfTestExperiment::run() +{ + Logger log("PERF", false); + log << "Experiment started (measuring ellapsed time using wallclock timer)..." << endl; + + // Performance tests: + WallclockTimer tm; + tm.startTimer(); +#if 1 + log << "Activated: CASE A (Best-Case)..." << endl; + // Case A): A lot of non-BP listeners a only one (or none) BPs: + const unsigned NON_BP_COUNT = 50; + log << "Adding " << NON_BP_COUNT << " non-BP listeners..." << endl; + MemReadListener mrl[NON_BP_COUNT]; + for (unsigned i = 0; i < NON_BP_COUNT; ++i) { + mrl[i].setWatchAddress(static_cast(-1)); + simulator.addListener(&mrl[i]); + } + log << "Adding one breakpoint listener and returning to simulator..." << endl; + BPSingleListener bp(0x00003c34); + simulator.addListenerAndResume(&bp); +#else + log << "Activated: CASE B (Worst-Case)..." << endl; + // Case B): n (non matching) BP listeners and no other listener types + const unsigned BP_COUNT = 50; + log << "Adding " << BP_COUNT << " BPSingleListeners..." << endl; + BPSingleListener bsl[BP_COUNT]; + for (unsigned i = 0; i < BP_COUNT; ++i) { + bsl[i].setWatchInstructionPointer(0xFFFFFFF); // we do not want them to trigger... + simulator.addListener(&bsl[i]); + } + log << "Adding final BPSingleListener and continuing simulation..." << endl; + // This is required to terminate the experiment: + BPSingleListener final(0x00003c34); + simulator.addListenerAndResume(&final); +#endif + + tm.stopTimer(); + log << "Time elapsed: " << tm << "s. Done, Bye!" << endl; + simulator.terminate(); + return true; +} diff --git a/src/experiments/perf-test/experiment.hpp b/src/experiments/perf-test/experiment.hpp new file mode 100644 index 00000000..3c54098a --- /dev/null +++ b/src/experiments/perf-test/experiment.hpp @@ -0,0 +1,13 @@ +#ifndef __PERF_TEST_EXPERIMENT_HPP__ + #define __PERF_TEST_EXPERIMENT_HPP__ + +#include "efw/ExperimentFlow.hpp" + +class PerfTestExperiment : public fail::ExperimentFlow { +public: + PerfTestExperiment() { } + + bool run(); +}; + +#endif // __PERF_TEST_EXPERIMENT_HPP__ diff --git a/src/experiments/perf-test/results.txt b/src/experiments/perf-test/results.txt new file mode 100644 index 00000000..7775548e --- /dev/null +++ b/src/experiments/perf-test/results.txt @@ -0,0 +1,103 @@ +**************************************************************************************************** + RESULTS: +**************************************************************************************************** + (A) WITH FAST_BREAKPOINTS (Default mode): + +hsc-simple (r1636) - phase 1: +real 1m8.604s +user 1m8.384s +sys 0m0.132s + +hsc-simple (r1636) - phase 2: +real 0m0.591s +user 0m0.064s +sys 0m0.076s + +perf-test (r1745): Best- vs. Worst-Case with Wallclock-Timer (NON_BP_COUNT = 50 and BP_COUNT = 50): +Case A: 511.46s (= ~9min, around 5,6 times faster than (B).a) +Case B: 4731.53s (= ~79min, around 1,1 times slower than (B).b) + +---------------------------------------------- + + (B) WITHOUT FAST_BREAKPOINTS (Default mode): + +hsc-simple (r1636) - phase 1: +real 0m34.712s +user 0m34.246s +sys 0m00.148s + +hsc-simple (r1636) - phase 2: +real 0m0.429s +user 0m0.048s +sys 0m0.084s + +perf-test (r1745): Best- vs. Worst-Case with Wallclock-Timer (NON_BP_COUNT = 50 and BP_COUNT = 50): +Case A: 2853.63s (= 47min) +Case B: 4214.03s (= 70min) + +---------------------------------------------- + + (C) WITH FAST_BREAKPOINTS (Release mode): + +hsc-simple (r1636) - phase 1: +real 0m13.341s +user 0m12.377s +sys 0m00.168s + +hsc-simple (r1636) - phase 2: +real 0m0.506s +user 0m0.032s +sys 0m0.100s + +perf-test (r1745): Best- vs. Worst-Case with Wallclock-Timer (NON_BP_COUNT = 50 and BP_COUNT = 50): +Case A: 43.0115s (< 1min, around 7,5 times faster than (D).a) +Case B: 385.547s (= ~6min, around 1,5 times faster than (D).b) + +---------------------------------------------- + + (D) WITHOUT FAST_BREAKPOINTS (Release mode): + +hsc-simple (r1636) - phase 1: +real 0m28.806s +user 0m28.214s +sys 0m00.160s + +hsc-simple (r1636) - phase 2: +real 0m0.565s +user 0m0.052s +sys 0m0.084s + +perf-test (r1745): Best- vs. Worst-Case with Wallclock-Timer (NON_BP_COUNT = 50 and BP_COUNT = 50): +Case A: 321.594s (= ~5min) +Case B: 587.698s (= ~9min) + +**************************************************************************************************** + EVALUATION: +**************************************************************************************************** +Note: These are just exemplary results based on the observed values (see above). + +- The (former) BufferCache's enabled a speedup up to 2,5x (according to Martin Unzer). +- hsc-simple: Fast-Breakpoints are only faster if compiled in Release mode (yields a + speedup up to 2x). +- hsc-simple: Unfortunately, they are also slower by a factor of 2, if compiled in + Default-Mode (and probably in Debug mode, too). +- perf-test: Except for case B in Default mode, Fast-Breakpoints enable a speedup that + ranges from 1,5 to 7,5! For case B (in Default mode -> no optimization), the Fast- + Breakpoint implementation slows down the overall execution speed by a factor of (only) + 1,1. However, for case A (Best-Case) we assume that the overall speedup (compared to + the corresponding case where Fast-Breakpoints are switched off) will tend to rise + when the experiment parameter NON_BP_COUNT is increased. + +**************************************************************************************************** + POSSIBLE OPTIMIZATIONS: +**************************************************************************************************** +Note: The following observations and conjectures are partly derived from the analysis of the +callgrind profile (using kcachegrind). + + (i) gather() should be inlined. (At the moment, this avoids an include cycle.) + (ii) Bypass the construction of a ResultSet object (the bypass would avoid an additional iteration + over the elements stored in the ResultSet itself), by calling makeActive in gather() + (iii) Complete the implementation of the PerfVecSortedSingleBP class (uses binary search in IPs) + + => (i) won't effect the speed in Default and Debug mode. (ii) should enable a speedup in all + cases. (iii) will only improve the speed when many *BPSingleListeners* are in use.