perf-test: best- and worst-case tests for evaluating fast-breakpoint performance (+ results).

git-svn-id: https://www4.informatik.uni-erlangen.de/i4svn/danceos/trunk/devel/fail@1745 8c4709b5-6ec9-48aa-a5cd-a96041d1645a
2012-10-16 13:07:11 +00:00
parent 5f0212aced
commit ab4cdcc6e0
4 changed files with 191 additions and 0 deletions
--- a/src/experiments/perf-test/CMakeLists.txt
+++ b/src/experiments/perf-test/CMakeLists.txt
@ -0,0 +1,17 @@
 set(EXPERIMENT_NAME perf-test)
 set(EXPERIMENT_TYPE PerfTestExperiment)
 configure_file(../instantiate-experiment.ah.in
               ${CMAKE_CURRENT_BINARY_DIR}/instantiate-${EXPERIMENT_NAME}.ah @ONLY
 )
 #experiment sources
 set(MY_EXPERIMENT_SRCS
 	experiment.hpp
 	experiment.cc
 )
 #### include directories ####
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
 ## build library
 add_library(fail-${EXPERIMENT_NAME} ${MY_EXPERIMENT_SRCS})
--- a/src/experiments/perf-test/experiment.cc
+++ b/src/experiments/perf-test/experiment.cc
@ -0,0 +1,58 @@
 #include "util/Logger.hpp"
 #include "util/WallclockTimer.hpp"
 #include "experiment.hpp"
 #include "sal/SALInst.hpp"
 #include "sal/Listener.hpp"
 #include "config/FailConfig.hpp"
 // Check if configuration dependencies are satisfied:
 #if !defined(CONFIG_EVENT_BREAKPOINTS)
  #error This experiment just needs breakpoints. You may want to enable Fast-Breakpoints as well.
 #endif
 using namespace std;
 using namespace fail;
 bool PerfTestExperiment::run()
 {
 	Logger log("PERF", false);
 	log << "Experiment started (measuring ellapsed time using wallclock timer)..." << endl;
 	// Performance tests:
 	WallclockTimer tm;
 	tm.startTimer();
 #if 1
 	log << "Activated: CASE A (Best-Case)..." << endl;
 	// Case A): A lot of non-BP listeners a only one (or none) BPs:
 	const unsigned NON_BP_COUNT = 50;
 	log << "Adding " << NON_BP_COUNT << " non-BP listeners..." << endl;
 	MemReadListener mrl[NON_BP_COUNT];
 	for (unsigned i = 0; i < NON_BP_COUNT; ++i) {
 		mrl[i].setWatchAddress(static_cast<address_t>(-1));
 		simulator.addListener(&mrl[i]);
 	}
 	log << "Adding one breakpoint listener and returning to simulator..." << endl;
 	BPSingleListener bp(0x00003c34);
 	simulator.addListenerAndResume(&bp);
 #else
 	log << "Activated: CASE B (Worst-Case)..." << endl;
 	// Case B): n (non matching) BP listeners and no other listener types
 	const unsigned BP_COUNT = 50;
 	log << "Adding " << BP_COUNT << " BPSingleListeners..." << endl;
 	BPSingleListener bsl[BP_COUNT];
 	for (unsigned i = 0; i < BP_COUNT; ++i) {
 		bsl[i].setWatchInstructionPointer(0xFFFFFFF); // we do not want them to trigger...
 		simulator.addListener(&bsl[i]);
 	}
 	log << "Adding final BPSingleListener and continuing simulation..." << endl;
 	// This is required to terminate the experiment:
 	BPSingleListener final(0x00003c34);
 	simulator.addListenerAndResume(&final);
 #endif
 	tm.stopTimer();
 	log << "Time elapsed: " << tm << "s. Done, Bye!" << endl;
 	simulator.terminate();
 	return true;
 }
--- a/src/experiments/perf-test/experiment.hpp
+++ b/src/experiments/perf-test/experiment.hpp
@ -0,0 +1,13 @@
 #ifndef __PERF_TEST_EXPERIMENT_HPP__
  #define __PERF_TEST_EXPERIMENT_HPP__
 #include "efw/ExperimentFlow.hpp"
 class PerfTestExperiment : public fail::ExperimentFlow {
 public:
 	PerfTestExperiment() { }
 	bool run();
 };
 #endif // __PERF_TEST_EXPERIMENT_HPP__
--- a/src/experiments/perf-test/results.txt
+++ b/src/experiments/perf-test/results.txt
@ -0,0 +1,103 @@
 ****************************************************************************************************
  RESULTS:
 ****************************************************************************************************
 (A) WITH FAST_BREAKPOINTS (Default mode):
 hsc-simple (r1636) - phase 1:
 real	1m8.604s
 user	1m8.384s
 sys		0m0.132s
 hsc-simple (r1636) - phase 2:
 real	0m0.591s
 user	0m0.064s
 sys		0m0.076s
 perf-test (r1745): Best- vs. Worst-Case with Wallclock-Timer (NON_BP_COUNT = 50 and BP_COUNT = 50):
 Case A:  511.46s (= ~9min, around 5,6 times faster than (B).a)
 Case B: 4731.53s (= ~79min, around 1,1 times slower than (B).b)
 ----------------------------------------------
 (B) WITHOUT FAST_BREAKPOINTS (Default mode):
 hsc-simple (r1636) - phase 1:
 real	0m34.712s
 user	0m34.246s
 sys		0m00.148s
 hsc-simple (r1636) - phase 2:
 real	0m0.429s
 user	0m0.048s
 sys		0m0.084s
 perf-test (r1745): Best- vs. Worst-Case with Wallclock-Timer (NON_BP_COUNT = 50 and BP_COUNT = 50):
 Case A: 2853.63s (= 47min)
 Case B: 4214.03s (= 70min)
 ----------------------------------------------
 (C) WITH FAST_BREAKPOINTS (Release mode):
 hsc-simple (r1636) - phase 1:
 real	0m13.341s
 user	0m12.377s
 sys		0m00.168s
 hsc-simple (r1636) - phase 2:
 real	0m0.506s
 user	0m0.032s
 sys		0m0.100s
 perf-test (r1745): Best- vs. Worst-Case with Wallclock-Timer (NON_BP_COUNT = 50 and BP_COUNT = 50):
 Case A: 43.0115s (< 1min, around 7,5 times faster than (D).a)
 Case B: 385.547s (= ~6min, around 1,5 times faster than (D).b)
 ----------------------------------------------
 (D) WITHOUT FAST_BREAKPOINTS (Release mode):
 hsc-simple (r1636) - phase 1:
 real	0m28.806s
 user	0m28.214s
 sys		0m00.160s
 hsc-simple (r1636) - phase 2:
 real	0m0.565s
 user	0m0.052s
 sys		0m0.084s
 perf-test (r1745): Best- vs. Worst-Case with Wallclock-Timer (NON_BP_COUNT = 50 and BP_COUNT = 50):
 Case A: 321.594s (= ~5min)
 Case B: 587.698s (= ~9min)
 ****************************************************************************************************
  EVALUATION:
 ****************************************************************************************************
 Note: These are just exemplary results based on the observed values (see above).
 - The (former) BufferCache's enabled a speedup up to 2,5x (according to Martin Unzer).
 - hsc-simple: Fast-Breakpoints are only faster if compiled in Release mode (yields a
  speedup up to 2x).
 - hsc-simple: Unfortunately, they are also slower by a factor of 2, if compiled in
  Default-Mode (and probably in Debug mode, too).
 - perf-test: Except for case B in Default mode, Fast-Breakpoints enable a speedup that
  ranges from 1,5 to 7,5! For case B (in Default mode -> no optimization), the Fast-
  Breakpoint implementation slows down the overall execution speed by a factor of (only)
  1,1. However, for case A (Best-Case) we assume that the overall speedup (compared to
  the corresponding case where Fast-Breakpoints are switched off) will tend to rise
  when the experiment parameter NON_BP_COUNT is increased.
 ****************************************************************************************************
  POSSIBLE OPTIMIZATIONS:
 ****************************************************************************************************
 Note: The following observations and conjectures are partly derived from the analysis of the
 callgrind profile (using kcachegrind).
   (i) gather() should be inlined. (At the moment, this avoids an include cycle.)
  (ii) Bypass the construction of a ResultSet object (the bypass would avoid an additional iteration
       over the elements stored in the ResultSet itself), by calling makeActive in gather()
 (iii) Complete the implementation of the PerfVecSortedSingleBP class (uses binary search in IPs)
 => (i) won't effect the speed in Default and Debug mode. (ii) should enable a speedup in all
    cases. (iii) will only improve the speed when many *BPSingleListeners* are in use.