perf-test: best- and worst-case tests for evaluating fast-breakpoint performance (+ results).

git-svn-id: https://www4.informatik.uni-erlangen.de/i4svn/danceos/trunk/devel/fail@1745 8c4709b5-6ec9-48aa-a5cd-a96041d1645a
2012-10-16 13:07:11 +00:00
parent 5f0212aced
commit ab4cdcc6e0
4 changed files with 191 additions and 0 deletions
--- a/src/experiments/perf-test/CMakeLists.txt
+++ b/src/experiments/perf-test/CMakeLists.txt
@ -0,0 +1,17 @@
+set(EXPERIMENT_NAME perf-test)
+set(EXPERIMENT_TYPE PerfTestExperiment)
+configure_file(../instantiate-experiment.ah.in
+               ${CMAKE_CURRENT_BINARY_DIR}/instantiate-${EXPERIMENT_NAME}.ah @ONLY
+)
+
+#experiment sources
+set(MY_EXPERIMENT_SRCS
+	experiment.hpp
+	experiment.cc
+)
+
+#### include directories ####
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+
+## build library
+add_library(fail-${EXPERIMENT_NAME} ${MY_EXPERIMENT_SRCS})
--- a/src/experiments/perf-test/experiment.cc
+++ b/src/experiments/perf-test/experiment.cc
@ -0,0 +1,58 @@
+#include "util/Logger.hpp"
+#include "util/WallclockTimer.hpp"
+
+#include "experiment.hpp"
+#include "sal/SALInst.hpp"
+#include "sal/Listener.hpp"
+#include "config/FailConfig.hpp"
+
+// Check if configuration dependencies are satisfied:
+#if !defined(CONFIG_EVENT_BREAKPOINTS)
+  #error This experiment just needs breakpoints. You may want to enable Fast-Breakpoints as well.
+#endif
+
+using namespace std;
+using namespace fail;
+
+bool PerfTestExperiment::run()
+{
+	Logger log("PERF", false);
+	log << "Experiment started (measuring ellapsed time using wallclock timer)..." << endl;
+
+	// Performance tests:
+	WallclockTimer tm;
+	tm.startTimer();
+#if 1
+	log << "Activated: CASE A (Best-Case)..." << endl;
+	// Case A): A lot of non-BP listeners a only one (or none) BPs:
+	const unsigned NON_BP_COUNT = 50;
+	log << "Adding " << NON_BP_COUNT << " non-BP listeners..." << endl;
+	MemReadListener mrl[NON_BP_COUNT];
+	for (unsigned i = 0; i < NON_BP_COUNT; ++i) {
+		mrl[i].setWatchAddress(static_cast<address_t>(-1));
+		simulator.addListener(&mrl[i]);
+	}
+	log << "Adding one breakpoint listener and returning to simulator..." << endl;
+	BPSingleListener bp(0x00003c34);
+	simulator.addListenerAndResume(&bp);
+#else
+	log << "Activated: CASE B (Worst-Case)..." << endl;
+	// Case B): n (non matching) BP listeners and no other listener types
+	const unsigned BP_COUNT = 50;
+	log << "Adding " << BP_COUNT << " BPSingleListeners..." << endl;
+	BPSingleListener bsl[BP_COUNT];
+	for (unsigned i = 0; i < BP_COUNT; ++i) {
+		bsl[i].setWatchInstructionPointer(0xFFFFFFF); // we do not want them to trigger...
+		simulator.addListener(&bsl[i]);
+	}
+	log << "Adding final BPSingleListener and continuing simulation..." << endl;
+	// This is required to terminate the experiment:
+	BPSingleListener final(0x00003c34);
+	simulator.addListenerAndResume(&final);
+#endif
+
+	tm.stopTimer();
+	log << "Time elapsed: " << tm << "s. Done, Bye!" << endl;
+	simulator.terminate();
+	return true;
+}
--- a/src/experiments/perf-test/experiment.hpp
+++ b/src/experiments/perf-test/experiment.hpp
@ -0,0 +1,13 @@
+#ifndef __PERF_TEST_EXPERIMENT_HPP__
+  #define __PERF_TEST_EXPERIMENT_HPP__
+
+#include "efw/ExperimentFlow.hpp"
+
+class PerfTestExperiment : public fail::ExperimentFlow {
+public:
+	PerfTestExperiment() { }
+
+	bool run();
+};
+
+#endif // __PERF_TEST_EXPERIMENT_HPP__
--- a/src/experiments/perf-test/results.txt
+++ b/src/experiments/perf-test/results.txt
@ -0,0 +1,103 @@
+****************************************************************************************************
+  RESULTS:
+****************************************************************************************************
+ (A) WITH FAST_BREAKPOINTS (Default mode):
+ 
+hsc-simple (r1636) - phase 1:
+real	1m8.604s
+user	1m8.384s
+sys		0m0.132s
+
+hsc-simple (r1636) - phase 2:
+real	0m0.591s
+user	0m0.064s
+sys		0m0.076s
+
+perf-test (r1745): Best- vs. Worst-Case with Wallclock-Timer (NON_BP_COUNT = 50 and BP_COUNT = 50):
+Case A:  511.46s (= ~9min, around 5,6 times faster than (B).a)
+Case B: 4731.53s (= ~79min, around 1,1 times slower than (B).b)
+
+----------------------------------------------
+
+ (B) WITHOUT FAST_BREAKPOINTS (Default mode):
+
+hsc-simple (r1636) - phase 1:
+real	0m34.712s
+user	0m34.246s
+sys		0m00.148s
+
+hsc-simple (r1636) - phase 2:
+real	0m0.429s
+user	0m0.048s
+sys		0m0.084s
+
+perf-test (r1745): Best- vs. Worst-Case with Wallclock-Timer (NON_BP_COUNT = 50 and BP_COUNT = 50):
+Case A: 2853.63s (= 47min)
+Case B: 4214.03s (= 70min)
+
+----------------------------------------------
+
+ (C) WITH FAST_BREAKPOINTS (Release mode):
+ 
+hsc-simple (r1636) - phase 1:
+real	0m13.341s
+user	0m12.377s
+sys		0m00.168s
+
+hsc-simple (r1636) - phase 2:
+real	0m0.506s
+user	0m0.032s
+sys		0m0.100s
+
+perf-test (r1745): Best- vs. Worst-Case with Wallclock-Timer (NON_BP_COUNT = 50 and BP_COUNT = 50):
+Case A: 43.0115s (< 1min, around 7,5 times faster than (D).a)
+Case B: 385.547s (= ~6min, around 1,5 times faster than (D).b)
+
+----------------------------------------------
+
+ (D) WITHOUT FAST_BREAKPOINTS (Release mode):
+ 
+hsc-simple (r1636) - phase 1:
+real	0m28.806s
+user	0m28.214s
+sys		0m00.160s
+
+hsc-simple (r1636) - phase 2:
+real	0m0.565s
+user	0m0.052s
+sys		0m0.084s
+
+perf-test (r1745): Best- vs. Worst-Case with Wallclock-Timer (NON_BP_COUNT = 50 and BP_COUNT = 50):
+Case A: 321.594s (= ~5min)
+Case B: 587.698s (= ~9min)
+
+****************************************************************************************************
+  EVALUATION:
+****************************************************************************************************
+Note: These are just exemplary results based on the observed values (see above).
+
+- The (former) BufferCache's enabled a speedup up to 2,5x (according to Martin Unzer).
+- hsc-simple: Fast-Breakpoints are only faster if compiled in Release mode (yields a
+  speedup up to 2x).
+- hsc-simple: Unfortunately, they are also slower by a factor of 2, if compiled in
+  Default-Mode (and probably in Debug mode, too).
+- perf-test: Except for case B in Default mode, Fast-Breakpoints enable a speedup that
+  ranges from 1,5 to 7,5! For case B (in Default mode -> no optimization), the Fast-
+  Breakpoint implementation slows down the overall execution speed by a factor of (only)
+  1,1. However, for case A (Best-Case) we assume that the overall speedup (compared to
+  the corresponding case where Fast-Breakpoints are switched off) will tend to rise
+  when the experiment parameter NON_BP_COUNT is increased.
+
+****************************************************************************************************
+  POSSIBLE OPTIMIZATIONS:
+****************************************************************************************************
+Note: The following observations and conjectures are partly derived from the analysis of the
+callgrind profile (using kcachegrind).
+
+   (i) gather() should be inlined. (At the moment, this avoids an include cycle.)
+  (ii) Bypass the construction of a ResultSet object (the bypass would avoid an additional iteration
+       over the elements stored in the ResultSet itself), by calling makeActive in gather()
+ (iii) Complete the implementation of the PerfVecSortedSingleBP class (uses binary search in IPs)
+
+ => (i) won't effect the speed in Default and Debug mode. (ii) should enable a speedup in all
+    cases. (iii) will only improve the speed when many *BPSingleListeners* are in use.