From a1e3b31cd53f39b2de78eba725227d3279ac003b Mon Sep 17 00:00:00 2001
From: Horst Schirmeier <horst.schirmeier@tu-dortmund.de>
Date: Fri, 16 May 2014 16:17:54 +0200
Subject: [PATCH 01/10] prune-trace: sample from known results

The --use-known-results switch simulates sampling (with fault
expansion, FESamplingPruner) by reusing results from a previous
campaign covering the full fault space (that used the "basic" pruner).
The pruner only creates entries in the "fspgroup" table that refer to
already existing pilots and corresponding results.

This switch is not for normal Fail* use, but only for experimenting
with the FESamplingPruner.

Change-Id: I1bf561d93f55918d243c5306551a1c6b48027198
---
 tools/prune-trace/FESamplingPruner.cc  | 192 ++++++++++++++++---------
 tools/prune-trace/FESamplingPruner.hpp |   4 +-
 2 files changed, 131 insertions(+), 65 deletions(-)

diff --git a/tools/prune-trace/FESamplingPruner.cc b/tools/prune-trace/FESamplingPruner.cc
index fb4f925b..c0740de8 100644
--- a/tools/prune-trace/FESamplingPruner.cc
+++ b/tools/prune-trace/FESamplingPruner.cc
@@ -14,7 +14,10 @@ struct Pilot {
 	uint64_t duration;
 
 	uint32_t instr2;
+	union {
 	uint32_t instr2_absolute;
+	uint32_t id;
+	};
 	uint32_t data_address;
 
 	typedef uint64_t size_type;
@@ -26,6 +29,9 @@ bool FESamplingPruner::commandline_init()
 	fail::CommandLine &cmd = fail::CommandLine::Inst();
 	SAMPLESIZE = cmd.addOption("", "samplesize", Arg::Required,
 		"--samplesize N \tNumber of samples to take (per variant)");
+	USE_KNOWN_RESULTS = cmd.addOption("", "use-known-results", Arg::None,
+		"--use-known-results \tReuse known results from a campaign with the 'basic' pruner "
+		"(abuses the DB layout to a certain degree, use with caution)");
 	return true;
 }
 
@@ -38,6 +44,10 @@ bool FESamplingPruner::prune_all()
 	}
 	m_samplesize = strtoul(cmd[SAMPLESIZE].first()->arg, 0, 10);
 
+	if (cmd[USE_KNOWN_RESULTS]) {
+		m_use_known_results = true;
+	}
+
 	// for each variant:
 	for (std::vector<fail::Database::Variant>::const_iterator it = m_variants.begin();
 		it != m_variants.end(); ++it) {
@@ -76,78 +86,132 @@ bool FESamplingPruner::sampling_prune(const fail::Database::Variant& variant)
 	MYSQL_RES *res;
 	MYSQL_ROW row;
 
-	LOG << "loading trace entries for " << variant.variant << "/" << variant.benchmark << " ..." << endl;
+	unsigned pilotcount = 0, samplerows;
 
-	unsigned pilotcount = 0;
+	if (!m_use_known_results) {
+		LOG << "loading trace entries for " << variant.variant << "/" << variant.benchmark << " ..." << endl;
 
-	// load trace entries
-	ss << "SELECT instr2, instr2_absolute, data_address, time2-time1+1 AS duration"
-		<< " FROM trace"
-		<< " WHERE variant_id = " << variant.id
-		<< " AND accesstype = 'R'"
-		<< " ORDER BY duration DESC"; // speeds up sampling, but query may be slow
-	res = db->query_stream(ss.str().c_str());
-	ss.str("");
-	if (!res) return false;
-	while ((row = mysql_fetch_row(res))) {
-		Pilot p;
-		p.instr2 = strtoul(row[0], 0, 10);
-		p.instr2_absolute = strtoul(row[1], 0, 10);
-		p.data_address = strtoul(row[2], 0, 10);
-		p.duration = strtoull(row[3], 0, 10);
-		pop.add(p);
-		++pilotcount;
+		// load trace entries
+		ss << "SELECT instr2, instr2_absolute, data_address, time2-time1+1 AS duration"
+			<< " FROM trace"
+			<< " WHERE variant_id = " << variant.id
+			<< " AND accesstype = 'R'"
+			<< " ORDER BY duration DESC"; // speeds up sampling, but query may be slow
+		res = db->query_stream(ss.str().c_str());
+		ss.str("");
+		if (!res) return false;
+		while ((row = mysql_fetch_row(res))) {
+			Pilot p;
+			p.instr2 = strtoul(row[0], 0, 10);
+			p.instr2_absolute = strtoul(row[1], 0, 10);
+			p.data_address = strtoul(row[2], 0, 10);
+			p.duration = strtoull(row[3], 0, 10);
+			pop.add(p);
+			++pilotcount;
+		}
+		mysql_free_result(res);
+
+		samplerows = std::min(pilotcount, m_samplesize);
+	} else {
+		LOG << "loading pilots for " << variant.variant << "/" << variant.benchmark << " ..." << endl;
+
+		// load fsppilot entries
+		ss << "SELECT p.id, p.instr2, p.data_address, t.time2 - t.time1 + 1 AS duration"
+			<< " FROM fsppilot p"
+			<< " JOIN trace t"
+			<< " ON t.variant_id = p.variant_id AND t.data_address = p.data_address AND t.instr2 = p.instr2"
+			<< " WHERE p.fspmethod_id = " << db->get_fspmethod_id("basic")
+			<< " AND p.variant_id = " << variant.id
+			<< " AND p.known_outcome = 0"
+			<< " ORDER BY duration DESC"; // speeds up sampling, but query may be slow
+		res = db->query_stream(ss.str().c_str());
+		ss.str("");
+		if (!res) return false;
+		while ((row = mysql_fetch_row(res))) {
+			Pilot p;
+			p.id = strtoul(row[0], 0, 10);
+			p.instr2 = strtoul(row[1], 0, 10);
+			p.data_address = strtoul(row[2], 0, 10);
+			p.duration = strtoull(row[3], 0, 10);
+			pop.add(p);
+			++pilotcount;
+		}
+		mysql_free_result(res);
+
+		samplerows = std::min(pilotcount, m_samplesize);
 	}
-	mysql_free_result(res);
-
-	unsigned samplerows = std::min(pilotcount, m_samplesize);
 
 	LOG << "loaded " << pilotcount << " entries, sampling "
 		<< samplerows << " entries with fault expansion ..." << endl;
 
-	// FIXME: change strategy when trace entries have IDs, insert into fspgroup first
-	ss << "INSERT INTO fsppilot (known_outcome, variant_id, instr2, injection_instr, "
-		<< "injection_instr_absolute, data_address, data_width, fspmethod_id) VALUES ";
-	std::string insert_sql(ss.str());
-	ss.str("");
+	unsigned num_fspgroup_entries = 0;
+	uint32_t known_pilot_method_id = m_method_id;
 
-	for (unsigned i = 0; i < samplerows; ++i) {
-		uint64_t pos = my_rand(pop.get_size() - 1);
-		Pilot p = pop.get(pos);
-		ss << "(0," << variant.id << "," << p.instr2 << "," << p.instr2
-			<< "," << p.instr2_absolute << "," << p.data_address
-			<< ",1," << m_method_id << ")";
-		db->insert_multiple(insert_sql.c_str(), ss.str().c_str());
+	if (!m_use_known_results) {
+		// FIXME: change strategy when trace entries have IDs, insert into fspgroup first
+		ss << "INSERT INTO fsppilot (known_outcome, variant_id, instr2, injection_instr, "
+			<< "injection_instr_absolute, data_address, data_width, fspmethod_id) VALUES ";
+		std::string insert_sql(ss.str());
 		ss.str("");
+
+		for (unsigned i = 0; i < samplerows; ++i) {
+			uint64_t pos = my_rand(pop.get_size() - 1);
+			Pilot p = pop.get(pos);
+			ss << "(0," << variant.id << "," << p.instr2 << "," << p.instr2
+				<< "," << p.instr2_absolute << "," << p.data_address
+				<< ",1," << m_method_id << ")";
+			db->insert_multiple(insert_sql.c_str(), ss.str().c_str());
+			ss.str("");
+		}
+		db->insert_multiple();
+
+		unsigned num_fsppilot_entries = samplerows;
+
+		// single entry for known outcome (write access)
+		ss << "INSERT INTO fsppilot (known_outcome, variant_id, instr2, injection_instr, injection_instr_absolute, data_address, data_width, fspmethod_id) "
+			  "SELECT 1, variant_id, instr2, instr2, instr2_absolute, "
+			  "  data_address, width, " << m_method_id << " "
+			  "FROM trace "
+			  "WHERE variant_id = " << variant.id << " AND accesstype = 'W' "
+			  "ORDER BY instr2 ASC "
+			  "LIMIT 1";
+		if (!db->query(ss.str().c_str())) return false;
+		ss.str("");
+		num_fsppilot_entries += db->affected_rows();
+		assert(num_fsppilot_entries == (samplerows + 1));
+
+		LOG << "created " << num_fsppilot_entries << " fsppilot entries" << std::endl;
+
+		// fspgroup entries for sampled trace entries
+		ss << "INSERT INTO fspgroup (variant_id, instr2, data_address, fspmethod_id, pilot_id) "
+		   << "SELECT p.variant_id, p.instr2, p.data_address, p.fspmethod_id, p.id "
+		   << "FROM fsppilot p "
+		   << "WHERE known_outcome = 0 AND p.fspmethod_id = " << m_method_id << " "
+		   << "AND p.variant_id = " << variant.id;
+
+		if (!db->query(ss.str().c_str())) return false;
+		ss.str("");
+		num_fspgroup_entries = db->affected_rows();
+	} else {
+		ss << "INSERT INTO fspgroup (variant_id, instr2, data_address, fspmethod_id, pilot_id) VALUES ";
+		std::string insert_sql(ss.str());
+		ss.str("");
+
+		for (unsigned i = 0; i < samplerows; ++i) {
+			uint64_t pos = my_rand(pop.get_size() - 1);
+			Pilot p = pop.get(pos);
+			ss << "(" << variant.id << "," << p.instr2
+				<< "," << p.data_address << "," << m_method_id
+				<< "," << p.id << ")";
+			db->insert_multiple(insert_sql.c_str(), ss.str().c_str());
+			ss.str("");
+		}
+		db->insert_multiple();
+		num_fspgroup_entries = samplerows;
+
+		// the known_outcome=1 pilot has been determined with the "basic" method
+		known_pilot_method_id = db->get_fspmethod_id("basic");
 	}
-	db->insert_multiple();
-	unsigned num_fsppilot_entries = samplerows;
-
-	// single entry for known outcome (write access)
-	ss << "INSERT INTO fsppilot (known_outcome, variant_id, instr2, injection_instr, injection_instr_absolute, data_address, data_width, fspmethod_id) "
-		  "SELECT 1, variant_id, instr2, instr2, instr2_absolute, "
-		  "  data_address, width, " << m_method_id << " "
-		  "FROM trace "
-		  "WHERE variant_id = " << variant.id << " AND accesstype = 'W' "
-		  "ORDER BY instr2 ASC "
-		  "LIMIT 1";
-	if (!db->query(ss.str().c_str())) return false;
-	ss.str("");
-	num_fsppilot_entries += db->affected_rows();
-	assert(num_fsppilot_entries == (samplerows + 1));
-
-	LOG << "created " << num_fsppilot_entries << " fsppilot entries" << std::endl;
-
-	// fspgroup entries for sampled trace entries
-	ss << "INSERT INTO fspgroup (variant_id, instr2, data_address, fspmethod_id, pilot_id) "
-	   << "SELECT p.variant_id, p.instr2, p.data_address, p.fspmethod_id, p.id "
-	   << "FROM fsppilot p "
-	   << "WHERE known_outcome = 0 AND p.fspmethod_id = " << m_method_id << " "
-	   << "AND p.variant_id = " << variant.id;
-
-	if (!db->query(ss.str().c_str())) return false;
-	ss.str("");
-	unsigned num_fspgroup_entries = db->affected_rows();
 
 #if 0 // do it like the basic pruner:
 	// fspgroup entries for known (W) trace entries
@@ -162,9 +226,9 @@ bool FESamplingPruner::sampling_prune(const fail::Database::Variant& variant)
 	// for each W); this needs to be accounted for at data analysis time,
 	// though.
 	ss << "INSERT INTO fspgroup (variant_id, instr2, data_address, fspmethod_id, pilot_id) "
-		"SELECT variant_id, instr2, data_address, fspmethod_id, id "
+		"SELECT variant_id, instr2, data_address, " << m_method_id << ", id "
 		"FROM fsppilot "
-		"WHERE variant_id = " << variant.id << " AND known_outcome = 1 AND fspmethod_id = " << m_method_id;
+		"WHERE variant_id = " << variant.id << " AND known_outcome = 1 AND fspmethod_id = " << known_pilot_method_id;
 #endif
 	if (!db->query(ss.str().c_str())) return false;
 	ss.str("");
diff --git a/tools/prune-trace/FESamplingPruner.hpp b/tools/prune-trace/FESamplingPruner.hpp
index a9538622..7c71c530 100644
--- a/tools/prune-trace/FESamplingPruner.hpp
+++ b/tools/prune-trace/FESamplingPruner.hpp
@@ -15,11 +15,13 @@
 ///
 class FESamplingPruner : public Pruner {
 	fail::CommandLine::option_handle SAMPLESIZE;
+	fail::CommandLine::option_handle USE_KNOWN_RESULTS;
 
 	unsigned m_samplesize;
+	bool m_use_known_results;
 
 public:
-	FESamplingPruner() : m_samplesize(0) { }
+	FESamplingPruner() : m_samplesize(0), m_use_known_results(false) { }
 	virtual std::string method_name() { return "FESampling"; }
 	virtual bool commandline_init();
 	virtual bool prune_all();

From b2b53380f4a0d95bb7f8a4682915574ec419e848 Mon Sep 17 00:00:00 2001
From: Horst Schirmeier <horst.schirmeier@tu-dortmund.de>
Date: Sat, 6 Dec 2014 00:19:17 +0100
Subject: [PATCH 02/10] prune-trace: add switch to disable sample weighting

In the sampling step, the --no-weighting switch disables the
equivalence-class weighting by using a weight of one instead of the
equivalence-class size.  This is usually not a good idea, and should
only be used for demonstration purposes, or if the fault model
requires weight-less sampling.

Change-Id: Id903d1924c6ecbcd217815aa5ce9271560130071
---
 tools/prune-trace/FESamplingPruner.cc  | 11 +++++++++--
 tools/prune-trace/FESamplingPruner.hpp |  5 +++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/tools/prune-trace/FESamplingPruner.cc b/tools/prune-trace/FESamplingPruner.cc
index c0740de8..431adb86 100644
--- a/tools/prune-trace/FESamplingPruner.cc
+++ b/tools/prune-trace/FESamplingPruner.cc
@@ -32,6 +32,9 @@ bool FESamplingPruner::commandline_init()
 	USE_KNOWN_RESULTS = cmd.addOption("", "use-known-results", Arg::None,
 		"--use-known-results \tReuse known results from a campaign with the 'basic' pruner "
 		"(abuses the DB layout to a certain degree, use with caution)");
+	NO_WEIGHTING = cmd.addOption("", "no-weighting", Arg::None,
+		"--no-weighting \tDisable weighted sampling (weight = 1 for all ECs) "
+		"(don't do this unless you know what you're doing)");
 	return true;
 }
 
@@ -48,6 +51,10 @@ bool FESamplingPruner::prune_all()
 		m_use_known_results = true;
 	}
 
+	if (cmd[NO_WEIGHTING]) {
+		m_weighting = false;
+	}
+
 	// for each variant:
 	for (std::vector<fail::Database::Variant>::const_iterator it = m_variants.begin();
 		it != m_variants.end(); ++it) {
@@ -105,7 +112,7 @@ bool FESamplingPruner::sampling_prune(const fail::Database::Variant& variant)
 			p.instr2 = strtoul(row[0], 0, 10);
 			p.instr2_absolute = strtoul(row[1], 0, 10);
 			p.data_address = strtoul(row[2], 0, 10);
-			p.duration = strtoull(row[3], 0, 10);
+			p.duration = m_weighting ? strtoull(row[3], 0, 10) : 1;
 			pop.add(p);
 			++pilotcount;
 		}
@@ -132,7 +139,7 @@ bool FESamplingPruner::sampling_prune(const fail::Database::Variant& variant)
 			p.id = strtoul(row[0], 0, 10);
 			p.instr2 = strtoul(row[1], 0, 10);
 			p.data_address = strtoul(row[2], 0, 10);
-			p.duration = strtoull(row[3], 0, 10);
+			p.duration = m_weighting ? strtoull(row[3], 0, 10) : 1;
 			pop.add(p);
 			++pilotcount;
 		}
diff --git a/tools/prune-trace/FESamplingPruner.hpp b/tools/prune-trace/FESamplingPruner.hpp
index 7c71c530..860504ab 100644
--- a/tools/prune-trace/FESamplingPruner.hpp
+++ b/tools/prune-trace/FESamplingPruner.hpp
@@ -16,12 +16,13 @@
 class FESamplingPruner : public Pruner {
 	fail::CommandLine::option_handle SAMPLESIZE;
 	fail::CommandLine::option_handle USE_KNOWN_RESULTS;
+	fail::CommandLine::option_handle NO_WEIGHTING;
 
 	unsigned m_samplesize;
-	bool m_use_known_results;
+	bool m_use_known_results, m_weighting;
 
 public:
-	FESamplingPruner() : m_samplesize(0), m_use_known_results(false) { }
+	FESamplingPruner() : m_samplesize(0), m_use_known_results(false), m_weighting(true) { }
 	virtual std::string method_name() { return "FESampling"; }
 	virtual bool commandline_init();
 	virtual bool prune_all();

From 57e4541190024102fca650ddf6b4d136faf3f1b3 Mon Sep 17 00:00:00 2001
From: Christoph Borchert <christoph.borchert@tu-dortmund.de>
Date: Thu, 18 Dec 2014 12:51:32 +0100
Subject: [PATCH 03/10] prune-trace: do not sort ECs for sampling

Sorting is too costly for large data sets, and not worth the
sampling-process speedup.

Change-Id: I622ff3ed9b352fc5c7586f9733d830be727b6a11
---
 tools/prune-trace/FESamplingPruner.cc | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tools/prune-trace/FESamplingPruner.cc b/tools/prune-trace/FESamplingPruner.cc
index 431adb86..3ce7d609 100644
--- a/tools/prune-trace/FESamplingPruner.cc
+++ b/tools/prune-trace/FESamplingPruner.cc
@@ -102,8 +102,7 @@ bool FESamplingPruner::sampling_prune(const fail::Database::Variant& variant)
 		ss << "SELECT instr2, instr2_absolute, data_address, time2-time1+1 AS duration"
 			<< " FROM trace"
 			<< " WHERE variant_id = " << variant.id
-			<< " AND accesstype = 'R'"
-			<< " ORDER BY duration DESC"; // speeds up sampling, but query may be slow
+			<< " AND accesstype = 'R'";
 		res = db->query_stream(ss.str().c_str());
 		ss.str("");
 		if (!res) return false;
@@ -129,8 +128,7 @@ bool FESamplingPruner::sampling_prune(const fail::Database::Variant& variant)
 			<< " ON t.variant_id = p.variant_id AND t.data_address = p.data_address AND t.instr2 = p.instr2"
 			<< " WHERE p.fspmethod_id = " << db->get_fspmethod_id("basic")
 			<< " AND p.variant_id = " << variant.id
-			<< " AND p.known_outcome = 0"
-			<< " ORDER BY duration DESC"; // speeds up sampling, but query may be slow
+			<< " AND p.known_outcome = 0";
 		res = db->query_stream(ss.str().c_str());
 		ss.str("");
 		if (!res) return false;

From ed18399ff6b75de01d71103cba2b2b8065025ef5 Mon Sep 17 00:00:00 2001
From: Christoph Borchert <christoph.borchert@tu-dortmund.de>
Date: Tue, 16 Dec 2014 16:35:41 +0100
Subject: [PATCH 04/10] prune-trace: remove invalid assertion

This assertion in the FESamplingPruner is invalid if the import took
place without "write" ECs.

Change-Id: I7d1bbcf1572573e2ac97e9be1191fbf9fe61f755
---
 tools/prune-trace/FESamplingPruner.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/prune-trace/FESamplingPruner.cc b/tools/prune-trace/FESamplingPruner.cc
index 3ce7d609..c4e7d3e1 100644
--- a/tools/prune-trace/FESamplingPruner.cc
+++ b/tools/prune-trace/FESamplingPruner.cc
@@ -183,7 +183,6 @@ bool FESamplingPruner::sampling_prune(const fail::Database::Variant& variant)
 		if (!db->query(ss.str().c_str())) return false;
 		ss.str("");
 		num_fsppilot_entries += db->affected_rows();
-		assert(num_fsppilot_entries == (samplerows + 1));
 
 		LOG << "created " << num_fsppilot_entries << " fsppilot entries" << std::endl;
 

From f23860c1394def7724f69ac5b9c575d3a27ccefe Mon Sep 17 00:00:00 2001
From: Christoph Borchert <christoph.borchert@tu-dortmund.de>
Date: Tue, 16 Dec 2014 11:31:10 +0100
Subject: [PATCH 05/10] prune-trace: use uint64_t for pilot counts

This enables using very large data sets in the FESamplingPruner.

Change-Id: Ibf097ed8cec24c85a74e83a78d79aa07893cfa8c
---
 tools/prune-trace/FESamplingPruner.cc  | 10 +++++-----
 tools/prune-trace/FESamplingPruner.hpp |  3 ++-
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/tools/prune-trace/FESamplingPruner.cc b/tools/prune-trace/FESamplingPruner.cc
index c4e7d3e1..cebd7292 100644
--- a/tools/prune-trace/FESamplingPruner.cc
+++ b/tools/prune-trace/FESamplingPruner.cc
@@ -93,7 +93,7 @@ bool FESamplingPruner::sampling_prune(const fail::Database::Variant& variant)
 	MYSQL_RES *res;
 	MYSQL_ROW row;
 
-	unsigned pilotcount = 0, samplerows;
+	uint64_t pilotcount = 0, samplerows;
 
 	if (!m_use_known_results) {
 		LOG << "loading trace entries for " << variant.variant << "/" << variant.benchmark << " ..." << endl;
@@ -149,7 +149,7 @@ bool FESamplingPruner::sampling_prune(const fail::Database::Variant& variant)
 	LOG << "loaded " << pilotcount << " entries, sampling "
 		<< samplerows << " entries with fault expansion ..." << endl;
 
-	unsigned num_fspgroup_entries = 0;
+	uint64_t num_fspgroup_entries = 0;
 	uint32_t known_pilot_method_id = m_method_id;
 
 	if (!m_use_known_results) {
@@ -159,7 +159,7 @@ bool FESamplingPruner::sampling_prune(const fail::Database::Variant& variant)
 		std::string insert_sql(ss.str());
 		ss.str("");
 
-		for (unsigned i = 0; i < samplerows; ++i) {
+		for (uint64_t i = 0; i < samplerows; ++i) {
 			uint64_t pos = my_rand(pop.get_size() - 1);
 			Pilot p = pop.get(pos);
 			ss << "(0," << variant.id << "," << p.instr2 << "," << p.instr2
@@ -170,7 +170,7 @@ bool FESamplingPruner::sampling_prune(const fail::Database::Variant& variant)
 		}
 		db->insert_multiple();
 
-		unsigned num_fsppilot_entries = samplerows;
+		uint64_t num_fsppilot_entries = samplerows;
 
 		// single entry for known outcome (write access)
 		ss << "INSERT INTO fsppilot (known_outcome, variant_id, instr2, injection_instr, injection_instr_absolute, data_address, data_width, fspmethod_id) "
@@ -201,7 +201,7 @@ bool FESamplingPruner::sampling_prune(const fail::Database::Variant& variant)
 		std::string insert_sql(ss.str());
 		ss.str("");
 
-		for (unsigned i = 0; i < samplerows; ++i) {
+		for (uint64_t i = 0; i < samplerows; ++i) {
 			uint64_t pos = my_rand(pop.get_size() - 1);
 			Pilot p = pop.get(pos);
 			ss << "(" << variant.id << "," << p.instr2
diff --git a/tools/prune-trace/FESamplingPruner.hpp b/tools/prune-trace/FESamplingPruner.hpp
index 860504ab..d33c1e94 100644
--- a/tools/prune-trace/FESamplingPruner.hpp
+++ b/tools/prune-trace/FESamplingPruner.hpp
@@ -1,6 +1,7 @@
 #ifndef __FESAMPLING_PRUNER_H__
 #define __FESAMPLING_PRUNER_H__
 
+#include <stdint.h>
 #include "Pruner.hpp"
 #include "util/CommandLine.hpp"
 
@@ -18,7 +19,7 @@ class FESamplingPruner : public Pruner {
 	fail::CommandLine::option_handle USE_KNOWN_RESULTS;
 	fail::CommandLine::option_handle NO_WEIGHTING;
 
-	unsigned m_samplesize;
+	uint64_t m_samplesize;
 	bool m_use_known_results, m_weighting;
 
 public:

From b0c58bab78178e088f619fa58e4aad4fde670e58 Mon Sep 17 00:00:00 2001
From: Christoph Borchert <christoph.borchert@tu-dortmund.de>
Date: Tue, 16 Dec 2014 12:02:57 +0100
Subject: [PATCH 06/10] util: SumTree::add() documentation

This function copies the reference argument 'element' internally.

Change-Id: I33d94f224bc2b1b89057b90258d500eaa364ab85
---
 src/core/util/SumTree.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/core/util/SumTree.hpp b/src/core/util/SumTree.hpp
index 998e3e05..337e7db8 100644
--- a/src/core/util/SumTree.hpp
+++ b/src/core/util/SumTree.hpp
@@ -51,7 +51,7 @@ class SumTree {
 public:
 	SumTree() : m_root(new Bucket), m_depth(0) {}
 	~SumTree() { delete m_root; }
-	//! Adds a new element to the tree.
+	//! Adds a copy of a new element to the tree.  The copy is created internally.
 	void add(const T& element);
 	//! Retrieves (and removes) element at random number position.
 	T get(typename T::size_type pos) { return get(pos, m_root, 0); }

From 2f70e05db6223fbe45e9a00a31cb1d01f887dab5 Mon Sep 17 00:00:00 2001
From: Horst Schirmeier <horst.schirmeier@tu-dortmund.de>
Date: Wed, 14 Jan 2015 23:37:18 +0100
Subject: [PATCH 07/10] util: rename SumTree::get -> remove, add r/o get

SumTree::get now non-intrusively picks an element and returns a
reference to it, SumTree::remove removes and returns a copy.  The
former is needed for sampling with replacement.

Change-Id: Iefef2fdf0b7df6ea7a9949f2588528ec9e86bb7a
---
 src/core/util/SumTree.hpp             | 60 +++++++++++++++++++++++----
 src/core/util/testing/SumTreeTest.cc  |  2 +-
 tools/prune-trace/FESamplingPruner.cc |  4 +-
 3 files changed, 54 insertions(+), 12 deletions(-)

diff --git a/src/core/util/SumTree.hpp b/src/core/util/SumTree.hpp
index 337e7db8..9d50269b 100644
--- a/src/core/util/SumTree.hpp
+++ b/src/core/util/SumTree.hpp
@@ -7,7 +7,7 @@
 
 // The SumTree implements an efficient tree data structure for
 // "roulette-wheel" sampling, or "sampling with fault expansion", i.e.,
-// sampling of trace entries / pilots without replacement and with a
+// sampling of trace entries / pilots with/without replacement and with a
 // picking probability proportional to the entries' sizes.
 //
 // For every sample, the naive approach picks a random number between 0
@@ -24,9 +24,9 @@
 //
 // Note that the current implementation is built for a pure growth phase
 // (when the tree gets filled with pilots from the database), followed by
-// a sampling phase when the tree gets emptied.  It does not handle a
-// mixed add/remove case very smartly, although it should remain
-// functional.
+// a sampling phase when the tree gets sampled from (with replacement) or
+// emptied (without replacement).  It does not handle a mixed add/remove case
+// very smartly, although it should remain functional.
 
 namespace fail {
 
@@ -53,15 +53,19 @@ public:
 	~SumTree() { delete m_root; }
 	//! Adds a copy of a new element to the tree.  The copy is created internally.
 	void add(const T& element);
-	//! Retrieves (and removes) element at random number position.
-	T get(typename T::size_type pos) { return get(pos, m_root, 0); }
+	//! Retrieves and removes element at random number position.
+	T remove(typename T::size_type pos) { return remove(pos, m_root, 0); }
+	//! Retrieves reference to element at random number position.
+	T& get(typename T::size_type pos) { return get(pos, m_root, 0); }
 	//! Yields the sum over all elements in the tree.
 	typename T::size_type get_size() const { return m_root->size; }
 private:
 	//! Internal, recursive version of add().
 	bool add(Bucket **node, const T& element, unsigned depth_remaining);
+	//! Internal, recursive version of remove().
+	T remove(typename T::size_type pos, Bucket *node, typename T::size_type sum);
 	//! Internal, recursive version of get().
-	T get(typename T::size_type pos, Bucket *node, typename T::size_type sum);
+	T& get(typename T::size_type pos, Bucket *node, typename T::size_type sum);
 };
 
 // template implementation
@@ -137,7 +141,7 @@ bool SumTree<T, BUCKETSIZE>::add(Bucket **node, const T& element, unsigned depth
 }
 
 template <typename T, unsigned BUCKETSIZE>
-T SumTree<T, BUCKETSIZE>::get(typename T::size_type pos, Bucket *node, typename T::size_type sum)
+T SumTree<T, BUCKETSIZE>::remove(typename T::size_type pos, Bucket *node, typename T::size_type sum)
 {
 	// sanity check
 	assert(pos >= sum && pos < sum + node->size);
@@ -153,7 +157,7 @@ T SumTree<T, BUCKETSIZE>::get(typename T::size_type pos, Bucket *node, typename
 
 		// found containing bucket, recurse
 		sum -= (*it)->size;
-		T e = get(pos, *it, sum);
+		T e = remove(pos, *it, sum);
 		node->size -= e.size();
 		// remove empty (or, at least, zero-sized) child?
 		if ((*it)->size == 0) {
@@ -184,6 +188,44 @@ T SumTree<T, BUCKETSIZE>::get(typename T::size_type pos, Bucket *node, typename
 	return T();
 }
 
+template <typename T, unsigned BUCKETSIZE>
+T& SumTree<T, BUCKETSIZE>::get(typename T::size_type pos, Bucket *node, typename T::size_type sum)
+{
+	// sanity check
+	assert(pos >= sum && pos < sum + node->size);
+
+	// will only be entered for inner nodes
+	for (typename std::vector<Bucket *>::iterator it = node->children.begin();
+		it != node->children.end(); ) {
+		sum += (*it)->size;
+		if (sum <= pos) {
+			++it;
+			continue;
+		}
+
+		// found containing bucket, recurse
+		sum -= (*it)->size;
+		return get(pos, *it, sum);
+	}
+
+	// will only be entered for leaf nodes
+	for (typename std::vector<T>::iterator it = node->elements.begin();
+		it != node->elements.end(); ) {
+		sum += it->size();
+		if (sum <= pos) {
+			++it;
+			continue;
+		}
+
+		// found pilot
+		return *it;
+	}
+
+	// this should never happen
+	assert(0);
+	return *(new T);
+}
+
 } // namespace
 
 #endif
diff --git a/src/core/util/testing/SumTreeTest.cc b/src/core/util/testing/SumTreeTest.cc
index 1757cd17..9824a034 100644
--- a/src/core/util/testing/SumTreeTest.cc
+++ b/src/core/util/testing/SumTreeTest.cc
@@ -28,7 +28,7 @@ int main()
 		uint64_t pos = tree.get_size() / 2;
 		LOG << "MAIN tree.get_size() = " << tree.get_size()
 			<< ", trying to retrieve pos = " << pos << endl;
-		Pilot p = tree.get(pos);
+		Pilot p = tree.remove(pos);
 		LOG << "MAIN retrieved pilot with duration " << p.duration << endl;
 	}
 }
diff --git a/tools/prune-trace/FESamplingPruner.cc b/tools/prune-trace/FESamplingPruner.cc
index cebd7292..e8f0710f 100644
--- a/tools/prune-trace/FESamplingPruner.cc
+++ b/tools/prune-trace/FESamplingPruner.cc
@@ -161,7 +161,7 @@ bool FESamplingPruner::sampling_prune(const fail::Database::Variant& variant)
 
 		for (uint64_t i = 0; i < samplerows; ++i) {
 			uint64_t pos = my_rand(pop.get_size() - 1);
-			Pilot p = pop.get(pos);
+			Pilot p = pop.remove(pos);
 			ss << "(0," << variant.id << "," << p.instr2 << "," << p.instr2
 				<< "," << p.instr2_absolute << "," << p.data_address
 				<< ",1," << m_method_id << ")";
@@ -203,7 +203,7 @@ bool FESamplingPruner::sampling_prune(const fail::Database::Variant& variant)
 
 		for (uint64_t i = 0; i < samplerows; ++i) {
 			uint64_t pos = my_rand(pop.get_size() - 1);
-			Pilot p = pop.get(pos);
+			Pilot p = pop.remove(pos);
 			ss << "(" << variant.id << "," << p.instr2
 				<< "," << p.data_address << "," << m_method_id
 				<< "," << p.id << ")";

From f8e0f1bb3f1fa0838a6278ecb410f4209ff5c42d Mon Sep 17 00:00:00 2001
From: Horst Schirmeier <horst.schirmeier@tu-dortmund.de>
Date: Wed, 14 Jan 2015 23:39:33 +0100
Subject: [PATCH 08/10] util: add SumTree::iterator

Change-Id: I8304b64634fa3ab92a126fe5d942674b26334b3d
---
 src/core/util/SumTree.hpp            | 59 ++++++++++++++++++++++++++++
 src/core/util/testing/SumTreeTest.cc |  8 +++-
 2 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/src/core/util/SumTree.hpp b/src/core/util/SumTree.hpp
index 9d50269b..d5df54da 100644
--- a/src/core/util/SumTree.hpp
+++ b/src/core/util/SumTree.hpp
@@ -4,6 +4,7 @@
 #include <assert.h>
 #include <stdint.h>
 #include <vector>
+#include <stack>
 
 // The SumTree implements an efficient tree data structure for
 // "roulette-wheel" sampling, or "sampling with fault expansion", i.e.,
@@ -44,6 +45,64 @@ class SumTree {
 		std::vector<T> elements;
 	};
 
+public:
+	//! Iterator
+	class TreeIterator : public std::iterator<std::input_iterator_tag, T> {
+		//! Buckets and corresponding element indexes down the tree
+		std::stack<std::pair<Bucket *, unsigned> > hierarchy;
+	public:
+		TreeIterator() {}
+//MyIterator(int* x) :p(x) {}
+		TreeIterator(const TreeIterator& i) : hierarchy(i.hierarchy) { }
+		TreeIterator(const SumTree<T, BUCKETSIZE>& tree)
+		{
+			// go down until we see leaves
+			hierarchy.push(std::pair<Bucket *, unsigned>(tree.m_root, 0));
+			while (!hierarchy.top().first->elements.size() && hierarchy.top().first->children.size() > 0) {
+				hierarchy.push(std::pair<Bucket *, unsigned>(hierarchy.top().first->children[hierarchy.top().second], 0));
+			}
+		}
+		TreeIterator& operator++()
+		{
+			// advance index in the current level
+			hierarchy.top().second++;
+			if (hierarchy.top().second < hierarchy.top().first->elements.size()) {
+				return *this;
+			}
+			// current level is exhausted, go back up to a not yet finished level
+			do {
+				hierarchy.pop();
+			} while (!hierarchy.empty()
+				&& ++hierarchy.top().second >= hierarchy.top().first->children.size());
+			// at the end?
+			if (hierarchy.empty()) {
+				return *this;
+			}
+			// go down until we see leaves again
+			do {
+				hierarchy.push(std::pair<Bucket *, unsigned>(hierarchy.top().first->children[hierarchy.top().second], 0));
+			} while (!hierarchy.top().first->elements.size() && hierarchy.top().first->children.size() > 0);
+			return *this;
+		}
+		TreeIterator operator++(int) { TreeIterator tmp(*this); operator++(); return tmp; }
+	    bool operator==(const TreeIterator& rhs) { return hierarchy == rhs.hierarchy; }
+		bool operator!=(const TreeIterator& rhs) { return hierarchy != rhs.hierarchy; }
+		T& operator*() { return hierarchy.top().first->elements[hierarchy.top().second]; }
+		T *operator->() { return &(operator*()); }
+	};
+	typedef TreeIterator iterator;
+
+	iterator begin()
+	{
+		return iterator(*this);
+	}
+
+	iterator end()
+	{
+		return iterator();
+	}
+
+private:
 	//! Root node
 	Bucket *m_root;
 	//! Tree depth: nodes at level m_depth are leaf nodes, others are inner nodes
diff --git a/src/core/util/testing/SumTreeTest.cc b/src/core/util/testing/SumTreeTest.cc
index 9824a034..ad9d717b 100644
--- a/src/core/util/testing/SumTreeTest.cc
+++ b/src/core/util/testing/SumTreeTest.cc
@@ -17,13 +17,19 @@ struct Pilot {
 
 int main()
 {
-	fail::SumTree<Pilot, 2> tree;
+	typedef fail::SumTree<Pilot, 2> sumtree_type;
+	sumtree_type tree;
 	for (int i = 0; i <= 20; ++i) {
 		Pilot p;
 		p.duration = i;
 		tree.add(p);
 	}
 
+	LOG << "tree contents:" << endl;
+	for (sumtree_type::iterator it = tree.begin(); it != tree.end(); ++it) {
+		LOG << it->size() << endl;
+	}
+
 	while (tree.get_size() > 0) {
 		uint64_t pos = tree.get_size() / 2;
 		LOG << "MAIN tree.get_size() = " << tree.get_size()

From 79211fd31de458143ffb362701d5154fb0cf28ca Mon Sep 17 00:00:00 2001
From: Horst Schirmeier <horst.schirmeier@tu-dortmund.de>
Date: Wed, 14 Jan 2015 23:40:03 +0100
Subject: [PATCH 09/10] prune-trace: add SamplingPruner

The SamplingPruner implements "normal" sampling with equivalence-class
reuse.  Unlike the FESamplingPruner, the SamplingPruner implements
uniform fault-space sampling that counts multiple hits of an
equivalence class.

This change modifies the database schema, more specifically it adds
the "weight" column to the fspgroup table.  Update existing databases
with this query:

  ALTER TABLE fspgroup ADD COLUMN weight INT UNSIGNED;

Change-Id: I668fc9b25fc4d79a60aa1ef8d69cdf5fa076cc6d
---
 tools/prune-trace/CMakeLists.txt       |   1 +
 tools/prune-trace/FESamplingPruner.hpp |   1 -
 tools/prune-trace/Pruner.cc            |   1 +
 tools/prune-trace/SamplingPruner.cc    | 236 +++++++++++++++++++++++++
 tools/prune-trace/SamplingPruner.hpp   |  37 ++++
 tools/prune-trace/main.cc              |   3 +
 6 files changed, 278 insertions(+), 1 deletion(-)
 create mode 100644 tools/prune-trace/SamplingPruner.cc
 create mode 100644 tools/prune-trace/SamplingPruner.hpp

diff --git a/tools/prune-trace/CMakeLists.txt b/tools/prune-trace/CMakeLists.txt
index 4043ffd4..aadf6545 100644
--- a/tools/prune-trace/CMakeLists.txt
+++ b/tools/prune-trace/CMakeLists.txt
@@ -2,6 +2,7 @@ set(SRCS
   Pruner.cc
   BasicPruner.cc
   FESamplingPruner.cc
+  SamplingPruner.cc
 )
 
 find_package(MySQL REQUIRED)
diff --git a/tools/prune-trace/FESamplingPruner.hpp b/tools/prune-trace/FESamplingPruner.hpp
index d33c1e94..c938be20 100644
--- a/tools/prune-trace/FESamplingPruner.hpp
+++ b/tools/prune-trace/FESamplingPruner.hpp
@@ -30,7 +30,6 @@ public:
 
 	void getAliases(std::deque<std::string> *aliases) {
 		aliases->push_back("FESamplingPruner");
-		aliases->push_back("sampling");
 	}
 
 private:
diff --git a/tools/prune-trace/Pruner.cc b/tools/prune-trace/Pruner.cc
index 3e8c7e10..72c4854a 100644
--- a/tools/prune-trace/Pruner.cc
+++ b/tools/prune-trace/Pruner.cc
@@ -100,6 +100,7 @@ bool Pruner::create_database() {
 	    "  data_address    int(10) unsigned NOT NULL,"
 	    "  fspmethod_id    int(11) NOT NULL,"
 	    "  pilot_id        int(11) NOT NULL,"
+	    "  weight int(11) UNSIGNED,"
 	    "  PRIMARY KEY (variant_id, data_address, instr2, fspmethod_id),"
 	    "  KEY joinresults (pilot_id,fspmethod_id)) engine=MyISAM";
 
diff --git a/tools/prune-trace/SamplingPruner.cc b/tools/prune-trace/SamplingPruner.cc
new file mode 100644
index 00000000..3a382a9d
--- /dev/null
+++ b/tools/prune-trace/SamplingPruner.cc
@@ -0,0 +1,236 @@
+#include <sstream>
+#include <stdlib.h>
+#include <fstream>
+#include <algorithm>
+#include "SamplingPruner.hpp"
+#include "util/Logger.hpp"
+#include "util/CommandLine.hpp"
+#include "util/SumTree.hpp"
+
+static fail::Logger LOG("SamplingPruner");
+using std::endl;
+
+struct WeightedPilot {
+	uint64_t duration;
+
+	uint32_t instr2;
+	union {
+	uint32_t instr2_absolute;
+	uint32_t id;
+	};
+	uint32_t data_address;
+	uint32_t weight;
+
+	typedef uint64_t size_type;
+	size_type size() const { return duration; }
+};
+
+bool SamplingPruner::commandline_init()
+{
+	fail::CommandLine &cmd = fail::CommandLine::Inst();
+	SAMPLESIZE = cmd.addOption("", "samplesize", Arg::Required,
+		"--samplesize N \tNumber of samples to take (per variant)");
+	USE_KNOWN_RESULTS = cmd.addOption("", "use-known-results", Arg::None,
+		"--use-known-results \tReuse known results from a campaign with the 'basic' pruner ");
+	NO_WEIGHTING = cmd.addOption("", "no-weighting", Arg::None,
+		"--no-weighting \tDisable weighted sampling (weight = 1 for all ECs) "
+		"(don't do this unless you know what you're doing)");
+	return true;
+}
+
+bool SamplingPruner::prune_all()
+{
+	fail::CommandLine &cmd = fail::CommandLine::Inst();
+	if (!cmd[SAMPLESIZE]) {
+		LOG << "parameter --samplesize required, aborting" << endl;
+		return false;
+	}
+	m_samplesize = strtoul(cmd[SAMPLESIZE].first()->arg, 0, 10);
+
+	if (cmd[USE_KNOWN_RESULTS]) {
+		m_use_known_results = true;
+	}
+
+	// for each variant:
+	for (std::vector<fail::Database::Variant>::const_iterator it = m_variants.begin();
+		it != m_variants.end(); ++it) {
+		if (!sampling_prune(*it)) {
+			return false;
+		}
+	}
+
+	return true;
+}
+
+// TODO: replace with a less syscall-intensive RNG
+// TODO: deduplicate (copied from FESamplingPruner), put in a central place
+static std::ifstream dev_urandom("/dev/urandom", std::ifstream::binary);
+static uint64_t my_rand(uint64_t limit)
+{
+	// find smallest bitpos that satisfies (1 << bitpos) > limit
+	int bitpos = 0;
+	while (limit >> bitpos) {
+		bitpos++;
+	}
+
+	uint64_t retval;
+
+	do {
+		dev_urandom.read((char *) &retval, sizeof(retval));
+		retval &= (1ULL << bitpos) - 1;
+	} while (retval > limit);
+
+	return retval;
+}
+
+bool SamplingPruner::sampling_prune(const fail::Database::Variant& variant)
+{
+	typedef fail::SumTree<WeightedPilot> sumtree_type;
+	sumtree_type pop; // sample population
+	std::stringstream ss;
+	MYSQL_RES *res;
+	MYSQL_ROW row;
+
+	uint64_t pilotcount = 0;
+
+	if (!m_use_known_results) {
+		LOG << "loading trace entries for " << variant.variant << "/" << variant.benchmark << " ..." << endl;
+
+		// load trace entries
+		ss << "SELECT instr2, instr2_absolute, data_address, time2-time1+1 AS duration"
+			<< " FROM trace"
+			<< " WHERE variant_id = " << variant.id
+			<< " AND accesstype = 'R'";
+		res = db->query_stream(ss.str().c_str());
+		ss.str("");
+		if (!res) return false;
+		while ((row = mysql_fetch_row(res))) {
+			WeightedPilot p;
+			p.instr2 = strtoul(row[0], 0, 10);
+			p.instr2_absolute = strtoul(row[1], 0, 10);
+			p.data_address = strtoul(row[2], 0, 10);
+			p.duration = m_weighting ? strtoull(row[3], 0, 10) : 1;
+			p.weight = 0;
+			pop.add(p);
+			++pilotcount;
+		}
+		mysql_free_result(res);
+	} else {
+		LOG << "loading pilots for " << variant.variant << "/" << variant.benchmark << " ..." << endl;
+
+		// load fsppilot entries
+		ss << "SELECT p.id, p.instr2, p.data_address, t.time2 - t.time1 + 1 AS duration"
+			<< " FROM fsppilot p"
+			<< " JOIN trace t"
+			<< " ON t.variant_id = p.variant_id AND t.data_address = p.data_address AND t.instr2 = p.instr2"
+			<< " WHERE p.fspmethod_id = " << db->get_fspmethod_id("basic")
+			<< " AND p.variant_id = " << variant.id
+			<< " AND p.known_outcome = 0";
+		res = db->query_stream(ss.str().c_str());
+		ss.str("");
+		if (!res) return false;
+		while ((row = mysql_fetch_row(res))) {
+			WeightedPilot p;
+			p.id = strtoul(row[0], 0, 10);
+			p.instr2 = strtoul(row[1], 0, 10);
+			p.data_address = strtoul(row[2], 0, 10);
+			p.duration = m_weighting ? strtoull(row[3], 0, 10) : 1;
+			p.weight = 0;
+			pop.add(p);
+			++pilotcount;
+		}
+		mysql_free_result(res);
+	}
+
+	LOG << "loaded " << pilotcount << " entries, sampling "
+		<< m_samplesize << " fault-space coordinates ..." << endl;
+
+	ss << "INSERT INTO fsppilot (known_outcome, variant_id, instr2, injection_instr, "
+		<< "injection_instr_absolute, data_address, data_width, fspmethod_id) VALUES ";
+	std::string insert_sql(ss.str());
+	ss.str("");
+
+	uint64_t popsize = pop.get_size(); // stays constant
+	uint64_t num_fsppilot_entries = 0;
+	for (uint64_t i = 0; i < m_samplesize; ++i) {
+		uint64_t pos = my_rand(popsize - 1);
+		WeightedPilot& p = pop.get(pos);
+		p.weight++;
+		// first time we sample this pilot?
+		if (!m_use_known_results && p.weight == 1) {
+			ss << "(0," << variant.id << "," << p.instr2 << "," << p.instr2
+				<< "," << p.instr2_absolute << "," << p.data_address
+				<< ",1," << m_method_id << ")";
+			db->insert_multiple(insert_sql.c_str(), ss.str().c_str());
+			ss.str("");
+			++num_fsppilot_entries;
+		}
+	}
+
+	if (!m_use_known_results) {
+		db->insert_multiple();
+		LOG << "created " << num_fsppilot_entries << " fsppilot entries" << std::endl;
+	}
+
+	// fspgroup entries for sampled trace entries
+	if (!m_use_known_results) {
+		ss << "INSERT INTO fspgroup (variant_id, instr2, data_address, fspmethod_id, pilot_id, weight) "
+		   << "SELECT p.variant_id, p.instr2, p.data_address, " << m_method_id << ", p.id, 1 "
+		   << "FROM fsppilot p "
+		   << "WHERE known_outcome = 0 AND p.fspmethod_id = " << m_method_id << " "
+		   << "AND p.variant_id = " << variant.id;
+
+		if (!db->query(ss.str().c_str())) return false;
+		ss.str("");
+		uint64_t num_fspgroup_entries = db->affected_rows();
+		LOG << "created " << num_fspgroup_entries << " fspgroup entries" << std::endl;
+
+		// FIXME is this faster than manually INSERTing all fspgroup entries?
+		num_fspgroup_entries = 0;
+		LOG << "updating fspgroup entries with weight > 1 ..." << std::endl;
+		for (sumtree_type::iterator it = pop.begin(); it != pop.end(); ++it) {
+			if (it->weight <= 1) {
+				continue;
+			}
+			++num_fspgroup_entries;
+			ss << "UPDATE fspgroup SET weight = " << it->weight <<
+				" WHERE variant_id = " << variant.id <<
+				" AND instr2 = " << it->instr2 <<
+				" AND data_address = " << it->data_address <<
+				" AND fspmethod_id = " << m_method_id;
+			// pilot_id is known but should be identical
+			if (!db->query(ss.str().c_str())) return false;
+			if (db->affected_rows() != 1) {
+				LOG << "something is wrong, query affected unexpected ("
+					<< db->affected_rows()
+					<< " != 1) number of rows: "
+					<< ss.str() << std::endl;
+			}
+			ss.str("");
+		}
+		LOG << "updated " << num_fspgroup_entries << " fspgroup entries" << std::endl;
+	} else {
+		uint64_t num_fspgroup_entries = 0;
+
+		LOG << "creating fspgroup entries ..." << std::endl;
+
+		ss << "INSERT INTO fspgroup (variant_id, instr2, data_address, fspmethod_id, pilot_id, weight) VALUES ";
+		insert_sql = ss.str();
+		ss.str("");
+
+		for (sumtree_type::iterator it = pop.begin(); it != pop.end(); ++it) {
+			if (it->weight == 0) {
+				continue;
+			}
+			++num_fspgroup_entries;
+			ss << "(" << variant.id << "," << it->instr2 << "," << it->data_address
+				<< "," << m_method_id << "," << it->id << "," << it->weight << ")";
+			db->insert_multiple(insert_sql.c_str(), ss.str().c_str());
+			ss.str("");
+		}
+		db->insert_multiple();
+		LOG << "created " << num_fspgroup_entries << " fspgroup entries" << std::endl;
+	}
+
+	return true;
+}
diff --git a/tools/prune-trace/SamplingPruner.hpp b/tools/prune-trace/SamplingPruner.hpp
new file mode 100644
index 00000000..db129f29
--- /dev/null
+++ b/tools/prune-trace/SamplingPruner.hpp
@@ -0,0 +1,37 @@
+#ifndef __SAMPLING_PRUNER_H__
+#define __SAMPLING_PRUNER_H__
+
+#include <stdint.h>
+#include "Pruner.hpp"
+#include "util/CommandLine.hpp"
+
+///
+/// SamplingPruner: implements sampling with equivalence-class reuse
+///
+/// Unlike the FESamplingPruner, the SamplingPruner implements uniform
+/// fault-space sampling that counts multiple hits of an equivalence class.
+///
+class SamplingPruner : public Pruner {
+	fail::CommandLine::option_handle SAMPLESIZE;
+	fail::CommandLine::option_handle USE_KNOWN_RESULTS;
+	fail::CommandLine::option_handle NO_WEIGHTING;
+
+	uint64_t m_samplesize;
+	bool m_use_known_results, m_weighting;
+
+public:
+	SamplingPruner() : m_samplesize(0), m_use_known_results(false), m_weighting(true) { }
+	virtual std::string method_name() { return "sampling"; }
+	virtual bool commandline_init();
+	virtual bool prune_all();
+
+	void getAliases(std::deque<std::string> *aliases) {
+		aliases->push_back("SamplingPruner");
+		aliases->push_back("sampling");
+	}
+
+private:
+	bool sampling_prune(const fail::Database::Variant& variant);
+};
+
+#endif
diff --git a/tools/prune-trace/main.cc b/tools/prune-trace/main.cc
index 295d5c9f..c0867495 100644
--- a/tools/prune-trace/main.cc
+++ b/tools/prune-trace/main.cc
@@ -14,6 +14,7 @@ using std::endl;
 #include "Pruner.hpp"
 #include "BasicPruner.hpp"
 #include "FESamplingPruner.hpp"
+#include "SamplingPruner.hpp"
 
 int main(int argc, char *argv[]) {
 	std::string username, hostname, database;
@@ -26,6 +27,8 @@ int main(int argc, char *argv[]) {
 	registry.add(&basicprunerleft);
 	FESamplingPruner fesamplingpruner;
 	registry.add(&fesamplingpruner);
+	SamplingPruner samplingpruner;
+	registry.add(&samplingpruner);
 
 	std::string pruners = registry.getPrimeAliasesCSV();
 

From 4cbcf30b7c7b4d72a7ef395b6ec67ffb7b0fdde7 Mon Sep 17 00:00:00 2001
From: Horst Schirmeier <horst.schirmeier@tu-dortmund.de>
Date: Thu, 15 Jan 2015 19:11:33 +0100
Subject: [PATCH 10/10] prune-trace: incremental mode for SamplingPruner

The --incremental switch allows to add more samples if the resulting
confidence intervals are not satisfactory yet.

Change-Id: I65dc99522f45f8a4eaf4ce68e832f7636585381d
---
 tools/prune-trace/Pruner.cc          |   6 +-
 tools/prune-trace/Pruner.hpp         |  10 ++-
 tools/prune-trace/SamplingPruner.cc  | 102 ++++++++++++++++++++-------
 tools/prune-trace/SamplingPruner.hpp |   6 +-
 tools/prune-trace/main.cc            |  18 ++++-
 5 files changed, 110 insertions(+), 32 deletions(-)

diff --git a/tools/prune-trace/Pruner.cc b/tools/prune-trace/Pruner.cc
index 72c4854a..a3491b75 100644
--- a/tools/prune-trace/Pruner.cc
+++ b/tools/prune-trace/Pruner.cc
@@ -13,7 +13,7 @@ bool Pruner::init(
 		const std::vector<std::string>& variants_exclude,
 		const std::vector<std::string>& benchmarks,
 		const std::vector<std::string>& benchmarks_exclude,
-		bool overwrite)
+		bool overwrite, bool incremental)
 {
 	m_variants = db->get_variants(
 		variants, variants_exclude,
@@ -26,8 +26,8 @@ bool Pruner::init(
 	    << std::endl;
 
 	// make sure we only prune variants that haven't been pruned previously
-	// (unless we run with --overwrite)
-	if (!overwrite) {
+	// (unless we run with --overwrite or --incremental)
+	if (!overwrite && !incremental) {
 		for (std::vector<fail::Database::Variant>::iterator it = m_variants.begin();
 			it != m_variants.end(); ) {
 			std::stringstream ss;
diff --git a/tools/prune-trace/Pruner.hpp b/tools/prune-trace/Pruner.hpp
index a0ec044d..30fd4e63 100644
--- a/tools/prune-trace/Pruner.hpp
+++ b/tools/prune-trace/Pruner.hpp
@@ -21,7 +21,7 @@ public:
 		const std::vector<std::string>& variants_exclude,
 		const std::vector<std::string>& benchmarks,
 		const std::vector<std::string>& benchmarks_exclude,
-		bool overwrite);
+		bool overwrite, bool incremental);
 
 	/**
 	 * Callback function that can be used to add command line options
@@ -35,6 +35,14 @@ public:
 	virtual bool clear_database();
 
 	virtual bool prune_all() = 0;
+
+	/**
+	 * Tell the pruner to work incrementally.  For example, a sampling pruner
+	 * could add more pilots to already existing ones (which already may be
+	 * associated with fault-injection results).  Returns false if the pruner
+	 * is incapable of working in the desired mode.
+	 */
+	virtual bool set_incremental(bool incremental) { return !incremental; }
 };
 
 #endif
diff --git a/tools/prune-trace/SamplingPruner.cc b/tools/prune-trace/SamplingPruner.cc
index 3a382a9d..ce362fcc 100644
--- a/tools/prune-trace/SamplingPruner.cc
+++ b/tools/prune-trace/SamplingPruner.cc
@@ -13,11 +13,9 @@ using std::endl;
 struct WeightedPilot {
 	uint64_t duration;
 
-	uint32_t instr2;
-	union {
-	uint32_t instr2_absolute;
 	uint32_t id;
-	};
+	uint32_t instr2;
+	uint32_t instr2_absolute;
 	uint32_t data_address;
 	uint32_t weight;
 
@@ -94,13 +92,27 @@ bool SamplingPruner::sampling_prune(const fail::Database::Variant& variant)
 	uint64_t pilotcount = 0;
 
 	if (!m_use_known_results) {
-		LOG << "loading trace entries for " << variant.variant << "/" << variant.benchmark << " ..." << endl;
+		LOG << "loading trace entries "
+			<< (m_incremental ? "and existing pilots " : "")
+			<< "for " << variant.variant << "/" << variant.benchmark << " ..." << endl;
 
-		// load trace entries
-		ss << "SELECT instr2, instr2_absolute, data_address, time2-time1+1 AS duration"
-			<< " FROM trace"
-			<< " WHERE variant_id = " << variant.id
-			<< " AND accesstype = 'R'";
+		if (!m_incremental) {
+			// load trace entries
+			ss << "SELECT instr2, instr2_absolute, data_address, time2-time1+1 AS duration"
+				" FROM trace"
+				" WHERE variant_id = " << variant.id <<
+				" AND accesstype = 'R'";
+		} else {
+			// load trace entries and existing pilots
+			ss << "SELECT t.instr2, t.instr2_absolute, t.data_address, t.time2-t.time1+1 AS duration,"
+				" IFNULL(g.pilot_id, 0), IFNULL(g.weight, 0)"
+				" FROM trace t"
+				" LEFT JOIN fspgroup g"
+				" ON t.variant_id = g.variant_id AND t.data_address = g.data_address AND t.instr2 = g.instr2"
+				" AND g.fspmethod_id = " << m_method_id <<
+				" WHERE t.variant_id = " << variant.id <<
+				" AND t.accesstype = 'R'";
+		}
 		res = db->query_stream(ss.str().c_str());
 		ss.str("");
 		if (!res) return false;
@@ -110,7 +122,8 @@ bool SamplingPruner::sampling_prune(const fail::Database::Variant& variant)
 			p.instr2_absolute = strtoul(row[1], 0, 10);
 			p.data_address = strtoul(row[2], 0, 10);
 			p.duration = m_weighting ? strtoull(row[3], 0, 10) : 1;
-			p.weight = 0;
+			p.id = m_incremental ? strtoul(row[4], 0, 10) : 0;
+			p.weight = m_incremental ? strtoul(row[5], 0, 10) : 0;
 			pop.add(p);
 			++pilotcount;
 		}
@@ -118,14 +131,28 @@ bool SamplingPruner::sampling_prune(const fail::Database::Variant& variant)
 	} else {
 		LOG << "loading pilots for " << variant.variant << "/" << variant.benchmark << " ..." << endl;
 
-		// load fsppilot entries
-		ss << "SELECT p.id, p.instr2, p.data_address, t.time2 - t.time1 + 1 AS duration"
-			<< " FROM fsppilot p"
-			<< " JOIN trace t"
-			<< " ON t.variant_id = p.variant_id AND t.data_address = p.data_address AND t.instr2 = p.instr2"
-			<< " WHERE p.fspmethod_id = " << db->get_fspmethod_id("basic")
-			<< " AND p.variant_id = " << variant.id
-			<< " AND p.known_outcome = 0";
+		if (!m_incremental) {
+			// load fsppilot entries
+			ss << "SELECT p.id, p.instr2, p.data_address, t.time2 - t.time1 + 1 AS duration"
+				" FROM fsppilot p"
+				" JOIN trace t"
+				" ON t.variant_id = p.variant_id AND t.data_address = p.data_address AND t.instr2 = p.instr2"
+				" WHERE p.fspmethod_id = " << db->get_fspmethod_id("basic") <<
+				" AND p.variant_id = " << variant.id <<
+				" AND p.known_outcome = 0";
+		} else {
+			// load fsppilot entries and existing sampling pilots
+			ss << "SELECT p.id, p.instr2, p.data_address, t.time2 - t.time1 + 1 AS duration, IFNULL(g.weight, 0)"
+				" FROM fsppilot p"
+				" JOIN trace t"
+				" ON t.variant_id = p.variant_id AND t.data_address = p.data_address AND t.instr2 = p.instr2"
+				" LEFT JOIN fspgroup g"
+				" ON t.variant_id = g.variant_id AND t.data_address = g.data_address AND t.instr2 = g.instr2"
+				" AND g.fspmethod_id = " << m_method_id <<
+				" WHERE p.fspmethod_id = " << db->get_fspmethod_id("basic") <<
+				" AND p.variant_id = " << variant.id <<
+				" AND p.known_outcome = 0";
+		}
 		res = db->query_stream(ss.str().c_str());
 		ss.str("");
 		if (!res) return false;
@@ -135,7 +162,7 @@ bool SamplingPruner::sampling_prune(const fail::Database::Variant& variant)
 			p.instr2 = strtoul(row[1], 0, 10);
 			p.data_address = strtoul(row[2], 0, 10);
 			p.duration = m_weighting ? strtoull(row[3], 0, 10) : 1;
-			p.weight = 0;
+			p.weight = m_incremental ? strtoull(row[4], 0, 10) : 0;
 			pop.add(p);
 			++pilotcount;
 		}
@@ -158,6 +185,8 @@ bool SamplingPruner::sampling_prune(const fail::Database::Variant& variant)
 		p.weight++;
 		// first time we sample this pilot?
 		if (!m_use_known_results && p.weight == 1) {
+			// no need to special-case existing pilots (incremental mode), as
+			// their initial weight is supposed to be at least 1
 			ss << "(0," << variant.id << "," << p.instr2 << "," << p.instr2
 				<< "," << p.instr2_absolute << "," << p.data_address
 				<< ",1," << m_method_id << ")";
@@ -174,7 +203,13 @@ bool SamplingPruner::sampling_prune(const fail::Database::Variant& variant)
 
 	// fspgroup entries for sampled trace entries
 	if (!m_use_known_results) {
-		ss << "INSERT INTO fspgroup (variant_id, instr2, data_address, fspmethod_id, pilot_id, weight) "
+		if (!m_incremental) {
+			ss << "INSERT";
+		} else {
+			// this spares us to delete existing pilots before
+			ss << "REPLACE";
+		}
+		ss << " INTO fspgroup (variant_id, instr2, data_address, fspmethod_id, pilot_id, weight) "
 		   << "SELECT p.variant_id, p.instr2, p.data_address, " << m_method_id << ", p.id, 1 "
 		   << "FROM fsppilot p "
 		   << "WHERE known_outcome = 0 AND p.fspmethod_id = " << m_method_id << " "
@@ -182,7 +217,14 @@ bool SamplingPruner::sampling_prune(const fail::Database::Variant& variant)
 
 		if (!db->query(ss.str().c_str())) return false;
 		ss.str("");
-		uint64_t num_fspgroup_entries = db->affected_rows();
+		uint64_t num_fspgroup_entries;
+		if (!m_incremental) {
+			num_fspgroup_entries = db->affected_rows();
+		} else {
+			// with REPLACE INTO, affected_rows does not yield the number of
+			// new rows; take num_fsppilot_entries instead
+			num_fspgroup_entries = num_fsppilot_entries;
+		}
 		LOG << "created " << num_fspgroup_entries << " fspgroup entries" << std::endl;
 
 		// FIXME is this faster than manually INSERTing all fspgroup entries?
@@ -208,13 +250,25 @@ bool SamplingPruner::sampling_prune(const fail::Database::Variant& variant)
 			}
 			ss.str("");
 		}
-		LOG << "updated " << num_fspgroup_entries << " fspgroup entries" << std::endl;
+
+		if (!m_incremental) {
+			LOG << "updated " << num_fspgroup_entries << " fspgroup entries" << std::endl;
+		} else {
+			// we don't know how many rows we really updated
+			LOG << "updated fspgroup entries" << std::endl;
+		}
 	} else {
 		uint64_t num_fspgroup_entries = 0;
 
 		LOG << "creating fspgroup entries ..." << std::endl;
 
-		ss << "INSERT INTO fspgroup (variant_id, instr2, data_address, fspmethod_id, pilot_id, weight) VALUES ";
+		if (!m_incremental) {
+			ss << "INSERT";
+		} else {
+			// this spares us to delete existing pilots before
+			ss << "REPLACE";
+		}
+		ss << " INTO fspgroup (variant_id, instr2, data_address, fspmethod_id, pilot_id, weight) VALUES ";
 		insert_sql = ss.str();
 		ss.str("");
 
diff --git a/tools/prune-trace/SamplingPruner.hpp b/tools/prune-trace/SamplingPruner.hpp
index db129f29..9fe6a2ee 100644
--- a/tools/prune-trace/SamplingPruner.hpp
+++ b/tools/prune-trace/SamplingPruner.hpp
@@ -17,10 +17,10 @@ class SamplingPruner : public Pruner {
 	fail::CommandLine::option_handle NO_WEIGHTING;
 
 	uint64_t m_samplesize;
-	bool m_use_known_results, m_weighting;
+	bool m_use_known_results, m_weighting, m_incremental;
 
 public:
-	SamplingPruner() : m_samplesize(0), m_use_known_results(false), m_weighting(true) { }
+	SamplingPruner() : m_samplesize(0), m_use_known_results(false), m_weighting(true), m_incremental(false) { }
 	virtual std::string method_name() { return "sampling"; }
 	virtual bool commandline_init();
 	virtual bool prune_all();
@@ -30,6 +30,8 @@ public:
 		aliases->push_back("sampling");
 	}
 
+	virtual bool set_incremental(bool incremental) { m_incremental = incremental; return true; }
+
 private:
 	bool sampling_prune(const fail::Database::Variant& variant);
 };
diff --git a/tools/prune-trace/main.cc b/tools/prune-trace/main.cc
index c0867495..0a391889 100644
--- a/tools/prune-trace/main.cc
+++ b/tools/prune-trace/main.cc
@@ -65,12 +65,20 @@ int main(int argc, char *argv[]) {
 	CommandLine::option_handle OVERWRITE =
 		cmd.addOption("", "overwrite", Arg::None,
 			"--overwrite \tOverwrite already existing pruning data (the default is to skip variants with existing entries)");
+	CommandLine::option_handle INCREMENTAL =
+		cmd.addOption("", "incremental", Arg::None,
+			"--incremental \tTell the pruner to work incrementally (if supported)");
 
 	if (!cmd.parse()) {
 		std::cerr << "Error parsing arguments." << std::endl;
 		exit(-1);
 	}
 
+	if (cmd[OVERWRITE] && cmd[INCREMENTAL]) {
+		std::cerr << "--overwrite and --incremental cannot be used together." << std::endl;
+		exit(-1);
+	}
+
 	Pruner *pruner;
 	std::string pruner_name = "BasicPruner";
 	if (cmd[PRUNER]) {
@@ -110,6 +118,11 @@ int main(int argc, char *argv[]) {
 	Database *db = Database::cmdline_connect();
 	pruner->set_db(db);
 
+	if (cmd[INCREMENTAL] && !pruner->set_incremental(true)) {
+		std::cerr << "Pruner is incapable of running incrementally" << std::endl;
+		exit(-1);
+	}
+
 	std::vector<std::string> variants, benchmarks, variants_exclude, benchmarks_exclude;
 	if (cmd[VARIANT]) {
 		for (option::Option *o = cmd[VARIANT]; o; o = o->next()) {
@@ -150,7 +163,8 @@ int main(int argc, char *argv[]) {
 		exit(-1);
 	}
 
-	if (!pruner->init(variants, variants_exclude, benchmarks, benchmarks_exclude, cmd[OVERWRITE])) {
+	if (!pruner->init(variants, variants_exclude, benchmarks, benchmarks_exclude,
+		cmd[OVERWRITE], cmd[INCREMENTAL])) {
 		LOG << "pruner->init() failed" << endl;
 		exit(-1);
 	}
@@ -158,7 +172,7 @@ int main(int argc, char *argv[]) {
 	////////////////////////////////////////////////////////////////
 	// Do the actual pruning
 	////////////////////////////////////////////////////////////////
-	if (!cmd[NO_DELETE] && cmd[OVERWRITE] && !pruner->clear_database()) {
+	if (!cmd[NO_DELETE] && cmd[OVERWRITE] && !cmd[INCREMENTAL] && !pruner->clear_database()) {
 		LOG << "clear_database() failed" << endl;
 		exit(-1);
 	}