From 6ebd9b003af5a7dd5eac6214351b606a80475e46 Mon Sep 17 00:00:00 2001 From: Christian Dietrich Date: Thu, 5 Jun 2014 12:08:42 +0200 Subject: [PATCH] DatabaseExperiment: base class for distributed fail experiments The DatabaseExperiment is a class a concrete experiment can inherit from. It handles the communication with the campaign server. Does the fast forward to the fault location, injects the fault and gives the result over experiment outcome to the child class. Change-Id: I1fb676da6c704cd570a638f0dfaadd4f1a9845e4 --- .../comm/DatabaseCampaignMessage.proto.in | 8 +- src/core/efw/CMakeLists.txt | 2 + src/core/efw/DatabaseExperiment.cc | 183 ++++++++++++++++++ src/core/efw/DatabaseExperiment.hpp | 150 ++++++++++++++ 4 files changed, 342 insertions(+), 1 deletion(-) create mode 100644 src/core/efw/DatabaseExperiment.cc create mode 100644 src/core/efw/DatabaseExperiment.hpp diff --git a/src/core/comm/DatabaseCampaignMessage.proto.in b/src/core/comm/DatabaseCampaignMessage.proto.in index 0e9f3bd6..2044ea1c 100644 --- a/src/core/comm/DatabaseCampaignMessage.proto.in +++ b/src/core/comm/DatabaseCampaignMessage.proto.in @@ -21,4 +21,10 @@ message DatabaseCampaignMessage { required string benchmark = 9 [(sql_ignore) = true]; required InjectionPointMessage injection_point = 10 [(sql_ignore) = true]; -} \ No newline at end of file +} + +message DatabaseExperimentMessage { + required uint32 bitoffset = 1 [(sql_primary_key) = true]; + required uint32 original_value = 2; +} + diff --git a/src/core/efw/CMakeLists.txt b/src/core/efw/CMakeLists.txt index 73f440ab..b13748e4 100644 --- a/src/core/efw/CMakeLists.txt +++ b/src/core/efw/CMakeLists.txt @@ -4,6 +4,8 @@ set(SRCS ExperimentFlow.hpp JobClient.hpp JobClient.cc + DatabaseExperiment.hpp + DatabaseExperiment.cc ) add_library(fail-efw ${SRCS}) diff --git a/src/core/efw/DatabaseExperiment.cc b/src/core/efw/DatabaseExperiment.cc new file mode 100644 index 00000000..24c9234e --- /dev/null +++ b/src/core/efw/DatabaseExperiment.cc @@ -0,0 +1,183 @@ +#include +#include + +#include +#include "sal/SALConfig.hpp" +#include "sal/Memory.hpp" +#include "sal/Listener.hpp" +#include "efw/DatabaseExperiment.hpp" +#include +#include +#include "comm/DatabaseCampaignMessage.pb.h" +#include "sal/bochs/BochsListener.hpp" +#include +#include + + +using namespace std; +using namespace fail; +using namespace google::protobuf; + +// Check if configuration dependencies are satisfied: +#if !defined(CONFIG_EVENT_BREAKPOINTS) || !defined(CONFIG_SR_RESTORE) + #error This experiment needs: breakpoints, restore. Enable these in the configuration. +#endif + +DatabaseExperiment::~DatabaseExperiment() { + delete this->m_jc; +} + +unsigned DatabaseExperiment::injectBitFlip(address_t data_address, unsigned bitpos){ + unsigned int value, injectedval; + + value = m_mm.getByte(data_address); + injectedval = value ^ (1 << bitpos); + m_mm.setByte(data_address, injectedval); + + m_log << "INJECTION at: 0x" << hex<< setw(2) << setfill('0') << data_address + << " value: 0x" << setw(2) << setfill('0') << value << " -> 0x" + << setw(2) << setfill('0') << (unsigned) m_mm.getByte(data_address) << endl; + + return value; +} + +template +T * protobufFindSubmessageByTypename(Message *msg, const std::string &name) { + T * submessage = 0; + const Descriptor *msg_type = msg->GetDescriptor(); + const Message::Reflection *ref = msg->GetReflection(); + const Descriptor *database_desc = + DescriptorPool::generated_pool()->FindMessageTypeByName(name); + assert(database_desc != 0); + + size_t count = msg_type->field_count(); + + for (unsigned i = 0; i < count; i++) { + const FieldDescriptor *field = msg_type->field(i); + assert(field != 0); + if (field->message_type() == database_desc) { + submessage = dynamic_cast(ref->MutableMessage(msg, field)); + assert(submessage != 0); + break; + } + } + return submessage; +} + + +bool DatabaseExperiment::run() +{ + m_log << "STARTING EXPERIMENT" << endl; + + if (!this->cb_start_experiment()) { + m_log << "Initialization failed. Exiting." << endl; + simulator.terminate(1); + } + + unsigned executed_jobs = 0; + + while (executed_jobs < 25 || m_jc->getNumberOfUndoneJobs() > 0) { + m_log << "asking jobserver for parameters" << endl; + ExperimentData * param = this->cb_allocate_experiment_data(); + if (!m_jc->getParam(*param)){ + m_log << "Dying." << endl; // We were told to die. + simulator.terminate(1); + } + m_current_param = param; + + DatabaseCampaignMessage * fsppilot = + protobufFindSubmessageByTypename(¶m->getMessage(), "DatabaseCampaignMessage"); + assert (fsppilot != 0); + + unsigned injection_instr = fsppilot->injection_instr(); + address_t data_address = fsppilot->data_address(); + unsigned width = fsppilot->data_width(); + + for (unsigned bit_offset = 0; bit_offset < width * 8; ++bit_offset) { + // 8 results in one job + Message *outer_result = cb_new_result(param); + m_current_result = outer_result; + DatabaseExperimentMessage *result = + protobufFindSubmessageByTypename(outer_result, "DatabaseExperimentMessage"); + result->set_bitoffset(bit_offset); + m_log << "restoring state" << endl; + // Restore to the image, which starts at address(main) + simulator.restore(cb_state_directory()); + executed_jobs ++; + + m_log << "Trying to inject @ instr #" << dec << injection_instr << endl; + + simulator.clearListeners(); + + // Generate an experiment listener, that matches on any IP + // event. It is used to forward to the injection + // point. The +1 is needed, since even for the zeroth + // dynamic instruction we need at least one breakpoint + // event. + BPSingleListener bp; + bp.setWatchInstructionPointer(ANY_ADDR); + bp.setCounter(injection_instr + 1); + simulator.addListener(&bp); + + if (!this->cb_before_fast_forward()) { + continue; + } + fail::BaseListener * listener; + while (true) { + listener = simulator.resume(); + if (listener == &bp) { + break; + } else { + bool should_continue = this->cb_during_fast_forward(listener); + if (!should_continue) + break; // Stop fast forwarding + } + } + if (!this->cb_after_fast_forward(listener)) { + continue; // Continue to next injection experiment + } + + address_t injection_instr_absolute = fsppilot->injection_instr_absolute(); + bool found_eip; + for (int i = 0; i < BX_SMP_PROCESSORS; i++) { + address_t eip = simulator.getCPU(i).getInstructionPointer(); + if (eip == injection_instr_absolute) { + found_eip = true; + } + } + if (!found_eip) { + m_log << "Invalid Injection address != 0x" << injection_instr_absolute << std::endl; + simulator.terminate(1); + } + + simulator.clearListeners(); + + /// INJECT BITFLIP: + result->set_original_value(injectBitFlip(data_address, bit_offset)); + + if (!this->cb_before_resume()) { + continue; // Continue to next experiment + } + + m_log << "Resuming till the crash" << std::endl; + // resume and wait for results + while (true) { + listener = simulator.resume(); + bool should_continue = this->cb_during_resume(listener); + if (!should_continue) + break; + } + m_log << "Resume done" << std::endl; + this->cb_after_resume(listener); + + simulator.clearListeners(); + } + m_jc->sendResult(*param); + this->cb_free_experiment_data(param); + } + // Explicitly terminate, or the simulator will continue to run. + simulator.terminate(); + return false; +} + + diff --git a/src/core/efw/DatabaseExperiment.hpp b/src/core/efw/DatabaseExperiment.hpp new file mode 100644 index 00000000..f60ddf9a --- /dev/null +++ b/src/core/efw/DatabaseExperiment.hpp @@ -0,0 +1,150 @@ +#ifndef __DATABASE_EXPERIMENT_HPP__ +#define __DATABASE_EXPERIMENT_HPP__ + +#include +#include "efw/ExperimentFlow.hpp" +#include "efw/JobClient.hpp" +#include "util/Logger.hpp" +#include +#include + +namespace fail { +class ExperimentData; + +class DatabaseExperiment : public fail::ExperimentFlow { + fail::JobClient *m_jc; + + unsigned injectBitFlip(fail::address_t data_address, unsigned bitpos); + + /** + The current experiment data as returned by the job client. This + allocated by cb_allocate_experiment_data() + */ + ExperimentData *m_current_param; + google::protobuf::Message *m_current_result; + +public: + DatabaseExperiment(const std::string &name) + : m_log(name, false), m_mm(fail::simulator.getMemoryManager()) { + + /* The fail server can be set with an environent variable, + otherwise the JOBSERVER configured by cmake ist used */ + char *server_host = getenv("FAIL_SERVER_HOST"); + if (server_host != NULL){ + this->m_jc = new fail::JobClient(std::string(server_host)); + } else { + this->m_jc = new fail::JobClient(); + } + } + + virtual ~DatabaseExperiment(); + + bool run(); + + +protected: + fail::Logger m_log; + fail::MemoryManager& m_mm; + + /** Returns the currently running experiment message as returned + * by the job client + */ + ExperimentData * get_current_experiment_data() { return m_current_param; } + + /** Returns the currently result message, that was allocated by + * cb_allocate_new_result. + */ + google::protobuf::Message * get_current_result() { return m_current_result; } + + + ////////////////////////////////////////////////////////////////// + // Can be overwritten by experiment + ////////////////////////////////////////////////////////////////// + + /** + * Get path to the state directory + */ + virtual std::string cb_state_directory() { return "state"; } + + /** + * Callback that is called, before the actual experiment + * starts. Simulation is terminated on false. + * @param The current result message + * @return \c true on success, \c false otherwise + */ + virtual bool cb_start_experiment() { return true; }; + + /** + * Allocate enough space to hold the incoming ExperimentData + * message. The can be accessed during the experiment through + * get_current_experiment_data() + */ + virtual ExperimentData* cb_allocate_experiment_data() = 0; + virtual void cb_free_experiment_data(ExperimentData *) {}; + + + /** + * Allocate a new result slot in the given experiment data. The + * returned pointer can be obtained by calling + * get_current_result() + */ + virtual google::protobuf::Message* cb_new_result(ExperimentData*) = 0; + + /** + * Callback that is called before the fast forward is done. This + * can be used to add additional event listeners during the fast + * forward phase. If returning false, the experiment is canceled. + * @return \c true on success, \c false otherwise + */ + virtual bool cb_before_fast_forward() { return true; }; + + /** + * Callback that is called during the fast forward, when an event + * has triggered, but it was not the fast forward listener. This + * can be used to collect additional information during the fast + * forward If returning false, the fast forwarding is stopped. + * + * @return \c true on should continue, \c false stop ff + */ + virtual bool cb_during_fast_forward(fail::BaseListener *) { return false; }; + + /** + * Callback that is called after the fast forward, with the last + * triggered event forward If returning false, the experiment is + * canceled. + * + * @return \c true on success, \c false otherwise + */ + virtual bool cb_after_fast_forward(fail::BaseListener *) { return true; }; + + /** + * Callback that is called before the resuming till crash has + * started. This is called after the fault was injected. Here the + * end listeners should be installed. Returns true on + * success. Otherwise the experiment is canceled. + + * @return \c true on success, \c false otherwise + */ + virtual bool cb_before_resume() = 0; + + /** + * Callback that is called during the resume-till-crash phase, + * when an event has triggered, This can be used to collect + * additional information during the resuming phse. If returning + * false, the resuming has finished and the experiment has stopped. + * + * @return \c true on should continue ff, \c false stop ff + */ + virtual bool cb_during_resume(fail::BaseListener *) { return false; }; + + /** + * Callback that is called after the resume-till-crash phase with + * the last triggered listener. This callback should collect all data and + * + */ + virtual void cb_after_resume(fail::BaseListener *) = 0; +}; + +} + +#endif // __DATABASE_EXPERIMENT_HPP__