diff --git a/core/config/CMakeLists.txt b/core/config/CMakeLists.txt index 1d59508f..0ef6400d 100644 --- a/core/config/CMakeLists.txt +++ b/core/config/CMakeLists.txt @@ -16,7 +16,13 @@ OPTION(CONFIG_STFU "Misc: Reduced verbosity" OFF) OPTION(CONFIG_SUPPRESS_INTERRUPTS "Target backend: Suppress interrupts" OFF) OPTION(CONFIG_FIRE_INTERRUPTS "Target backend: Fire interrupts" OFF) OPTION(CONFIG_DISABLE_KEYB_INTERRUPTS "Target backend: Suppress keyboard interrupts" OFF) -OPTION(CONFIG_FI_MEM_ACCESS_BITFLIP "deprecated something" OFF) +OPTION(CONFIG_FI_MEM_ACCESS_BITFLIP "Deprecated something" OFF) +OPTION(SERVER_PERFORMANCE_MEASURE "Performance measurement in job-server" OFF) +SET(SERVER_PERF_LOG_PATH "perf.log" CACHE STRING "A file name for storing the server's performance log (CSV)") +SET(SERVER_PERF_STEPPING_SEC "1" CACHE STRING "Stepping of performance measurements in seconds") +SET(CLIENT_RAND_BACKOFF_TSTART "3" CACHE STRING "Lower limit of client's backoff phase in seconds") +SET(CLIENT_RAND_BACKOFF_TEND "8" CACHE STRING "Upper limit of client's backoff phase in seconds") +SET(CLIENT_RETRY_COUNT "3" CACHE STRING "Client's number of reconnect-reties") configure_file(${CMAKE_CURRENT_SOURCE_DIR}/FailConfig.hpp.in ${CMAKE_CURRENT_BINARY_DIR}/FailConfig.hpp) diff --git a/core/config/FailConfig.hpp.in b/core/config/FailConfig.hpp.in index 553bb925..c436c652 100644 --- a/core/config/FailConfig.hpp.in +++ b/core/config/FailConfig.hpp.in @@ -19,11 +19,17 @@ #cmakedefine CONFIG_SR_SAVE #cmakedefine CONFIG_SR_REBOOT -// Miscellaneous +// Fail configuration #cmakedefine CONFIG_STFU #cmakedefine CONFIG_SUPPRESS_INTERRUPTS #cmakedefine CONFIG_FIRE_INTERRUPTS #cmakedefine CONFIG_DISABLE_KEYB_INTERRUPTS +#cmakedefine SERVER_PERFORMANCE_MEASURE +#cmakedefine SERVER_PERF_LOG_PATH "@SERVER_PERF_LOG_PATH@" +#cmakedefine SERVER_PERF_STEPPING_SEC @SERVER_PERF_STEPPING_SEC@ +#cmakedefine CLIENT_RAND_BACKOFF_TSTART @CLIENT_RAND_BACKOFF_TSTART@ +#cmakedefine CLIENT_RAND_BACKOFF_TEND @CLIENT_RAND_BACKOFF_TEND@ +#cmakedefine CLIENT_RETRY_COUNT @CLIENT_RETRY_COUNT@ // Fault injection #cmakedefine CONFIG_FI_MEM_ACCESS_BITFLIP diff --git a/core/jobserver/JobClient.cc b/core/jobserver/JobClient.cc index df4f763f..3801086e 100644 --- a/core/jobserver/JobClient.cc +++ b/core/jobserver/JobClient.cc @@ -34,20 +34,20 @@ bool JobClient::connectToServer() memcpy(&serv_addr.sin_addr.s_addr, m_server_ent->h_addr, m_server_ent->h_length); serv_addr.sin_port = htons(m_server_port); - int retries = RETRY_COUNT; + int retries = CLIENT_RETRY_COUNT; while(true) { if(connect(m_sockfd, (sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) { perror("[Client@connect()]"); if(retries > 0) { - // Wait RAND_BACKOFF_TSTART to RAND_BACKOFF_TEND seconds: - int delay = rand() % (RAND_BACKOFF_TEND-RAND_BACKOFF_TSTART) + RAND_BACKOFF_TSTART; + // Wait CLIENT_RAND_BACKOFF_TSTART to RAND_BACKOFF_TEND seconds: + int delay = rand() % (CLIENT_RAND_BACKOFF_TEND-CLIENT_RAND_BACKOFF_TSTART) + CLIENT_RAND_BACKOFF_TSTART; cout << "[Client] Retrying to connect to server in ~" << delay << "s..." << endl; sleep(delay); usleep(rand() % 1000000); --retries; continue; } - cout << "[Client] Unable to reconnect (tried " << RETRY_COUNT << " times); " + cout << "[Client] Unable to reconnect (tried " << CLIENT_RETRY_COUNT << " times); " << "I'll give it up!" << endl; return false; // finally: unable to connect, give it up :-( } diff --git a/core/jobserver/JobClient.hpp b/core/jobserver/JobClient.hpp index e17f0df7..67dfa8b1 100644 --- a/core/jobserver/JobClient.hpp +++ b/core/jobserver/JobClient.hpp @@ -1,26 +1,16 @@ -/** - * \brief The Minion's JobClient requests ExperimentData and returns results. - * - * \author Martin Hoffmann - */ - - #ifndef __JOB_CLIENT_H__ -#define __JOB_CLIENT_H__ + #define __JOB_CLIENT_H__ #include #include #include #include #include + #include "SocketComm.hpp" #include "controller/ExperimentData.hpp" #include "jobserver/messagedefs/FailControlMessage.pb.h" - -// FIXME This should be part of a "client config" (?). -#define RAND_BACKOFF_TSTART 3 -#define RAND_BACKOFF_TEND 8 -#define RETRY_COUNT 3 +#include "config/FailConfig.hpp" namespace fi { @@ -28,6 +18,7 @@ namespace fi { * \class JobClient * * \brief Manages communication with JobServer + * The Minion's JobClient requests ExperimentData and returns results. * */ class JobClient { @@ -63,10 +54,6 @@ namespace fi { bool sendResult(ExperimentData& result); }; - - - } - -#endif +#endif // __JOB_CLIENT_H__ diff --git a/core/jobserver/JobServer.cc b/core/jobserver/JobServer.cc index 8bd34580..20838e41 100644 --- a/core/jobserver/JobServer.cc +++ b/core/jobserver/JobServer.cc @@ -1,6 +1,3 @@ -// Author: Martin Hoffmann, Richard Hellwig, Adrian Böckenkamp -// Date: 07.10.11 - // needs to be included before *.pb.h, otherwise ac++/Puma chokes on the latter #include @@ -17,6 +14,7 @@ #include "jobserver/messagedefs/FailControlMessage.pb.h" #include "SocketComm.hpp" #include "controller/Minion.hpp" + #ifndef __puma #include #include @@ -32,7 +30,9 @@ void JobServer::addParam(ExperimentData* exp){ #endif } +#ifdef SERVER_PERFORMANCE_MEASURE volatile unsigned JobServer::m_DoneCount = 0; +#endif ExperimentData *JobServer::getDone() { @@ -58,12 +58,12 @@ ExperimentData *JobServer::getDone() #ifdef SERVER_PERFORMANCE_MEASURE void JobServer::measure() { - cout << "\n[Server] Logging throughput in \"" << PERFORMANCE_LOG_PATH << "\"..." << endl; - ofstream m_file(PERFORMANCE_LOG_PATH, std::ios::trunc); // overwrite existing perf-logs + cout << "\n[Server] Logging throughput in \"" << SERVER_PERF_LOG_PATH << "\"..." << endl; + ofstream m_file(SERVER_PERF_LOG_PATH, std::ios::trunc); // overwrite existing perf-logs if(!m_file.is_open()) { cerr << "[Server] Perf-logging has been enabled" << "but I was not able to write the log-file \"" - << PERFORMANCE_LOG_PATH << "\"." << endl; + << SERVER_PERF_LOG_PATH << "\"." << endl; exit(1); } unsigned counter = 0; @@ -73,9 +73,9 @@ void JobServer::measure() while(!m_finish) { // Format: 1st column (seconds)[TAB]2nd column (throughput) m_file << counter << "\t" << (m_DoneCount - diff) << endl; - counter += PERFORMANCE_STEPPING_SEC; + counter += SERVER_PERF_STEPPING_SEC; diff = m_DoneCount; - sleep(PERFORMANCE_STEPPING_SEC); + sleep(SERVER_PERF_STEPPING_SEC); } // NOTE: Summing up the values written in the 2nd column does not // necessarily yield the number of completed experiments/jobs diff --git a/core/jobserver/JobServer.hpp b/core/jobserver/JobServer.hpp index f09daa2e..95bbfee2 100644 --- a/core/jobserver/JobServer.hpp +++ b/core/jobserver/JobServer.hpp @@ -1,39 +1,28 @@ -/** - * \brief The JobServer supplies the Minions with ExperimentData's - * and receives the result data. - * - * \author Martin Hoffmann, Richard Hellwig, Adrian Böckenkamp - */ - - #ifndef __JOB_SERVER_H__ -#define __JOB_SERVER_H__ + #define __JOB_SERVER_H__ #include "controller/Minion.hpp" #include "util/SynchronizedQueue.hpp" #include "util/SynchronizedCounter.hpp" #include "util/SynchronizedMap.hpp" +#include "config/FailConfig.hpp" #include #ifndef __puma #include #endif -// TODO: This should be part of a "server-config". -#define SERVER_PERFORMANCE_MEASURE -#define PERFORMANCE_LOG_PATH "perf.dat" -#define PERFORMANCE_STEPPING_SEC 1 - namespace fi { class CommThread; /** * \class JobServer - * Manages the campaigns parameter distributions. - * The Campaign Controller can add experiment parameter sets, - * which the Jobserver will distribute to requesting clients. - * The campaign controller can wait for all results, or a timeout. + * The server supplies the Minions with ExperimentData's and receives the result data. + * + * Manages the campaigns parameter distributions. The Campaign Controller can add + * experiment parameter sets, which the Jobserver will distribute to requesting + * clients. The campaign controller can wait for all results, or a timeout. */ class JobServer {