From 882d4f381b8af72d8859b7d849c241d9578e24ff Mon Sep 17 00:00:00 2001 From: Horst Schirmeier Date: Thu, 16 Jan 2014 17:54:50 +0100 Subject: [PATCH] jobclient: bugfix: faster shutdown at campaign end The JobClient currently waits a LONG time until it really shuts down after not having reached the server in sendResultsToServer() (which is unfortunately the by far most probable point in the code to determine this): - A different bug (fixed in the previous commit) provoked the situation that a (way) too large amount of jobs was fetched before. - sendResult() (called after each experiment iteration) realized that CLIENT_JOB_REQUEST_SEC seconds are over, and tried to prematurely call home to send first results (without planning to get new jobs yet). - If the server was gone (done, or aborted), connect in sendResultsToServer() failed after several retries and timeouts. - All subsequent calls to sendResult() retried connecting to the server (again, with retries and timeouts), once for each remaining job. - When all jobs were done, getParam() tries to connect a last time, finally telling the experiment that nobody's home. This resulted in client shutdown times of up to four hours (for the default CLIENT_JOB_LIMIT of 1000) after the campaign server terminated. This change solves the issue by not handing out new (cached) jobs after the connect failed once, making the experiment terminate quickly. Change-Id: I0d8cb2e084d783aca74c51a503fa72eb2b2eb0b7 --- src/core/efw/JobClient.cc | 24 ++++++++++++++++++++++-- src/core/efw/JobClient.hpp | 2 ++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/src/core/efw/JobClient.cc b/src/core/efw/JobClient.cc index 2bb89f80..34732338 100644 --- a/src/core/efw/JobClient.cc +++ b/src/core/efw/JobClient.cc @@ -22,6 +22,7 @@ JobClient::JobClient(const std::string& server, int port) m_job_total = 0; m_job_runtime_total = 0; m_job_throughput = 1; // client gets only one job at the first request + m_connect_failed = false; } JobClient::~JobClient() @@ -32,6 +33,11 @@ JobClient::~JobClient() bool JobClient::connectToServer() { + // don't retry server connects to speedup shutdown at campaign end + if (m_connect_failed) { + return false; + } + int retries = CLIENT_RETRY_COUNT; while (true) { // Connect to server @@ -69,6 +75,7 @@ bool JobClient::connectToServer() cout << "[Client] Unable to reconnect (tried " << CLIENT_RETRY_COUNT << " times); " << "I'll give it up!" << endl; // TODO: Log-level? + m_connect_failed = true; return false; // finally: unable to connect, give it up :-( } break; // connected! :-) @@ -81,6 +88,11 @@ bool JobClient::connectToServer() bool JobClient::getParam(ExperimentData& exp) { + // die immediately if a previous connect already failed + if (m_connect_failed) { + return false; + } + while (1) { // Here we try to acquire a parameter set switch (tryToGetExperimentData(exp)) { // Jobserver will sent workload, params are set in \c exp @@ -190,10 +202,10 @@ bool JobClient::sendResult(ExperimentData& result) m_job_runtime.reset(); m_job_runtime.startTimer(); m_job_total += m_results.size(); - sendResultsToServer(); + // tell caller whether we failed phoning home + return sendResultsToServer(); } - //If there are more jobs for the experiment store result return true; } else { //Stop time measurement and calculate new throughput @@ -221,6 +233,14 @@ bool JobClient::sendResultsToServer() { if (m_results.size() != 0) { if (!connectToServer()) { + // clear results, although we didn't get them to safety; otherwise, + // subsequent calls to sendResult() may and the destructor will + // retry sending them, resulting in a large shutdown time + while (m_results.size()) { + delete &m_results.front()->getMessage(); + delete m_results.front(); + m_results.pop_front(); + } return false; } diff --git a/src/core/efw/JobClient.hpp b/src/core/efw/JobClient.hpp index de4153c5..f0d03157 100644 --- a/src/core/efw/JobClient.hpp +++ b/src/core/efw/JobClient.hpp @@ -37,6 +37,8 @@ private: std::deque m_parameters; std::deque m_results; + bool m_connect_failed; + bool connectToServer(); bool sendResultsToServer(); FailControlMessage_Command tryToGetExperimentData(ExperimentData& exp);