core/efw: send back results earlier
The client sends results back earlier (i.e., before all jobs are done) if the client response time (CLIENT_JOB_REQUEST_SEC) is exceeded. This makes sure that extraordinarily long-running experiments get reported back before, e.g., the LIDO job timeout kills the Fail* instance. Change-Id: I3ada0360ec54b63f80a7008570ca514449720220
This commit is contained in:
@ -182,12 +182,23 @@ bool JobClient::sendResult(ExperimentData& result)
|
|||||||
m_results.push_back( temp_exp );
|
m_results.push_back( temp_exp );
|
||||||
|
|
||||||
if (m_parameters.size() != 0) {
|
if (m_parameters.size() != 0) {
|
||||||
|
//If job request time is over send back all existing results
|
||||||
|
if (CLIENT_JOB_REQUEST_SEC < (double)m_job_runtime) {
|
||||||
|
m_job_runtime_total += (double) m_job_runtime;
|
||||||
|
m_job_runtime.reset();
|
||||||
|
m_job_runtime.startTimer();
|
||||||
|
m_job_total += m_results.size();
|
||||||
|
sendResultsToServer();
|
||||||
|
}
|
||||||
|
|
||||||
//If there are more jobs for the experiment store result
|
//If there are more jobs for the experiment store result
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
//Stop time measurement and calculate new throughput
|
//Stop time measurement and calculate new throughput
|
||||||
m_job_runtime.stopTimer();
|
m_job_runtime.stopTimer();
|
||||||
m_job_throughput = 0.5 * m_job_throughput + 0.5*(CLIENT_JOB_REQUEST_SEC/((double)m_job_runtime/m_results.size()));
|
m_job_runtime_total += (double) m_job_runtime;
|
||||||
|
m_job_total += m_results.size();
|
||||||
|
m_job_throughput = 0.5 * m_job_throughput + 0.5*(CLIENT_JOB_REQUEST_SEC/(m_job_runtime_total/m_job_total));
|
||||||
|
|
||||||
if (m_job_throughput > CLIENT_JOB_LIMIT) {
|
if (m_job_throughput > CLIENT_JOB_LIMIT) {
|
||||||
m_job_throughput = CLIENT_JOB_LIMIT;
|
m_job_throughput = CLIENT_JOB_LIMIT;
|
||||||
@ -195,8 +206,10 @@ bool JobClient::sendResult(ExperimentData& result)
|
|||||||
m_job_throughput = 1;
|
m_job_throughput = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
//Reset timer for new time measurement
|
//Timer/Counter cleanup
|
||||||
m_job_runtime.reset();
|
m_job_runtime.reset();
|
||||||
|
m_job_runtime_total = 0;
|
||||||
|
m_job_total = 0;
|
||||||
|
|
||||||
return sendResultsToServer();
|
return sendResultsToServer();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -31,7 +31,9 @@ private:
|
|||||||
uint64_t m_server_runid;
|
uint64_t m_server_runid;
|
||||||
|
|
||||||
WallclockTimer m_job_runtime;
|
WallclockTimer m_job_runtime;
|
||||||
|
double m_job_runtime_total;
|
||||||
int m_job_throughput;
|
int m_job_throughput;
|
||||||
|
int m_job_total;
|
||||||
std::deque<ExperimentData*> m_parameters;
|
std::deque<ExperimentData*> m_parameters;
|
||||||
std::deque<ExperimentData*> m_results;
|
std::deque<ExperimentData*> m_results;
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user