jobserver: bugfix: potential race

Delay insertion of to-be-sent jobs into m_runningJobs until they are
really sent, as getMessage() won't work anymore (as in: segfault) if
this job is concurrently re-sent (due to campaign end), its result is
received, and deleted in the campaign.  This becomes non-hypothetical
with larger values for CLIENT_JOB_LIMIT and CLIENT_JOB_REQUEST_SEC.

Additionally, reinsert the remaining jobs into the input queue if
communication fails, instead of inefficiently delaying redistribution
until the campaign end.

Change-Id: If85e3c8261deda86beb8d4d93343429223753f22
This commit is contained in:
Horst Schirmeier
2014-01-20 22:21:40 +01:00
parent 128b54b045
commit 1f6e275e5e

View File

@ -266,10 +266,6 @@ void CommThread::sendPendingExperimentData(Minion& minion)
} else {
break;
}
if (!m_js.m_runningJobs.insert(workloadID, temp_exp)) {
cout << "!![Server]could not insert workload id: [" << workloadID << "] double entry?" << endl;
}
}
if (exp.size() != 0) {
ctrlmsg.set_job_size(exp.size());
@ -280,8 +276,22 @@ void CommThread::sendPendingExperimentData(Minion& minion)
if (SocketComm::sendMsg(minion.getSocketDescriptor(), ctrlmsg)) {
for (i = 0; i < ctrlmsg.job_size(); i++) {
if (SocketComm::sendMsg(minion.getSocketDescriptor(), exp.front()->getMessage())) {
// delay insertion into m_runningJobs until here, as
// getMessage() won't work anymore if this job is re-sent,
// received, and deleted in the meantime
if (!m_js.m_runningJobs.insert(exp.front()->getWorkloadID(), exp.front())) {
cout << "!![Server]could not insert workload id: [" << workloadID << "] double entry?" << endl;
}
exp.pop_front();
} else {
// add remaining jobs back to the queue
cout << "!![Server] failed to send scheduled " << exp.size() << " jobs" << endl;
while (exp.size()) {
m_js.m_undoneJobs.Enqueue(exp.front());
exp.pop_front();
}
break;
}