Job-resend mechanism for JobServer added.
git-svn-id: https://www4.informatik.uni-erlangen.de/i4svn/danceos/trunk/devel/fail@1060 8c4709b5-6ec9-48aa-a5cd-a96041d1645a
This commit is contained in:
@ -1,4 +1,4 @@
|
|||||||
// Author: Martin Hoffmann, Richard Hellwig
|
// Author: Martin Hoffmann, Richard Hellwig, Adrian Böckenkamp
|
||||||
// Date: 07.10.11
|
// Date: 07.10.11
|
||||||
|
|
||||||
// <iostream> needs to be included before *.pb.h, otherwise ac++/Puma chokes on the latter
|
// <iostream> needs to be included before *.pb.h, otherwise ac++/Puma chokes on the latter
|
||||||
@ -26,7 +26,6 @@ using namespace std;
|
|||||||
|
|
||||||
namespace fi {
|
namespace fi {
|
||||||
|
|
||||||
|
|
||||||
void JobServer::addParam(ExperimentData* exp){
|
void JobServer::addParam(ExperimentData* exp){
|
||||||
#ifndef __puma
|
#ifndef __puma
|
||||||
m_undoneJobs.Enqueue(exp);
|
m_undoneJobs.Enqueue(exp);
|
||||||
@ -151,9 +150,10 @@ void JobServer::run(){
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Communication thread implementation
|
|
||||||
void CommThread::operator()()
|
void CommThread::operator()()
|
||||||
{
|
{
|
||||||
|
// The communication thread implementation:
|
||||||
|
|
||||||
Minion minion;
|
Minion minion;
|
||||||
FailControlMessage ctrlmsg;
|
FailControlMessage ctrlmsg;
|
||||||
minion.setSocketDescriptor(m_sock);
|
minion.setSocketDescriptor(m_sock);
|
||||||
@ -186,52 +186,88 @@ void CommThread::operator()()
|
|||||||
close(m_sock);
|
close(m_sock);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool CommThread::sendPendingExperimentData(Minion& minion)
|
#ifndef __puma
|
||||||
|
boost::mutex CommThread::m_CommMutex;
|
||||||
|
#endif // __puma
|
||||||
|
|
||||||
|
void CommThread::sendPendingExperimentData(Minion& minion)
|
||||||
{
|
{
|
||||||
FailControlMessage ctrlmsg;
|
FailControlMessage ctrlmsg;
|
||||||
ctrlmsg.set_build_id(42);
|
ctrlmsg.set_build_id(42);
|
||||||
ExperimentData * exp = 0;
|
ExperimentData * exp = 0;
|
||||||
if(m_js.m_undoneJobs.Dequeue_nb(exp) == true){
|
if(m_js.m_undoneJobs.Dequeue_nb(exp) == true) {
|
||||||
// Got an element from queue, assign ID to workload and send to minion
|
// Got an element from queue, assign ID to workload and send to minion
|
||||||
uint32_t workloadID = m_js.m_counter.increment(); // increment workload counter
|
uint32_t workloadID = m_js.m_counter.increment(); // increment workload counter
|
||||||
exp->setWorkloadID(workloadID); // store ID for identification when receiving result
|
exp->setWorkloadID(workloadID); // store ID for identification when receiving result
|
||||||
if(!m_js.m_runningJobs.insert(workloadID, exp)){
|
if(!m_js.m_runningJobs.insert(workloadID, exp)) {
|
||||||
cout << "!![Server]could not insert workload id: [" << workloadID << "] double entry?" << endl;
|
cout << "!![Server]could not insert workload id: [" << workloadID << "] double entry?" << endl;
|
||||||
}
|
}
|
||||||
ctrlmsg.set_command(FailControlMessage_Command_WORK_FOLLOWS);
|
ctrlmsg.set_command(FailControlMessage_Command_WORK_FOLLOWS);
|
||||||
ctrlmsg.set_workloadid(workloadID); // set workload id
|
ctrlmsg.set_workloadid(workloadID); // set workload id
|
||||||
//cout << ">>[Server] Sending workload [" << workloadID << "]" << endl;
|
//cout << ">>[Server] Sending workload [" << workloadID << "]" << endl;
|
||||||
cout << ">>[" << workloadID << "] " << flush;
|
cout << ">>[" << workloadID << "] " << flush;
|
||||||
SocketComm::send_msg(minion.getSocketDescriptor(), ctrlmsg);
|
SocketComm::send_msg(minion.getSocketDescriptor(), ctrlmsg);
|
||||||
SocketComm::send_msg(minion.getSocketDescriptor(), exp->getMessage());
|
SocketComm::send_msg(minion.getSocketDescriptor(), exp->getMessage());
|
||||||
}else if( m_js.noMoreExperiments() == false ){
|
return;
|
||||||
// Currently we have no workload, but the campaign is not over yet. Minion can try again later
|
}
|
||||||
ctrlmsg.set_command(FailControlMessage_Command_COME_AGAIN);
|
|
||||||
SocketComm::send_msg(minion.getSocketDescriptor(), ctrlmsg);
|
#ifndef __puma
|
||||||
cout << "--[Server] No workload, come again..." << endl;
|
boost::unique_lock<boost::mutex> lock(m_CommMutex);
|
||||||
}else{
|
#endif
|
||||||
// No more elements, and campaign is over. Minion can die.
|
if((exp = m_js.m_runningJobs.first()) != NULL) { // 2nd priority
|
||||||
ctrlmsg.set_command(FailControlMessage_Command_DIE);
|
// (This simply gets the first running-job.)
|
||||||
cout << "--[Server] No workload, and no campaign, please die." << endl;
|
// TODO: Improve selection of parameter-set to be resend (the first is not
|
||||||
SocketComm::send_msg(minion.getSocketDescriptor(), ctrlmsg);
|
// necessarily the best...especially when the specific parameter-set
|
||||||
}
|
// causes the experiment-client to terminate abnormally -> endless loop!)
|
||||||
return true;
|
// Further ideas: sequential, random, ...? (+ "retry-counter" for each job)
|
||||||
|
|
||||||
|
// Implement resend of running-parameter sets to improve campaign speed
|
||||||
|
// and to prevent result loss due to (unexpected) termination of experiment
|
||||||
|
// clients.
|
||||||
|
// (Note: Therefore we need to be aware of receiving multiple results for a
|
||||||
|
// single parameter-set, @see receiveExperimentResults.)
|
||||||
|
uint32_t workloadID = exp->getWorkloadID(); // (this ID has been set previously)
|
||||||
|
// Resend the parameter-set.
|
||||||
|
ctrlmsg.set_command(FailControlMessage_Command_WORK_FOLLOWS);
|
||||||
|
ctrlmsg.set_workloadid(workloadID); // set workload id
|
||||||
|
//cout << ">>[Server] Re-sending workload [" << workloadID << "]" << endl;
|
||||||
|
cout << ">>R[" << workloadID << "] " << flush;
|
||||||
|
SocketComm::send_msg(minion.getSocketDescriptor(), ctrlmsg);
|
||||||
|
SocketComm::send_msg(minion.getSocketDescriptor(), exp->getMessage());
|
||||||
|
} else if(m_js.noMoreExperiments() == false) {
|
||||||
|
// Currently we have no workload (even the running-job-queue is empty!), but
|
||||||
|
// the campaign is not over yet. Minion can try again later.
|
||||||
|
ctrlmsg.set_command(FailControlMessage_Command_COME_AGAIN);
|
||||||
|
SocketComm::send_msg(minion.getSocketDescriptor(), ctrlmsg);
|
||||||
|
cout << "--[Server] No workload, come again..." << endl;
|
||||||
|
} else {
|
||||||
|
// No more elements, and campaign is over. Minion can die.
|
||||||
|
ctrlmsg.set_command(FailControlMessage_Command_DIE);
|
||||||
|
cout << "--[Server] No workload, and no campaign, please die." << endl;
|
||||||
|
SocketComm::send_msg(minion.getSocketDescriptor(), ctrlmsg);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void CommThread::receiveExperimentResults(Minion& minion, uint32_t workloadID)
|
||||||
bool CommThread::receiveExperimentResults(Minion& minion, uint32_t workloadID)
|
|
||||||
{
|
{
|
||||||
ExperimentData * exp; // Get exp* from running jobs
|
#ifndef __puma
|
||||||
//cout << "<<[Server] Received result for workload id [" << workloadID << "]" << endl;
|
boost::unique_lock<boost::mutex> lock(m_CommMutex);
|
||||||
cout << "<<[" << workloadID << "] " << flush;
|
#endif
|
||||||
if( m_js.m_runningJobs.remove(workloadID, exp) ){ /// ExperimentData* found
|
|
||||||
SocketComm::rcv_msg(minion.getSocketDescriptor(), exp->getMessage() ); /// deserialize results.
|
ExperimentData * exp; // Get exp* from running jobs
|
||||||
m_js.m_doneJobs.Enqueue(exp); /// Put results in done queue..
|
//cout << "<<[Server] Received result for workload id [" << workloadID << "]" << endl;
|
||||||
return true;
|
cout << "<<[" << workloadID << "] " << flush;
|
||||||
}else{
|
if(m_js.m_runningJobs.remove(workloadID, exp)) { // ExperimentData* found
|
||||||
cout << "!![Server] workload id not found in running jobs map :( [" << workloadID << "]" << endl;
|
SocketComm::rcv_msg(minion.getSocketDescriptor(), exp->getMessage() ); // deserialize results.
|
||||||
return false;
|
m_js.m_doneJobs.Enqueue(exp); // Put results in done queue..
|
||||||
}
|
} else {
|
||||||
|
// We can receive several results for the same workload id because
|
||||||
|
// we (may) distribute the (running) jobs to a *few* experiment-clients.
|
||||||
|
cout << "[Server] Received another result for workload id ["
|
||||||
|
<< workloadID << "] -- ignored." << endl;
|
||||||
|
|
||||||
|
// TODO: Any need for error-handling here?
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|||||||
@ -1,7 +1,8 @@
|
|||||||
/**
|
/**
|
||||||
* \brief The JobServer supplies the Minions with ExperimentData's and receives the result data.
|
* \brief The JobServer supplies the Minions with ExperimentData's
|
||||||
|
* and receives the result data.
|
||||||
*
|
*
|
||||||
* \author Martin Hoffmann, Richard Hellwig
|
* \author Martin Hoffmann, Richard Hellwig, Adrian Böckenkamp
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
@ -65,7 +66,13 @@ public:
|
|||||||
m_serverThread = new boost::thread(&JobServer::run, this); // run operator()() in a thread.
|
m_serverThread = new boost::thread(&JobServer::run, this); // run operator()() in a thread.
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
~JobServer() {}
|
~JobServer()
|
||||||
|
{
|
||||||
|
#ifndef __puma
|
||||||
|
// Cleanup of m_serverThread, etc.
|
||||||
|
delete m_serverThread;
|
||||||
|
#endif // __puma
|
||||||
|
};
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
@ -126,32 +133,32 @@ public:
|
|||||||
class CommThread {
|
class CommThread {
|
||||||
int m_sock; //! Socket descriptor of the connection
|
int m_sock; //! Socket descriptor of the connection
|
||||||
JobServer& m_js; //! Calling jobserver
|
JobServer& m_js; //! Calling jobserver
|
||||||
|
#ifndef __puma
|
||||||
|
static boost::mutex m_CommMutex; //! to synchronise the communication
|
||||||
|
#endif // __puma
|
||||||
public:
|
public:
|
||||||
CommThread(int sockfd, JobServer& p) : m_sock(sockfd), m_js(p) {};
|
CommThread(int sockfd, JobServer& p) : m_sock(sockfd), m_js(p) {};
|
||||||
/**
|
/**
|
||||||
* The thread's entry point
|
* The thread's entry point.
|
||||||
*/
|
*/
|
||||||
void operator() ();
|
void operator() ();
|
||||||
private:
|
private:
|
||||||
/// FIXME concerns are not really separated yet ;)
|
/// FIXME concerns are not really separated yet ;)
|
||||||
/**
|
/**
|
||||||
* Called after minion calls for work.
|
* Called after minion calls for work.
|
||||||
* Tries to deque a parameter set non blocking, and
|
* Tries to deque a parameter set non blocking, and
|
||||||
* sends it back to the requesting minion.
|
* sends it back to the requesting minion.
|
||||||
* @param minion The minion asking for input
|
* @param minion The minion asking for input
|
||||||
* @return FIXME return value not evaluated yet.
|
|
||||||
*/
|
*/
|
||||||
bool sendPendingExperimentData(Minion& minion);
|
void sendPendingExperimentData(Minion& minion);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Called after minion offers a result message.
|
* Called after minion offers a result message.
|
||||||
* Evaluates the Workload ID and puts the corresponding
|
* Evaluates the Workload ID and puts the corresponding
|
||||||
* job result into the result queue.
|
* job result into the result queue.
|
||||||
* @param minion The minion offering results
|
* @param minion The minion offering results
|
||||||
* @param workloadID The workload id of the result message
|
* @param workloadID The workload id of the result message
|
||||||
* @return \c true if Worload ID could be mapped, \c false if not
|
|
||||||
*/
|
*/
|
||||||
bool receiveExperimentResults(Minion& minion, uint32_t workloadID);
|
void receiveExperimentResults(Minion& minion, uint32_t workloadID);
|
||||||
};
|
};
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|||||||
@ -27,6 +27,20 @@ private:
|
|||||||
#endif
|
#endif
|
||||||
return m_map.size();
|
return m_map.size();
|
||||||
}
|
}
|
||||||
|
/**
|
||||||
|
* Retrieves the first element in the map.
|
||||||
|
* @return a pointer to the first element, or \c NULL if empty
|
||||||
|
*/
|
||||||
|
Tvalue first()
|
||||||
|
{
|
||||||
|
#ifndef __puma
|
||||||
|
boost::unique_lock<boost::mutex> lock(m_mutex);
|
||||||
|
#endif
|
||||||
|
if(m_map.size() > 0)
|
||||||
|
return m_map.begin()->second;
|
||||||
|
else
|
||||||
|
return NULL;
|
||||||
|
} // Lock is automatically released here
|
||||||
/**
|
/**
|
||||||
* Add data to the map, return false if already present
|
* Add data to the map, return false if already present
|
||||||
* @param key Map key
|
* @param key Map key
|
||||||
|
|||||||
Reference in New Issue
Block a user