The Jobclient can get several jobs with one request

git-svn-id: https://www4.informatik.uni-erlangen.de/i4svn/danceos/trunk/devel/fail@1963 8c4709b5-6ec9-48aa-a5cd-a96041d1645a
This commit is contained in:
hellwig
2012-11-30 16:50:02 +00:00
parent da3b2b8253
commit d7842c2ad7
7 changed files with 214 additions and 94 deletions

View File

@ -5,15 +5,16 @@ message FailControlMessage {
RESULT_FOLLOWS = 1; // followed by experiment-specific ExperimentData message (holding both original parameters and experiment result) RESULT_FOLLOWS = 1; // followed by experiment-specific ExperimentData message (holding both original parameters and experiment result)
// JobServer may send these: // JobServer may send these:
WORK_FOLLOWS = 5; // followed by experiment-specific ExperimentData message WORK_FOLLOWS = 6; // followed by experiment-specific ExperimentData message
COME_AGAIN = 6; // no experiment-specific ExperimentData at the moment, but Campaign is not over yet COME_AGAIN = 7; // no experiment-specific ExperimentData at the moment, but Campaign is not over yet
DIE = 7; // tells the client to terminate DIE = 8; // tells the client to terminate
} }
required Command command = 1; required Command command = 1;
optional uint32 workloadID = 2; repeated uint32 workloadID = 2;
// identifying the client/server build (e.g., build time in unixtime format) // identifying the client/server build (e.g., build time in unixtime format)
required uint64 build_id = 3; required uint64 build_id = 3;
// campaign server run ID: prevents old clients talking to new servers // campaign server run ID: prevents old clients talking to new servers
optional uint64 run_id = 4; optional uint64 run_id = 4;
optional uint32 job_size = 5;
} }

View File

@ -29,6 +29,8 @@ SET(SERVER_PERF_STEPPING_SEC "1" CACHE STRING "Stepping of performan
SET(CLIENT_RAND_BACKOFF_TSTART "3" CACHE STRING "Lower limit of client's backoff phase in seconds") SET(CLIENT_RAND_BACKOFF_TSTART "3" CACHE STRING "Lower limit of client's backoff phase in seconds")
SET(CLIENT_RAND_BACKOFF_TEND "8" CACHE STRING "Upper limit of client's backoff phase in seconds") SET(CLIENT_RAND_BACKOFF_TEND "8" CACHE STRING "Upper limit of client's backoff phase in seconds")
SET(CLIENT_RETRY_COUNT "3" CACHE STRING "Client's number of reconnect retries") SET(CLIENT_RETRY_COUNT "3" CACHE STRING "Client's number of reconnect retries")
SET(CLIENT_JOB_REQUEST_SEC "60" CACHE STRING "Determines how often the client asks for new jobs")
SET(CLIENT_JOB_LIMIT_SEC "1000" CACHE STRING "How many jobs can a client ask for")
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/FailConfig.hpp.in configure_file(${CMAKE_CURRENT_SOURCE_DIR}/FailConfig.hpp.in
${CMAKE_CURRENT_BINARY_DIR}/FailConfig.hpp) ${CMAKE_CURRENT_BINARY_DIR}/FailConfig.hpp)

View File

@ -39,6 +39,8 @@
#define CLIENT_RAND_BACKOFF_TSTART @CLIENT_RAND_BACKOFF_TSTART@ #define CLIENT_RAND_BACKOFF_TSTART @CLIENT_RAND_BACKOFF_TSTART@
#define CLIENT_RAND_BACKOFF_TEND @CLIENT_RAND_BACKOFF_TEND@ #define CLIENT_RAND_BACKOFF_TEND @CLIENT_RAND_BACKOFF_TEND@
#define CLIENT_RETRY_COUNT @CLIENT_RETRY_COUNT@ #define CLIENT_RETRY_COUNT @CLIENT_RETRY_COUNT@
#define CLIENT_JOB_REQUEST_SEC @CLIENT_JOB_REQUEST_SEC@
#define CLIENT_JOB_LIMIT_SEC @CLIENT_JOB_LIMIT_SEC@
#define PROJECT_VERSION "@PROJECT_VERSION@" #define PROJECT_VERSION "@PROJECT_VERSION@"
#define FAIL_VERSION PROJECT_VERSION #define FAIL_VERSION PROJECT_VERSION

View File

@ -9,7 +9,6 @@
#include <string.h> #include <string.h>
#include <arpa/inet.h> #include <arpa/inet.h>
#include "comm/FailControlMessage.pb.h"
#include "comm/SocketComm.hpp" #include "comm/SocketComm.hpp"
#include "JobServer.hpp" #include "JobServer.hpp"
#include "Minion.hpp" #include "Minion.hpp"
@ -212,6 +211,7 @@ void CommThread::operator()()
break; break;
} }
// give minion something to do.. // give minion something to do..
m_job_size = ctrlmsg.job_size();
sendPendingExperimentData(minion); sendPendingExperimentData(minion);
break; break;
case FailControlMessage::RESULT_FOLLOWS: case FailControlMessage::RESULT_FOLLOWS:
@ -221,7 +221,7 @@ void CommThread::operator()()
break; break;
} }
// get results and put to done queue. // get results and put to done queue.
receiveExperimentResults(minion, ctrlmsg.workloadid()); receiveExperimentResults(minion, ctrlmsg);
break; break;
default: default:
// hm.. don't know what to do. please die. // hm.. don't know what to do. please die.
@ -238,23 +238,46 @@ void CommThread::operator()()
void CommThread::sendPendingExperimentData(Minion& minion) void CommThread::sendPendingExperimentData(Minion& minion)
{ {
uint32_t i;
uint32_t workloadID;
std::vector<ExperimentData*> exp;
ExperimentData* temp_exp = 0;
FailControlMessage ctrlmsg; FailControlMessage ctrlmsg;
ctrlmsg.set_build_id(42); ctrlmsg.set_build_id(42);
ctrlmsg.set_run_id(m_js.m_runid); ctrlmsg.set_run_id(m_js.m_runid);
ExperimentData * exp = 0; ctrlmsg.set_command(FailControlMessage::WORK_FOLLOWS);
if (m_js.m_undoneJobs.Dequeue_nb(exp) == true) {
// Got an element from queue, assign ID to workload and send to minion for(i = 0; i < m_job_size ; i++) {
uint32_t workloadID = m_js.m_counter.increment(); // increment workload counter if (m_js.m_undoneJobs.Dequeue_nb(temp_exp) == true) {
exp->setWorkloadID(workloadID); // store ID for identification when receiving result // Got an element from queue, assign ID to workload and send to minion
if (!m_js.m_runningJobs.insert(workloadID, exp)) { workloadID = m_js.m_counter.increment(); // increment workload counter
temp_exp->setWorkloadID(workloadID); // store ID for identification when receiving result
ctrlmsg.add_workloadid(workloadID);
exp.push_back(temp_exp);
}
if (!m_js.m_runningJobs.insert(workloadID, temp_exp)) {
cout << "!![Server]could not insert workload id: [" << workloadID << "] double entry?" << endl; cout << "!![Server]could not insert workload id: [" << workloadID << "] double entry?" << endl;
} }
ctrlmsg.set_command(FailControlMessage::WORK_FOLLOWS); }
ctrlmsg.set_workloadid(workloadID); // set workload id ctrlmsg.set_job_size(exp.size());
//cout << ">>[Server] Sending workload [" << workloadID << "]" << endl;
cout << ">>[" << workloadID << "] " << flush; cout << " >>[";
if (SocketComm::sendMsg(minion.getSocketDescriptor(), ctrlmsg)) { for ( i = 0; i < exp.size() ; i++) {
SocketComm::sendMsg(minion.getSocketDescriptor(), exp->getMessage()); cout << " "<< ctrlmsg.workloadid(i) <<" ";
}
cout << "] " << flush;
if (SocketComm::sendMsg(minion.getSocketDescriptor(), ctrlmsg)) {
for (i = 0; i < ctrlmsg.job_size() ; i++) {
if(SocketComm::sendMsg(minion.getSocketDescriptor(), exp.front()->getMessage())) {
exp.erase(exp.begin());
} else {
break;
}
} }
return; return;
} }
@ -265,7 +288,7 @@ void CommThread::sendPendingExperimentData(Minion& minion)
// (See details in receiveExperimentResults) // (See details in receiveExperimentResults)
boost::unique_lock<boost::mutex> lock(m_CommMutex); boost::unique_lock<boost::mutex> lock(m_CommMutex);
#endif #endif
if ((exp = m_js.m_runningJobs.pickone()) != NULL) { // 2nd priority if ((temp_exp = m_js.m_runningJobs.pickone()) != NULL) { // 2nd priority
// (This picks one running job.) // (This picks one running job.)
// TODO: Improve selection of parameter set to be resent: // TODO: Improve selection of parameter set to be resent:
// - currently: Linear complexity! // - currently: Linear complexity!
@ -277,14 +300,15 @@ void CommThread::sendPendingExperimentData(Minion& minion)
// clients. // clients.
// (Note: Therefore we need to be aware of receiving multiple results for a // (Note: Therefore we need to be aware of receiving multiple results for a
// single parameter-set, @see receiveExperimentResults.) // single parameter-set, @see receiveExperimentResults.)
uint32_t workloadID = exp->getWorkloadID(); // (this ID has been set previously) uint32_t workloadID = temp_exp->getWorkloadID(); // (this ID has been set previously)
// Resend the parameter-set. // Resend the parameter-set.
ctrlmsg.set_command(FailControlMessage::WORK_FOLLOWS); ctrlmsg.set_command(FailControlMessage::WORK_FOLLOWS);
ctrlmsg.set_workloadid(workloadID); // set workload id ctrlmsg.add_workloadid(workloadID); // set workload id
ctrlmsg.set_job_size(1); // In 2nd priority the jobserver send only one job
//cout << ">>[Server] Re-sending workload [" << workloadID << "]" << endl; //cout << ">>[Server] Re-sending workload [" << workloadID << "]" << endl;
cout << ">>R[" << workloadID << "] " << flush; cout << ">>R[" << workloadID << "] " << flush;
if (SocketComm::sendMsg(minion.getSocketDescriptor(), ctrlmsg)) { if (SocketComm::sendMsg(minion.getSocketDescriptor(), ctrlmsg)) {
SocketComm::sendMsg(minion.getSocketDescriptor(), exp->getMessage()); SocketComm::sendMsg(minion.getSocketDescriptor(), temp_exp->getMessage());
} }
} else if (m_js.noMoreExperiments() == false) { } else if (m_js.noMoreExperiments() == false) {
// Currently we have no workload (even the running-job-queue is empty!), but // Currently we have no workload (even the running-job-queue is empty!), but
@ -300,11 +324,15 @@ void CommThread::sendPendingExperimentData(Minion& minion)
} }
} }
void CommThread::receiveExperimentResults(Minion& minion, uint32_t workloadID) void CommThread::receiveExperimentResults(Minion& minion, FailControlMessage ctrlmsg)
{ {
int i;
ExperimentData* exp = NULL; // Get exp* from running jobs ExperimentData* exp = NULL; // Get exp* from running jobs
//cout << "<<[Server] Received result for workload id [" << workloadID << "]" << endl; cout << " <<[ ";
cout << "<<[" << workloadID << "] " << flush; for (i = 0; i < ctrlmsg.workloadid_size(); i++) {
cout << ctrlmsg.workloadid(i) << " ";
}
cout << "] " << flush;
#ifndef __puma #ifndef __puma
// Prevent re-sending jobs in sendPendingExperimentData: // Prevent re-sending jobs in sendPendingExperimentData:
// a) sendPendingExperimentData needs an intact job to serialize and send it. // a) sendPendingExperimentData needs an intact job to serialize and send it.
@ -314,24 +342,27 @@ void CommThread::receiveExperimentResults(Minion& minion, uint32_t workloadID)
// already may cause breakage in sendPendingExperimentData (a). // already may cause breakage in sendPendingExperimentData (a).
boost::unique_lock<boost::mutex> lock(m_CommMutex); boost::unique_lock<boost::mutex> lock(m_CommMutex);
#endif #endif
if (m_js.m_runningJobs.remove(workloadID, exp)) { // ExperimentData* found for (i = 0 ; i < ctrlmsg.workloadid_size() ; i++) {
// deserialize results, expect failures if (m_js.m_runningJobs.remove( ctrlmsg.workloadid(i), exp)) { // ExperimentData* found
if (!SocketComm::rcvMsg(minion.getSocketDescriptor(), exp->getMessage())) { // deserialize results, expect failures
m_js.m_runningJobs.insert(workloadID, exp); if (!SocketComm::rcvMsg(minion.getSocketDescriptor(), exp->getMessage())) {
m_js.m_runningJobs.insert(ctrlmsg.workloadid(i), exp);
} else {
m_js.m_doneJobs.Enqueue(exp); // Put results in done queue
}
#ifdef SERVER_PERFORMANCE_MEASURE
++JobServer::m_DoneCount;
#endif
} else { } else {
m_js.m_doneJobs.Enqueue(exp); // Put results in done queue // We can receive several results for the same workload id because
// we (may) distribute the (running) jobs to a *few* experiment-clients.
cout << "[Server] Received another result for workload id ["
<< ctrlmsg.workloadid(i) << "] -- ignored." << endl;
// TODO: Any need for error-handling here?
} }
#ifdef SERVER_PERFORMANCE_MEASURE
++JobServer::m_DoneCount;
#endif
} else {
// We can receive several results for the same workload id because
// we (may) distribute the (running) jobs to a *few* experiment-clients.
cout << "[Server] Received another result for workload id ["
<< workloadID << "] -- ignored." << endl;
// TODO: Any need for error-handling here?
} }
} }
} // end-of-namespace: fail } // end-of-namespace: fail

View File

@ -6,6 +6,7 @@
#include "util/SynchronizedCounter.hpp" #include "util/SynchronizedCounter.hpp"
#include "util/SynchronizedMap.hpp" #include "util/SynchronizedMap.hpp"
#include "config/FailConfig.hpp" #include "config/FailConfig.hpp"
#include "comm/FailControlMessage.pb.h"
#include <list> #include <list>
#include <ctime> #include <ctime>
@ -139,6 +140,7 @@ public:
class CommThread { class CommThread {
private: private:
int m_sock; //! Socket descriptor of the connection int m_sock; //! Socket descriptor of the connection
uint32_t m_job_size;
JobServer& m_js; //! Calling jobserver JobServer& m_js; //! Calling jobserver
// FIXME: Concerns are not really separated yet ;) // FIXME: Concerns are not really separated yet ;)
@ -156,13 +158,13 @@ private:
* @param minion The minion offering results * @param minion The minion offering results
* @param workloadID The workload id of the result message * @param workloadID The workload id of the result message
*/ */
void receiveExperimentResults(Minion& minion, uint32_t workloadID); void receiveExperimentResults(Minion& minion, FailControlMessage ctrlmsg);
public: public:
#ifndef __puma #ifndef __puma
static boost::mutex m_CommMutex; //! to synchronise the communication static boost::mutex m_CommMutex; //! to synchronise the communication
#endif // __puma #endif // __puma
CommThread(int sockfd, JobServer& p) CommThread(int sockfd, JobServer& p)
: m_sock(sockfd), m_js(p) { } : m_sock(sockfd), m_job_size(1), m_js(p) { }
/** /**
* The thread's entry point. * The thread's entry point.
*/ */

View File

@ -16,6 +16,7 @@ JobClient::JobClient(const std::string& server, int port)
} }
srand(time(NULL)); // needed for random backoff (see connectToServer) srand(time(NULL)); // needed for random backoff (see connectToServer)
m_server_runid = 0; // server accepts this for virgin clients m_server_runid = 0; // server accepts this for virgin clients
m_job_throughput = 1; // client gets only one job at the first request
} }
bool JobClient::connectToServer() bool JobClient::connectToServer()
@ -71,7 +72,8 @@ bool JobClient::getParam(ExperimentData& exp)
while (1) { // Here we try to acquire a parameter set while (1) { // Here we try to acquire a parameter set
switch (tryToGetExperimentData(exp)) { switch (tryToGetExperimentData(exp)) {
// Jobserver will sent workload, params are set in \c exp // Jobserver will sent workload, params are set in \c exp
case FailControlMessage::WORK_FOLLOWS: return true; case FailControlMessage::WORK_FOLLOWS:
return true;
// Nothing to do right now, but maybe later // Nothing to do right now, but maybe later
case FailControlMessage::COME_AGAIN: case FailControlMessage::COME_AGAIN:
sleep(1); sleep(1);
@ -84,70 +86,144 @@ bool JobClient::getParam(ExperimentData& exp)
FailControlMessage_Command JobClient::tryToGetExperimentData(ExperimentData& exp) FailControlMessage_Command JobClient::tryToGetExperimentData(ExperimentData& exp)
{ {
FailControlMessage ctrlmsg; //Are there other jobs for the experiment
if (m_parameters.size() != 0) {
exp.getMessage().CopyFrom(m_parameters.front()->getMessage());
exp.setWorkloadID(m_parameters.front()->getWorkloadID());
delete &m_parameters.front()->getMessage();
delete m_parameters.front();
m_parameters.erase(m_parameters.begin());
return FailControlMessage::WORK_FOLLOWS;
} else {
FailControlMessage ctrlmsg;
// Connection failed, minion can die // Connection failed, minion can die
if (!connectToServer()) { if (!connectToServer()) {
return FailControlMessage::DIE; return FailControlMessage::DIE;
} }
// Retrieve ExperimentData // Retrieve ExperimentData
ctrlmsg.set_command(FailControlMessage::NEED_WORK); ctrlmsg.set_command(FailControlMessage::NEED_WORK);
ctrlmsg.set_build_id(42); ctrlmsg.set_build_id(42);
ctrlmsg.set_run_id(m_server_runid); ctrlmsg.set_run_id(m_server_runid);
ctrlmsg.set_job_size(m_job_throughput); //Request for a number of jobs
if (!SocketComm::sendMsg(m_sockfd, ctrlmsg)) { if (!SocketComm::sendMsg(m_sockfd, ctrlmsg)) {
// Failed to send message? Retry. // Failed to send message? Retry.
close(m_sockfd); close(m_sockfd);
return FailControlMessage::COME_AGAIN; return FailControlMessage::COME_AGAIN;
} }
ctrlmsg.Clear(); ctrlmsg.Clear();
if (!SocketComm::rcvMsg(m_sockfd, ctrlmsg)) { if (!SocketComm::rcvMsg(m_sockfd, ctrlmsg)) {
// Failed to receive message? Retry.
close(m_sockfd);
return FailControlMessage::COME_AGAIN;
}
// now we know the current run ID
m_server_runid = ctrlmsg.run_id();
switch (ctrlmsg.command()) {
case FailControlMessage::WORK_FOLLOWS:
if (!SocketComm::rcvMsg(m_sockfd, exp.getMessage())) {
// Failed to receive message? Retry. // Failed to receive message? Retry.
close(m_sockfd); close(m_sockfd);
return FailControlMessage::COME_AGAIN; return FailControlMessage::COME_AGAIN;
} }
exp.setWorkloadID(ctrlmsg.workloadid()); // Store workload id of experiment data
break; // now we know the current run ID
case FailControlMessage::COME_AGAIN: m_server_runid = ctrlmsg.run_id();
break;
default: switch (ctrlmsg.command()) {
break; case FailControlMessage::WORK_FOLLOWS:
uint32_t i;
for (i = 0 ; i < ctrlmsg.job_size() ; i++) {
ExperimentData* temp_exp = new ExperimentData(exp.getMessage().New());
if (!SocketComm::rcvMsg(m_sockfd, temp_exp->getMessage())) {
// Failed to receive message? Retry.
close(m_sockfd);
return FailControlMessage::COME_AGAIN;
}
temp_exp->setWorkloadID(ctrlmsg.workloadid(i)); //Store workload id of experiment data
m_parameters.push_back(temp_exp);
}
break;
case FailControlMessage::COME_AGAIN:
break;
default:
break;
}
close(m_sockfd);
//Take front from m_parameters and copy to exp.
exp.getMessage().CopyFrom(m_parameters.front()->getMessage());
exp.setWorkloadID(m_parameters.front()->getWorkloadID());
//Delete front element of m_parameters
delete &m_parameters.front()->getMessage();
delete m_parameters.front();
m_parameters.erase(m_parameters.begin());
//start time measurement for throughput calculation
m_job_runtime.startTimer();
return ctrlmsg.command();
} }
close(m_sockfd);
return ctrlmsg.command();
} }
bool JobClient::sendResult(ExperimentData& result) bool JobClient::sendResult(ExperimentData& result)
{ {
if (!connectToServer()) //Create new ExperimentData for result
return false; ExperimentData* temp_exp = new ExperimentData(result.getMessage().New());
temp_exp->getMessage().CopyFrom(result.getMessage());
temp_exp->setWorkloadID(result.getWorkloadID());
if (m_parameters.size() != 0) {
//If there are more jobs for the experiment store result
m_results.push_back( temp_exp );
return true;
} else {
m_results.push_back( temp_exp );
//Stop time measurement and calculate new throughput
m_job_runtime.stopTimer();
m_job_throughput = CLIENT_JOB_REQUEST_SEC/((double)m_job_runtime/m_results.size());
if (m_job_throughput > CLIENT_JOB_LIMIT_SEC)
m_job_throughput = CLIENT_JOB_LIMIT_SEC;
if (m_job_throughput < 1)
m_job_throughput = 1;
//Reset timer for new time measurement
m_job_runtime.reset();
if (!connectToServer())
return false;
// Send back results //Send back results
FailControlMessage ctrlmsg; FailControlMessage ctrlmsg;
ctrlmsg.set_command(FailControlMessage::RESULT_FOLLOWS); ctrlmsg.set_command(FailControlMessage::RESULT_FOLLOWS);
ctrlmsg.set_build_id(42); ctrlmsg.set_build_id(42);
ctrlmsg.set_run_id(m_server_runid); ctrlmsg.set_run_id(m_server_runid);
ctrlmsg.set_workloadid(result.getWorkloadID()); ctrlmsg.set_job_size(m_results.size()); //Store how many results will be sent
cout << "[Client] Sending back result [" << std::dec << result.getWorkloadID() << "]..." << endl;
// TODO: Log-level? cout << "[Client] Sending back result [";
SocketComm::sendMsg(m_sockfd, ctrlmsg);
SocketComm::sendMsg(m_sockfd, result.getMessage()); uint32_t i;
for (i = 0; i < m_results.size() ; i++) {
ctrlmsg.add_workloadid(m_results[i]->getWorkloadID());
cout << std::dec << m_results[i]->getWorkloadID();
cout << " ";
}
cout << "]";
// TODO: Log-level?
SocketComm::sendMsg(m_sockfd, ctrlmsg);
for (i = 0; i < ctrlmsg.job_size() ; i++) {
SocketComm::sendMsg(m_sockfd, m_results.front()->getMessage());
delete &m_results.front()->getMessage();
delete m_results.front();
m_results.erase(m_results.begin());
}
// Close connection. // Close connection.
close(m_sockfd); close(m_sockfd);
return true; return true;
}
} }
} // end-of-namespace: fail } // end-of-namespace: fail

View File

@ -11,6 +11,7 @@
#include "comm/ExperimentData.hpp" #include "comm/ExperimentData.hpp"
#include "comm/FailControlMessage.pb.h" #include "comm/FailControlMessage.pb.h"
#include "config/FailConfig.hpp" #include "config/FailConfig.hpp"
#include "util/WallclockTimer.hpp"
namespace fail { namespace fail {
@ -27,7 +28,12 @@ private:
struct hostent* m_server_ent; struct hostent* m_server_ent;
int m_sockfd; int m_sockfd;
uint64_t m_server_runid; uint64_t m_server_runid;
WallclockTimer m_job_runtime;
int m_job_throughput;
std::vector<ExperimentData*> m_parameters;
std::vector<ExperimentData*> m_results;
bool connectToServer(); bool connectToServer();
FailControlMessage_Command tryToGetExperimentData(ExperimentData& exp); FailControlMessage_Command tryToGetExperimentData(ExperimentData& exp);