The Jobclient can get several jobs with one request
git-svn-id: https://www4.informatik.uni-erlangen.de/i4svn/danceos/trunk/devel/fail@1963 8c4709b5-6ec9-48aa-a5cd-a96041d1645a
This commit is contained in:
@ -5,15 +5,16 @@ message FailControlMessage {
|
|||||||
RESULT_FOLLOWS = 1; // followed by experiment-specific ExperimentData message (holding both original parameters and experiment result)
|
RESULT_FOLLOWS = 1; // followed by experiment-specific ExperimentData message (holding both original parameters and experiment result)
|
||||||
|
|
||||||
// JobServer may send these:
|
// JobServer may send these:
|
||||||
WORK_FOLLOWS = 5; // followed by experiment-specific ExperimentData message
|
WORK_FOLLOWS = 6; // followed by experiment-specific ExperimentData message
|
||||||
COME_AGAIN = 6; // no experiment-specific ExperimentData at the moment, but Campaign is not over yet
|
COME_AGAIN = 7; // no experiment-specific ExperimentData at the moment, but Campaign is not over yet
|
||||||
DIE = 7; // tells the client to terminate
|
DIE = 8; // tells the client to terminate
|
||||||
}
|
}
|
||||||
|
|
||||||
required Command command = 1;
|
required Command command = 1;
|
||||||
optional uint32 workloadID = 2;
|
repeated uint32 workloadID = 2;
|
||||||
// identifying the client/server build (e.g., build time in unixtime format)
|
// identifying the client/server build (e.g., build time in unixtime format)
|
||||||
required uint64 build_id = 3;
|
required uint64 build_id = 3;
|
||||||
// campaign server run ID: prevents old clients talking to new servers
|
// campaign server run ID: prevents old clients talking to new servers
|
||||||
optional uint64 run_id = 4;
|
optional uint64 run_id = 4;
|
||||||
|
optional uint32 job_size = 5;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -29,6 +29,8 @@ SET(SERVER_PERF_STEPPING_SEC "1" CACHE STRING "Stepping of performan
|
|||||||
SET(CLIENT_RAND_BACKOFF_TSTART "3" CACHE STRING "Lower limit of client's backoff phase in seconds")
|
SET(CLIENT_RAND_BACKOFF_TSTART "3" CACHE STRING "Lower limit of client's backoff phase in seconds")
|
||||||
SET(CLIENT_RAND_BACKOFF_TEND "8" CACHE STRING "Upper limit of client's backoff phase in seconds")
|
SET(CLIENT_RAND_BACKOFF_TEND "8" CACHE STRING "Upper limit of client's backoff phase in seconds")
|
||||||
SET(CLIENT_RETRY_COUNT "3" CACHE STRING "Client's number of reconnect retries")
|
SET(CLIENT_RETRY_COUNT "3" CACHE STRING "Client's number of reconnect retries")
|
||||||
|
SET(CLIENT_JOB_REQUEST_SEC "60" CACHE STRING "Determines how often the client asks for new jobs")
|
||||||
|
SET(CLIENT_JOB_LIMIT_SEC "1000" CACHE STRING "How many jobs can a client ask for")
|
||||||
|
|
||||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/FailConfig.hpp.in
|
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/FailConfig.hpp.in
|
||||||
${CMAKE_CURRENT_BINARY_DIR}/FailConfig.hpp)
|
${CMAKE_CURRENT_BINARY_DIR}/FailConfig.hpp)
|
||||||
|
|||||||
@ -39,6 +39,8 @@
|
|||||||
#define CLIENT_RAND_BACKOFF_TSTART @CLIENT_RAND_BACKOFF_TSTART@
|
#define CLIENT_RAND_BACKOFF_TSTART @CLIENT_RAND_BACKOFF_TSTART@
|
||||||
#define CLIENT_RAND_BACKOFF_TEND @CLIENT_RAND_BACKOFF_TEND@
|
#define CLIENT_RAND_BACKOFF_TEND @CLIENT_RAND_BACKOFF_TEND@
|
||||||
#define CLIENT_RETRY_COUNT @CLIENT_RETRY_COUNT@
|
#define CLIENT_RETRY_COUNT @CLIENT_RETRY_COUNT@
|
||||||
|
#define CLIENT_JOB_REQUEST_SEC @CLIENT_JOB_REQUEST_SEC@
|
||||||
|
#define CLIENT_JOB_LIMIT_SEC @CLIENT_JOB_LIMIT_SEC@
|
||||||
#define PROJECT_VERSION "@PROJECT_VERSION@"
|
#define PROJECT_VERSION "@PROJECT_VERSION@"
|
||||||
#define FAIL_VERSION PROJECT_VERSION
|
#define FAIL_VERSION PROJECT_VERSION
|
||||||
|
|
||||||
|
|||||||
@ -9,7 +9,6 @@
|
|||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <arpa/inet.h>
|
#include <arpa/inet.h>
|
||||||
|
|
||||||
#include "comm/FailControlMessage.pb.h"
|
|
||||||
#include "comm/SocketComm.hpp"
|
#include "comm/SocketComm.hpp"
|
||||||
#include "JobServer.hpp"
|
#include "JobServer.hpp"
|
||||||
#include "Minion.hpp"
|
#include "Minion.hpp"
|
||||||
@ -212,6 +211,7 @@ void CommThread::operator()()
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// give minion something to do..
|
// give minion something to do..
|
||||||
|
m_job_size = ctrlmsg.job_size();
|
||||||
sendPendingExperimentData(minion);
|
sendPendingExperimentData(minion);
|
||||||
break;
|
break;
|
||||||
case FailControlMessage::RESULT_FOLLOWS:
|
case FailControlMessage::RESULT_FOLLOWS:
|
||||||
@ -221,7 +221,7 @@ void CommThread::operator()()
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// get results and put to done queue.
|
// get results and put to done queue.
|
||||||
receiveExperimentResults(minion, ctrlmsg.workloadid());
|
receiveExperimentResults(minion, ctrlmsg);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
// hm.. don't know what to do. please die.
|
// hm.. don't know what to do. please die.
|
||||||
@ -238,23 +238,46 @@ void CommThread::operator()()
|
|||||||
|
|
||||||
void CommThread::sendPendingExperimentData(Minion& minion)
|
void CommThread::sendPendingExperimentData(Minion& minion)
|
||||||
{
|
{
|
||||||
|
uint32_t i;
|
||||||
|
uint32_t workloadID;
|
||||||
|
std::vector<ExperimentData*> exp;
|
||||||
|
ExperimentData* temp_exp = 0;
|
||||||
FailControlMessage ctrlmsg;
|
FailControlMessage ctrlmsg;
|
||||||
|
|
||||||
ctrlmsg.set_build_id(42);
|
ctrlmsg.set_build_id(42);
|
||||||
ctrlmsg.set_run_id(m_js.m_runid);
|
ctrlmsg.set_run_id(m_js.m_runid);
|
||||||
ExperimentData * exp = 0;
|
ctrlmsg.set_command(FailControlMessage::WORK_FOLLOWS);
|
||||||
if (m_js.m_undoneJobs.Dequeue_nb(exp) == true) {
|
|
||||||
// Got an element from queue, assign ID to workload and send to minion
|
for(i = 0; i < m_job_size ; i++) {
|
||||||
uint32_t workloadID = m_js.m_counter.increment(); // increment workload counter
|
if (m_js.m_undoneJobs.Dequeue_nb(temp_exp) == true) {
|
||||||
exp->setWorkloadID(workloadID); // store ID for identification when receiving result
|
// Got an element from queue, assign ID to workload and send to minion
|
||||||
if (!m_js.m_runningJobs.insert(workloadID, exp)) {
|
workloadID = m_js.m_counter.increment(); // increment workload counter
|
||||||
|
temp_exp->setWorkloadID(workloadID); // store ID for identification when receiving result
|
||||||
|
ctrlmsg.add_workloadid(workloadID);
|
||||||
|
exp.push_back(temp_exp);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!m_js.m_runningJobs.insert(workloadID, temp_exp)) {
|
||||||
cout << "!![Server]could not insert workload id: [" << workloadID << "] double entry?" << endl;
|
cout << "!![Server]could not insert workload id: [" << workloadID << "] double entry?" << endl;
|
||||||
}
|
}
|
||||||
ctrlmsg.set_command(FailControlMessage::WORK_FOLLOWS);
|
}
|
||||||
ctrlmsg.set_workloadid(workloadID); // set workload id
|
ctrlmsg.set_job_size(exp.size());
|
||||||
//cout << ">>[Server] Sending workload [" << workloadID << "]" << endl;
|
|
||||||
cout << ">>[" << workloadID << "] " << flush;
|
cout << " >>[";
|
||||||
if (SocketComm::sendMsg(minion.getSocketDescriptor(), ctrlmsg)) {
|
for ( i = 0; i < exp.size() ; i++) {
|
||||||
SocketComm::sendMsg(minion.getSocketDescriptor(), exp->getMessage());
|
cout << " "<< ctrlmsg.workloadid(i) <<" ";
|
||||||
|
}
|
||||||
|
cout << "] " << flush;
|
||||||
|
|
||||||
|
|
||||||
|
if (SocketComm::sendMsg(minion.getSocketDescriptor(), ctrlmsg)) {
|
||||||
|
for (i = 0; i < ctrlmsg.job_size() ; i++) {
|
||||||
|
if(SocketComm::sendMsg(minion.getSocketDescriptor(), exp.front()->getMessage())) {
|
||||||
|
exp.erase(exp.begin());
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -265,7 +288,7 @@ void CommThread::sendPendingExperimentData(Minion& minion)
|
|||||||
// (See details in receiveExperimentResults)
|
// (See details in receiveExperimentResults)
|
||||||
boost::unique_lock<boost::mutex> lock(m_CommMutex);
|
boost::unique_lock<boost::mutex> lock(m_CommMutex);
|
||||||
#endif
|
#endif
|
||||||
if ((exp = m_js.m_runningJobs.pickone()) != NULL) { // 2nd priority
|
if ((temp_exp = m_js.m_runningJobs.pickone()) != NULL) { // 2nd priority
|
||||||
// (This picks one running job.)
|
// (This picks one running job.)
|
||||||
// TODO: Improve selection of parameter set to be resent:
|
// TODO: Improve selection of parameter set to be resent:
|
||||||
// - currently: Linear complexity!
|
// - currently: Linear complexity!
|
||||||
@ -277,14 +300,15 @@ void CommThread::sendPendingExperimentData(Minion& minion)
|
|||||||
// clients.
|
// clients.
|
||||||
// (Note: Therefore we need to be aware of receiving multiple results for a
|
// (Note: Therefore we need to be aware of receiving multiple results for a
|
||||||
// single parameter-set, @see receiveExperimentResults.)
|
// single parameter-set, @see receiveExperimentResults.)
|
||||||
uint32_t workloadID = exp->getWorkloadID(); // (this ID has been set previously)
|
uint32_t workloadID = temp_exp->getWorkloadID(); // (this ID has been set previously)
|
||||||
// Resend the parameter-set.
|
// Resend the parameter-set.
|
||||||
ctrlmsg.set_command(FailControlMessage::WORK_FOLLOWS);
|
ctrlmsg.set_command(FailControlMessage::WORK_FOLLOWS);
|
||||||
ctrlmsg.set_workloadid(workloadID); // set workload id
|
ctrlmsg.add_workloadid(workloadID); // set workload id
|
||||||
|
ctrlmsg.set_job_size(1); // In 2nd priority the jobserver send only one job
|
||||||
//cout << ">>[Server] Re-sending workload [" << workloadID << "]" << endl;
|
//cout << ">>[Server] Re-sending workload [" << workloadID << "]" << endl;
|
||||||
cout << ">>R[" << workloadID << "] " << flush;
|
cout << ">>R[" << workloadID << "] " << flush;
|
||||||
if (SocketComm::sendMsg(minion.getSocketDescriptor(), ctrlmsg)) {
|
if (SocketComm::sendMsg(minion.getSocketDescriptor(), ctrlmsg)) {
|
||||||
SocketComm::sendMsg(minion.getSocketDescriptor(), exp->getMessage());
|
SocketComm::sendMsg(minion.getSocketDescriptor(), temp_exp->getMessage());
|
||||||
}
|
}
|
||||||
} else if (m_js.noMoreExperiments() == false) {
|
} else if (m_js.noMoreExperiments() == false) {
|
||||||
// Currently we have no workload (even the running-job-queue is empty!), but
|
// Currently we have no workload (even the running-job-queue is empty!), but
|
||||||
@ -300,11 +324,15 @@ void CommThread::sendPendingExperimentData(Minion& minion)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void CommThread::receiveExperimentResults(Minion& minion, uint32_t workloadID)
|
void CommThread::receiveExperimentResults(Minion& minion, FailControlMessage ctrlmsg)
|
||||||
{
|
{
|
||||||
|
int i;
|
||||||
ExperimentData* exp = NULL; // Get exp* from running jobs
|
ExperimentData* exp = NULL; // Get exp* from running jobs
|
||||||
//cout << "<<[Server] Received result for workload id [" << workloadID << "]" << endl;
|
cout << " <<[ ";
|
||||||
cout << "<<[" << workloadID << "] " << flush;
|
for (i = 0; i < ctrlmsg.workloadid_size(); i++) {
|
||||||
|
cout << ctrlmsg.workloadid(i) << " ";
|
||||||
|
}
|
||||||
|
cout << "] " << flush;
|
||||||
#ifndef __puma
|
#ifndef __puma
|
||||||
// Prevent re-sending jobs in sendPendingExperimentData:
|
// Prevent re-sending jobs in sendPendingExperimentData:
|
||||||
// a) sendPendingExperimentData needs an intact job to serialize and send it.
|
// a) sendPendingExperimentData needs an intact job to serialize and send it.
|
||||||
@ -314,24 +342,27 @@ void CommThread::receiveExperimentResults(Minion& minion, uint32_t workloadID)
|
|||||||
// already may cause breakage in sendPendingExperimentData (a).
|
// already may cause breakage in sendPendingExperimentData (a).
|
||||||
boost::unique_lock<boost::mutex> lock(m_CommMutex);
|
boost::unique_lock<boost::mutex> lock(m_CommMutex);
|
||||||
#endif
|
#endif
|
||||||
if (m_js.m_runningJobs.remove(workloadID, exp)) { // ExperimentData* found
|
for (i = 0 ; i < ctrlmsg.workloadid_size() ; i++) {
|
||||||
// deserialize results, expect failures
|
if (m_js.m_runningJobs.remove( ctrlmsg.workloadid(i), exp)) { // ExperimentData* found
|
||||||
if (!SocketComm::rcvMsg(minion.getSocketDescriptor(), exp->getMessage())) {
|
// deserialize results, expect failures
|
||||||
m_js.m_runningJobs.insert(workloadID, exp);
|
if (!SocketComm::rcvMsg(minion.getSocketDescriptor(), exp->getMessage())) {
|
||||||
|
m_js.m_runningJobs.insert(ctrlmsg.workloadid(i), exp);
|
||||||
|
} else {
|
||||||
|
m_js.m_doneJobs.Enqueue(exp); // Put results in done queue
|
||||||
|
}
|
||||||
|
#ifdef SERVER_PERFORMANCE_MEASURE
|
||||||
|
++JobServer::m_DoneCount;
|
||||||
|
#endif
|
||||||
} else {
|
} else {
|
||||||
m_js.m_doneJobs.Enqueue(exp); // Put results in done queue
|
// We can receive several results for the same workload id because
|
||||||
}
|
// we (may) distribute the (running) jobs to a *few* experiment-clients.
|
||||||
#ifdef SERVER_PERFORMANCE_MEASURE
|
cout << "[Server] Received another result for workload id ["
|
||||||
++JobServer::m_DoneCount;
|
<< ctrlmsg.workloadid(i) << "] -- ignored." << endl;
|
||||||
#endif
|
|
||||||
} else {
|
|
||||||
// We can receive several results for the same workload id because
|
|
||||||
// we (may) distribute the (running) jobs to a *few* experiment-clients.
|
|
||||||
cout << "[Server] Received another result for workload id ["
|
|
||||||
<< workloadID << "] -- ignored." << endl;
|
|
||||||
|
|
||||||
// TODO: Any need for error-handling here?
|
// TODO: Any need for error-handling here?
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} // end-of-namespace: fail
|
} // end-of-namespace: fail
|
||||||
|
|||||||
@ -6,6 +6,7 @@
|
|||||||
#include "util/SynchronizedCounter.hpp"
|
#include "util/SynchronizedCounter.hpp"
|
||||||
#include "util/SynchronizedMap.hpp"
|
#include "util/SynchronizedMap.hpp"
|
||||||
#include "config/FailConfig.hpp"
|
#include "config/FailConfig.hpp"
|
||||||
|
#include "comm/FailControlMessage.pb.h"
|
||||||
|
|
||||||
#include <list>
|
#include <list>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
@ -139,6 +140,7 @@ public:
|
|||||||
class CommThread {
|
class CommThread {
|
||||||
private:
|
private:
|
||||||
int m_sock; //! Socket descriptor of the connection
|
int m_sock; //! Socket descriptor of the connection
|
||||||
|
uint32_t m_job_size;
|
||||||
JobServer& m_js; //! Calling jobserver
|
JobServer& m_js; //! Calling jobserver
|
||||||
|
|
||||||
// FIXME: Concerns are not really separated yet ;)
|
// FIXME: Concerns are not really separated yet ;)
|
||||||
@ -156,13 +158,13 @@ private:
|
|||||||
* @param minion The minion offering results
|
* @param minion The minion offering results
|
||||||
* @param workloadID The workload id of the result message
|
* @param workloadID The workload id of the result message
|
||||||
*/
|
*/
|
||||||
void receiveExperimentResults(Minion& minion, uint32_t workloadID);
|
void receiveExperimentResults(Minion& minion, FailControlMessage ctrlmsg);
|
||||||
public:
|
public:
|
||||||
#ifndef __puma
|
#ifndef __puma
|
||||||
static boost::mutex m_CommMutex; //! to synchronise the communication
|
static boost::mutex m_CommMutex; //! to synchronise the communication
|
||||||
#endif // __puma
|
#endif // __puma
|
||||||
CommThread(int sockfd, JobServer& p)
|
CommThread(int sockfd, JobServer& p)
|
||||||
: m_sock(sockfd), m_js(p) { }
|
: m_sock(sockfd), m_job_size(1), m_js(p) { }
|
||||||
/**
|
/**
|
||||||
* The thread's entry point.
|
* The thread's entry point.
|
||||||
*/
|
*/
|
||||||
|
|||||||
@ -16,6 +16,7 @@ JobClient::JobClient(const std::string& server, int port)
|
|||||||
}
|
}
|
||||||
srand(time(NULL)); // needed for random backoff (see connectToServer)
|
srand(time(NULL)); // needed for random backoff (see connectToServer)
|
||||||
m_server_runid = 0; // server accepts this for virgin clients
|
m_server_runid = 0; // server accepts this for virgin clients
|
||||||
|
m_job_throughput = 1; // client gets only one job at the first request
|
||||||
}
|
}
|
||||||
|
|
||||||
bool JobClient::connectToServer()
|
bool JobClient::connectToServer()
|
||||||
@ -71,7 +72,8 @@ bool JobClient::getParam(ExperimentData& exp)
|
|||||||
while (1) { // Here we try to acquire a parameter set
|
while (1) { // Here we try to acquire a parameter set
|
||||||
switch (tryToGetExperimentData(exp)) {
|
switch (tryToGetExperimentData(exp)) {
|
||||||
// Jobserver will sent workload, params are set in \c exp
|
// Jobserver will sent workload, params are set in \c exp
|
||||||
case FailControlMessage::WORK_FOLLOWS: return true;
|
case FailControlMessage::WORK_FOLLOWS:
|
||||||
|
return true;
|
||||||
// Nothing to do right now, but maybe later
|
// Nothing to do right now, but maybe later
|
||||||
case FailControlMessage::COME_AGAIN:
|
case FailControlMessage::COME_AGAIN:
|
||||||
sleep(1);
|
sleep(1);
|
||||||
@ -84,70 +86,144 @@ bool JobClient::getParam(ExperimentData& exp)
|
|||||||
|
|
||||||
FailControlMessage_Command JobClient::tryToGetExperimentData(ExperimentData& exp)
|
FailControlMessage_Command JobClient::tryToGetExperimentData(ExperimentData& exp)
|
||||||
{
|
{
|
||||||
FailControlMessage ctrlmsg;
|
//Are there other jobs for the experiment
|
||||||
|
if (m_parameters.size() != 0) {
|
||||||
|
exp.getMessage().CopyFrom(m_parameters.front()->getMessage());
|
||||||
|
exp.setWorkloadID(m_parameters.front()->getWorkloadID());
|
||||||
|
|
||||||
// Connection failed, minion can die
|
delete &m_parameters.front()->getMessage();
|
||||||
if (!connectToServer()) {
|
delete m_parameters.front();
|
||||||
return FailControlMessage::DIE;
|
m_parameters.erase(m_parameters.begin());
|
||||||
}
|
|
||||||
|
|
||||||
// Retrieve ExperimentData
|
return FailControlMessage::WORK_FOLLOWS;
|
||||||
ctrlmsg.set_command(FailControlMessage::NEED_WORK);
|
} else {
|
||||||
ctrlmsg.set_build_id(42);
|
FailControlMessage ctrlmsg;
|
||||||
ctrlmsg.set_run_id(m_server_runid);
|
|
||||||
|
|
||||||
if (!SocketComm::sendMsg(m_sockfd, ctrlmsg)) {
|
// Connection failed, minion can die
|
||||||
// Failed to send message? Retry.
|
if (!connectToServer()) {
|
||||||
close(m_sockfd);
|
return FailControlMessage::DIE;
|
||||||
return FailControlMessage::COME_AGAIN;
|
}
|
||||||
}
|
|
||||||
ctrlmsg.Clear();
|
|
||||||
if (!SocketComm::rcvMsg(m_sockfd, ctrlmsg)) {
|
|
||||||
// Failed to receive message? Retry.
|
|
||||||
close(m_sockfd);
|
|
||||||
return FailControlMessage::COME_AGAIN;
|
|
||||||
}
|
|
||||||
|
|
||||||
// now we know the current run ID
|
// Retrieve ExperimentData
|
||||||
m_server_runid = ctrlmsg.run_id();
|
ctrlmsg.set_command(FailControlMessage::NEED_WORK);
|
||||||
|
ctrlmsg.set_build_id(42);
|
||||||
|
ctrlmsg.set_run_id(m_server_runid);
|
||||||
|
ctrlmsg.set_job_size(m_job_throughput); //Request for a number of jobs
|
||||||
|
|
||||||
switch (ctrlmsg.command()) {
|
if (!SocketComm::sendMsg(m_sockfd, ctrlmsg)) {
|
||||||
case FailControlMessage::WORK_FOLLOWS:
|
// Failed to send message? Retry.
|
||||||
if (!SocketComm::rcvMsg(m_sockfd, exp.getMessage())) {
|
close(m_sockfd);
|
||||||
|
return FailControlMessage::COME_AGAIN;
|
||||||
|
}
|
||||||
|
ctrlmsg.Clear();
|
||||||
|
if (!SocketComm::rcvMsg(m_sockfd, ctrlmsg)) {
|
||||||
// Failed to receive message? Retry.
|
// Failed to receive message? Retry.
|
||||||
close(m_sockfd);
|
close(m_sockfd);
|
||||||
return FailControlMessage::COME_AGAIN;
|
return FailControlMessage::COME_AGAIN;
|
||||||
}
|
}
|
||||||
exp.setWorkloadID(ctrlmsg.workloadid()); // Store workload id of experiment data
|
|
||||||
break;
|
// now we know the current run ID
|
||||||
case FailControlMessage::COME_AGAIN:
|
m_server_runid = ctrlmsg.run_id();
|
||||||
break;
|
|
||||||
default:
|
switch (ctrlmsg.command()) {
|
||||||
break;
|
case FailControlMessage::WORK_FOLLOWS:
|
||||||
|
uint32_t i;
|
||||||
|
for (i = 0 ; i < ctrlmsg.job_size() ; i++) {
|
||||||
|
ExperimentData* temp_exp = new ExperimentData(exp.getMessage().New());
|
||||||
|
|
||||||
|
if (!SocketComm::rcvMsg(m_sockfd, temp_exp->getMessage())) {
|
||||||
|
// Failed to receive message? Retry.
|
||||||
|
close(m_sockfd);
|
||||||
|
return FailControlMessage::COME_AGAIN;
|
||||||
|
}
|
||||||
|
|
||||||
|
temp_exp->setWorkloadID(ctrlmsg.workloadid(i)); //Store workload id of experiment data
|
||||||
|
m_parameters.push_back(temp_exp);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case FailControlMessage::COME_AGAIN:
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
close(m_sockfd);
|
||||||
|
//Take front from m_parameters and copy to exp.
|
||||||
|
exp.getMessage().CopyFrom(m_parameters.front()->getMessage());
|
||||||
|
exp.setWorkloadID(m_parameters.front()->getWorkloadID());
|
||||||
|
//Delete front element of m_parameters
|
||||||
|
delete &m_parameters.front()->getMessage();
|
||||||
|
delete m_parameters.front();
|
||||||
|
m_parameters.erase(m_parameters.begin());
|
||||||
|
//start time measurement for throughput calculation
|
||||||
|
m_job_runtime.startTimer();
|
||||||
|
|
||||||
|
return ctrlmsg.command();
|
||||||
}
|
}
|
||||||
close(m_sockfd);
|
|
||||||
return ctrlmsg.command();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool JobClient::sendResult(ExperimentData& result)
|
bool JobClient::sendResult(ExperimentData& result)
|
||||||
{
|
{
|
||||||
if (!connectToServer())
|
//Create new ExperimentData for result
|
||||||
return false;
|
ExperimentData* temp_exp = new ExperimentData(result.getMessage().New());
|
||||||
|
temp_exp->getMessage().CopyFrom(result.getMessage());
|
||||||
|
temp_exp->setWorkloadID(result.getWorkloadID());
|
||||||
|
|
||||||
// Send back results
|
if (m_parameters.size() != 0) {
|
||||||
FailControlMessage ctrlmsg;
|
//If there are more jobs for the experiment store result
|
||||||
ctrlmsg.set_command(FailControlMessage::RESULT_FOLLOWS);
|
m_results.push_back( temp_exp );
|
||||||
ctrlmsg.set_build_id(42);
|
|
||||||
ctrlmsg.set_run_id(m_server_runid);
|
|
||||||
ctrlmsg.set_workloadid(result.getWorkloadID());
|
|
||||||
cout << "[Client] Sending back result [" << std::dec << result.getWorkloadID() << "]..." << endl;
|
|
||||||
// TODO: Log-level?
|
|
||||||
SocketComm::sendMsg(m_sockfd, ctrlmsg);
|
|
||||||
SocketComm::sendMsg(m_sockfd, result.getMessage());
|
|
||||||
|
|
||||||
// Close connection.
|
return true;
|
||||||
close(m_sockfd);
|
} else {
|
||||||
return true;
|
m_results.push_back( temp_exp );
|
||||||
|
|
||||||
|
//Stop time measurement and calculate new throughput
|
||||||
|
m_job_runtime.stopTimer();
|
||||||
|
m_job_throughput = CLIENT_JOB_REQUEST_SEC/((double)m_job_runtime/m_results.size());
|
||||||
|
|
||||||
|
if (m_job_throughput > CLIENT_JOB_LIMIT_SEC)
|
||||||
|
m_job_throughput = CLIENT_JOB_LIMIT_SEC;
|
||||||
|
|
||||||
|
if (m_job_throughput < 1)
|
||||||
|
m_job_throughput = 1;
|
||||||
|
|
||||||
|
//Reset timer for new time measurement
|
||||||
|
m_job_runtime.reset();
|
||||||
|
|
||||||
|
if (!connectToServer())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
//Send back results
|
||||||
|
FailControlMessage ctrlmsg;
|
||||||
|
ctrlmsg.set_command(FailControlMessage::RESULT_FOLLOWS);
|
||||||
|
ctrlmsg.set_build_id(42);
|
||||||
|
ctrlmsg.set_run_id(m_server_runid);
|
||||||
|
ctrlmsg.set_job_size(m_results.size()); //Store how many results will be sent
|
||||||
|
|
||||||
|
cout << "[Client] Sending back result [";
|
||||||
|
|
||||||
|
uint32_t i;
|
||||||
|
for (i = 0; i < m_results.size() ; i++) {
|
||||||
|
ctrlmsg.add_workloadid(m_results[i]->getWorkloadID());
|
||||||
|
cout << std::dec << m_results[i]->getWorkloadID();
|
||||||
|
cout << " ";
|
||||||
|
}
|
||||||
|
cout << "]";
|
||||||
|
|
||||||
|
// TODO: Log-level?
|
||||||
|
SocketComm::sendMsg(m_sockfd, ctrlmsg);
|
||||||
|
|
||||||
|
for (i = 0; i < ctrlmsg.job_size() ; i++) {
|
||||||
|
SocketComm::sendMsg(m_sockfd, m_results.front()->getMessage());
|
||||||
|
delete &m_results.front()->getMessage();
|
||||||
|
delete m_results.front();
|
||||||
|
m_results.erase(m_results.begin());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Close connection.
|
||||||
|
close(m_sockfd);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} // end-of-namespace: fail
|
} // end-of-namespace: fail
|
||||||
|
|||||||
@ -11,6 +11,7 @@
|
|||||||
#include "comm/ExperimentData.hpp"
|
#include "comm/ExperimentData.hpp"
|
||||||
#include "comm/FailControlMessage.pb.h"
|
#include "comm/FailControlMessage.pb.h"
|
||||||
#include "config/FailConfig.hpp"
|
#include "config/FailConfig.hpp"
|
||||||
|
#include "util/WallclockTimer.hpp"
|
||||||
|
|
||||||
namespace fail {
|
namespace fail {
|
||||||
|
|
||||||
@ -28,6 +29,11 @@ private:
|
|||||||
int m_sockfd;
|
int m_sockfd;
|
||||||
uint64_t m_server_runid;
|
uint64_t m_server_runid;
|
||||||
|
|
||||||
|
WallclockTimer m_job_runtime;
|
||||||
|
int m_job_throughput;
|
||||||
|
std::vector<ExperimentData*> m_parameters;
|
||||||
|
std::vector<ExperimentData*> m_results;
|
||||||
|
|
||||||
bool connectToServer();
|
bool connectToServer();
|
||||||
|
|
||||||
FailControlMessage_Command tryToGetExperimentData(ExperimentData& exp);
|
FailControlMessage_Command tryToGetExperimentData(ExperimentData& exp);
|
||||||
|
|||||||
Reference in New Issue
Block a user