mirror of
https://github.com/FairRootGroup/FairMQ.git
synced 2025-10-13 16:46:47 +00:00
shm: optimize monitor heartbeats
This commit is contained in:
parent
ab54668aee
commit
28a887a457
|
@ -124,6 +124,15 @@ struct EventCounter
|
||||||
std::atomic<uint64_t> fCount;
|
std::atomic<uint64_t> fCount;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct Heartbeat
|
||||||
|
{
|
||||||
|
Heartbeat(uint64_t c)
|
||||||
|
: fCount(c)
|
||||||
|
{}
|
||||||
|
|
||||||
|
std::atomic<uint64_t> fCount;
|
||||||
|
};
|
||||||
|
|
||||||
struct RegionCounter
|
struct RegionCounter
|
||||||
{
|
{
|
||||||
RegionCounter(uint16_t c)
|
RegionCounter(uint16_t c)
|
||||||
|
|
|
@ -80,7 +80,7 @@ class Manager
|
||||||
, fMsgCounterNew(0)
|
, fMsgCounterNew(0)
|
||||||
, fMsgCounterDelete(0)
|
, fMsgCounterDelete(0)
|
||||||
#endif
|
#endif
|
||||||
, fSendHeartbeats(true)
|
, fBeatTheHeart(true)
|
||||||
, fThrowOnBadAlloc(config ? config->GetProperty<bool>("shm-throw-bad-alloc", true) : true)
|
, fThrowOnBadAlloc(config ? config->GetProperty<bool>("shm-throw-bad-alloc", true) : true)
|
||||||
, fNoCleanup(config ? config->GetProperty<bool>("shm-no-cleanup", false) : false)
|
, fNoCleanup(config ? config->GetProperty<bool>("shm-no-cleanup", false) : false)
|
||||||
{
|
{
|
||||||
|
@ -106,7 +106,7 @@ class Manager
|
||||||
StartMonitor(fShmId);
|
StartMonitor(fShmId);
|
||||||
}
|
}
|
||||||
|
|
||||||
fHeartbeatThread = std::thread(&Manager::SendHeartbeats, this);
|
fHeartbeatThread = std::thread(&Manager::Heartbeats, this);
|
||||||
|
|
||||||
{
|
{
|
||||||
std::stringstream ss;
|
std::stringstream ss;
|
||||||
|
@ -544,23 +544,15 @@ class Manager
|
||||||
void DecrementShmMsgCounter(uint16_t segmentId) { --((*fShmMsgCounters)[segmentId].fCount); }
|
void DecrementShmMsgCounter(uint16_t segmentId) { --((*fShmMsgCounters)[segmentId].fCount); }
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void SendHeartbeats()
|
void Heartbeats()
|
||||||
{
|
{
|
||||||
std::string controlQueueName("fmq_" + fShmId + "_cq");
|
using namespace boost::interprocess;
|
||||||
|
|
||||||
|
Heartbeat* hb = fManagementSegment.find_or_construct<Heartbeat>(unique_instance)(0);
|
||||||
std::unique_lock<std::mutex> lock(fHeartbeatsMtx);
|
std::unique_lock<std::mutex> lock(fHeartbeatsMtx);
|
||||||
while (fSendHeartbeats) {
|
while (fBeatTheHeart) {
|
||||||
try {
|
(hb->fCount)++;
|
||||||
boost::interprocess::message_queue mq(boost::interprocess::open_only, controlQueueName.c_str());
|
fHeartbeatsCV.wait_for(lock, std::chrono::milliseconds(100), [&]() { return !fBeatTheHeart; });
|
||||||
boost::posix_time::ptime sndTill = boost::posix_time::microsec_clock::universal_time() + boost::posix_time::milliseconds(100);
|
|
||||||
if (mq.timed_send(fDeviceId.c_str(), fDeviceId.size(), 0, sndTill)) {
|
|
||||||
fHeartbeatsCV.wait_for(lock, std::chrono::milliseconds(100), [&]() { return !fSendHeartbeats; });
|
|
||||||
} else {
|
|
||||||
LOG(debug) << "control queue timeout";
|
|
||||||
}
|
|
||||||
} catch (boost::interprocess::interprocess_exception& ie) {
|
|
||||||
fHeartbeatsCV.wait_for(lock, std::chrono::milliseconds(500), [&]() { return !fSendHeartbeats; });
|
|
||||||
// LOG(debug) << "no " << controlQueueName << " found";
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -678,7 +670,7 @@ class Manager
|
||||||
|
|
||||||
{
|
{
|
||||||
std::unique_lock<std::mutex> lock(fHeartbeatsMtx);
|
std::unique_lock<std::mutex> lock(fHeartbeatsMtx);
|
||||||
fSendHeartbeats = false;
|
fBeatTheHeart = false;
|
||||||
}
|
}
|
||||||
fHeartbeatsCV.notify_one();
|
fHeartbeatsCV.notify_one();
|
||||||
if (fHeartbeatThread.joinable()) {
|
if (fHeartbeatThread.joinable()) {
|
||||||
|
@ -744,14 +736,12 @@ class Manager
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
std::thread fHeartbeatThread;
|
std::thread fHeartbeatThread;
|
||||||
bool fSendHeartbeats;
|
bool fBeatTheHeart;
|
||||||
std::mutex fHeartbeatsMtx;
|
std::mutex fHeartbeatsMtx;
|
||||||
std::condition_variable fHeartbeatsCV;
|
std::condition_variable fHeartbeatsCV;
|
||||||
|
|
||||||
bool fThrowOnBadAlloc;
|
bool fThrowOnBadAlloc;
|
||||||
bool fNoCleanup;
|
bool fNoCleanup;
|
||||||
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace fair::mq::shmem
|
} // namespace fair::mq::shmem
|
||||||
|
|
|
@ -87,7 +87,6 @@ Monitor::Monitor(string shmId, bool selfDestruct, bool interactive, bool viewOnl
|
||||||
, fTimeoutInMS(timeoutInMS)
|
, fTimeoutInMS(timeoutInMS)
|
||||||
, fIntervalInMS(intervalInMS)
|
, fIntervalInMS(intervalInMS)
|
||||||
, fShmId(std::move(shmId))
|
, fShmId(std::move(shmId))
|
||||||
, fControlQueueName("fmq_" + fShmId + "_cq")
|
|
||||||
, fTerminating(false)
|
, fTerminating(false)
|
||||||
, fHeartbeatTriggered(false)
|
, fHeartbeatTriggered(false)
|
||||||
, fLastHeartbeat(chrono::high_resolution_clock::now())
|
, fLastHeartbeat(chrono::high_resolution_clock::now())
|
||||||
|
@ -132,8 +131,7 @@ void Monitor::Run()
|
||||||
{
|
{
|
||||||
thread heartbeatThread;
|
thread heartbeatThread;
|
||||||
if (!fViewOnly) {
|
if (!fViewOnly) {
|
||||||
RemoveQueue(fControlQueueName);
|
heartbeatThread = thread(&Monitor::CheckHeartbeats, this);
|
||||||
heartbeatThread = thread(&Monitor::ReceiveHeartbeats, this);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fInteractive) {
|
if (fInteractive) {
|
||||||
|
@ -158,7 +156,7 @@ void Monitor::Watch()
|
||||||
fSeenOnce = true;
|
fSeenOnce = true;
|
||||||
|
|
||||||
auto now = chrono::high_resolution_clock::now();
|
auto now = chrono::high_resolution_clock::now();
|
||||||
unsigned int duration = chrono::duration_cast<chrono::milliseconds>(now - fLastHeartbeat).count();
|
unsigned int duration = chrono::duration_cast<chrono::milliseconds>(now - fLastHeartbeat.load()).count();
|
||||||
|
|
||||||
if (fHeartbeatTriggered && duration > fTimeoutInMS) {
|
if (fHeartbeatTriggered && duration > fTimeoutInMS) {
|
||||||
// memory is present, but no heartbeats since timeout duration
|
// memory is present, but no heartbeats since timeout duration
|
||||||
|
@ -181,7 +179,7 @@ void Monitor::Watch()
|
||||||
} else {
|
} else {
|
||||||
// if self-destruct is requested, and no segment has ever been observed, quit after double timeout duration
|
// if self-destruct is requested, and no segment has ever been observed, quit after double timeout duration
|
||||||
auto now = chrono::high_resolution_clock::now();
|
auto now = chrono::high_resolution_clock::now();
|
||||||
unsigned int duration = chrono::duration_cast<chrono::milliseconds>(now - fLastHeartbeat).count();
|
unsigned int duration = chrono::duration_cast<chrono::milliseconds>(now - fLastHeartbeat.load()).count();
|
||||||
|
|
||||||
if (duration > fTimeoutInMS * 2) {
|
if (duration > fTimeoutInMS * 2) {
|
||||||
Cleanup(ShmId{fShmId});
|
Cleanup(ShmId{fShmId});
|
||||||
|
@ -305,31 +303,30 @@ void Monitor::ListAll(const std::string& path)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Monitor::ReceiveHeartbeats()
|
void Monitor::CheckHeartbeats()
|
||||||
{
|
{
|
||||||
try {
|
using namespace boost::interprocess;
|
||||||
bipc::message_queue mq(bipc::open_or_create, fControlQueueName.c_str(), 1000, 256);
|
|
||||||
|
|
||||||
unsigned int priority = 0;
|
uint64_t localHb = 0;
|
||||||
bipc::message_queue::size_type recvdSize = 0;
|
|
||||||
char msg[256] = {0};
|
|
||||||
|
|
||||||
while (!fTerminating) {
|
while (!fTerminating) {
|
||||||
bpt::ptime rcvTill = bpt::microsec_clock::universal_time() + bpt::milliseconds(100);
|
std::this_thread::sleep_for(std::chrono::milliseconds(200));
|
||||||
if (mq.timed_receive(&msg, sizeof(msg), recvdSize, priority, rcvTill)) {
|
try {
|
||||||
fHeartbeatTriggered = true;
|
managed_shared_memory managementSegment(open_read_only, std::string("fmq_" + fShmId + "_mng").c_str());
|
||||||
fLastHeartbeat = chrono::high_resolution_clock::now();
|
Heartbeat* hb = managementSegment.find<Heartbeat>(unique_instance).first;
|
||||||
string deviceId(msg, recvdSize);
|
|
||||||
fDeviceHeartbeats[deviceId] = fLastHeartbeat;
|
|
||||||
} else {
|
|
||||||
// LOG(info) << "control queue timeout";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (bie& ie) {
|
|
||||||
LOG(info) << ie.what();
|
|
||||||
}
|
|
||||||
|
|
||||||
RemoveQueue(fControlQueueName);
|
if (hb) {
|
||||||
|
uint64_t globalHb = hb->fCount;
|
||||||
|
if (localHb != globalHb) {
|
||||||
|
fHeartbeatTriggered = true;
|
||||||
|
fLastHeartbeat.store(chrono::high_resolution_clock::now());
|
||||||
|
localHb = globalHb;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (bie&) {
|
||||||
|
// management segment not found, simply retry.
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Monitor::Interactive()
|
void Monitor::Interactive()
|
||||||
|
@ -629,7 +626,6 @@ std::vector<std::pair<std::string, bool>> Monitor::CleanupFull(const ShmId& shmI
|
||||||
{
|
{
|
||||||
auto result = Cleanup(shmId, verbose);
|
auto result = Cleanup(shmId, verbose);
|
||||||
result.emplace_back(RunRemoval(Monitor::RemoveMutex, "fmq_" + shmId.shmId + "_ms", verbose));
|
result.emplace_back(RunRemoval(Monitor::RemoveMutex, "fmq_" + shmId.shmId + "_ms", verbose));
|
||||||
result.emplace_back(RunRemoval(Monitor::RemoveQueue, "fmq_" + shmId.shmId + "_cq", verbose));
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -117,7 +117,7 @@ class Monitor
|
||||||
private:
|
private:
|
||||||
void PrintHelp();
|
void PrintHelp();
|
||||||
void Watch();
|
void Watch();
|
||||||
void ReceiveHeartbeats();
|
void CheckHeartbeats();
|
||||||
void CheckSegment();
|
void CheckSegment();
|
||||||
void Interactive();
|
void Interactive();
|
||||||
void SignalMonitor();
|
void SignalMonitor();
|
||||||
|
@ -131,12 +131,10 @@ class Monitor
|
||||||
unsigned int fTimeoutInMS;
|
unsigned int fTimeoutInMS;
|
||||||
unsigned int fIntervalInMS;
|
unsigned int fIntervalInMS;
|
||||||
std::string fShmId;
|
std::string fShmId;
|
||||||
std::string fControlQueueName;
|
|
||||||
std::atomic<bool> fTerminating;
|
std::atomic<bool> fTerminating;
|
||||||
std::atomic<bool> fHeartbeatTriggered;
|
std::atomic<bool> fHeartbeatTriggered;
|
||||||
std::chrono::high_resolution_clock::time_point fLastHeartbeat;
|
std::atomic<std::chrono::high_resolution_clock::time_point> fLastHeartbeat;
|
||||||
std::thread fSignalThread;
|
std::thread fSignalThread;
|
||||||
std::unordered_map<std::string, std::chrono::high_resolution_clock::time_point> fDeviceHeartbeats;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace fair::mq::shmem
|
} // namespace fair::mq::shmem
|
||||||
|
|
|
@ -19,7 +19,6 @@ FairMQ Shared Memory currently uses the following names to register shared memor
|
||||||
| `fmq_<shmId>_rg_<index>` | unmanaged region(s) | one of the devices | devices with unmanaged regions |
|
| `fmq_<shmId>_rg_<index>` | unmanaged region(s) | one of the devices | devices with unmanaged regions |
|
||||||
| `fmq_<shmId>_rgq_<index>` | unmanaged region queue(s) | one of the devices | devices with unmanaged regions |
|
| `fmq_<shmId>_rgq_<index>` | unmanaged region queue(s) | one of the devices | devices with unmanaged regions |
|
||||||
| `fmq_<shmId>_ms` | shmmonitor status | shmmonitor | devices, shmmonitor |
|
| `fmq_<shmId>_ms` | shmmonitor status | shmmonitor | devices, shmmonitor |
|
||||||
| `fmq_<shmId>_cq` | message queue between transport and shmmonitor | shmmonitor | devices, shmmonitor |
|
|
||||||
|
|
||||||
The shmId is generated out of session id and user id.
|
The shmId is generated out of session id and user id.
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user