shm: allow monitor::ResetContent to cleanup after a crash

This commit is contained in:
Alexey Rybalchenko 2022-02-01 18:55:09 +01:00
parent ddc7c834db
commit b264727179
7 changed files with 108 additions and 37 deletions

View File

@ -133,6 +133,7 @@ struct RegionConfig
bool removeOnDestruction = true; /// remove the region on object destruction bool removeOnDestruction = true; /// remove the region on object destruction
int creationFlags = 0; /// flags passed to the underlying transport on region creation int creationFlags = 0; /// flags passed to the underlying transport on region creation
int64_t userFlags = 0; /// custom flags that have no effect on the transport, but can be retrieved from the region by the user int64_t userFlags = 0; /// custom flags that have no effect on the transport, but can be retrieved from the region by the user
uint64_t size = 0; /// region size
std::string path = ""; /// file path, if the region is backed by a file std::string path = ""; /// file path, if the region is backed by a file
std::optional<uint16_t> id = std::nullopt; /// region id std::optional<uint16_t> id = std::nullopt; /// region id
uint32_t linger = 100; /// delay in ms before region destruction to collect outstanding events uint32_t linger = 100; /// delay in ms before region destruction to collect outstanding events

View File

@ -28,6 +28,8 @@
namespace fair::mq::shmem namespace fair::mq::shmem
{ {
static constexpr uint64_t kManagementSegmentSize = 6553600;
struct SharedMemoryError : std::runtime_error { using std::runtime_error::runtime_error; }; struct SharedMemoryError : std::runtime_error { using std::runtime_error::runtime_error; };
using SimpleSeqFitSegment = boost::interprocess::basic_managed_shared_memory<char, using SimpleSeqFitSegment = boost::interprocess::basic_managed_shared_memory<char,

View File

@ -132,7 +132,7 @@ class Manager
: fShmId64(config ? config->GetProperty<uint64_t>("shmid", makeShmIdUint64(sessionName)) : makeShmIdUint64(sessionName)) : fShmId64(config ? config->GetProperty<uint64_t>("shmid", makeShmIdUint64(sessionName)) : makeShmIdUint64(sessionName))
, fShmId(makeShmIdStr(fShmId64)) , fShmId(makeShmIdStr(fShmId64))
, fSegmentId(config ? config->GetProperty<uint16_t>("shm-segment-id", 0) : 0) , fSegmentId(config ? config->GetProperty<uint16_t>("shm-segment-id", 0) : 0)
, fManagementSegment(boost::interprocess::open_or_create, std::string("fmq_" + fShmId + "_mng").c_str(), 6553600) , fManagementSegment(boost::interprocess::open_or_create, std::string("fmq_" + fShmId + "_mng").c_str(), kManagementSegmentSize)
, fShmVoidAlloc(fManagementSegment.get_segment_manager()) , fShmVoidAlloc(fManagementSegment.get_segment_manager())
, fShmMtx(fManagementSegment.find_or_construct<boost::interprocess::interprocess_mutex>(boost::interprocess::unique_instance)()) , fShmMtx(fManagementSegment.find_or_construct<boost::interprocess::interprocess_mutex>(boost::interprocess::unique_instance)())
, fNumObservedEvents(0) , fNumObservedEvents(0)

View File

@ -6,9 +6,10 @@
* copied verbatim in the file "LICENSE" * * copied verbatim in the file "LICENSE" *
********************************************************************************/ ********************************************************************************/
#include "Monitor.h"
#include "Common.h" #include "Common.h"
#include "UnmanagedRegion.h" #include "Monitor.h"
#include "Segment.h"
#include <fairmq/shmem/UnmanagedRegion.h>
#include <fairmq/tools/IO.h> #include <fairmq/tools/IO.h>
#include <fairmq/tools/Strings.h> #include <fairmq/tools/Strings.h>
@ -415,24 +416,28 @@ void Monitor::PrintDebugInfo(const ShmId& shmId __attribute__((unused)))
size_t numMessages = 0; size_t numMessages = 0;
for (const auto& e : *debug) { if (debug) {
numMessages += e.second.size(); for (const auto& e : *debug) {
} numMessages += e.second.size();
LOG(info) << endl << "found " << numMessages << " messages.";
for (const auto& s : *debug) {
for (const auto& e : s.second) {
using time_point = chrono::system_clock::time_point;
time_point tmpt{chrono::duration_cast<time_point::duration>(chrono::nanoseconds(e.second.fCreationTime))};
time_t t = chrono::system_clock::to_time_t(tmpt);
uint64_t ms = e.second.fCreationTime % 1000000;
auto tm = localtime(&t);
LOG(info) << "segment: " << setw(3) << setfill(' ') << s.first
<< ", offset: " << setw(12) << setfill(' ') << e.first
<< ", size: " << setw(10) << setfill(' ') << e.second.fSize
<< ", creator PID: " << e.second.fPid << setfill('0')
<< ", at: " << setw(2) << tm->tm_hour << ":" << setw(2) << tm->tm_min << ":" << setw(2) << tm->tm_sec << "." << setw(6) << ms;
} }
LOG(info) << endl << "found " << numMessages << " messages.";
for (const auto& s : *debug) {
for (const auto& e : s.second) {
using time_point = chrono::system_clock::time_point;
time_point tmpt{chrono::duration_cast<time_point::duration>(chrono::nanoseconds(e.second.fCreationTime))};
time_t t = chrono::system_clock::to_time_t(tmpt);
uint64_t ms = e.second.fCreationTime % 1000000;
auto tm = localtime(&t);
LOG(info) << "segment: " << setw(3) << setfill(' ') << s.first
<< ", offset: " << setw(12) << setfill(' ') << e.first
<< ", size: " << setw(10) << setfill(' ') << e.second.fSize
<< ", creator PID: " << e.second.fPid << setfill('0')
<< ", at: " << setw(2) << tm->tm_hour << ":" << setw(2) << tm->tm_min << ":" << setw(2) << tm->tm_sec << "." << setw(6) << ms;
}
}
} else {
LOG(info) << "no debug data found";
} }
} catch (bie&) { } catch (bie&) {
LOG(info) << "no segments found"; LOG(info) << "no segments found";
@ -463,11 +468,16 @@ unordered_map<uint16_t, std::vector<BufferDebugInfo>> Monitor::GetDebugInfo(cons
result.reserve(debug->size()); result.reserve(debug->size());
for (const auto& s : *debug) {
result[s.first].reserve(s.second.size()); if (debug) {
for (const auto& e : s.second) { for (const auto& s : *debug) {
result[s.first][e.first] = BufferDebugInfo(e.first, e.second.fPid, e.second.fSize, e.second.fCreationTime); result[s.first].reserve(s.second.size());
for (const auto& e : s.second) {
result[s.first][e.first] = BufferDebugInfo(e.first, e.second.fPid, e.second.fSize, e.second.fCreationTime);
}
} }
} else {
LOG(info) << "no debug data found";
} }
} catch (bie&) { } catch (bie&) {
LOG(info) << "no segments found"; LOG(info) << "no segments found";
@ -701,6 +711,43 @@ void Monitor::ResetContent(const SessionId& sessionId, bool verbose /* = true */
ResetContent(shmId, verbose); ResetContent(shmId, verbose);
} }
void Monitor::ResetContent(const ShmId& shmIdT, const std::vector<SegmentConfig>& segmentCfgs, const std::vector<RegionConfig>& regionCfgs, bool verbose /* = true */)
{
using namespace boost::interprocess;
std::string shmId = shmIdT.shmId;
std::string managementSegmentName("fmq_" + shmId + "_mng");
// reset managed segments
ResetContent(shmIdT, verbose);
// delete management segment
Remove<bipc::shared_memory_object>(managementSegmentName, verbose);
// recreate management segment
managed_shared_memory mngSegment(create_only, managementSegmentName.c_str(), kManagementSegmentSize);
// fill management segment with segment & region infos
for (const auto& s : segmentCfgs) {
if (s.allocationAlgorithm == "rbtree_best_fit") {
Segment::Register(shmId, s.id, AllocationAlgorithm::rbtree_best_fit);
} else if (s.allocationAlgorithm == "simple_seq_fit") {
Segment::Register(shmId, s.id, AllocationAlgorithm::simple_seq_fit);
} else {
LOG(error) << "Unknown allocation algorithm provided: " << s.allocationAlgorithm;
throw MonitorError("Unknown allocation algorithm provided: " + s.allocationAlgorithm);
}
}
for (const auto& r : regionCfgs) {
fair::mq::shmem::UnmanagedRegion::Register(shmId, r);
}
}
void Monitor::ResetContent(const SessionId& sessionId, const std::vector<SegmentConfig>& segmentCfgs, const std::vector<RegionConfig>& regionCfgs, bool verbose /* = true */)
{
ShmId shmId{makeShmIdStr(sessionId.sessionId)};
if (verbose) {
cout << "ResetContent called with session id '" << sessionId.sessionId << "', translating to shared memory id '" << shmId.shmId << "'" << endl;
}
ResetContent(shmId, segmentCfgs, regionCfgs, verbose);
}
Monitor::~Monitor() Monitor::~Monitor()
{ {
if (fSignalThread.joinable()) { if (fSignalThread.joinable()) {

View File

@ -8,6 +8,8 @@
#ifndef FAIR_MQ_SHMEM_MONITOR_H_ #ifndef FAIR_MQ_SHMEM_MONITOR_H_
#define FAIR_MQ_SHMEM_MONITOR_H_ #define FAIR_MQ_SHMEM_MONITOR_H_
#include <fairmq/UnmanagedRegion.h>
#include <fairlogger/Logger.h> #include <fairlogger/Logger.h>
#include <thread> #include <thread>
@ -49,6 +51,13 @@ struct BufferDebugInfo
uint64_t fCreationTime; uint64_t fCreationTime;
}; };
struct SegmentConfig
{
uint16_t id;
uint64_t size;
std::string allocationAlgorithm;
};
class Monitor class Monitor
{ {
public: public:
@ -88,6 +97,14 @@ class Monitor
/// @param sessionId session id /// @param sessionId session id
/// Only call this when segment is not in use /// Only call this when segment is not in use
static void ResetContent(const SessionId& sessionId, bool verbose = true); static void ResetContent(const SessionId& sessionId, bool verbose = true);
/// @brief [EXPERIMENTAL] cleanup the content of the shem segment, without recreating it
/// @param shmId shared memory id
/// Only call this when segment is not in use
static void ResetContent(const ShmId& shmId, const std::vector<SegmentConfig>& segmentCfgs, const std::vector<RegionConfig>& regionCfgs, bool verbose = true);
/// @brief [EXPERIMENTAL] cleanup the content of the shem segment, without recreating it
/// @param sessionId session id
/// Only call this when segment is not in use
static void ResetContent(const SessionId& sessionId, const std::vector<SegmentConfig>& segmentCfgs, const std::vector<RegionConfig>& regionCfgs, bool verbose = true);
/// @brief Outputs list of messages in shmem (if compiled with FAIRMQ_DEBUG_MODE=ON) /// @brief Outputs list of messages in shmem (if compiled with FAIRMQ_DEBUG_MODE=ON)
/// @param shmId shmem id /// @param shmId shmem id

View File

@ -26,6 +26,8 @@ static const RBTreeBestFit rbTreeBestFit = RBTreeBestFit();
struct Segment struct Segment
{ {
friend class Monitor;
Segment(const std::string& shmId, uint16_t id, size_t size, SimpleSeqFit) Segment(const std::string& shmId, uint16_t id, size_t size, SimpleSeqFit)
: fSegment(SimpleSeqFitSegment(boost::interprocess::open_or_create, : fSegment(SimpleSeqFitSegment(boost::interprocess::open_or_create,
std::string("fmq_" + shmId + "_m_" + std::to_string(id)).c_str(), std::string("fmq_" + shmId + "_m_" + std::to_string(id)).c_str(),
@ -66,15 +68,12 @@ struct Segment
static void Register(const std::string& shmId, uint16_t id, AllocationAlgorithm allocAlgo) static void Register(const std::string& shmId, uint16_t id, AllocationAlgorithm allocAlgo)
{ {
using namespace boost::interprocess; using namespace boost::interprocess;
managed_shared_memory mngSegment(open_or_create, std::string("fmq_" + shmId + "_mng").c_str(), 6553600); managed_shared_memory mngSegment(open_or_create, std::string("fmq_" + shmId + "_mng").c_str(), kManagementSegmentSize);
VoidAlloc alloc(mngSegment.get_segment_manager()); VoidAlloc alloc(mngSegment.get_segment_manager());
Uint16SegmentInfoHashMap* shmSegments = mngSegment.find_or_construct<Uint16SegmentInfoHashMap>(unique_instance)(alloc); Uint16SegmentInfoHashMap* shmSegments = mngSegment.find_or_construct<Uint16SegmentInfoHashMap>(unique_instance)(alloc);
EventCounter* eventCounter = mngSegment.find<EventCounter>(unique_instance).first; EventCounter* eventCounter = mngSegment.find_or_construct<EventCounter>(unique_instance)(0);
if (!eventCounter) {
eventCounter = mngSegment.construct<EventCounter>(unique_instance)(0);
}
bool newSegmentRegistered = shmSegments->emplace(id, allocAlgo).second; bool newSegmentRegistered = shmSegments->emplace(id, allocAlgo).second;
if (newSegmentRegistered) { if (newSegmentRegistered) {

View File

@ -41,6 +41,7 @@ struct UnmanagedRegion
{ {
friend class Message; friend class Message;
friend class Manager; friend class Manager;
friend class Monitor;
UnmanagedRegion(const std::string& shmId, uint16_t id, uint64_t size) UnmanagedRegion(const std::string& shmId, uint16_t id, uint64_t size)
: UnmanagedRegion(shmId, size, false, makeRegionConfig(id)) : UnmanagedRegion(shmId, size, false, makeRegionConfig(id))
@ -50,6 +51,10 @@ struct UnmanagedRegion
: UnmanagedRegion(shmId, size, false, std::move(cfg)) : UnmanagedRegion(shmId, size, false, std::move(cfg))
{} {}
UnmanagedRegion(const std::string& shmId, RegionConfig cfg)
: UnmanagedRegion(shmId, cfg.size, false, std::move(cfg))
{}
UnmanagedRegion(const std::string& shmId, uint64_t size, bool remote, RegionConfig cfg) UnmanagedRegion(const std::string& shmId, uint64_t size, bool remote, RegionConfig cfg)
: fRemote(remote) : fRemote(remote)
, fRemoveOnDestruction(cfg.removeOnDestruction) , fRemoveOnDestruction(cfg.removeOnDestruction)
@ -66,6 +71,9 @@ struct UnmanagedRegion
{ {
using namespace boost::interprocess; using namespace boost::interprocess;
// TODO: refactor this
cfg.size = size;
if (!cfg.path.empty()) { if (!cfg.path.empty()) {
fName = std::string(cfg.path + fName); fName = std::string(cfg.path + fName);
@ -119,7 +127,7 @@ struct UnmanagedRegion
} }
if (!remote) { if (!remote) {
Register(shmId, cfg, size); Register(shmId, cfg);
} }
LOG(trace) << "shmem: initialized region: " << fName << " (" << (remote ? "remote" : "local") << ")"; LOG(trace) << "shmem: initialized region: " << fName << " (" << (remote ? "remote" : "local") << ")";
@ -223,20 +231,17 @@ struct UnmanagedRegion
return regionCfg; return regionCfg;
} }
static void Register(const std::string& shmId, RegionConfig& cfg, uint64_t size) static void Register(const std::string& shmId, const RegionConfig& cfg)
{ {
using namespace boost::interprocess; using namespace boost::interprocess;
managed_shared_memory mngSegment(open_or_create, std::string("fmq_" + shmId + "_mng").c_str(), 6553600); managed_shared_memory mngSegment(open_or_create, std::string("fmq_" + shmId + "_mng").c_str(), kManagementSegmentSize);
VoidAlloc alloc(mngSegment.get_segment_manager()); VoidAlloc alloc(mngSegment.get_segment_manager());
Uint16RegionInfoHashMap* shmRegions = mngSegment.find_or_construct<Uint16RegionInfoHashMap>(unique_instance)(alloc); Uint16RegionInfoHashMap* shmRegions = mngSegment.find_or_construct<Uint16RegionInfoHashMap>(unique_instance)(alloc);
EventCounter* eventCounter = mngSegment.find<EventCounter>(unique_instance).first; EventCounter* eventCounter = mngSegment.find_or_construct<EventCounter>(unique_instance)(0);
if (!eventCounter) {
eventCounter = mngSegment.construct<EventCounter>(unique_instance)(0);
}
bool newShmRegionCreated = shmRegions->emplace(cfg.id.value(), RegionInfo(cfg.path.c_str(), cfg.creationFlags, cfg.userFlags, size, alloc)).second; bool newShmRegionCreated = shmRegions->emplace(cfg.id.value(), RegionInfo(cfg.path.c_str(), cfg.creationFlags, cfg.userFlags, cfg.size, alloc)).second;
if (newShmRegionCreated) { if (newShmRegionCreated) {
(eventCounter->fCount)++; (eventCounter->fCount)++;
} }