From 29da090bf5cb2e4cac31f887e619922af04dc6dc Mon Sep 17 00:00:00 2001 From: Gvozden Neskovic Date: Thu, 6 May 2021 11:04:01 +0200 Subject: [PATCH] use thread local cache to avoid interprocess lock on shm GetData --- fairmq/shmem/Manager.h | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/fairmq/shmem/Manager.h b/fairmq/shmem/Manager.h index c6608603..de85ae40 100644 --- a/fairmq/shmem/Manager.h +++ b/fairmq/shmem/Manager.h @@ -314,6 +314,7 @@ class Manager (fEventCounter->fCount)++; } + fRegionsGen += 1; // signal TL cache invalidation fRegionEventsCV.notify_all(); return result; @@ -327,8 +328,28 @@ class Manager Region* GetRegion(const uint16_t id) { + // NOTE: gcc optimizations. Prevent loading tls addresses many times in the fast path + const auto &lTlCache = fTlRegionCache; + const auto &lTlCacheVec = lTlCache.fRegionsTLCache; + const auto lTlCacheGen = lTlCache.fRegionsTLCacheGen; + + // fast path + for (const auto &lRegion : lTlCacheVec) { + if ((lRegion.second == id) && (lTlCacheGen == fRegionsGen)) { + return lRegion.first; + } + } + boost::interprocess::scoped_lock lock(fShmMtx); - return GetRegionUnsafe(id); + // slow path: check invalidation + if (lTlCacheGen != fRegionsGen) { + fTlRegionCache.fRegionsTLCache.clear(); + } + + auto *lRegion = GetRegionUnsafe(id); + fTlRegionCache.fRegionsTLCache.emplace_back(std::make_pair(lRegion, id)); + fTlRegionCache.fRegionsTLCacheGen = fRegionsGen; + return lRegion; } Region* GetRegionUnsafe(const uint16_t id) @@ -366,6 +387,7 @@ class Manager fShmRegions->at(id).fDestroyed = true; (fEventCounter->fCount)++; } + fRegionsGen += 1; // signal TL cache invalidation fRegionEventsCV.notify_all(); } @@ -613,6 +635,7 @@ class Manager using namespace boost::interprocess; bool lastRemoved = false; + fRegionsGen += 1; // signal TL cache invalidation UnsubscribeFromRegionEvents(); { @@ -665,6 +688,12 @@ class Manager Uint16SegmentInfoHashMap* fShmSegments; Uint16RegionInfoHashMap* fShmRegions; std::unordered_map> fRegions; + // make sure this is alone in the cache line: mostly read + alignas(128) inline static std::atomic fRegionsGen = 0ul; + inline static thread_local struct ManagerTLCache { + unsigned long fRegionsTLCacheGen; + std::vector> fRegionsTLCache; + } fTlRegionCache; std::atomic fInterrupted; std::atomic fMsgCounter; // TODO: find a better lifetime solution instead of the counter