From 1597999aeda2f13598bf6af180ca72ca6919885e Mon Sep 17 00:00:00 2001 From: Alexey Rybalchenko Date: Wed, 10 Jun 2026 15:09:47 +0200 Subject: [PATCH] fix(shmem): don't cache nullptr in GetRegionFromCache A failed region lookup was inserted into the thread-local cache as nullptr, making the failure permanent for the lifetime of the cache generation - retrying never healed because the fast path would return nullptr without calling GetRegion again. Skip the cache insert on failure so subsequent calls retry the slow path. --- fairmq/shmem/Manager.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fairmq/shmem/Manager.h b/fairmq/shmem/Manager.h index 60cee0df..7d06fe15 100644 --- a/fairmq/shmem/Manager.h +++ b/fairmq/shmem/Manager.h @@ -390,8 +390,10 @@ class Manager } auto* lRegion = GetRegion(id); - fTlRegionCache.fRegionsTLCache.emplace_back(lRegion, id, fShmId64); - fTlRegionCache.fRegionsTLCacheGen = fRegionsGen; + if (lRegion) { + fTlRegionCache.fRegionsTLCache.emplace_back(lRegion, id, fShmId64); + fTlRegionCache.fRegionsTLCacheGen = fRegionsGen; + } return lRegion; }