FairMQ/fairmq/plugins/PMIx/PMIxPlugin.cxx
Dennis Klein cda7282422 feat!: Remove deprecated components sdk, sdk_commands, dds_plugin
BREAKING CHANGE: Components have been moved to ODC project, see
https://github.com/FairRootGroup/FairMQ/discussions/392 for details.
2022-08-11 15:30:25 +02:00

206 lines
8.3 KiB
C++

/********************************************************************************
* Copyright (C) 2019-2022 GSI Helmholtzzentrum fuer Schwerionenforschung GmbH *
* *
* This software is distributed under the terms of the *
* GNU Lesser General Public Licence (LGPL) version 3, *
* copied verbatim in the file "LICENSE" *
********************************************************************************/
#include "PMIxPlugin.h"
#include <fairmq/tools/Strings.h>
#include <sstream>
#include <stdexcept>
#include <cstdint> // UINT32_MAX
using namespace std;
namespace fair::mq::plugins
{
PMIxPlugin::PMIxPlugin(const string& name,
const Plugin::Version version,
const string& maintainer,
const string& homepage,
PluginServices* pluginServices)
: Plugin(name, version, maintainer, homepage, pluginServices)
, fProcess(Init())
, fPid(getpid())
, fPMIxClient(tools::ToString("PMIx client(pid=", fPid, ") "))
, fDeviceId(string(fProcess.nspace) + "_" + to_string(fProcess.rank))
// , fLastExternalController(UINT32_MAX)
, fExitingAckedByLastExternalController(false)
, fCurrentState(DeviceState::Idle)
, fLastState(DeviceState::Idle)
{
TakeDeviceControl();
LOG(debug) << PMIxClient() << "pmix::init() OK: " << fProcess << ", version=" << pmix::get_version();
SetProperty<string>("id", fDeviceId);
Fence("pmix::init");
// LOG(info) << "PMIX_EXTERNAL_ERR_BASE: " << PMIX_EXTERNAL_ERR_BASE;
// job level infos
// LOG(info) << "PMIX_SESSION_ID: " << pmix::getInfo(PMIX_SESSION_ID, fProcess);
// LOG(info) << "PMIX_UNIV_SIZE: " << pmix::getInfo(PMIX_UNIV_SIZE, fProcess);
// LOG(info) << "PMIX_JOB_SIZE: " << pmix::getInfo(PMIX_JOB_SIZE, fProcess);
// LOG(info) << "PMIX_JOB_NUM_APPS: " << pmix::getInfo(PMIX_JOB_NUM_APPS, fProcess);
// LOG(info) << "PMIX_APP_SIZE: " << pmix::getInfo(PMIX_APP_SIZE, fProcess);
// LOG(info) << "PMIX_MAX_PROCS: " << pmix::getInfo(PMIX_MAX_PROCS, fProcess);
// LOG(info) << "PMIX_NUM_NODES: " << pmix::getInfo(PMIX_NUM_NODES, fProcess);
// LOG(info) << "PMIX_CLUSTER_ID: " << pmix::getInfo(PMIX_CLUSTER_ID, fProcess);
// LOG(info) << "PMIX_NSPACE: " << pmix::getInfo(PMIX_NSPACE, fProcess);
// LOG(info) << "PMIX_JOBID: " << pmix::getInfo(PMIX_JOBID, fProcess);
// LOG(info) << "PMIX_NODE_LIST: " << pmix::getInfo(PMIX_NODE_LIST, fProcess);
// LOG(info) << "PMIX_ALLOCATED_NODELIST: " << pmix::getInfo(PMIX_ALLOCATED_NODELIST, fProcess);
// LOG(info) << "PMIX_NPROC_OFFSET: " << pmix::getInfo(PMIX_NPROC_OFFSET, fProcess);
// LOG(info) << "PMIX_LOCALLDR: " << pmix::getInfo(PMIX_LOCALLDR, fProcess);
// LOG(info) << "PMIX_APPLDR: " << pmix::getInfo(PMIX_APPLDR, fProcess);
// // per-node information
// LOG(info) << "PMIX_NODE_SIZE: " << pmix::getInfo(PMIX_NODE_SIZE, fProcess);
// LOG(info) << "PMIX_LOCAL_SIZE: " << pmix::getInfo(PMIX_LOCAL_SIZE, fProcess);
// LOG(info) << "PMIX_AVAIL_PHYS_MEMORY: " << pmix::getInfo(PMIX_AVAIL_PHYS_MEMORY, fProcess);
// // per-process information
// LOG(info) << "PMIX_PROCID: " << pmix::getInfo(PMIX_PROCID, fProcess);
// LOG(info) << "PMIX_APPNUM: " << pmix::getInfo(PMIX_APPNUM, fProcess);
// LOG(info) << "PMIX_LOCAL_RANK: " << pmix::getInfo(PMIX_LOCAL_RANK, fProcess);
// LOG(info) << "PMIX_NODE_RANK: " << pmix::getInfo(PMIX_NODE_RANK, fProcess);
// LOG(info) << "PMIX_RANK: " << pmix::getInfo(PMIX_RANK, fProcess);
// LOG(info) << "PMIX_GLOBAL_RANK: " << pmix::getInfo(PMIX_GLOBAL_RANK, fProcess);
// LOG(info) << "PMIX_APP_RANK: " << pmix::getInfo(PMIX_APP_RANK, fProcess);
SubscribeToDeviceStateChange([this](DeviceState newState) {
switch (newState) {
case DeviceState::Bound:
Publish();
break;
case DeviceState::Connecting:
Lookup();
break;
case DeviceState::Exiting:
ReleaseDeviceControl();
UnsubscribeFromDeviceStateChange();
break;
default:
break;
}
lock_guard<mutex> lock{fStateChangeSubscriberMutex};
fLastState = fCurrentState;
fCurrentState = newState;
// for (auto subscriberId : fStateChangeSubscribers) {
// LOG(debug) << "Publishing state-change: " << fLastState << "->" << newState << " to " << subscriberId;
// }
});
}
PMIxPlugin::~PMIxPlugin()
{
LOG(debug) << "Destroying PMIxPlugin";
ReleaseDeviceControl();
while (pmix::initialized()) {
try {
pmix::finalize();
LOG(debug) << PMIxClient() << "pmix::finalize() OK";
} catch (const pmix::runtime_error& e) {
LOG(debug) << PMIxClient() << "pmix::finalize() failed: " << e.what();
}
}
}
auto PMIxPlugin::Init() -> pmix::proc
{
if (!pmix::initialized()) {
return pmix::init();
} else {
throw runtime_error("trying to initialize PMIx while it is already initialized");
}
}
auto PMIxPlugin::Publish() -> void
{
auto channels(GetChannelInfo());
vector<pmix::info> info;
for (const auto& c : channels) {
string methodKey("chans." + c.first + "." + to_string(c.second - 1) + ".method");
if (GetProperty<string>(methodKey) == "bind") {
for (int i = 0; i < c.second; ++i) {
string addressKey("chans." + c.first + "." + to_string(i) + ".address");
info.emplace_back(addressKey, GetProperty<string>(addressKey));
LOG(debug) << PMIxClient() << info.back();
}
}
}
if (info.size() > 0) {
pmix::publish(info);
LOG(debug) << PMIxClient() << "pmix::publish() OK: published " << info.size()
<< " binding channels.";
}
}
auto PMIxPlugin::Fence() -> void
{
pmix::proc all(fProcess);
all.rank = pmix::rank::wildcard;
pmix::fence({all});
}
auto PMIxPlugin::Fence(const std::string& label) -> void
{
Fence();
LOG(debug) << PMIxClient() << "pmix::fence() [" << label << "] OK";
}
auto PMIxPlugin::Lookup() -> void
{
auto channels(GetChannelInfo());
for (const auto& c : channels) {
string methodKey("chans." + c.first + "." + to_string(c.second - 1) + ".method");
if (GetProperty<string>(methodKey) == "connect") {
for (int i = 0; i < c.second; ++i) {
vector<pmix::pdata> pdata;
string addressKey("chans." + c.first + "." + to_string(i) + ".address");
pdata.emplace_back();
pdata.back().set_key(addressKey);
vector<pmix::info> info;
info.emplace_back(PMIX_WAIT, static_cast<int>(pdata.size()));
if (pdata.size() > 0) {
pmix::lookup(pdata, info);
LOG(debug) << PMIxClient() << "pmix::lookup() OK";
}
for (const auto& p : pdata) {
if (p.value.type == PMIX_UNDEF) {
LOG(debug) << PMIxClient() << "pmix::lookup() not found: key=" << p.key;
} else if (p.value.type == PMIX_STRING) {
LOG(debug) << PMIxClient() << "pmix::lookup() found:"
<< " key=" << p.key << ",value=" << p.value.data.string;
SetProperty<string>(p.key, p.value.data.string);
} else {
LOG(debug) << PMIxClient() << "pmix::lookup() wrong type returned: "
<< "key=" << p.key << ",type=" << p.value.type;
}
}
}
}
}
}
auto PMIxPlugin::WaitForExitingAck() -> void
{
unique_lock<mutex> lock(fStateChangeSubscriberMutex);
fExitingAcked.wait_for(lock, chrono::milliseconds(1000), [this]() {
return fExitingAckedByLastExternalController;
});
}
} // namespace fair::mq::plugins