mirror of
https://github.com/FairRootGroup/FairMQ.git
synced 2025-10-14 09:06:47 +00:00
FairMQ: Fix missing ofi completion events
This commit is contained in:
parent
144aa912d7
commit
c5072ea425
|
@ -122,15 +122,12 @@ auto Context::GetBoostVersion() const -> std::string
|
||||||
return tools::ToString(BOOST_VERSION / 100000, ".", BOOST_VERSION / 100 % 1000, ".", BOOST_VERSION % 100);
|
return tools::ToString(BOOST_VERSION / 100000, ".", BOOST_VERSION / 100 % 1000, ".", BOOST_VERSION % 100);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto Context::InitOfi(ConnectionType type, std::string addr) -> void
|
auto Context::InitOfi(ConnectionType type, Address addr) -> void
|
||||||
{
|
{
|
||||||
auto addr2 = ConvertAddress(addr);
|
|
||||||
if (addr2.Protocol != "tcp")
|
|
||||||
throw ContextError{"Wrong protocol: Supplied address must be in format tcp://ip:port"};
|
|
||||||
|
|
||||||
if (!fOfiInfo) {
|
if (!fOfiInfo) {
|
||||||
sockaddr_in* sa = static_cast<sockaddr_in*>(malloc(sizeof(sockaddr_in)));
|
sockaddr_in* sa = static_cast<sockaddr_in*>(malloc(sizeof(sockaddr_in)));
|
||||||
auto sa2 = ConvertAddress(addr2);
|
addr.Port = 0;
|
||||||
|
auto sa2 = ConvertAddress(addr);
|
||||||
memcpy(sa, &sa2, sizeof(sockaddr_in));
|
memcpy(sa, &sa2, sizeof(sockaddr_in));
|
||||||
|
|
||||||
// Prepare fi_getinfo query
|
// Prepare fi_getinfo query
|
||||||
|
@ -143,16 +140,15 @@ auto Context::InitOfi(ConnectionType type, std::string addr) -> void
|
||||||
ofi_hints->domain_attr->threading = FI_THREAD_SAFE;
|
ofi_hints->domain_attr->threading = FI_THREAD_SAFE;
|
||||||
ofi_hints->domain_attr->control_progress = FI_PROGRESS_AUTO;
|
ofi_hints->domain_attr->control_progress = FI_PROGRESS_AUTO;
|
||||||
ofi_hints->domain_attr->data_progress = FI_PROGRESS_AUTO;
|
ofi_hints->domain_attr->data_progress = FI_PROGRESS_AUTO;
|
||||||
// if (type == ConnectionType::Bind) {
|
ofi_hints->tx_attr->op_flags = FI_COMPLETION;
|
||||||
// ofi_hints->src_addr = sa;
|
ofi_hints->rx_attr->op_flags = FI_COMPLETION;
|
||||||
// ofi_hints->src_addrlen = sizeof(sockaddr_in);
|
ofi_hints->src_addr = sa;
|
||||||
// } else {
|
ofi_hints->src_addrlen = sizeof(sockaddr_in);
|
||||||
// ofi_hints->dest_addr = sa;
|
ofi_hints->dest_addr = nullptr;
|
||||||
// ofi_hints->dest_addrlen = sizeof(sockaddr_in);
|
ofi_hints->dest_addrlen = 0;
|
||||||
// }
|
|
||||||
|
|
||||||
// Query fi_getinfo for fabric to use
|
// Query fi_getinfo for fabric to use
|
||||||
auto res = fi_getinfo(FI_VERSION(1, 5), strdup(addr2.Ip.c_str()), 0, 0, ofi_hints.get(), &fOfiInfo);
|
auto res = fi_getinfo(FI_VERSION(1, 5), nullptr, nullptr, 0, ofi_hints.get(), &fOfiInfo);
|
||||||
if (res != 0) throw ContextError{tools::ToString("Failed querying fi_getinfo, reason: ", fi_strerror(res))};
|
if (res != 0) throw ContextError{tools::ToString("Failed querying fi_getinfo, reason: ", fi_strerror(res))};
|
||||||
if (!fOfiInfo) throw ContextError{"Could not find any ofi compatible fabric."};
|
if (!fOfiInfo) throw ContextError{"Could not find any ofi compatible fabric."};
|
||||||
|
|
||||||
|
@ -286,7 +282,12 @@ auto Context::InsertAddressVector(sockaddr_in address) -> fi_addr_t
|
||||||
if (ret != 1)
|
if (ret != 1)
|
||||||
throw ContextError{tools::ToString("Failed to insert address into ofi address vector")};
|
throw ContextError{tools::ToString("Failed to insert address into ofi address vector")};
|
||||||
|
|
||||||
return ret;
|
return mappedAddress;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto Context::AddressVectorLookup(fi_addr_t address) -> sockaddr_in
|
||||||
|
{
|
||||||
|
throw ContextError("Not yet implemented");
|
||||||
}
|
}
|
||||||
|
|
||||||
auto Context::ConvertAddress(std::string address) -> Address
|
auto Context::ConvertAddress(std::string address) -> Address
|
||||||
|
|
|
@ -41,7 +41,6 @@ class Context
|
||||||
Context(int numberIoThreads = 2);
|
Context(int numberIoThreads = 2);
|
||||||
~Context();
|
~Context();
|
||||||
|
|
||||||
auto InitOfi(ConnectionType type, std::string address) -> void;
|
|
||||||
auto CreateOfiEndpoint() -> fid_ep*;
|
auto CreateOfiEndpoint() -> fid_ep*;
|
||||||
auto CreateOfiCompletionQueue(Direction dir) -> fid_cq*;
|
auto CreateOfiCompletionQueue(Direction dir) -> fid_cq*;
|
||||||
auto GetZmqVersion() const -> std::string;
|
auto GetZmqVersion() const -> std::string;
|
||||||
|
@ -51,12 +50,14 @@ class Context
|
||||||
auto GetZmqContext() const -> void* { return fZmqContext; }
|
auto GetZmqContext() const -> void* { return fZmqContext; }
|
||||||
auto GetIoContext() -> boost::asio::io_service& { return fIoContext; }
|
auto GetIoContext() -> boost::asio::io_service& { return fIoContext; }
|
||||||
auto InsertAddressVector(sockaddr_in address) -> fi_addr_t;
|
auto InsertAddressVector(sockaddr_in address) -> fi_addr_t;
|
||||||
|
auto AddressVectorLookup(fi_addr_t address) -> sockaddr_in;
|
||||||
struct Address {
|
struct Address {
|
||||||
std::string Protocol;
|
std::string Protocol;
|
||||||
std::string Ip;
|
std::string Ip;
|
||||||
unsigned int Port;
|
unsigned int Port;
|
||||||
friend auto operator<<(std::ostream& os, const Address& a) -> std::ostream& { return os << a.Protocol << "://" << a.Ip << ":" << a.Port; }
|
friend auto operator<<(std::ostream& os, const Address& a) -> std::ostream& { return os << a.Protocol << "://" << a.Ip << ":" << a.Port; }
|
||||||
};
|
};
|
||||||
|
auto InitOfi(ConnectionType type, Address address) -> void;
|
||||||
static auto ConvertAddress(std::string address) -> Address;
|
static auto ConvertAddress(std::string address) -> Address;
|
||||||
static auto ConvertAddress(Address address) -> sockaddr_in;
|
static auto ConvertAddress(Address address) -> sockaddr_in;
|
||||||
static auto ConvertAddress(sockaddr_in address) -> Address;
|
static auto ConvertAddress(sockaddr_in address) -> Address;
|
||||||
|
|
|
@ -85,7 +85,7 @@ auto Socket::Bind(const string& address) -> bool
|
||||||
try {
|
try {
|
||||||
auto addr = Context::VerifyAddress(address);
|
auto addr = Context::VerifyAddress(address);
|
||||||
BindControlSocket(addr);
|
BindControlSocket(addr);
|
||||||
fContext.InitOfi(ConnectionType::Bind, address);
|
fContext.InitOfi(ConnectionType::Bind, addr);
|
||||||
InitDataEndpoint();
|
InitDataEndpoint();
|
||||||
fWaitingForControlPeer = true;
|
fWaitingForControlPeer = true;
|
||||||
return true;
|
return true;
|
||||||
|
@ -106,7 +106,7 @@ auto Socket::Connect(const string& address) -> void
|
||||||
{
|
{
|
||||||
auto addr = Context::VerifyAddress(address);
|
auto addr = Context::VerifyAddress(address);
|
||||||
ConnectControlSocket(addr);
|
ConnectControlSocket(addr);
|
||||||
fContext.InitOfi(ConnectionType::Connect, address);
|
fContext.InitOfi(ConnectionType::Connect, addr);
|
||||||
InitDataEndpoint();
|
InitDataEndpoint();
|
||||||
fWaitingForControlPeer = true;
|
fWaitingForControlPeer = true;
|
||||||
}
|
}
|
||||||
|
@ -207,7 +207,7 @@ try {
|
||||||
auto Socket::SendControlMessage(unique_ptr<ControlMessage> ctrl) -> void
|
auto Socket::SendControlMessage(unique_ptr<ControlMessage> ctrl) -> void
|
||||||
{
|
{
|
||||||
assert(fControlSocket);
|
assert(fControlSocket);
|
||||||
LOG(debug) << "About to send control message: " << ctrl->DebugString();
|
// LOG(debug) << "About to send control message: " << ctrl->DebugString();
|
||||||
|
|
||||||
// Serialize
|
// Serialize
|
||||||
string* str = new string();
|
string* str = new string();
|
||||||
|
@ -217,8 +217,10 @@ auto Socket::SendControlMessage(unique_ptr<ControlMessage> ctrl) -> void
|
||||||
assert(ret == 0);
|
assert(ret == 0);
|
||||||
|
|
||||||
// Send
|
// Send
|
||||||
if (zmq_msg_send(&msg, fControlSocket, 0) == -1)
|
if (zmq_msg_send(&msg, fControlSocket, 0) == -1) {
|
||||||
|
zmq_msg_close(&msg);
|
||||||
throw SocketError(tools::ToString("Failed to send control message, reason: ", zmq_strerror(errno)));
|
throw SocketError(tools::ToString("Failed to send control message, reason: ", zmq_strerror(errno)));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto Socket::ReceiveControlMessage() -> unique_ptr<ControlMessage>
|
auto Socket::ReceiveControlMessage() -> unique_ptr<ControlMessage>
|
||||||
|
@ -229,14 +231,17 @@ auto Socket::ReceiveControlMessage() -> unique_ptr<ControlMessage>
|
||||||
zmq_msg_t msg;
|
zmq_msg_t msg;
|
||||||
auto ret = zmq_msg_init(&msg);
|
auto ret = zmq_msg_init(&msg);
|
||||||
assert(ret == 0);
|
assert(ret == 0);
|
||||||
if (zmq_msg_recv(&msg, fControlSocket, 0) == -1)
|
if (zmq_msg_recv(&msg, fControlSocket, 0) == -1) {
|
||||||
|
zmq_msg_close(&msg);
|
||||||
throw SocketError(tools::ToString("Failed to receive control message, reason: ", zmq_strerror(errno)));
|
throw SocketError(tools::ToString("Failed to receive control message, reason: ", zmq_strerror(errno)));
|
||||||
|
}
|
||||||
|
|
||||||
// Deserialize
|
// Deserialize
|
||||||
auto ctrl = tools::make_unique<ControlMessage>();
|
auto ctrl = tools::make_unique<ControlMessage>();
|
||||||
ctrl->ParseFromArray(zmq_msg_data(&msg), zmq_msg_size(&msg));
|
ctrl->ParseFromArray(zmq_msg_data(&msg), zmq_msg_size(&msg));
|
||||||
|
|
||||||
LOG(debug) << "Received control message: " << ctrl->DebugString();
|
zmq_msg_close(&msg);
|
||||||
|
// LOG(debug) << "Received control message: " << ctrl->DebugString();
|
||||||
return ctrl;
|
return ctrl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -269,6 +274,9 @@ auto Socket::WaitForControlPeer() -> void
|
||||||
string remoteIp(inet_ntoa(remoteAddr.sin_addr));
|
string remoteIp(inet_ntoa(remoteAddr.sin_addr));
|
||||||
int remotePort = ntohs(remoteAddr.sin_port);
|
int remotePort = ntohs(remoteAddr.sin_port);
|
||||||
LOG(debug) << "Accepted control peer connection from " << remoteIp << ":" << remotePort;
|
LOG(debug) << "Accepted control peer connection from " << remoteIp << ":" << remotePort;
|
||||||
|
|
||||||
|
// sucks, but the above event does not guarantee the socket is operational ...
|
||||||
|
std::this_thread::sleep_for(std::chrono::milliseconds(200));
|
||||||
} else if (event == ZMQ_EVENT_CONNECTED) {
|
} else if (event == ZMQ_EVENT_CONNECTED) {
|
||||||
LOG(debug) << "Connected successfully to control peer";
|
LOG(debug) << "Connected successfully to control peer";
|
||||||
} else {
|
} else {
|
||||||
|
@ -318,7 +326,7 @@ try {
|
||||||
throw SocketError(tools::ToString("Failed posting ofi send buffer, reason: ", fi_strerror(ret)));
|
throw SocketError(tools::ToString("Failed posting ofi send buffer, reason: ", fi_strerror(ret)));
|
||||||
|
|
||||||
fi_cq_err_entry cqEntry;
|
fi_cq_err_entry cqEntry;
|
||||||
ret = fi_cq_sread(fDataCompletionQueueTx, &cqEntry, 1, nullptr, 1000);
|
ret = fi_cq_sread(fDataCompletionQueueTx, &cqEntry, 1, nullptr, -1);
|
||||||
if (ret != 1)
|
if (ret != 1)
|
||||||
throw SocketError(tools::ToString("Failed reading ofi tx completion queue event, reason: ", fi_strerror(ret)));
|
throw SocketError(tools::ToString("Failed reading ofi tx completion queue event, reason: ", fi_strerror(ret)));
|
||||||
}
|
}
|
||||||
|
@ -350,12 +358,13 @@ try {
|
||||||
assert(ctrl->has_post_buffer());
|
assert(ctrl->has_post_buffer());
|
||||||
auto postBuffer = ctrl->post_buffer();
|
auto postBuffer = ctrl->post_buffer();
|
||||||
auto size = postBuffer.size();
|
auto size = postBuffer.size();
|
||||||
LOG(debug) << "Received post buffer control message with size: " << size;
|
|
||||||
|
|
||||||
// Receive data
|
// Receive data
|
||||||
if (size) {
|
if (size) {
|
||||||
msg->Rebuild(size);
|
msg->Rebuild(size);
|
||||||
auto ret = fi_recv(fDataEndpoint, msg->GetData(), msg->GetSize(), nullptr, fRemoteDataAddr, nullptr);
|
auto buf = msg->GetData();
|
||||||
|
auto size2 = msg->GetSize();
|
||||||
|
auto ret = fi_recv(fDataEndpoint, buf, size2, nullptr, fRemoteDataAddr, nullptr);
|
||||||
if (ret != FI_SUCCESS)
|
if (ret != FI_SUCCESS)
|
||||||
throw SocketError(tools::ToString("Failed posting ofi receive buffer, reason: ", fi_strerror(ret)));
|
throw SocketError(tools::ToString("Failed posting ofi receive buffer, reason: ", fi_strerror(ret)));
|
||||||
|
|
||||||
|
@ -368,9 +377,11 @@ try {
|
||||||
SendControlMessage(move(ctrl2));
|
SendControlMessage(move(ctrl2));
|
||||||
|
|
||||||
fi_cq_err_entry cqEntry;
|
fi_cq_err_entry cqEntry;
|
||||||
ret = fi_cq_sread(fDataCompletionQueueRx, &cqEntry, 1, nullptr, 1000);
|
ret = fi_cq_sread(fDataCompletionQueueRx, &cqEntry, 1, nullptr, -1);
|
||||||
if (ret != 1)
|
if (ret != 1)
|
||||||
throw SocketError(tools::ToString("Failed reading ofi rx completion queue event, reason: ", fi_strerror(ret)));
|
throw SocketError(tools::ToString("Failed reading ofi rx completion queue event, reason: ", fi_strerror(ret)));
|
||||||
|
assert(cqEntry.len == size2);
|
||||||
|
assert(cqEntry.buf == buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
return size;
|
return size;
|
||||||
|
|
|
@ -45,13 +45,17 @@ class PairLeft : public FairMQDevice
|
||||||
|
|
||||||
// Simple message with short text data
|
// Simple message with short text data
|
||||||
auto msg5{NewSimpleMessageFor("data", 0, "testdata1234")};
|
auto msg5{NewSimpleMessageFor("data", 0, "testdata1234")};
|
||||||
LOG(info) << "Will send msg5";
|
|
||||||
if (Send(msg5, "data") >= 0) counter++;
|
if (Send(msg5, "data") >= 0) counter++;
|
||||||
LOG(info) << "Sent msg5";
|
auto msg6{NewMessageFor("data", 0)};
|
||||||
if (counter == 5) LOG(info) << "Simple message with short text data successfull";
|
auto ret = Receive(msg6, "data");
|
||||||
|
if (ret > 0) {
|
||||||
|
auto content = std::string{static_cast<char*>(msg6->GetData()), msg6->GetSize()};
|
||||||
|
LOG(info) << ret << ", " << msg6->GetSize() << ", '" << content << "'";
|
||||||
|
if (msg6->GetSize() == ret && content == "testdata1234") counter++;
|
||||||
|
}
|
||||||
|
if (counter == 6) LOG(info) << "Simple message with short text data successfull";
|
||||||
|
|
||||||
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
|
assert(counter == 6);
|
||||||
assert(counter == 5);
|
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -52,9 +52,12 @@ class PairRight : public FairMQDevice
|
||||||
LOG(info) << ret << ", " << msg5->GetSize() << ", '" << content << "'";
|
LOG(info) << ret << ", " << msg5->GetSize() << ", '" << content << "'";
|
||||||
if (msg5->GetSize() == ret && content == "testdata1234") counter++;
|
if (msg5->GetSize() == ret && content == "testdata1234") counter++;
|
||||||
}
|
}
|
||||||
if (counter == 5) LOG(info) << "Simple message with short text data successfull";
|
auto msg6{NewSimpleMessageFor("data", 0, "testdata1234")};
|
||||||
|
if (Send(msg6, "data") >= 0) counter++;
|
||||||
|
if (counter == 6) LOG(info) << "Simple message with short text data successfull";
|
||||||
|
|
||||||
if (counter == 5) LOG(info) << "PAIR test successfull.";
|
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
|
||||||
|
if (counter == 6) LOG(info) << "PAIR test successfull.";
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user