FairMQ: Fix missing ofi completion events

This commit is contained in:
Dennis Klein 2018-03-08 03:07:26 +01:00 committed by Mohammad Al-Turany
parent 144aa912d7
commit c5072ea425
5 changed files with 53 additions and 33 deletions

View File

@ -122,15 +122,12 @@ auto Context::GetBoostVersion() const -> std::string
return tools::ToString(BOOST_VERSION / 100000, ".", BOOST_VERSION / 100 % 1000, ".", BOOST_VERSION % 100);
}
auto Context::InitOfi(ConnectionType type, std::string addr) -> void
auto Context::InitOfi(ConnectionType type, Address addr) -> void
{
auto addr2 = ConvertAddress(addr);
if (addr2.Protocol != "tcp")
throw ContextError{"Wrong protocol: Supplied address must be in format tcp://ip:port"};
if (!fOfiInfo) {
sockaddr_in* sa = static_cast<sockaddr_in*>(malloc(sizeof(sockaddr_in)));
auto sa2 = ConvertAddress(addr2);
addr.Port = 0;
auto sa2 = ConvertAddress(addr);
memcpy(sa, &sa2, sizeof(sockaddr_in));
// Prepare fi_getinfo query
@ -143,16 +140,15 @@ auto Context::InitOfi(ConnectionType type, std::string addr) -> void
ofi_hints->domain_attr->threading = FI_THREAD_SAFE;
ofi_hints->domain_attr->control_progress = FI_PROGRESS_AUTO;
ofi_hints->domain_attr->data_progress = FI_PROGRESS_AUTO;
// if (type == ConnectionType::Bind) {
// ofi_hints->src_addr = sa;
// ofi_hints->src_addrlen = sizeof(sockaddr_in);
// } else {
// ofi_hints->dest_addr = sa;
// ofi_hints->dest_addrlen = sizeof(sockaddr_in);
// }
ofi_hints->tx_attr->op_flags = FI_COMPLETION;
ofi_hints->rx_attr->op_flags = FI_COMPLETION;
ofi_hints->src_addr = sa;
ofi_hints->src_addrlen = sizeof(sockaddr_in);
ofi_hints->dest_addr = nullptr;
ofi_hints->dest_addrlen = 0;
// Query fi_getinfo for fabric to use
auto res = fi_getinfo(FI_VERSION(1, 5), strdup(addr2.Ip.c_str()), 0, 0, ofi_hints.get(), &fOfiInfo);
auto res = fi_getinfo(FI_VERSION(1, 5), nullptr, nullptr, 0, ofi_hints.get(), &fOfiInfo);
if (res != 0) throw ContextError{tools::ToString("Failed querying fi_getinfo, reason: ", fi_strerror(res))};
if (!fOfiInfo) throw ContextError{"Could not find any ofi compatible fabric."};
@ -286,7 +282,12 @@ auto Context::InsertAddressVector(sockaddr_in address) -> fi_addr_t
if (ret != 1)
throw ContextError{tools::ToString("Failed to insert address into ofi address vector")};
return ret;
return mappedAddress;
}
auto Context::AddressVectorLookup(fi_addr_t address) -> sockaddr_in
{
throw ContextError("Not yet implemented");
}
auto Context::ConvertAddress(std::string address) -> Address

View File

@ -41,7 +41,6 @@ class Context
Context(int numberIoThreads = 2);
~Context();
auto InitOfi(ConnectionType type, std::string address) -> void;
auto CreateOfiEndpoint() -> fid_ep*;
auto CreateOfiCompletionQueue(Direction dir) -> fid_cq*;
auto GetZmqVersion() const -> std::string;
@ -51,12 +50,14 @@ class Context
auto GetZmqContext() const -> void* { return fZmqContext; }
auto GetIoContext() -> boost::asio::io_service& { return fIoContext; }
auto InsertAddressVector(sockaddr_in address) -> fi_addr_t;
auto AddressVectorLookup(fi_addr_t address) -> sockaddr_in;
struct Address {
std::string Protocol;
std::string Ip;
unsigned int Port;
friend auto operator<<(std::ostream& os, const Address& a) -> std::ostream& { return os << a.Protocol << "://" << a.Ip << ":" << a.Port; }
};
auto InitOfi(ConnectionType type, Address address) -> void;
static auto ConvertAddress(std::string address) -> Address;
static auto ConvertAddress(Address address) -> sockaddr_in;
static auto ConvertAddress(sockaddr_in address) -> Address;

View File

@ -85,7 +85,7 @@ auto Socket::Bind(const string& address) -> bool
try {
auto addr = Context::VerifyAddress(address);
BindControlSocket(addr);
fContext.InitOfi(ConnectionType::Bind, address);
fContext.InitOfi(ConnectionType::Bind, addr);
InitDataEndpoint();
fWaitingForControlPeer = true;
return true;
@ -106,7 +106,7 @@ auto Socket::Connect(const string& address) -> void
{
auto addr = Context::VerifyAddress(address);
ConnectControlSocket(addr);
fContext.InitOfi(ConnectionType::Connect, address);
fContext.InitOfi(ConnectionType::Connect, addr);
InitDataEndpoint();
fWaitingForControlPeer = true;
}
@ -207,7 +207,7 @@ try {
auto Socket::SendControlMessage(unique_ptr<ControlMessage> ctrl) -> void
{
assert(fControlSocket);
LOG(debug) << "About to send control message: " << ctrl->DebugString();
// LOG(debug) << "About to send control message: " << ctrl->DebugString();
// Serialize
string* str = new string();
@ -217,8 +217,10 @@ auto Socket::SendControlMessage(unique_ptr<ControlMessage> ctrl) -> void
assert(ret == 0);
// Send
if (zmq_msg_send(&msg, fControlSocket, 0) == -1)
if (zmq_msg_send(&msg, fControlSocket, 0) == -1) {
zmq_msg_close(&msg);
throw SocketError(tools::ToString("Failed to send control message, reason: ", zmq_strerror(errno)));
}
}
auto Socket::ReceiveControlMessage() -> unique_ptr<ControlMessage>
@ -229,14 +231,17 @@ auto Socket::ReceiveControlMessage() -> unique_ptr<ControlMessage>
zmq_msg_t msg;
auto ret = zmq_msg_init(&msg);
assert(ret == 0);
if (zmq_msg_recv(&msg, fControlSocket, 0) == -1)
if (zmq_msg_recv(&msg, fControlSocket, 0) == -1) {
zmq_msg_close(&msg);
throw SocketError(tools::ToString("Failed to receive control message, reason: ", zmq_strerror(errno)));
}
// Deserialize
auto ctrl = tools::make_unique<ControlMessage>();
ctrl->ParseFromArray(zmq_msg_data(&msg), zmq_msg_size(&msg));
LOG(debug) << "Received control message: " << ctrl->DebugString();
zmq_msg_close(&msg);
// LOG(debug) << "Received control message: " << ctrl->DebugString();
return ctrl;
}
@ -269,6 +274,9 @@ auto Socket::WaitForControlPeer() -> void
string remoteIp(inet_ntoa(remoteAddr.sin_addr));
int remotePort = ntohs(remoteAddr.sin_port);
LOG(debug) << "Accepted control peer connection from " << remoteIp << ":" << remotePort;
// sucks, but the above event does not guarantee the socket is operational ...
std::this_thread::sleep_for(std::chrono::milliseconds(200));
} else if (event == ZMQ_EVENT_CONNECTED) {
LOG(debug) << "Connected successfully to control peer";
} else {
@ -318,7 +326,7 @@ try {
throw SocketError(tools::ToString("Failed posting ofi send buffer, reason: ", fi_strerror(ret)));
fi_cq_err_entry cqEntry;
ret = fi_cq_sread(fDataCompletionQueueTx, &cqEntry, 1, nullptr, 1000);
ret = fi_cq_sread(fDataCompletionQueueTx, &cqEntry, 1, nullptr, -1);
if (ret != 1)
throw SocketError(tools::ToString("Failed reading ofi tx completion queue event, reason: ", fi_strerror(ret)));
}
@ -350,12 +358,13 @@ try {
assert(ctrl->has_post_buffer());
auto postBuffer = ctrl->post_buffer();
auto size = postBuffer.size();
LOG(debug) << "Received post buffer control message with size: " << size;
// Receive data
if (size) {
msg->Rebuild(size);
auto ret = fi_recv(fDataEndpoint, msg->GetData(), msg->GetSize(), nullptr, fRemoteDataAddr, nullptr);
auto buf = msg->GetData();
auto size2 = msg->GetSize();
auto ret = fi_recv(fDataEndpoint, buf, size2, nullptr, fRemoteDataAddr, nullptr);
if (ret != FI_SUCCESS)
throw SocketError(tools::ToString("Failed posting ofi receive buffer, reason: ", fi_strerror(ret)));
@ -368,9 +377,11 @@ try {
SendControlMessage(move(ctrl2));
fi_cq_err_entry cqEntry;
ret = fi_cq_sread(fDataCompletionQueueRx, &cqEntry, 1, nullptr, 1000);
ret = fi_cq_sread(fDataCompletionQueueRx, &cqEntry, 1, nullptr, -1);
if (ret != 1)
throw SocketError(tools::ToString("Failed reading ofi rx completion queue event, reason: ", fi_strerror(ret)));
assert(cqEntry.len == size2);
assert(cqEntry.buf == buf);
}
return size;

View File

@ -45,13 +45,17 @@ class PairLeft : public FairMQDevice
// Simple message with short text data
auto msg5{NewSimpleMessageFor("data", 0, "testdata1234")};
LOG(info) << "Will send msg5";
if (Send(msg5, "data") >= 0) counter++;
LOG(info) << "Sent msg5";
if (counter == 5) LOG(info) << "Simple message with short text data successfull";
auto msg6{NewMessageFor("data", 0)};
auto ret = Receive(msg6, "data");
if (ret > 0) {
auto content = std::string{static_cast<char*>(msg6->GetData()), msg6->GetSize()};
LOG(info) << ret << ", " << msg6->GetSize() << ", '" << content << "'";
if (msg6->GetSize() == ret && content == "testdata1234") counter++;
}
if (counter == 6) LOG(info) << "Simple message with short text data successfull";
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
assert(counter == 5);
assert(counter == 6);
};
};

View File

@ -52,9 +52,12 @@ class PairRight : public FairMQDevice
LOG(info) << ret << ", " << msg5->GetSize() << ", '" << content << "'";
if (msg5->GetSize() == ret && content == "testdata1234") counter++;
}
if (counter == 5) LOG(info) << "Simple message with short text data successfull";
auto msg6{NewSimpleMessageFor("data", 0, "testdata1234")};
if (Send(msg6, "data") >= 0) counter++;
if (counter == 6) LOG(info) << "Simple message with short text data successfull";
if (counter == 5) LOG(info) << "PAIR test successfull.";
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
if (counter == 6) LOG(info) << "PAIR test successfull.";
};
};