mirror of
https://git.rwth-aachen.de/acs/public/villas/node/
synced 2025-03-09 00:00:00 +01:00
Implemented request to resolve address and built an rdma_cm_event framework
This commit is contained in:
parent
1528603a88
commit
4220ff8111
2 changed files with 149 additions and 11 deletions
|
@ -40,6 +40,8 @@ struct format_type;
|
|||
|
||||
struct infiniband {
|
||||
struct rdma_cm_id *id;
|
||||
struct rdma_event_channel *ec;
|
||||
|
||||
struct ibv_pd *pd;
|
||||
struct ibv_cq *cq;
|
||||
struct ibv_comp_channel *comp_channel;
|
||||
|
@ -47,14 +49,18 @@ struct infiniband {
|
|||
pthread_t cq_poller_thread;
|
||||
|
||||
struct connection_s {
|
||||
char *src_ip_addr;
|
||||
char *dst_ip_addr;
|
||||
struct addrinfo *src_addr;
|
||||
struct addrinfo *dst_addr;
|
||||
const int timeout;
|
||||
enum rdma_port_space port_space;
|
||||
|
||||
struct ibv_qp *qp;
|
||||
struct ibv_mr *mr_payload;
|
||||
struct r_addr_key_s *r_addr_key;
|
||||
} conn;
|
||||
|
||||
int is_source;
|
||||
|
||||
};
|
||||
|
||||
/** @see node_type::reverse */
|
||||
|
@ -76,7 +82,7 @@ int infiniband_destroy(struct node *n);
|
|||
int infiniband_stop(struct node *n);
|
||||
|
||||
/** @see node_type::init */
|
||||
int infiniband_init();
|
||||
int infiniband_init(struct super_node *n);
|
||||
|
||||
/** @see node_type::deinit */
|
||||
int infiniband_deinit();
|
||||
|
|
|
@ -26,25 +26,54 @@
|
|||
#include <villas/plugin.h>
|
||||
#include <villas/utils.h>
|
||||
#include <villas/format_type.h>
|
||||
#include <rdma/rdma_cma.h>
|
||||
|
||||
static void infiniband_log_cb(struct infiniband *ib, void *userdata, int level, const char *str)
|
||||
static int infiniband_addr_resolved(struct rdma_cm_id *id)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void infiniband_connect_cb(struct infiniband *ib, void *userdata, int result)
|
||||
static int infiniband_route_resolved(struct rdma_cm_id *id)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void infiniband_disconnect_cb(struct infiniband *ib, void *userdata, int result)
|
||||
static int infiniband_connect_request(struct rdma_cm_id *id)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void infiniband_message_cb(struct infiniband *ib, void *userdata)
|
||||
static int infiniband_event(struct rdma_cm_event *event)
|
||||
{
|
||||
}
|
||||
int ret = 0;
|
||||
|
||||
static void infiniband_subscribe_cb(struct infiniband *ib, void *userdata, int mid, int qos_count, const int *granted_qos)
|
||||
{
|
||||
switch(event->event)
|
||||
{
|
||||
case RDMA_CM_EVENT_ADDR_RESOLVED:
|
||||
ret = infiniband_addr_resolved(event->id);
|
||||
break;
|
||||
case RDMA_CM_EVENT_ADDR_ERROR:
|
||||
error("Address resolution (rdma_resolve_addr) failed!");
|
||||
case RDMA_CM_EVENT_ROUTE_RESOLVED:
|
||||
ret = infiniband_route_resolved(event->id);
|
||||
break;
|
||||
case RDMA_CM_EVENT_ROUTE_ERROR:
|
||||
error("Route resolution (rdma_resovle_route) failed!");
|
||||
case RDMA_CM_EVENT_CONNECT_REQUEST:
|
||||
break;
|
||||
case RDMA_CM_EVENT_CONNECT_ERROR:
|
||||
error("An error has occurred trying to establish a connection!");
|
||||
case RDMA_CM_EVENT_REJECTED:
|
||||
error("Connection request or response was rejected by the remote end point!");
|
||||
case RDMA_CM_EVENT_ESTABLISHED:
|
||||
ret = 1;
|
||||
break;
|
||||
default:
|
||||
error("Unknown event occurred: %u",
|
||||
event->event);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int infiniband_reverse(struct node *n)
|
||||
|
@ -54,6 +83,57 @@ int infiniband_reverse(struct node *n)
|
|||
|
||||
int infiniband_parse(struct node *n, json_t *cfg)
|
||||
{
|
||||
struct infiniband *ib = (struct infiniband *) n->_vd;
|
||||
|
||||
int ret;
|
||||
const char *local = NULL;
|
||||
const char *remote = NULL;
|
||||
const char *port_space = NULL;
|
||||
const int timeout;
|
||||
|
||||
json_error_t err;
|
||||
ret = json_unpack_ex(cfg, &err, 0, "{ s?: s, s?: s, s?: s, s?: i}",
|
||||
"remote", &remote,
|
||||
"local", &local,
|
||||
"rdma_port_space", &port_space,
|
||||
"resolution_timeout", &timeout
|
||||
);
|
||||
if(ret)
|
||||
jerror(&err, "Failed to parse configuration of node %s", node_name(n));
|
||||
|
||||
// Translate IP:PORT to a struct addrinfo
|
||||
ret = getaddrinfo(local, NULL, NULL, &ib->conn.src_addr);
|
||||
if(ret) {
|
||||
error("Failed to resolve local address '%s' of node %s: %s",
|
||||
local, node_name(n), gai_strerror(ret));
|
||||
}
|
||||
|
||||
// Translate port space and create rdma_cm_id object
|
||||
if(strcmp(port_space, "RDMA_PS_IPOIB") == 0) ib->conn.port_space = RDMA_PS_IPOIB;
|
||||
else if(strcmp(port_space, "RDMA_PS_TCP") == 0) ib->conn.port_space = RDMA_PS_TCP;
|
||||
else if(strcmp(port_space, "RDMA_PS_UDP") == 0) ib->conn.port_space = RDMA_PS_UDP;
|
||||
else if(strcmp(port_space, "RDMA_PS_IB") == 0) ib->conn.port_space = RDMA_PS_IB;
|
||||
else {
|
||||
error("Failed to translate rdma_port_space in node %s. %s is not a valid \
|
||||
port space supported by rdma_cma.h!", node_name(n), port_space);
|
||||
}
|
||||
|
||||
//Check if node is a source and connect to target
|
||||
if(remote)
|
||||
{
|
||||
ib->is_source = 1;
|
||||
|
||||
// Translate address info
|
||||
ret = getaddrinfo(remote, NULL, NULL, &ib->conn.dst_addr);
|
||||
if(ret) {
|
||||
error("Failed to resolve remote address '%s' of node %s: %s",
|
||||
remote, node_name(n), gai_strerror(ret));
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
ib->is_source = 0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -69,6 +149,57 @@ int infiniband_destroy(struct node *n)
|
|||
|
||||
int infiniband_start(struct node *n)
|
||||
{
|
||||
struct infiniband *ib = (struct infiniband *) n->_vd;
|
||||
struct rdma_cm_event *event = NULL;
|
||||
int ret;
|
||||
|
||||
// Create event channel
|
||||
ib->ec = rdma_create_event_channel();
|
||||
if(!ib->ec) {
|
||||
error("Failed to create event channel in node %s!",
|
||||
node_name(n));
|
||||
}
|
||||
|
||||
ret = rdma_create_id(ib->ec, &ib->id, NULL, ib->conn.port_space);
|
||||
if(ret) {
|
||||
error("Failed to create rdma_cm_id of node %s: %s",
|
||||
node_name(n), gai_strerror(ret));
|
||||
}
|
||||
info("Succesfully created CM RDMA ID of node %s",
|
||||
node_name(n));
|
||||
|
||||
// Bind rdma_cm_id to the HCA
|
||||
ret = rdma_bind_addr(ib->id, ib->conn.src_addr->ai_addr);
|
||||
if(ret) {
|
||||
error("Failed to bind to local device of node %s: %s",
|
||||
node_name(n), gai_strerror(ret));
|
||||
}
|
||||
info("Bound to Infiniband device of node %s",
|
||||
node_name(n));
|
||||
|
||||
if(ib->is_source)
|
||||
{
|
||||
// Resolve address
|
||||
ret = rdma_resolve_addr(ib->id, NULL, ib->conn.dst_addr->ai_addr, ib->conn.timeout);
|
||||
if(ret) {
|
||||
error("Failed to resolve remote address after %ims of node %s: %s",
|
||||
ib->conn.timeout, node_name(n), gai_strerror(ret));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Several events should occur on the event channel, to make
|
||||
// sure the nodes are succesfully connected.
|
||||
info("Starting to monitor events on rdma_cm_id.\n");
|
||||
while(rdma_get_cm_event(ib->ec, &event) == 0)
|
||||
{
|
||||
struct rdma_cm_event event_copy;
|
||||
memcpy(&event_copy, event, sizeof(*event));
|
||||
|
||||
if(infiniband_event(&event_copy))
|
||||
break;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -77,8 +208,9 @@ int infiniband_stop(struct node *n)
|
|||
return 0;
|
||||
}
|
||||
|
||||
int infiniband_init()
|
||||
int infiniband_init(struct super_node *n)
|
||||
{
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue