diff --git a/include/villas/nodes/infiniband.h b/include/villas/nodes/infiniband.h index 64eee7067..d48cb0aee 100644 --- a/include/villas/nodes/infiniband.h +++ b/include/villas/nodes/infiniband.h @@ -40,6 +40,8 @@ struct format_type; struct infiniband { struct rdma_cm_id *id; + struct rdma_event_channel *ec; + struct ibv_pd *pd; struct ibv_cq *cq; struct ibv_comp_channel *comp_channel; @@ -47,14 +49,18 @@ struct infiniband { pthread_t cq_poller_thread; struct connection_s { - char *src_ip_addr; - char *dst_ip_addr; + struct addrinfo *src_addr; + struct addrinfo *dst_addr; + const int timeout; + enum rdma_port_space port_space; struct ibv_qp *qp; struct ibv_mr *mr_payload; struct r_addr_key_s *r_addr_key; } conn; + int is_source; + }; /** @see node_type::reverse */ @@ -76,7 +82,7 @@ int infiniband_destroy(struct node *n); int infiniband_stop(struct node *n); /** @see node_type::init */ -int infiniband_init(); +int infiniband_init(struct super_node *n); /** @see node_type::deinit */ int infiniband_deinit(); diff --git a/lib/nodes/infiniband.c b/lib/nodes/infiniband.c index a56e8fd6e..18b860819 100644 --- a/lib/nodes/infiniband.c +++ b/lib/nodes/infiniband.c @@ -26,25 +26,54 @@ #include #include #include +#include -static void infiniband_log_cb(struct infiniband *ib, void *userdata, int level, const char *str) +static int infiniband_addr_resolved(struct rdma_cm_id *id) { + return 0; } -static void infiniband_connect_cb(struct infiniband *ib, void *userdata, int result) +static int infiniband_route_resolved(struct rdma_cm_id *id) { + return 0; } -static void infiniband_disconnect_cb(struct infiniband *ib, void *userdata, int result) +static int infiniband_connect_request(struct rdma_cm_id *id) { + return 0; } -static void infiniband_message_cb(struct infiniband *ib, void *userdata) +static int infiniband_event(struct rdma_cm_event *event) { -} + int ret = 0; -static void infiniband_subscribe_cb(struct infiniband *ib, void *userdata, int mid, int qos_count, const int *granted_qos) -{ + switch(event->event) + { + case RDMA_CM_EVENT_ADDR_RESOLVED: + ret = infiniband_addr_resolved(event->id); + break; + case RDMA_CM_EVENT_ADDR_ERROR: + error("Address resolution (rdma_resolve_addr) failed!"); + case RDMA_CM_EVENT_ROUTE_RESOLVED: + ret = infiniband_route_resolved(event->id); + break; + case RDMA_CM_EVENT_ROUTE_ERROR: + error("Route resolution (rdma_resovle_route) failed!"); + case RDMA_CM_EVENT_CONNECT_REQUEST: + break; + case RDMA_CM_EVENT_CONNECT_ERROR: + error("An error has occurred trying to establish a connection!"); + case RDMA_CM_EVENT_REJECTED: + error("Connection request or response was rejected by the remote end point!"); + case RDMA_CM_EVENT_ESTABLISHED: + ret = 1; + break; + default: + error("Unknown event occurred: %u", + event->event); + } + + return ret; } int infiniband_reverse(struct node *n) @@ -54,6 +83,57 @@ int infiniband_reverse(struct node *n) int infiniband_parse(struct node *n, json_t *cfg) { + struct infiniband *ib = (struct infiniband *) n->_vd; + + int ret; + const char *local = NULL; + const char *remote = NULL; + const char *port_space = NULL; + const int timeout; + + json_error_t err; + ret = json_unpack_ex(cfg, &err, 0, "{ s?: s, s?: s, s?: s, s?: i}", + "remote", &remote, + "local", &local, + "rdma_port_space", &port_space, + "resolution_timeout", &timeout + ); + if(ret) + jerror(&err, "Failed to parse configuration of node %s", node_name(n)); + + // Translate IP:PORT to a struct addrinfo + ret = getaddrinfo(local, NULL, NULL, &ib->conn.src_addr); + if(ret) { + error("Failed to resolve local address '%s' of node %s: %s", + local, node_name(n), gai_strerror(ret)); + } + + // Translate port space and create rdma_cm_id object + if(strcmp(port_space, "RDMA_PS_IPOIB") == 0) ib->conn.port_space = RDMA_PS_IPOIB; + else if(strcmp(port_space, "RDMA_PS_TCP") == 0) ib->conn.port_space = RDMA_PS_TCP; + else if(strcmp(port_space, "RDMA_PS_UDP") == 0) ib->conn.port_space = RDMA_PS_UDP; + else if(strcmp(port_space, "RDMA_PS_IB") == 0) ib->conn.port_space = RDMA_PS_IB; + else { + error("Failed to translate rdma_port_space in node %s. %s is not a valid \ + port space supported by rdma_cma.h!", node_name(n), port_space); + } + + //Check if node is a source and connect to target + if(remote) + { + ib->is_source = 1; + + // Translate address info + ret = getaddrinfo(remote, NULL, NULL, &ib->conn.dst_addr); + if(ret) { + error("Failed to resolve remote address '%s' of node %s: %s", + remote, node_name(n), gai_strerror(ret)); + } + + } + else + ib->is_source = 0; + return 0; } @@ -69,6 +149,57 @@ int infiniband_destroy(struct node *n) int infiniband_start(struct node *n) { + struct infiniband *ib = (struct infiniband *) n->_vd; + struct rdma_cm_event *event = NULL; + int ret; + + // Create event channel + ib->ec = rdma_create_event_channel(); + if(!ib->ec) { + error("Failed to create event channel in node %s!", + node_name(n)); + } + + ret = rdma_create_id(ib->ec, &ib->id, NULL, ib->conn.port_space); + if(ret) { + error("Failed to create rdma_cm_id of node %s: %s", + node_name(n), gai_strerror(ret)); + } + info("Succesfully created CM RDMA ID of node %s", + node_name(n)); + + // Bind rdma_cm_id to the HCA + ret = rdma_bind_addr(ib->id, ib->conn.src_addr->ai_addr); + if(ret) { + error("Failed to bind to local device of node %s: %s", + node_name(n), gai_strerror(ret)); + } + info("Bound to Infiniband device of node %s", + node_name(n)); + + if(ib->is_source) + { + // Resolve address + ret = rdma_resolve_addr(ib->id, NULL, ib->conn.dst_addr->ai_addr, ib->conn.timeout); + if(ret) { + error("Failed to resolve remote address after %ims of node %s: %s", + ib->conn.timeout, node_name(n), gai_strerror(ret)); + } + + } + + // Several events should occur on the event channel, to make + // sure the nodes are succesfully connected. + info("Starting to monitor events on rdma_cm_id.\n"); + while(rdma_get_cm_event(ib->ec, &event) == 0) + { + struct rdma_cm_event event_copy; + memcpy(&event_copy, event, sizeof(*event)); + + if(infiniband_event(&event_copy)) + break; + } + return 0; } @@ -77,8 +208,9 @@ int infiniband_stop(struct node *n) return 0; } -int infiniband_init() +int infiniband_init(struct super_node *n) { + return 0; }