From a1b7a9cda6d777b45b549cfa8af78b7f475451f2 Mon Sep 17 00:00:00 2001 From: Dennis Potter Date: Thu, 21 Jun 2018 11:56:28 +0200 Subject: [PATCH 01/35] Initial commit with (empty) infiniband node --- Makefile | 2 +- Makefile.config | 37 ++++----- Makefile.help | 2 +- include/villas/nodes/infiniband.h | 90 +++++++++++++++++++++ lib/nodes/Makefile.inc | 7 ++ lib/nodes/infiniband.c | 127 ++++++++++++++++++++++++++++++ 6 files changed, 245 insertions(+), 20 deletions(-) create mode 100644 include/villas/nodes/infiniband.h create mode 100644 lib/nodes/infiniband.c diff --git a/Makefile b/Makefile index 23cc84d0f..ba1e6d237 100644 --- a/Makefile +++ b/Makefile @@ -69,7 +69,7 @@ endif # Common flags LDLIBS = CFLAGS += -std=c11 -MMD -mcx16 -I$(BUILDDIR)/include -I$(SRCDIR)/include -CFLAGS += -Wall -Werror -fdiagnostics-color=auto -D_POSIX_C_SOURCE=200809L -D_GNU_SOURCE=1 +CFLAGS += -Wall -fdiagnostics-color=auto -D_POSIX_C_SOURCE=200809L -D_GNU_SOURCE=1 ifeq ($(PLATFORM),Darwin) CFLAGS += -D_DARWIN_C_SOURCE diff --git a/Makefile.config b/Makefile.config index fbb80c5c9..5ce1fd01d 100644 --- a/Makefile.config +++ b/Makefile.config @@ -54,21 +54,22 @@ else IS_LINUX = 0 endif -WITH_NODE_FPGA ?= $(IS_LINUX) -WITH_NODE_CBUILDER ?= $(IS_LINUX) -WITH_NODE_LOOPBACK ?= $(IS_LINUX) -WITH_NODE_COMEDI ?= $(IS_LINUX) -WITH_NODE_TEST_RTT ?= 1 -WITH_NODE_FILE ?= 1 -WITH_NODE_SIGNAL ?= 1 -WITH_NODE_NGSI ?= 1 -WITH_NODE_WEBSOCKET ?= 1 -WITH_NODE_SOCKET ?= 1 -WITH_NODE_ZEROMQ ?= 1 -WITH_NODE_NANOMSG ?= 1 -WITH_NODE_SHMEM ?= 1 -WITH_NODE_STATS ?= 1 -WITH_NODE_INFLUXDB ?= 1 -WITH_NODE_AMQP ?= 1 -WITH_NODE_IEC61850 ?= 1 -WITH_NODE_MQTT ?= 1 +WITH_NODE_FPGA ?= $(IS_LINUX) +WITH_NODE_CBUILDER ?= $(IS_LINUX) +WITH_NODE_LOOPBACK ?= $(IS_LINUX) +WITH_NODE_COMEDI ?= $(IS_LINUX) +WITH_NODE_TEST_RTT ?= 1 +WITH_NODE_FILE ?= 1 +WITH_NODE_SIGNAL ?= 1 +WITH_NODE_NGSI ?= 1 +WITH_NODE_WEBSOCKET ?= 1 +WITH_NODE_SOCKET ?= 1 +WITH_NODE_ZEROMQ ?= 1 +WITH_NODE_NANOMSG ?= 1 +WITH_NODE_SHMEM ?= 1 +WITH_NODE_STATS ?= 1 +WITH_NODE_INFLUXDB ?= 1 +WITH_NODE_AMQP ?= 1 +WITH_NODE_IEC61850 ?= 1 +WITH_NODE_MQTT ?= 1 +WITH_NODE_INFINIBAND ?= 1 diff --git a/Makefile.help b/Makefile.help index 5b2a17d97..369704f67 100644 --- a/Makefile.help +++ b/Makefile.help @@ -92,7 +92,7 @@ help: $E " WITH_NODE_AMQP = $(WITH_NODE_AMQP)" $E " WITH_NODE_MQTT = $(WITH_NODE_MQTT)" $E " WITH_NODE_IEC61850 = $(WITH_NODE_IEC61850)" - $E " WITH_NODE_MQTT = $(WITH_NODE_MQTT)" + $E " WITH_NODE_INFINIBAND = $(WITH_NODE_INFINIBAND)" $E " WITH_NODE_COMEDI = $(WITH_NODE_COMEDI)" $E $E "Available dependencies: $(LIB_PKGS)" diff --git a/include/villas/nodes/infiniband.h b/include/villas/nodes/infiniband.h new file mode 100644 index 000000000..64eee7067 --- /dev/null +++ b/include/villas/nodes/infiniband.h @@ -0,0 +1,90 @@ +/** Node type: infiniband + * + * @file + * @author Dennis Potter + * @copyright 2018, Institute for Automation of Complex Power Systems, EONERC + * @license GNU General Public License (version 3) + * + * VILLASnode + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + *********************************************************************************/ + +/** + * @addtogroup infiniband infiniband node type + * @ingroup node + * @{ + */ + +#pragma once + +#include +#include +#include +#include +#include + +/* Forward declarations */ +struct format_type; + +struct infiniband { + struct rdma_cm_id *id; + struct ibv_pd *pd; + struct ibv_cq *cq; + struct ibv_comp_channel *comp_channel; + + pthread_t cq_poller_thread; + + struct connection_s { + char *src_ip_addr; + char *dst_ip_addr; + + struct ibv_qp *qp; + struct ibv_mr *mr_payload; + struct r_addr_key_s *r_addr_key; + } conn; + +}; + +/** @see node_type::reverse */ +int infiniband_reverse(struct node *n); + +/** @see node_type::print */ +char * infiniband_print(struct node *n); + +/** @see node_type::parse */ +int infiniband_parse(struct node *n, json_t *cfg); + +/** @see node_type::open */ +int infiniband_start(struct node *n); + +/** @see node_type::destroy */ +int infiniband_destroy(struct node *n); + +/** @see node_type::close */ +int infiniband_stop(struct node *n); + +/** @see node_type::init */ +int infiniband_init(); + +/** @see node_type::deinit */ +int infiniband_deinit(); + +/** @see node_type::read */ +int infiniband_read(struct node *n, struct sample *smps[], unsigned cnt); + +/** @see node_type::write */ +int infiniband_write(struct node *n, struct sample *smps[], unsigned cnt); + +/** @} */ diff --git a/lib/nodes/Makefile.inc b/lib/nodes/Makefile.inc index 180a98669..a05c6c48c 100644 --- a/lib/nodes/Makefile.inc +++ b/lib/nodes/Makefile.inc @@ -156,6 +156,13 @@ ifneq ($(wildcard /usr/include/mosquitto.h),) endif endif +# Enable Infiniband support +ifeq ($(WITH_NODE_INFINIBAND),1) + LIB_SRCS += lib/nodes/infiniband.c + LIB_NODES += infiniband + WITH_IO = 1 +endif + # Enable Comedi support ifeq ($(WITH_NODE_COMEDI),1) ifeq ($(shell $(PKGCONFIG) comedilib; echo $$?),0) diff --git a/lib/nodes/infiniband.c b/lib/nodes/infiniband.c new file mode 100644 index 000000000..a56e8fd6e --- /dev/null +++ b/lib/nodes/infiniband.c @@ -0,0 +1,127 @@ +/** Node type: infiniband + * + * @author Dennis Potter + * @copyright 2018, Institute for Automation of Complex Power Systems, EONERC + * @license GNU General Public License (version 3) + * + * VILLASnode + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + *********************************************************************************/ + +#include + +#include +#include +#include +#include + +static void infiniband_log_cb(struct infiniband *ib, void *userdata, int level, const char *str) +{ +} + +static void infiniband_connect_cb(struct infiniband *ib, void *userdata, int result) +{ +} + +static void infiniband_disconnect_cb(struct infiniband *ib, void *userdata, int result) +{ +} + +static void infiniband_message_cb(struct infiniband *ib, void *userdata) +{ +} + +static void infiniband_subscribe_cb(struct infiniband *ib, void *userdata, int mid, int qos_count, const int *granted_qos) +{ +} + +int infiniband_reverse(struct node *n) +{ + return 0; +} + +int infiniband_parse(struct node *n, json_t *cfg) +{ + return 0; +} + +char * infiniband_print(struct node *n) +{ + return 0; +} + +int infiniband_destroy(struct node *n) +{ + return 0; +} + +int infiniband_start(struct node *n) +{ + return 0; +} + +int infiniband_stop(struct node *n) +{ + return 0; +} + +int infiniband_init() +{ + return 0; +} + +int infiniband_deinit() +{ + return 0; +} + +int infiniband_read(struct node *n, struct sample *smps[], unsigned cnt) +{ + return 0; +} + +int infiniband_write(struct node *n, struct sample *smps[], unsigned cnt) +{ + return 0; +} + +int infiniband_fd(struct node *n) +{ + return 0; +} + +static struct plugin p = { + .name = "infiniband", + .description = "Infiniband)", + .type = PLUGIN_TYPE_NODE, + .node = { + .vectorize = 0, + .size = sizeof(struct infiniband), + .reverse = infiniband_reverse, + .parse = infiniband_parse, + .print = infiniband_print, + .start = infiniband_start, + .destroy = infiniband_destroy, + .stop = infiniband_stop, + .init = infiniband_init, + .deinit = infiniband_deinit, + .read = infiniband_read, + .write = infiniband_write, + .fd = infiniband_fd + } +}; + +REGISTER_PLUGIN(&p) +LIST_INIT_STATIC(&p.node.instances) From 1528603a88fef5fd2a5b400eab2de32b47654887 Mon Sep 17 00:00:00 2001 From: Dennis Potter Date: Fri, 22 Jun 2018 13:01:52 +0200 Subject: [PATCH 02/35] Added missing libraries to Makefile --- lib/nodes/Makefile.inc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/nodes/Makefile.inc b/lib/nodes/Makefile.inc index a05c6c48c..47a329242 100644 --- a/lib/nodes/Makefile.inc +++ b/lib/nodes/Makefile.inc @@ -159,7 +159,9 @@ endif # Enable Infiniband support ifeq ($(WITH_NODE_INFINIBAND),1) LIB_SRCS += lib/nodes/infiniband.c - LIB_NODES += infiniband + LIB_NODES += infiniband + LIB_LDLIBS += -libverbs + LIB_LDLIBS += -lrdmacm WITH_IO = 1 endif From 4220ff8111ec932a788e9a69a31ab95fa344fd40 Mon Sep 17 00:00:00 2001 From: Dennis Potter Date: Fri, 22 Jun 2018 13:02:41 +0200 Subject: [PATCH 03/35] Implemented request to resolve address and built an rdma_cm_event framework --- include/villas/nodes/infiniband.h | 12 ++- lib/nodes/infiniband.c | 148 ++++++++++++++++++++++++++++-- 2 files changed, 149 insertions(+), 11 deletions(-) diff --git a/include/villas/nodes/infiniband.h b/include/villas/nodes/infiniband.h index 64eee7067..d48cb0aee 100644 --- a/include/villas/nodes/infiniband.h +++ b/include/villas/nodes/infiniband.h @@ -40,6 +40,8 @@ struct format_type; struct infiniband { struct rdma_cm_id *id; + struct rdma_event_channel *ec; + struct ibv_pd *pd; struct ibv_cq *cq; struct ibv_comp_channel *comp_channel; @@ -47,14 +49,18 @@ struct infiniband { pthread_t cq_poller_thread; struct connection_s { - char *src_ip_addr; - char *dst_ip_addr; + struct addrinfo *src_addr; + struct addrinfo *dst_addr; + const int timeout; + enum rdma_port_space port_space; struct ibv_qp *qp; struct ibv_mr *mr_payload; struct r_addr_key_s *r_addr_key; } conn; + int is_source; + }; /** @see node_type::reverse */ @@ -76,7 +82,7 @@ int infiniband_destroy(struct node *n); int infiniband_stop(struct node *n); /** @see node_type::init */ -int infiniband_init(); +int infiniband_init(struct super_node *n); /** @see node_type::deinit */ int infiniband_deinit(); diff --git a/lib/nodes/infiniband.c b/lib/nodes/infiniband.c index a56e8fd6e..18b860819 100644 --- a/lib/nodes/infiniband.c +++ b/lib/nodes/infiniband.c @@ -26,25 +26,54 @@ #include #include #include +#include -static void infiniband_log_cb(struct infiniband *ib, void *userdata, int level, const char *str) +static int infiniband_addr_resolved(struct rdma_cm_id *id) { + return 0; } -static void infiniband_connect_cb(struct infiniband *ib, void *userdata, int result) +static int infiniband_route_resolved(struct rdma_cm_id *id) { + return 0; } -static void infiniband_disconnect_cb(struct infiniband *ib, void *userdata, int result) +static int infiniband_connect_request(struct rdma_cm_id *id) { + return 0; } -static void infiniband_message_cb(struct infiniband *ib, void *userdata) +static int infiniband_event(struct rdma_cm_event *event) { -} + int ret = 0; -static void infiniband_subscribe_cb(struct infiniband *ib, void *userdata, int mid, int qos_count, const int *granted_qos) -{ + switch(event->event) + { + case RDMA_CM_EVENT_ADDR_RESOLVED: + ret = infiniband_addr_resolved(event->id); + break; + case RDMA_CM_EVENT_ADDR_ERROR: + error("Address resolution (rdma_resolve_addr) failed!"); + case RDMA_CM_EVENT_ROUTE_RESOLVED: + ret = infiniband_route_resolved(event->id); + break; + case RDMA_CM_EVENT_ROUTE_ERROR: + error("Route resolution (rdma_resovle_route) failed!"); + case RDMA_CM_EVENT_CONNECT_REQUEST: + break; + case RDMA_CM_EVENT_CONNECT_ERROR: + error("An error has occurred trying to establish a connection!"); + case RDMA_CM_EVENT_REJECTED: + error("Connection request or response was rejected by the remote end point!"); + case RDMA_CM_EVENT_ESTABLISHED: + ret = 1; + break; + default: + error("Unknown event occurred: %u", + event->event); + } + + return ret; } int infiniband_reverse(struct node *n) @@ -54,6 +83,57 @@ int infiniband_reverse(struct node *n) int infiniband_parse(struct node *n, json_t *cfg) { + struct infiniband *ib = (struct infiniband *) n->_vd; + + int ret; + const char *local = NULL; + const char *remote = NULL; + const char *port_space = NULL; + const int timeout; + + json_error_t err; + ret = json_unpack_ex(cfg, &err, 0, "{ s?: s, s?: s, s?: s, s?: i}", + "remote", &remote, + "local", &local, + "rdma_port_space", &port_space, + "resolution_timeout", &timeout + ); + if(ret) + jerror(&err, "Failed to parse configuration of node %s", node_name(n)); + + // Translate IP:PORT to a struct addrinfo + ret = getaddrinfo(local, NULL, NULL, &ib->conn.src_addr); + if(ret) { + error("Failed to resolve local address '%s' of node %s: %s", + local, node_name(n), gai_strerror(ret)); + } + + // Translate port space and create rdma_cm_id object + if(strcmp(port_space, "RDMA_PS_IPOIB") == 0) ib->conn.port_space = RDMA_PS_IPOIB; + else if(strcmp(port_space, "RDMA_PS_TCP") == 0) ib->conn.port_space = RDMA_PS_TCP; + else if(strcmp(port_space, "RDMA_PS_UDP") == 0) ib->conn.port_space = RDMA_PS_UDP; + else if(strcmp(port_space, "RDMA_PS_IB") == 0) ib->conn.port_space = RDMA_PS_IB; + else { + error("Failed to translate rdma_port_space in node %s. %s is not a valid \ + port space supported by rdma_cma.h!", node_name(n), port_space); + } + + //Check if node is a source and connect to target + if(remote) + { + ib->is_source = 1; + + // Translate address info + ret = getaddrinfo(remote, NULL, NULL, &ib->conn.dst_addr); + if(ret) { + error("Failed to resolve remote address '%s' of node %s: %s", + remote, node_name(n), gai_strerror(ret)); + } + + } + else + ib->is_source = 0; + return 0; } @@ -69,6 +149,57 @@ int infiniband_destroy(struct node *n) int infiniband_start(struct node *n) { + struct infiniband *ib = (struct infiniband *) n->_vd; + struct rdma_cm_event *event = NULL; + int ret; + + // Create event channel + ib->ec = rdma_create_event_channel(); + if(!ib->ec) { + error("Failed to create event channel in node %s!", + node_name(n)); + } + + ret = rdma_create_id(ib->ec, &ib->id, NULL, ib->conn.port_space); + if(ret) { + error("Failed to create rdma_cm_id of node %s: %s", + node_name(n), gai_strerror(ret)); + } + info("Succesfully created CM RDMA ID of node %s", + node_name(n)); + + // Bind rdma_cm_id to the HCA + ret = rdma_bind_addr(ib->id, ib->conn.src_addr->ai_addr); + if(ret) { + error("Failed to bind to local device of node %s: %s", + node_name(n), gai_strerror(ret)); + } + info("Bound to Infiniband device of node %s", + node_name(n)); + + if(ib->is_source) + { + // Resolve address + ret = rdma_resolve_addr(ib->id, NULL, ib->conn.dst_addr->ai_addr, ib->conn.timeout); + if(ret) { + error("Failed to resolve remote address after %ims of node %s: %s", + ib->conn.timeout, node_name(n), gai_strerror(ret)); + } + + } + + // Several events should occur on the event channel, to make + // sure the nodes are succesfully connected. + info("Starting to monitor events on rdma_cm_id.\n"); + while(rdma_get_cm_event(ib->ec, &event) == 0) + { + struct rdma_cm_event event_copy; + memcpy(&event_copy, event, sizeof(*event)); + + if(infiniband_event(&event_copy)) + break; + } + return 0; } @@ -77,8 +208,9 @@ int infiniband_stop(struct node *n) return 0; } -int infiniband_init() +int infiniband_init(struct super_node *n) { + return 0; } From ae22048e0d1c67f55a47bfbf48f78496360c2a7d Mon Sep 17 00:00:00 2001 From: Dennis Potter Date: Sat, 23 Jun 2018 13:19:31 +0200 Subject: [PATCH 04/35] Updated Dockerfile with IB Verbs and RDMA CM dependencies --- packaging/docker/Dockerfile.dev | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/packaging/docker/Dockerfile.dev b/packaging/docker/Dockerfile.dev index a20728ea1..b7f9a80f9 100644 --- a/packaging/docker/Dockerfile.dev +++ b/packaging/docker/Dockerfile.dev @@ -80,6 +80,29 @@ RUN dnf -y install \ mosquitto-devel \ comedilib-devel +# IB Verbs Dependencies +RUN dnf -y install \ + libibverbs-utils \ + libibverbs-devel \ + libibverbs-devel-static \ + libmlx4 \ + libmlx5 \ + ibutils \ + libibcm \ + libibcommon \ + libibmad \ + libibumad + +# RDMA CM Dependencies +RUN dnf -y install \ + librdmacm-utils \ + librdmacm-devel \ + librdmacm \ + libibumad-devel \ + perftest + + + # Build & Install Criterion RUN cd /tmp && \ git clone --recursive https://github.com/Snaipe/Criterion && \ From a0f0410e73937585a344f373b22633be70e264a2 Mon Sep 17 00:00:00 2001 From: Dennis Potter Date: Sat, 23 Jun 2018 14:53:37 +0200 Subject: [PATCH 05/35] This commit should be able to create a connection between two nodes. This commit isn't checked for bugs and no memory management is done. Furthermore, no poll threads are implemented. --- include/villas/nodes/infiniband.h | 28 +++- lib/nodes/infiniband.c | 245 +++++++++++++++++++++++++----- packaging/docker/Dockerfile.dev | 2 - 3 files changed, 230 insertions(+), 45 deletions(-) diff --git a/include/villas/nodes/infiniband.h b/include/villas/nodes/infiniband.h index d48cb0aee..6ebe37c8d 100644 --- a/include/villas/nodes/infiniband.h +++ b/include/villas/nodes/infiniband.h @@ -38,15 +38,26 @@ /* Forward declarations */ struct format_type; +enum poll_mode_e +{ + EVENT, + BUSY +}; + struct infiniband { struct rdma_cm_id *id; struct rdma_event_channel *ec; - struct ibv_pd *pd; - struct ibv_cq *cq; - struct ibv_comp_channel *comp_channel; + struct context_s { + struct ibv_pd *pd; + struct ibv_cq *cq; + struct ibv_comp_channel *comp_channel; + } ctx; - pthread_t cq_poller_thread; + struct poll_s { + enum poll_mode_e poll_mode; + pthread_t cq_poller_thread; + } poll; struct connection_s { struct addrinfo *src_addr; @@ -54,13 +65,18 @@ struct infiniband { const int timeout; enum rdma_port_space port_space; - struct ibv_qp *qp; struct ibv_mr *mr_payload; struct r_addr_key_s *r_addr_key; } conn; - int is_source; + struct init_s { + int cq_size; + enum ibv_qp_type qp_type; + int max_send_wr; + int max_recv_wr; + } init; + int is_source; }; /** @see node_type::reverse */ diff --git a/lib/nodes/infiniband.c b/lib/nodes/infiniband.c index 18b860819..da0456781 100644 --- a/lib/nodes/infiniband.c +++ b/lib/nodes/infiniband.c @@ -28,38 +28,169 @@ #include #include -static int infiniband_addr_resolved(struct rdma_cm_id *id) +static void ib_create_busy_poll(struct node *n, struct rdma_cm_id *id) { + struct infiniband *ib = (struct infiniband *) n->_vd; + + // Create completion queue and bind to channel + ib->ctx.cq = ibv_create_cq(ib->id->verbs, ib->init.cq_size, NULL, NULL, 0); + if(!ib->ctx.cq) + error("Could not create completion queue in node %s.", node_name(n)); + + //ToDo: Create poll pthread +} + +static void ib_create_event(struct node *n, struct rdma_cm_id *id) +{ + int ret; + struct infiniband *ib = (struct infiniband *) n->_vd; + + // Create completion channel + ib->ctx.comp_channel = ibv_create_comp_channel(ib->id->verbs); + if(!ib->ctx.comp_channel) + error("Could not create completion channel in node %s.", node_name(n)); + + // Create completion queue and bind to channel + ib->ctx.cq = ibv_create_cq(ib->id->verbs, + ib->init.cq_size, + NULL, + ib->ctx.comp_channel, + 0); + if(!ib->ctx.cq) + error("Could not create completion queue in node %s.", node_name(n)); + + // Request notifications from completion queue + ret = ibv_req_notify_cq(ib->ctx.cq, 0); + if(ret) + error("Failed to request notifiy CQ in node %s: %s", + node_name(n), gai_strerror(ret)); + + //ToDo: Create poll pthread +} + +static void ib_build_ibv(struct node *n, struct rdma_cm_id *id) +{ + struct infiniband *ib = (struct infiniband *) n->_vd; + int ret; + + //Allocate protection domain + ib->ctx.pd = ibv_alloc_pd(ib->id->verbs); + if(!ib->ctx.pd) + error("Could not allocate protection domain in node %s.", node_name(n)); + + // Initiate poll mode + switch(ib->poll.poll_mode) + { + case EVENT: + ib_create_event(n, id); + break; + case BUSY: + ib_create_busy_poll(n, id); + break; + } + + // Prepare Queue Pair (QP) attributes + struct ibv_qp_init_attr qp_attr; + qp_attr.send_cq = ib->ctx.cq; + qp_attr.recv_cq = ib->ctx.cq; + qp_attr.qp_type = ib->init.qp_type; + + qp_attr.cap.max_send_wr = ib->init.max_send_wr; + qp_attr.cap.max_recv_wr = ib->init.max_recv_wr; + qp_attr.cap.max_send_sge = 1; + qp_attr.cap.max_recv_sge = 1; + + //ToDo: Set maximum inline data + + // Create the actual QP + ret = rdma_create_qp(id, ib->ctx.pd, &qp_attr); + if(ret) + error("Failed to create Queue Pair in node %s.", node_name(n)); + + info("Successfully created Queue Pair in node %s.", node_name(n)); +} + +static int ib_addr_resolved(struct node *n, struct rdma_cm_id *id) +{ + struct infiniband *ib = (struct infiniband *) n->_vd; + int ret; + + // Build all components from IB Verbs + ib_build_ibv(n, id); + + // Resolve address + ret = rdma_resolve_route(id, ib->conn.timeout); + if(ret) + error("Failed to resolve route in node %s.", node_name(n)); + + info("Successfully resolved address node %s", node_name(n)); + + //ToDo: create check if data can be send inline + return 0; } -static int infiniband_route_resolved(struct rdma_cm_id *id) +static int ib_route_resolved(struct node *n, struct rdma_cm_id *id) { + int ret; + + ib_build_ibv(n, id); + + //ToDo: Post receive WRs + + struct rdma_conn_param cm_params; + memset(&cm_params, 0, sizeof(cm_params)); + + // Send connection request + ret = rdma_connect(id, &cm_params); + if(ret) + error("Failed to connect in node %s.", node_name(n)); + + info("Route resolved and called rdma_connect"); + return 0; } -static int infiniband_connect_request(struct rdma_cm_id *id) +static int ib_connect_request(struct node *n, struct rdma_cm_id *id) { + int ret; + info("Received a connection request!"); + + ib_build_ibv(n, id); + + //ToDo: Post receive WRs + + struct rdma_conn_param cm_params; + memset(&cm_params, 0, sizeof(cm_params)); + + // Accept connection request + ret = rdma_accept(id, &cm_params); + if(ret) + error("Failed to connect in node %s.", node_name(n)); + + info("Successfully accepted connection request."); + return 0; } -static int infiniband_event(struct rdma_cm_event *event) +static int ib_event(struct node *n, struct rdma_cm_event *event) { int ret = 0; switch(event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: - ret = infiniband_addr_resolved(event->id); + ret = ib_addr_resolved(n, event->id); break; case RDMA_CM_EVENT_ADDR_ERROR: error("Address resolution (rdma_resolve_addr) failed!"); case RDMA_CM_EVENT_ROUTE_RESOLVED: - ret = infiniband_route_resolved(event->id); + ret = ib_route_resolved(n, event->id); break; case RDMA_CM_EVENT_ROUTE_ERROR: error("Route resolution (rdma_resovle_route) failed!"); case RDMA_CM_EVENT_CONNECT_REQUEST: + ret = ib_connect_request(n, event->id); break; case RDMA_CM_EVENT_CONNECT_ERROR: error("An error has occurred trying to establish a connection!"); @@ -76,27 +207,38 @@ static int infiniband_event(struct rdma_cm_event *event) return ret; } -int infiniband_reverse(struct node *n) +int ib_reverse(struct node *n) { return 0; } -int infiniband_parse(struct node *n, json_t *cfg) +int ib_parse(struct node *n, json_t *cfg) { struct infiniband *ib = (struct infiniband *) n->_vd; int ret; const char *local = NULL; const char *remote = NULL; - const char *port_space = NULL; - const int timeout; + const char *port_space = "RDMA_PC_TCP"; + const char *poll_mode = "BUSY"; + const char *qp_type = "IBV_QPT_RC"; + int timeout = 1000; + int cq_size = 10; + int max_send_wr = 100; + int max_recv_wr = 100; json_error_t err; - ret = json_unpack_ex(cfg, &err, 0, "{ s?: s, s?: s, s?: s, s?: i}", + ret = json_unpack_ex(cfg, &err, 0, "{ s?: s, s?: s, s?: s, s?: i, \ + s?: s, s?: i, s?: s, s?: i, s?: i}", "remote", &remote, "local", &local, "rdma_port_space", &port_space, - "resolution_timeout", &timeout + "resolution_timeout", &timeout, + "poll_mode", &poll_mode, + "cq_size", &cq_size, + "qp_type", &qp_type, + "max_send_wr", &max_send_wr, + "max_recv_wr", &max_recv_wr ); if(ret) jerror(&err, "Failed to parse configuration of node %s", node_name(n)); @@ -108,7 +250,7 @@ int infiniband_parse(struct node *n, json_t *cfg) local, node_name(n), gai_strerror(ret)); } - // Translate port space and create rdma_cm_id object + // Translate port space if(strcmp(port_space, "RDMA_PS_IPOIB") == 0) ib->conn.port_space = RDMA_PS_IPOIB; else if(strcmp(port_space, "RDMA_PS_TCP") == 0) ib->conn.port_space = RDMA_PS_TCP; else if(strcmp(port_space, "RDMA_PS_UDP") == 0) ib->conn.port_space = RDMA_PS_UDP; @@ -117,7 +259,31 @@ int infiniband_parse(struct node *n, json_t *cfg) error("Failed to translate rdma_port_space in node %s. %s is not a valid \ port space supported by rdma_cma.h!", node_name(n), port_space); } + + // Translate poll mode + if(strcmp(poll_mode, "EVENT") == 0) ib->poll.poll_mode = EVENT; + else if(strcmp(poll_mode, "BUSY") == 0) ib->poll.poll_mode = BUSY; + else { + error("Failed to translate poll_mode in node %s. %s is not a valid \ + poll mode!", node_name(n), poll_mode); + } + // Set completion queue size + ib->init.cq_size = cq_size; + + // Translate QP type + if(strcmp(qp_type, "IBV_QPT_RC") == 0) ib->init.qp_type = IBV_QPT_RC; + else if(strcmp(qp_type, "IBV_QPT_UC") == 0) ib->init.qp_type = IBV_QPT_UC; + else if(strcmp(qp_type, "IBV_QPT_UD") == 0) ib->init.qp_type = IBV_QPT_UD; + else { + error("Failed to translate qp_type in node %s. %s is not a valid \ + qp_type!", node_name(n), qp_type); + } + + //Set max. send and receive Work Requests + ib->init.max_send_wr = max_send_wr; + ib->init.max_recv_wr = max_recv_wr; + //Check if node is a source and connect to target if(remote) { @@ -137,17 +303,17 @@ int infiniband_parse(struct node *n, json_t *cfg) return 0; } -char * infiniband_print(struct node *n) +char * ib_print(struct node *n) { return 0; } -int infiniband_destroy(struct node *n) +int ib_destroy(struct node *n) { return 0; } -int infiniband_start(struct node *n) +int ib_start(struct node *n) { struct infiniband *ib = (struct infiniband *) n->_vd; struct rdma_cm_event *event = NULL; @@ -180,7 +346,10 @@ int infiniband_start(struct node *n) if(ib->is_source) { // Resolve address - ret = rdma_resolve_addr(ib->id, NULL, ib->conn.dst_addr->ai_addr, ib->conn.timeout); + ret = rdma_resolve_addr(ib->id, + NULL, + ib->conn.dst_addr->ai_addr, + ib->conn.timeout); if(ret) { error("Failed to resolve remote address after %ims of node %s: %s", ib->conn.timeout, node_name(n), gai_strerror(ret)); @@ -190,68 +359,70 @@ int infiniband_start(struct node *n) // Several events should occur on the event channel, to make // sure the nodes are succesfully connected. - info("Starting to monitor events on rdma_cm_id.\n"); + info("Starting to monitor events on rdma_cm_id on node %s.", + node_name(n)); + while(rdma_get_cm_event(ib->ec, &event) == 0) { struct rdma_cm_event event_copy; memcpy(&event_copy, event, sizeof(*event)); - if(infiniband_event(&event_copy)) + if(ib_event(n, &event_copy)) break; } return 0; } -int infiniband_stop(struct node *n) +int ib_stop(struct node *n) { return 0; } -int infiniband_init(struct super_node *n) +int ib_init(struct super_node *n) { return 0; } -int infiniband_deinit() +int ib_deinit() { return 0; } -int infiniband_read(struct node *n, struct sample *smps[], unsigned cnt) +int ib_read(struct node *n, struct sample *smps[], unsigned cnt) { return 0; } -int infiniband_write(struct node *n, struct sample *smps[], unsigned cnt) +int ib_write(struct node *n, struct sample *smps[], unsigned cnt) { return 0; } -int infiniband_fd(struct node *n) +int ib_fd(struct node *n) { return 0; } static struct plugin p = { .name = "infiniband", - .description = "Infiniband)", + .description = "Infiniband", .type = PLUGIN_TYPE_NODE, .node = { .vectorize = 0, .size = sizeof(struct infiniband), - .reverse = infiniband_reverse, - .parse = infiniband_parse, - .print = infiniband_print, - .start = infiniband_start, - .destroy = infiniband_destroy, - .stop = infiniband_stop, - .init = infiniband_init, - .deinit = infiniband_deinit, - .read = infiniband_read, - .write = infiniband_write, - .fd = infiniband_fd + .reverse = ib_reverse, + .parse = ib_parse, + .print = ib_print, + .start = ib_start, + .destroy = ib_destroy, + .stop = ib_stop, + .init = ib_init, + .deinit = ib_deinit, + .read = ib_read, + .write = ib_write, + .fd = ib_fd } }; diff --git a/packaging/docker/Dockerfile.dev b/packaging/docker/Dockerfile.dev index b7f9a80f9..6f10d1736 100644 --- a/packaging/docker/Dockerfile.dev +++ b/packaging/docker/Dockerfile.dev @@ -101,8 +101,6 @@ RUN dnf -y install \ libibumad-devel \ perftest - - # Build & Install Criterion RUN cd /tmp && \ git clone --recursive https://github.com/Snaipe/Criterion && \ From 4f6c2543b5ee99920b2c90f215a3ba1ca100c70c Mon Sep 17 00:00:00 2001 From: Dennis Potter Date: Sat, 23 Jun 2018 19:05:33 +0200 Subject: [PATCH 06/35] Fixed bugs in connection. A source node is now able to connect to a target node. --- include/villas/nodes/infiniband.h | 10 ++-- lib/nodes/infiniband.c | 83 +++++++++++++++++-------------- 2 files changed, 50 insertions(+), 43 deletions(-) diff --git a/include/villas/nodes/infiniband.h b/include/villas/nodes/infiniband.h index 6ebe37c8d..ae0928eb6 100644 --- a/include/villas/nodes/infiniband.h +++ b/include/villas/nodes/infiniband.h @@ -62,21 +62,17 @@ struct infiniband { struct connection_s { struct addrinfo *src_addr; struct addrinfo *dst_addr; - const int timeout; + int timeout; enum rdma_port_space port_space; struct ibv_mr *mr_payload; struct r_addr_key_s *r_addr_key; } conn; - struct init_s { - int cq_size; - enum ibv_qp_type qp_type; - int max_send_wr; - int max_recv_wr; - } init; + struct ibv_qp_init_attr qp_init; int is_source; + int cq_size; }; /** @see node_type::reverse */ diff --git a/lib/nodes/infiniband.c b/lib/nodes/infiniband.c index da0456781..e51c1bd90 100644 --- a/lib/nodes/infiniband.c +++ b/lib/nodes/infiniband.c @@ -33,7 +33,7 @@ static void ib_create_busy_poll(struct node *n, struct rdma_cm_id *id) struct infiniband *ib = (struct infiniband *) n->_vd; // Create completion queue and bind to channel - ib->ctx.cq = ibv_create_cq(ib->id->verbs, ib->init.cq_size, NULL, NULL, 0); + ib->ctx.cq = ibv_create_cq(ib->id->verbs, ib->cq_size, NULL, NULL, 0); if(!ib->ctx.cq) error("Could not create completion queue in node %s.", node_name(n)); @@ -52,7 +52,7 @@ static void ib_create_event(struct node *n, struct rdma_cm_id *id) // Create completion queue and bind to channel ib->ctx.cq = ibv_create_cq(ib->id->verbs, - ib->init.cq_size, + ib->cq_size, NULL, ib->ctx.comp_channel, 0); @@ -89,25 +89,18 @@ static void ib_build_ibv(struct node *n, struct rdma_cm_id *id) break; } - // Prepare Queue Pair (QP) attributes - struct ibv_qp_init_attr qp_attr; - qp_attr.send_cq = ib->ctx.cq; - qp_attr.recv_cq = ib->ctx.cq; - qp_attr.qp_type = ib->init.qp_type; - - qp_attr.cap.max_send_wr = ib->init.max_send_wr; - qp_attr.cap.max_recv_wr = ib->init.max_recv_wr; - qp_attr.cap.max_send_sge = 1; - qp_attr.cap.max_recv_sge = 1; + // Prepare remaining Queue Pair (QP) attributes + ib->qp_init.send_cq = ib->ctx.cq; + ib->qp_init.recv_cq = ib->ctx.cq; //ToDo: Set maximum inline data // Create the actual QP - ret = rdma_create_qp(id, ib->ctx.pd, &qp_attr); + ret = rdma_create_qp(id, ib->ctx.pd, &ib->qp_init); if(ret) error("Failed to create Queue Pair in node %s.", node_name(n)); - info("Successfully created Queue Pair in node %s.", node_name(n)); + info("Successfully created Queue Pair."); } static int ib_addr_resolved(struct node *n, struct rdma_cm_id *id) @@ -115,6 +108,8 @@ static int ib_addr_resolved(struct node *n, struct rdma_cm_id *id) struct infiniband *ib = (struct infiniband *) n->_vd; int ret; + info("Successfully resolved address."); + // Build all components from IB Verbs ib_build_ibv(n, id); @@ -123,8 +118,6 @@ static int ib_addr_resolved(struct node *n, struct rdma_cm_id *id) if(ret) error("Failed to resolve route in node %s.", node_name(n)); - info("Successfully resolved address node %s", node_name(n)); - //ToDo: create check if data can be send inline return 0; @@ -134,7 +127,7 @@ static int ib_route_resolved(struct node *n, struct rdma_cm_id *id) { int ret; - ib_build_ibv(n, id); + info("Successfully resolved route."); //ToDo: Post receive WRs @@ -146,7 +139,7 @@ static int ib_route_resolved(struct node *n, struct rdma_cm_id *id) if(ret) error("Failed to connect in node %s.", node_name(n)); - info("Route resolved and called rdma_connect"); + info("Called rdma_connect."); return 0; } @@ -197,6 +190,7 @@ static int ib_event(struct node *n, struct rdma_cm_event *event) case RDMA_CM_EVENT_REJECTED: error("Connection request or response was rejected by the remote end point!"); case RDMA_CM_EVENT_ESTABLISHED: + info("Connection established!"); ret = 1; break; default: @@ -244,7 +238,7 @@ int ib_parse(struct node *n, json_t *cfg) jerror(&err, "Failed to parse configuration of node %s", node_name(n)); // Translate IP:PORT to a struct addrinfo - ret = getaddrinfo(local, NULL, NULL, &ib->conn.src_addr); + ret = getaddrinfo(local, (char *)"13337", NULL, &ib->conn.src_addr); if(ret) { error("Failed to resolve local address '%s' of node %s: %s", local, node_name(n), gai_strerror(ret)); @@ -259,6 +253,9 @@ int ib_parse(struct node *n, json_t *cfg) error("Failed to translate rdma_port_space in node %s. %s is not a valid \ port space supported by rdma_cma.h!", node_name(n), port_space); } + + // Set timeout + ib->conn.timeout = timeout; // Translate poll mode if(strcmp(poll_mode, "EVENT") == 0) ib->poll.poll_mode = EVENT; @@ -269,33 +266,36 @@ int ib_parse(struct node *n, json_t *cfg) } // Set completion queue size - ib->init.cq_size = cq_size; + ib->cq_size = cq_size; // Translate QP type - if(strcmp(qp_type, "IBV_QPT_RC") == 0) ib->init.qp_type = IBV_QPT_RC; - else if(strcmp(qp_type, "IBV_QPT_UC") == 0) ib->init.qp_type = IBV_QPT_UC; - else if(strcmp(qp_type, "IBV_QPT_UD") == 0) ib->init.qp_type = IBV_QPT_UD; + if(strcmp(qp_type, "IBV_QPT_RC") == 0) ib->qp_init.qp_type = IBV_QPT_RC; + else if(strcmp(qp_type, "IBV_QPT_UC") == 0) ib->qp_init.qp_type = IBV_QPT_UC; + else if(strcmp(qp_type, "IBV_QPT_UD") == 0) ib->qp_init.qp_type = IBV_QPT_UD; else { error("Failed to translate qp_type in node %s. %s is not a valid \ qp_type!", node_name(n), qp_type); } - //Set max. send and receive Work Requests - ib->init.max_send_wr = max_send_wr; - ib->init.max_recv_wr = max_recv_wr; - + // Set max. send and receive Work Requests + ib->qp_init.cap.max_send_wr = max_send_wr; + ib->qp_init.cap.max_recv_wr = max_recv_wr; + + // Set remaining QP attributes + ib->qp_init.cap.max_send_sge = 1; + ib->qp_init.cap.max_recv_sge = 1; + //Check if node is a source and connect to target if(remote) { ib->is_source = 1; // Translate address info - ret = getaddrinfo(remote, NULL, NULL, &ib->conn.dst_addr); + ret = getaddrinfo(remote, (char *)"13337", NULL, &ib->conn.dst_addr); if(ret) { error("Failed to resolve remote address '%s' of node %s: %s", remote, node_name(n), gai_strerror(ret)); } - } else ib->is_source = 0; @@ -331,8 +331,7 @@ int ib_start(struct node *n) error("Failed to create rdma_cm_id of node %s: %s", node_name(n), gai_strerror(ret)); } - info("Succesfully created CM RDMA ID of node %s", - node_name(n)); + info("Succesfully created rdma_cm_id."); // Bind rdma_cm_id to the HCA ret = rdma_bind_addr(ib->id, ib->conn.src_addr->ai_addr); @@ -340,8 +339,7 @@ int ib_start(struct node *n) error("Failed to bind to local device of node %s: %s", node_name(n), gai_strerror(ret)); } - info("Bound to Infiniband device of node %s", - node_name(n)); + info("Bound rdma_cm_id to Infiniband device."); if(ib->is_source) { @@ -354,19 +352,27 @@ int ib_start(struct node *n) error("Failed to resolve remote address after %ims of node %s: %s", ib->conn.timeout, node_name(n), gai_strerror(ret)); } - + } + else + { + // Listen on rdma_cm_id for events + ret = rdma_listen(ib->id, 10); + if(ret) { + error("Failed to listen to rdma_cm_id on node %s", node_name(n)); + } } // Several events should occur on the event channel, to make // sure the nodes are succesfully connected. - info("Starting to monitor events on rdma_cm_id on node %s.", - node_name(n)); + info("Starting to monitor events on rdma_cm_id."); while(rdma_get_cm_event(ib->ec, &event) == 0) { struct rdma_cm_event event_copy; memcpy(&event_copy, event, sizeof(*event)); + rdma_ack_cm_event(event); + if(ib_event(n, &event_copy)) break; } @@ -397,6 +403,11 @@ int ib_read(struct node *n, struct sample *smps[], unsigned cnt) int ib_write(struct node *n, struct sample *smps[], unsigned cnt) { + + for(int i = 0; i < smps[0]->length; i++) + { + printf("Sample %i: %f\n", i, smps[0]->data[i].f); + } return 0; } From 077355d8ba5edbdc8de9b078d5810357f5c9b558 Mon Sep 17 00:00:00 2001 From: Dennis Potter Date: Sun, 24 Jun 2018 13:02:04 +0200 Subject: [PATCH 07/35] Started to implement memory managemen and registration. Not yet working and in an early stage. Still some debugging and testing printfs --- include/villas/nodes/infiniband.h | 18 +++++++- lib/nodes/infiniband.c | 74 +++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+), 1 deletion(-) diff --git a/include/villas/nodes/infiniband.h b/include/villas/nodes/infiniband.h index ae0928eb6..f072613d3 100644 --- a/include/villas/nodes/infiniband.h +++ b/include/villas/nodes/infiniband.h @@ -44,6 +44,15 @@ enum poll_mode_e BUSY }; +struct payload_s { + int data; +}; + +struct r_addr_key_s { + uint64_t remote_addr; + uint32_t rkey; +}; + struct infiniband { struct rdma_cm_id *id; struct rdma_event_channel *ec; @@ -65,7 +74,6 @@ struct infiniband { int timeout; enum rdma_port_space port_space; - struct ibv_mr *mr_payload; struct r_addr_key_s *r_addr_key; } conn; @@ -73,6 +81,14 @@ struct infiniband { int is_source; int cq_size; + + struct ib_memory { + struct pool p_recv; + struct pool p_send; + + struct ibv_mr *mr_recv; + struct ibv_mr *mr_send; + } mem; }; /** @see node_type::reverse */ diff --git a/lib/nodes/infiniband.c b/lib/nodes/infiniband.c index e51c1bd90..d68a3507d 100644 --- a/lib/nodes/infiniband.c +++ b/lib/nodes/infiniband.c @@ -26,6 +26,9 @@ #include #include #include +#include +#include + #include static void ib_create_busy_poll(struct node *n, struct rdma_cm_id *id) @@ -101,6 +104,75 @@ static void ib_build_ibv(struct node *n, struct rdma_cm_id *id) error("Failed to create Queue Pair in node %s.", node_name(n)); info("Successfully created Queue Pair."); + + // Allocate memory + ib->mem.p_recv.state = STATE_DESTROYED; + ib->mem.p_recv.queue.state = STATE_DESTROYED; + + // Set pool size to maximum size of Receive Queue + pool_init(&ib->mem.p_recv, + //ib->qp_init.cap.max_recv_wr, + 1, + sizeof(struct payload_s), + &memtype_heap); + if(ret) { + error("Failed to init recv memory pool of node %s: %s", + node_name(n), gai_strerror(ret)); + } + + //ToDo: initialize r_addr_key struct if mode is RDMA + struct payload_s* test; + test = pool_get(&ib->mem.p_recv); + + printf("Address value: %p\n", test); + printf("Address pool: %p\n", &ib->mem.p_recv); + printf("Address calculated: %p\n", &ib->mem.p_recv+ib->mem.p_recv.buffer_off); + + printf("Offset: %li\n", ib->mem.p_recv.buffer_off); + printf("Size of struct: %lu\n", sizeof(struct payload_s)); + printf("Size of block: %lu\n", ib->mem.p_recv.blocksz); + + // Register memory for IB Device. Not necessary if data is send + // exclusively inline + ib->mem.mr_recv = ibv_reg_mr( + ib->ctx.pd, + &ib->mem.p_recv+ib->mem.p_recv.buffer_off, + ib->mem.p_recv.len*ib->mem.p_recv.blocksz, + IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); + if(!ib->mem.mr_recv) { + error("Failed to register mr_recv with ibv_reg_mr of node %s.", + node_name(n)); + } + + if(ib->is_source) + { + ib->mem.p_send.state = STATE_DESTROYED; + ib->mem.p_send.queue.state = STATE_DESTROYED; + + // Set pool size to maximum size of Receive Queue + pool_init(&ib->mem.p_send, + ib->qp_init.cap.max_send_wr, + sizeof(struct payload_s), + &memtype_heap); + if(ret) { + error("Failed to init send memory of node %s: %s", + node_name(n), gai_strerror(ret)); + } + + //ToDo: initialize r_addr_key struct if mode is RDMA + + // Register memory for IB Device. Not necessary if data is send + // exclusively inline + ib->mem.mr_send = ibv_reg_mr( + ib->ctx.pd, + &ib->mem.p_send+ib->mem.p_send.buffer_off, + ib->mem.p_send.len*ib->mem.p_send.blocksz, + IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); + if(!ib->mem.mr_send) { + error("Failed to register mr_send with ibv_reg_mr of node %s.", + node_name(n)); + } + } } static int ib_addr_resolved(struct node *n, struct rdma_cm_id *id) @@ -238,6 +310,7 @@ int ib_parse(struct node *n, json_t *cfg) jerror(&err, "Failed to parse configuration of node %s", node_name(n)); // Translate IP:PORT to a struct addrinfo + //ToDo: Fix fixed port ret = getaddrinfo(local, (char *)"13337", NULL, &ib->conn.src_addr); if(ret) { error("Failed to resolve local address '%s' of node %s: %s", @@ -291,6 +364,7 @@ int ib_parse(struct node *n, json_t *cfg) ib->is_source = 1; // Translate address info + //ToDo: Fix fixed port ret = getaddrinfo(remote, (char *)"13337", NULL, &ib->conn.dst_addr); if(ret) { error("Failed to resolve remote address '%s' of node %s: %s", From 4fef5d67e5c1ee5e749c926a7a7b157549960d45 Mon Sep 17 00:00:00 2001 From: Dennis Potter Date: Mon, 25 Jun 2018 18:21:44 +0200 Subject: [PATCH 08/35] Memory is registrated succesfully. Created framework to send data. At this moment, the receiver doesn't prepare Receive Work Requests. This is the first thing to fix after this commit --- lib/nodes/infiniband.c | 87 +++++++++++++++++++++++++++++++++--------- 1 file changed, 69 insertions(+), 18 deletions(-) diff --git a/lib/nodes/infiniband.c b/lib/nodes/infiniband.c index d68a3507d..8ff29cdba 100644 --- a/lib/nodes/infiniband.c +++ b/lib/nodes/infiniband.c @@ -111,8 +111,7 @@ static void ib_build_ibv(struct node *n, struct rdma_cm_id *id) // Set pool size to maximum size of Receive Queue pool_init(&ib->mem.p_recv, - //ib->qp_init.cap.max_recv_wr, - 1, + ib->qp_init.cap.max_recv_wr, sizeof(struct payload_s), &memtype_heap); if(ret) { @@ -121,28 +120,19 @@ static void ib_build_ibv(struct node *n, struct rdma_cm_id *id) } //ToDo: initialize r_addr_key struct if mode is RDMA - struct payload_s* test; - test = pool_get(&ib->mem.p_recv); - - printf("Address value: %p\n", test); - printf("Address pool: %p\n", &ib->mem.p_recv); - printf("Address calculated: %p\n", &ib->mem.p_recv+ib->mem.p_recv.buffer_off); - - printf("Offset: %li\n", ib->mem.p_recv.buffer_off); - printf("Size of struct: %lu\n", sizeof(struct payload_s)); - printf("Size of block: %lu\n", ib->mem.p_recv.blocksz); // Register memory for IB Device. Not necessary if data is send // exclusively inline ib->mem.mr_recv = ibv_reg_mr( ib->ctx.pd, - &ib->mem.p_recv+ib->mem.p_recv.buffer_off, - ib->mem.p_recv.len*ib->mem.p_recv.blocksz, + (char*)&ib->mem.p_recv+ib->mem.p_recv.buffer_off, + ib->mem.p_recv.len, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); if(!ib->mem.mr_recv) { error("Failed to register mr_recv with ibv_reg_mr of node %s.", node_name(n)); } + info("Allocated receive memory."); if(ib->is_source) { @@ -165,13 +155,14 @@ static void ib_build_ibv(struct node *n, struct rdma_cm_id *id) // exclusively inline ib->mem.mr_send = ibv_reg_mr( ib->ctx.pd, - &ib->mem.p_send+ib->mem.p_send.buffer_off, - ib->mem.p_send.len*ib->mem.p_send.blocksz, + (char*)&ib->mem.p_send+ib->mem.p_send.buffer_off, + ib->mem.p_send.len, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); if(!ib->mem.mr_send) { error("Failed to register mr_send with ibv_reg_mr of node %s.", node_name(n)); } + info("Allocated send memory."); } } @@ -472,17 +463,77 @@ int ib_deinit() int ib_read(struct node *n, struct sample *smps[], unsigned cnt) { - return 0; + //Create separate thread for polling! This impelemtation is just + //for testing purposes + struct infiniband *ib = (struct infiniband *) n->_vd; + int ret; + struct ibv_wc wc[100]; + + ret = ibv_poll_cq(ib->ctx.cq, 100, wc); + + + + + return ret; } int ib_write(struct node *n, struct sample *smps[], unsigned cnt) { + struct infiniband *ib = (struct infiniband *) n->_vd; + int ret; + struct ibv_send_wr wr; + struct ibv_send_wr *bad_wr = NULL; + struct ibv_sge sg_list; + + memset(&wr, 0, sizeof(wr)); + + struct payload_s *payl; + payl = pool_get(&ib->mem.p_send); + + payl->data = 1337; + + // If data is send inline, it is not necessary to copy data to protected + // memory region first. + if(1) + { + //sg_list.addr = (uint64_t)smps[0]->data; + //sg_list.length = smps[0]->length-1; + sg_list.addr = (uintptr_t)payl; + sg_list.length = 1; + // lkey not necessary + } + else + { + //- copy value to send_region + //- give pointer to start of array + } + + // Set Send Work Request + wr.wr_id = 123; //ToDo: set this to a useful value + wr.sg_list = &sg_list; + wr.num_sge = 1; //ToDo: Right now only smps[0] is sg_list. This can be extended + //furthermore we should break the transaction up if inline mode + //is selected + wr.next = NULL; + wr.send_flags = IBV_SEND_SIGNALED; + wr.imm_data = htonl(0); //ToDo: set this to a useful value + wr.opcode = IBV_WR_SEND_WITH_IMM; for(int i = 0; i < smps[0]->length; i++) { printf("Sample %i: %f\n", i, smps[0]->data[i].f); } - return 0; + + ret = ibv_post_send(ib->id->qp, &wr, &bad_wr); + if(ret) + { + error("Failed to send message in node %s: %s", + node_name(n), gai_strerror(ret)); + + return -ret; + } + + return cnt; } int ib_fd(struct node *n) From 9e5836001d9f649b84f4d84456771e0c24bd2346 Mon Sep 17 00:00:00 2001 From: Dennis Potter Date: Wed, 27 Jun 2018 10:37:46 +0200 Subject: [PATCH 09/35] Node is able to send messages. A lot is hardcoded and this was only meant to be a first setup of a working node. --- include/villas/nodes/infiniband.h | 5 +- lib/nodes/infiniband.c | 153 ++++++++++++++++++++++-------- 2 files changed, 112 insertions(+), 46 deletions(-) diff --git a/include/villas/nodes/infiniband.h b/include/villas/nodes/infiniband.h index f072613d3..c0a8a825a 100644 --- a/include/villas/nodes/infiniband.h +++ b/include/villas/nodes/infiniband.h @@ -44,16 +44,13 @@ enum poll_mode_e BUSY }; -struct payload_s { - int data; -}; - struct r_addr_key_s { uint64_t remote_addr; uint32_t rkey; }; struct infiniband { + struct rdma_cm_id *listen_id; struct rdma_cm_id *id; struct rdma_event_channel *ec; diff --git a/lib/nodes/infiniband.c b/lib/nodes/infiniband.c index 8ff29cdba..7044d1eb7 100644 --- a/lib/nodes/infiniband.c +++ b/lib/nodes/infiniband.c @@ -31,6 +31,30 @@ #include +int ib_post_recv_wrs(struct node *n) +{ + struct infiniband *ib = (struct infiniband *) n->_vd; + struct ibv_recv_wr wr, *bad_wr = NULL; + int ret; + struct ibv_sge sge; + + // Prepare receive Scatter/Gather element + sge.addr = (uintptr_t)pool_get(&ib->mem.p_recv); + sge.length = ib->mem.p_recv.blocksz; + sge.lkey = ib->mem.mr_recv->lkey; + + // Prepare a receive Work Request + wr.wr_id = (uintptr_t)sge.addr; + wr.next = NULL; + wr.sg_list = &sge; + wr.num_sge = 1; + + // Post Work Request + ret = ibv_post_recv(ib->id->qp, &wr, &bad_wr); + + return ret; +} + static void ib_create_busy_poll(struct node *n, struct rdma_cm_id *id) { struct infiniband *ib = (struct infiniband *) n->_vd; @@ -103,7 +127,7 @@ static void ib_build_ibv(struct node *n, struct rdma_cm_id *id) if(ret) error("Failed to create Queue Pair in node %s.", node_name(n)); - info("Successfully created Queue Pair."); + info("Created Queue Pair."); // Allocate memory ib->mem.p_recv.state = STATE_DESTROYED; @@ -112,7 +136,7 @@ static void ib_build_ibv(struct node *n, struct rdma_cm_id *id) // Set pool size to maximum size of Receive Queue pool_init(&ib->mem.p_recv, ib->qp_init.cap.max_recv_wr, - sizeof(struct payload_s), + sizeof(double), &memtype_heap); if(ret) { error("Failed to init recv memory pool of node %s: %s", @@ -142,7 +166,7 @@ static void ib_build_ibv(struct node *n, struct rdma_cm_id *id) // Set pool size to maximum size of Receive Queue pool_init(&ib->mem.p_send, ib->qp_init.cap.max_send_wr, - sizeof(struct payload_s), + sizeof(double), &memtype_heap); if(ret) { error("Failed to init send memory of node %s: %s", @@ -163,7 +187,20 @@ static void ib_build_ibv(struct node *n, struct rdma_cm_id *id) node_name(n)); } info("Allocated send memory."); + } + + // Post Receive Work Requests to be able to receive data + // Fill complete Receive Queue during initialization + for(int i=0; iqp_init.cap.max_recv_wr; i++) + { + ret = ib_post_recv_wrs(n); + if(ret) { + error("Failed to post initial receive Work Requests of node %s.", + node_name(n)); + } + } + info("Filled the complete Receive Queue."); } static int ib_addr_resolved(struct node *n, struct rdma_cm_id *id) @@ -209,9 +246,11 @@ static int ib_route_resolved(struct node *n, struct rdma_cm_id *id) static int ib_connect_request(struct node *n, struct rdma_cm_id *id) { + struct infiniband *ib = (struct infiniband *) n->_vd; int ret; info("Received a connection request!"); + ib->id = id; ib_build_ibv(n, id); //ToDo: Post receive WRs @@ -398,6 +437,9 @@ int ib_start(struct node *n) } info("Succesfully created rdma_cm_id."); + // The ID will be overwritten for the target + ib->listen_id = ib->id; + // Bind rdma_cm_id to the HCA ret = rdma_bind_addr(ib->id, ib->conn.src_addr->ai_addr); if(ret) { @@ -421,7 +463,7 @@ int ib_start(struct node *n) else { // Listen on rdma_cm_id for events - ret = rdma_listen(ib->id, 10); + ret = rdma_listen(ib->listen_id, 10); if(ret) { error("Failed to listen to rdma_cm_id on node %s", node_name(n)); } @@ -467,64 +509,80 @@ int ib_read(struct node *n, struct sample *smps[], unsigned cnt) //for testing purposes struct infiniband *ib = (struct infiniband *) n->_vd; int ret; - struct ibv_wc wc[100]; + struct ibv_wc wc[cnt]; + union { + double f; + int64_t i; + } *data; - ret = ibv_poll_cq(ib->ctx.cq, 100, wc); + ret = ibv_poll_cq(ib->ctx.cq, cnt, wc); + if(ret) + { + data = malloc(ret*sizeof(double)); - + for(int i=0; ilength = ret; + smps[0]->capacity = cnt; + memcpy(smps[0]->data, data, ret*sizeof(double)); + } return ret; } int ib_write(struct node *n, struct sample *smps[], unsigned cnt) { + /* Send pool is not used at this moment! */ struct infiniband *ib = (struct infiniband *) n->_vd; int ret; - struct ibv_send_wr wr; - struct ibv_send_wr *bad_wr = NULL; - struct ibv_sge sg_list; + struct ibv_send_wr wr[smps[0]->length], *bad_wr = NULL; + struct ibv_sge sge[smps[0]->length]; memset(&wr, 0, sizeof(wr)); - struct payload_s *payl; - payl = pool_get(&ib->mem.p_send); + //ToDo: Place this into configuration and create checks if settings are valid + int send_inline = 1; - payl->data = 1337; - - // If data is send inline, it is not necessary to copy data to protected - // memory region first. - if(1) + for(int i=0; ilength; i++) { - //sg_list.addr = (uint64_t)smps[0]->data; - //sg_list.length = smps[0]->length-1; - sg_list.addr = (uintptr_t)payl; - sg_list.length = 1; - // lkey not necessary - } - else - { - //- copy value to send_region - //- give pointer to start of array - } + // If data is send inline, it is not necessary to copy data to protected + // memory region first. + if(send_inline) + { + sge[i].addr = (uint64_t)&smps[0]->data[i].f; + sge[i].length = sizeof(double); + } + else + { + //- copy value to send_region + //- give pointer to start of array + } - // Set Send Work Request - wr.wr_id = 123; //ToDo: set this to a useful value - wr.sg_list = &sg_list; - wr.num_sge = 1; //ToDo: Right now only smps[0] is sg_list. This can be extended - //furthermore we should break the transaction up if inline mode - //is selected - wr.next = NULL; - wr.send_flags = IBV_SEND_SIGNALED; - wr.imm_data = htonl(0); //ToDo: set this to a useful value - wr.opcode = IBV_WR_SEND_WITH_IMM; + // Set Send Work Request + wr[i].wr_id = 0; //ToDo: set this to a useful value + wr[i].sg_list = &sge[i]; + wr[i].num_sge = 1; //ToDo: Right now only smps[0] is sg_list. This can be extended + //furthermore we should break the transaction up if inline mode + //is selected + + if(i == (smps[0]->length-1)) + wr[i].next = NULL; + else + wr[i].next = &wr[i+1]; + wr[i].send_flags = IBV_SEND_SIGNALED | (send_inline<<3); + wr[i].imm_data = htonl(0); //ToDo: set this to a useful value + wr[i].opcode = IBV_WR_SEND_WITH_IMM; - for(int i = 0; i < smps[0]->length; i++) - { - printf("Sample %i: %f\n", i, smps[0]->data[i].f); } - ret = ibv_post_send(ib->id->qp, &wr, &bad_wr); + //Send linked list of Work Requests + ret = ibv_post_send(ib->id->qp, wr, &bad_wr); if(ret) { error("Failed to send message in node %s: %s", @@ -532,6 +590,17 @@ int ib_write(struct node *n, struct sample *smps[], unsigned cnt) return -ret; } + + /* Debugging */ + struct ibv_wc wc[5]; + int size; + while(1) + { + size = ibv_poll_cq(ib->ctx.cq, 5, wc); + if(size) + for(int j=0; j Date: Wed, 27 Jun 2018 11:21:28 +0200 Subject: [PATCH 10/35] Refactored the rdma_cm_id structure --- include/villas/nodes/infiniband.h | 2 +- lib/nodes/infiniband.c | 41 ++++++++++++++++--------------- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/include/villas/nodes/infiniband.h b/include/villas/nodes/infiniband.h index c0a8a825a..122521a80 100644 --- a/include/villas/nodes/infiniband.h +++ b/include/villas/nodes/infiniband.h @@ -68,8 +68,8 @@ struct infiniband { struct connection_s { struct addrinfo *src_addr; struct addrinfo *dst_addr; - int timeout; enum rdma_port_space port_space; + int timeout; struct r_addr_key_s *r_addr_key; } conn; diff --git a/lib/nodes/infiniband.c b/lib/nodes/infiniband.c index 7044d1eb7..436412c77 100644 --- a/lib/nodes/infiniband.c +++ b/lib/nodes/infiniband.c @@ -55,7 +55,7 @@ int ib_post_recv_wrs(struct node *n) return ret; } -static void ib_create_busy_poll(struct node *n, struct rdma_cm_id *id) +static void ib_create_busy_poll(struct node *n) { struct infiniband *ib = (struct infiniband *) n->_vd; @@ -67,7 +67,7 @@ static void ib_create_busy_poll(struct node *n, struct rdma_cm_id *id) //ToDo: Create poll pthread } -static void ib_create_event(struct node *n, struct rdma_cm_id *id) +static void ib_create_event(struct node *n) { int ret; struct infiniband *ib = (struct infiniband *) n->_vd; @@ -95,7 +95,7 @@ static void ib_create_event(struct node *n, struct rdma_cm_id *id) //ToDo: Create poll pthread } -static void ib_build_ibv(struct node *n, struct rdma_cm_id *id) +static void ib_build_ibv(struct node *n) { struct infiniband *ib = (struct infiniband *) n->_vd; int ret; @@ -109,10 +109,10 @@ static void ib_build_ibv(struct node *n, struct rdma_cm_id *id) switch(ib->poll.poll_mode) { case EVENT: - ib_create_event(n, id); + ib_create_event(n); break; case BUSY: - ib_create_busy_poll(n, id); + ib_create_busy_poll(n); break; } @@ -123,7 +123,7 @@ static void ib_build_ibv(struct node *n, struct rdma_cm_id *id) //ToDo: Set maximum inline data // Create the actual QP - ret = rdma_create_qp(id, ib->ctx.pd, &ib->qp_init); + ret = rdma_create_qp(ib->id, ib->ctx.pd, &ib->qp_init); if(ret) error("Failed to create Queue Pair in node %s.", node_name(n)); @@ -203,7 +203,7 @@ static void ib_build_ibv(struct node *n, struct rdma_cm_id *id) info("Filled the complete Receive Queue."); } -static int ib_addr_resolved(struct node *n, struct rdma_cm_id *id) +static int ib_addr_resolved(struct node *n) { struct infiniband *ib = (struct infiniband *) n->_vd; int ret; @@ -211,10 +211,10 @@ static int ib_addr_resolved(struct node *n, struct rdma_cm_id *id) info("Successfully resolved address."); // Build all components from IB Verbs - ib_build_ibv(n, id); + ib_build_ibv(n); // Resolve address - ret = rdma_resolve_route(id, ib->conn.timeout); + ret = rdma_resolve_route(ib->id, ib->conn.timeout); if(ret) error("Failed to resolve route in node %s.", node_name(n)); @@ -223,8 +223,9 @@ static int ib_addr_resolved(struct node *n, struct rdma_cm_id *id) return 0; } -static int ib_route_resolved(struct node *n, struct rdma_cm_id *id) +static int ib_route_resolved(struct node *n) { + struct infiniband *ib = (struct infiniband *) n->_vd; int ret; info("Successfully resolved route."); @@ -235,7 +236,7 @@ static int ib_route_resolved(struct node *n, struct rdma_cm_id *id) memset(&cm_params, 0, sizeof(cm_params)); // Send connection request - ret = rdma_connect(id, &cm_params); + ret = rdma_connect(ib->id, &cm_params); if(ret) error("Failed to connect in node %s.", node_name(n)); @@ -251,15 +252,13 @@ static int ib_connect_request(struct node *n, struct rdma_cm_id *id) info("Received a connection request!"); ib->id = id; - ib_build_ibv(n, id); + ib_build_ibv(n); - //ToDo: Post receive WRs - struct rdma_conn_param cm_params; memset(&cm_params, 0, sizeof(cm_params)); // Accept connection request - ret = rdma_accept(id, &cm_params); + ret = rdma_accept(ib->id, &cm_params); if(ret) error("Failed to connect in node %s.", node_name(n)); @@ -275,12 +274,12 @@ static int ib_event(struct node *n, struct rdma_cm_event *event) switch(event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: - ret = ib_addr_resolved(n, event->id); + ret = ib_addr_resolved(n); break; case RDMA_CM_EVENT_ADDR_ERROR: error("Address resolution (rdma_resolve_addr) failed!"); case RDMA_CM_EVENT_ROUTE_RESOLVED: - ret = ib_route_resolved(n, event->id); + ret = ib_route_resolved(n); break; case RDMA_CM_EVENT_ROUTE_ERROR: error("Route resolution (rdma_resovle_route) failed!"); @@ -437,9 +436,6 @@ int ib_start(struct node *n) } info("Succesfully created rdma_cm_id."); - // The ID will be overwritten for the target - ib->listen_id = ib->id; - // Bind rdma_cm_id to the HCA ret = rdma_bind_addr(ib->id, ib->conn.src_addr->ai_addr); if(ret) { @@ -462,6 +458,11 @@ int ib_start(struct node *n) } else { + // The ID will be overwritten for the target. If the event type is + // RDMA_CM_EVENT_CONNECT_REQUEST, >then this references a new id for + // that communication. + ib->listen_id = ib->id; + // Listen on rdma_cm_id for events ret = rdma_listen(ib->listen_id, 10); if(ret) { From 1df18da3f2e9bddb4c06b180dc0ea5870ce95b42 Mon Sep 17 00:00:00 2001 From: Dennis Potter Date: Wed, 27 Jun 2018 17:01:47 +0200 Subject: [PATCH 11/35] Added Work Completion threads. Send Completion thread checks for errors, Receive Completion Queue is still empty. --- include/villas/nodes/infiniband.h | 38 +++-- lib/nodes/infiniband.c | 242 +++++++++++++++++++----------- 2 files changed, 184 insertions(+), 96 deletions(-) diff --git a/include/villas/nodes/infiniband.h b/include/villas/nodes/infiniband.h index 122521a80..cb0034711 100644 --- a/include/villas/nodes/infiniband.h +++ b/include/villas/nodes/infiniband.h @@ -35,9 +35,11 @@ #include #include -/* Forward declarations */ -struct format_type; +/* Function poitner typedefs */ +typedef void (*ib_on_completion)(struct node*, struct ibv_wc*, int*); +typedef void* (*ib_poll_function)(void*); +/* Enums */ enum poll_mode_e { EVENT, @@ -50,21 +52,33 @@ struct r_addr_key_s { }; struct infiniband { - struct rdma_cm_id *listen_id; - struct rdma_cm_id *id; - struct rdma_event_channel *ec; + /* IBV/RDMA CM structs */ struct context_s { + struct rdma_cm_id *listen_id; + struct rdma_cm_id *id; + struct rdma_event_channel *ec; + struct ibv_pd *pd; struct ibv_cq *cq; struct ibv_comp_channel *comp_channel; } ctx; + /* Work Completion related */ struct poll_s { enum poll_mode_e poll_mode; + + /* On completion function */ + ib_on_completion on_compl; + + /* Busy poll or Event function */ + ib_poll_function poll_func; + + /* Poll thread */ pthread_t cq_poller_thread; } poll; + /* Connection specific variables */ struct connection_s { struct addrinfo *src_addr; struct addrinfo *dst_addr; @@ -74,11 +88,7 @@ struct infiniband { struct r_addr_key_s *r_addr_key; } conn; - struct ibv_qp_init_attr qp_init; - - int is_source; - int cq_size; - + /* Memory related variables */ struct ib_memory { struct pool p_recv; struct pool p_send; @@ -86,6 +96,14 @@ struct infiniband { struct ibv_mr *mr_recv; struct ibv_mr *mr_send; } mem; + + /* Queue Pair init variables */ + struct ibv_qp_init_attr qp_init; + + /* Misc settings */ + int is_source; + int cq_size; + }; /** @see node_type::reverse */ diff --git a/lib/nodes/infiniband.c b/lib/nodes/infiniband.c index 436412c77..ea248fac4 100644 --- a/lib/nodes/infiniband.c +++ b/lib/nodes/infiniband.c @@ -31,6 +31,7 @@ #include + int ib_post_recv_wrs(struct node *n) { struct infiniband *ib = (struct infiniband *) n->_vd; @@ -50,35 +51,82 @@ int ib_post_recv_wrs(struct node *n) wr.num_sge = 1; // Post Work Request - ret = ibv_post_recv(ib->id->qp, &wr, &bad_wr); + ret = ibv_post_recv(ib->ctx.id->qp, &wr, &bad_wr); return ret; } -static void ib_create_busy_poll(struct node *n) +void ib_completion_target(struct node* n, struct ibv_wc* wc, int* size) { - struct infiniband *ib = (struct infiniband *) n->_vd; - - // Create completion queue and bind to channel - ib->ctx.cq = ibv_create_cq(ib->id->verbs, ib->cq_size, NULL, NULL, 0); - if(!ib->ctx.cq) - error("Could not create completion queue in node %s.", node_name(n)); - - //ToDo: Create poll pthread + //ToDo: No implementation yet. This is still handled in ib_read } -static void ib_create_event(struct node *n) +void ib_completion_source(struct node* n, struct ibv_wc* wc, int* size) +{ + for(int i=0; i<*size; i++) + { + if(wc[i].status != IBV_WC_SUCCESS) + error("Work Completion status was not IBV_WC_SUCCES in node %s: %s", + node_name(n), gai_strerror(wc[i].status)); + } + +} + +void * ib_event_thread(void *n) +{ + struct infiniband *ib = (struct infiniband *)((struct node *)n)->_vd; + struct ibv_wc wc[ib->cq_size]; + int size; + + while(1) + { + // Function blocks, until an event occurs + ibv_get_cq_event(ib->ctx.comp_channel, &ib->ctx.cq, NULL); + + // Poll as long as WCs are available + while((size = ibv_poll_cq(ib->ctx.cq, ib->cq_size, wc))) + ib->poll.on_compl(n, wc, &size); + + // Request a new event in the CQ and acknowledge event + ibv_req_notify_cq(ib->ctx.cq, 0); + ibv_ack_cq_events(ib->ctx.cq, 1); + } +} + +void * ib_busy_poll_thread(void *n) +{ + struct infiniband *ib = (struct infiniband *)((struct node *)n)->_vd; + struct ibv_wc wc[ib->cq_size]; + int size; + + while(1) + { + //ToDo: Implement stopThreads variable + if(0) + return NULL; + + // Poll as long as WCs are available + while((size = ibv_poll_cq(ib->ctx.cq, ib->cq_size, wc))) + ib->poll.on_compl(n, wc, &size); + } +} + +static void ib_init_wc_poll(struct node *n) { int ret; struct infiniband *ib = (struct infiniband *) n->_vd; + ib->ctx.comp_channel = NULL; - // Create completion channel - ib->ctx.comp_channel = ibv_create_comp_channel(ib->id->verbs); - if(!ib->ctx.comp_channel) - error("Could not create completion channel in node %s.", node_name(n)); + if(ib->poll.poll_mode == EVENT) + { + // Create completion channel + ib->ctx.comp_channel = ibv_create_comp_channel(ib->ctx.id->verbs); + if(!ib->ctx.comp_channel) + error("Could not create completion channel in node %s.", node_name(n)); + } - // Create completion queue and bind to channel - ib->ctx.cq = ibv_create_cq(ib->id->verbs, + // Create completion queue and bind to channel (or NULL) + ib->ctx.cq = ibv_create_cq(ib->ctx.id->verbs, ib->cq_size, NULL, ib->ctx.comp_channel, @@ -86,13 +134,26 @@ static void ib_create_event(struct node *n) if(!ib->ctx.cq) error("Could not create completion queue in node %s.", node_name(n)); - // Request notifications from completion queue - ret = ibv_req_notify_cq(ib->ctx.cq, 0); - if(ret) - error("Failed to request notifiy CQ in node %s: %s", - node_name(n), gai_strerror(ret)); + if(ib->poll.poll_mode == EVENT) + { + // Request notifications from completion queue + ret = ibv_req_notify_cq(ib->ctx.cq, 0); + if(ret) + error("Failed to request notifiy CQ in node %s: %s", + node_name(n), gai_strerror(ret)); + } - //ToDo: Create poll pthread + // Initialize polling pthread + //ToDo: Remove if(is_source) + if(ib->is_source) + { + ret = pthread_create(&ib->poll.cq_poller_thread, NULL, ib->poll.poll_func, n); + if(ret) + { + error("Failed to create poll thread of node %s: %s", + node_name(n), gai_strerror(ret)); + } + } } static void ib_build_ibv(struct node *n) @@ -101,20 +162,13 @@ static void ib_build_ibv(struct node *n) int ret; //Allocate protection domain - ib->ctx.pd = ibv_alloc_pd(ib->id->verbs); + ib->ctx.pd = ibv_alloc_pd(ib->ctx.id->verbs); if(!ib->ctx.pd) error("Could not allocate protection domain in node %s.", node_name(n)); + info("Allocated Protection Domain"); // Initiate poll mode - switch(ib->poll.poll_mode) - { - case EVENT: - ib_create_event(n); - break; - case BUSY: - ib_create_busy_poll(n); - break; - } + ib_init_wc_poll(n); // Prepare remaining Queue Pair (QP) attributes ib->qp_init.send_cq = ib->ctx.cq; @@ -123,7 +177,7 @@ static void ib_build_ibv(struct node *n) //ToDo: Set maximum inline data // Create the actual QP - ret = rdma_create_qp(ib->id, ib->ctx.pd, &ib->qp_init); + ret = rdma_create_qp(ib->ctx.id, ib->ctx.pd, &ib->qp_init); if(ret) error("Failed to create Queue Pair in node %s.", node_name(n)); @@ -138,7 +192,8 @@ static void ib_build_ibv(struct node *n) ib->qp_init.cap.max_recv_wr, sizeof(double), &memtype_heap); - if(ret) { + if(ret) + { error("Failed to init recv memory pool of node %s: %s", node_name(n), gai_strerror(ret)); } @@ -168,7 +223,8 @@ static void ib_build_ibv(struct node *n) ib->qp_init.cap.max_send_wr, sizeof(double), &memtype_heap); - if(ret) { + if(ret) + { error("Failed to init send memory of node %s: %s", node_name(n), gai_strerror(ret)); } @@ -195,7 +251,8 @@ static void ib_build_ibv(struct node *n) for(int i=0; iqp_init.cap.max_recv_wr; i++) { ret = ib_post_recv_wrs(n); - if(ret) { + if(ret) + { error("Failed to post initial receive Work Requests of node %s.", node_name(n)); } @@ -214,7 +271,7 @@ static int ib_addr_resolved(struct node *n) ib_build_ibv(n); // Resolve address - ret = rdma_resolve_route(ib->id, ib->conn.timeout); + ret = rdma_resolve_route(ib->ctx.id, ib->conn.timeout); if(ret) error("Failed to resolve route in node %s.", node_name(n)); @@ -236,7 +293,7 @@ static int ib_route_resolved(struct node *n) memset(&cm_params, 0, sizeof(cm_params)); // Send connection request - ret = rdma_connect(ib->id, &cm_params); + ret = rdma_connect(ib->ctx.id, &cm_params); if(ret) error("Failed to connect in node %s.", node_name(n)); @@ -251,14 +308,14 @@ static int ib_connect_request(struct node *n, struct rdma_cm_id *id) int ret; info("Received a connection request!"); - ib->id = id; + ib->ctx.id = id; ib_build_ibv(n); struct rdma_conn_param cm_params; memset(&cm_params, 0, sizeof(cm_params)); // Accept connection request - ret = rdma_accept(ib->id, &cm_params); + ret = rdma_accept(ib->ctx.id, &cm_params); if(ret) error("Failed to connect in node %s.", node_name(n)); @@ -341,7 +398,8 @@ int ib_parse(struct node *n, json_t *cfg) // Translate IP:PORT to a struct addrinfo //ToDo: Fix fixed port ret = getaddrinfo(local, (char *)"13337", NULL, &ib->conn.src_addr); - if(ret) { + if(ret) + { error("Failed to resolve local address '%s' of node %s: %s", local, node_name(n), gai_strerror(ret)); } @@ -360,9 +418,19 @@ int ib_parse(struct node *n, json_t *cfg) ib->conn.timeout = timeout; // Translate poll mode - if(strcmp(poll_mode, "EVENT") == 0) ib->poll.poll_mode = EVENT; - else if(strcmp(poll_mode, "BUSY") == 0) ib->poll.poll_mode = BUSY; - else { + if(strcmp(poll_mode, "EVENT") == 0) + { + ib->poll.poll_mode = EVENT; + ib->poll.poll_func = ib_event_thread; + + } + else if(strcmp(poll_mode, "BUSY") == 0) + { + ib->poll.poll_mode = BUSY; + ib->poll.poll_func = ib_busy_poll_thread; + } + else + { error("Failed to translate poll_mode in node %s. %s is not a valid \ poll mode!", node_name(n), poll_mode); } @@ -395,13 +463,22 @@ int ib_parse(struct node *n, json_t *cfg) // Translate address info //ToDo: Fix fixed port ret = getaddrinfo(remote, (char *)"13337", NULL, &ib->conn.dst_addr); - if(ret) { + if(ret) + { error("Failed to resolve remote address '%s' of node %s: %s", remote, node_name(n), gai_strerror(ret)); } + + // Set correct Work Completion function + ib->poll.on_compl = ib_completion_source; } else + { ib->is_source = 0; + + // Set correct Work Completion function + ib->poll.on_compl = ib_completion_target; + } return 0; } @@ -423,22 +500,24 @@ int ib_start(struct node *n) int ret; // Create event channel - ib->ec = rdma_create_event_channel(); - if(!ib->ec) { + ib->ctx.ec = rdma_create_event_channel(); + if(!ib->ctx.ec) { error("Failed to create event channel in node %s!", node_name(n)); } - ret = rdma_create_id(ib->ec, &ib->id, NULL, ib->conn.port_space); - if(ret) { + ret = rdma_create_id(ib->ctx.ec, &ib->ctx.id, NULL, ib->conn.port_space); + if(ret) + { error("Failed to create rdma_cm_id of node %s: %s", node_name(n), gai_strerror(ret)); } info("Succesfully created rdma_cm_id."); // Bind rdma_cm_id to the HCA - ret = rdma_bind_addr(ib->id, ib->conn.src_addr->ai_addr); - if(ret) { + ret = rdma_bind_addr(ib->ctx.id, ib->conn.src_addr->ai_addr); + if(ret) + { error("Failed to bind to local device of node %s: %s", node_name(n), gai_strerror(ret)); } @@ -447,11 +526,12 @@ int ib_start(struct node *n) if(ib->is_source) { // Resolve address - ret = rdma_resolve_addr(ib->id, + ret = rdma_resolve_addr(ib->ctx.id, NULL, ib->conn.dst_addr->ai_addr, ib->conn.timeout); - if(ret) { + if(ret) + { error("Failed to resolve remote address after %ims of node %s: %s", ib->conn.timeout, node_name(n), gai_strerror(ret)); } @@ -461,11 +541,12 @@ int ib_start(struct node *n) // The ID will be overwritten for the target. If the event type is // RDMA_CM_EVENT_CONNECT_REQUEST, >then this references a new id for // that communication. - ib->listen_id = ib->id; + ib->ctx.listen_id = ib->ctx.id; // Listen on rdma_cm_id for events - ret = rdma_listen(ib->listen_id, 10); - if(ret) { + ret = rdma_listen(ib->ctx.listen_id, 10); + if(ret) + { error("Failed to listen to rdma_cm_id on node %s", node_name(n)); } } @@ -474,7 +555,7 @@ int ib_start(struct node *n) // sure the nodes are succesfully connected. info("Starting to monitor events on rdma_cm_id."); - while(rdma_get_cm_event(ib->ec, &event) == 0) + while(rdma_get_cm_event(ib->ctx.ec, &event) == 0) { struct rdma_cm_event event_copy; memcpy(&event_copy, event, sizeof(*event)); @@ -583,7 +664,7 @@ int ib_write(struct node *n, struct sample *smps[], unsigned cnt) } //Send linked list of Work Requests - ret = ibv_post_send(ib->id->qp, wr, &bad_wr); + ret = ibv_post_send(ib->ctx.id->qp, wr, &bad_wr); if(ret) { error("Failed to send message in node %s: %s", @@ -591,17 +672,6 @@ int ib_write(struct node *n, struct sample *smps[], unsigned cnt) return -ret; } - - /* Debugging */ - struct ibv_wc wc[5]; - int size; - while(1) - { - size = ibv_poll_cq(ib->ctx.cq, 5, wc); - if(size) - for(int j=0; j Date: Thu, 28 Jun 2018 12:46:16 +0200 Subject: [PATCH 12/35] Implemented disconnect function on source and target side. The cleanup function doesn't go through completely yet, probably because rdma_destroy_id blocks because not everything in the rdma_cm_id is destroyed yet. --- include/villas/nodes/infiniband.h | 7 +- lib/nodes/infiniband.c | 117 +++++++++++++++++++++++++++--- 2 files changed, 113 insertions(+), 11 deletions(-) diff --git a/include/villas/nodes/infiniband.h b/include/villas/nodes/infiniband.h index cb0034711..ebb443394 100644 --- a/include/villas/nodes/infiniband.h +++ b/include/villas/nodes/infiniband.h @@ -63,7 +63,6 @@ struct infiniband { struct ibv_cq *cq; struct ibv_comp_channel *comp_channel; } ctx; - /* Work Completion related */ struct poll_s { enum poll_mode_e poll_mode; @@ -76,6 +75,8 @@ struct infiniband { /* Poll thread */ pthread_t cq_poller_thread; + + int stopThread; } poll; /* Connection specific variables */ @@ -86,6 +87,9 @@ struct infiniband { int timeout; struct r_addr_key_s *r_addr_key; + + pthread_t stop_thread; + int rdma_disconnect_called; } conn; /* Memory related variables */ @@ -103,7 +107,6 @@ struct infiniband { /* Misc settings */ int is_source; int cq_size; - }; /** @see node_type::reverse */ diff --git a/lib/nodes/infiniband.c b/lib/nodes/infiniband.c index ea248fac4..b3faf33e0 100644 --- a/lib/nodes/infiniband.c +++ b/lib/nodes/infiniband.c @@ -31,6 +31,31 @@ #include +int ib_cleanup(struct node *n) +{ + struct infiniband *ib = (struct infiniband *) n->_vd; + info("Starting to clean up"); + + // Destroy QP + rdma_destroy_qp(ib->ctx.id); + info("Destroyed QP"); + + // Deregister memory regions + ibv_dereg_mr(ib->mem.mr_recv); + if(ib->is_source) + ibv_dereg_mr(ib->mem.mr_send); + info("Deregistered memory regions"); + + // Destroy pools + pool_destroy(&ib->mem.p_recv); + pool_destroy(&ib->mem.p_send); + info("Destroyed memory pools"); + + rdma_destroy_id(ib->ctx.id); + info("Destroyed rdma_cm_id"); + + return 0; +} int ib_post_recv_wrs(struct node *n) { @@ -63,11 +88,21 @@ void ib_completion_target(struct node* n, struct ibv_wc* wc, int* size) void ib_completion_source(struct node* n, struct ibv_wc* wc, int* size) { + struct infiniband *ib = (struct infiniband *)((struct node *)n)->_vd; + for(int i=0; i<*size; i++) { + //On disconnect, the QP set to error state and will be flushed + if(wc[i].status == IBV_WC_WR_FLUSH_ERR) + { + ib->poll.stopThread = 1; + return; + } + if(wc[i].status != IBV_WC_SUCCESS) - error("Work Completion status was not IBV_WC_SUCCES in node %s: %s", - node_name(n), gai_strerror(wc[i].status)); + warn("Work Completion status was not IBV_WC_SUCCES in node %s: %i", + node_name(n), wc[i].status); + } } @@ -101,13 +136,12 @@ void * ib_busy_poll_thread(void *n) while(1) { - //ToDo: Implement stopThreads variable - if(0) - return NULL; - // Poll as long as WCs are available while((size = ibv_poll_cq(ib->ctx.cq, ib->cq_size, wc))) ib->poll.on_compl(n, wc, &size); + + if(ib->poll.stopThread) + return NULL; } } @@ -351,6 +385,9 @@ static int ib_event(struct node *n, struct rdma_cm_event *event) info("Connection established!"); ret = 1; break; + case RDMA_CM_EVENT_DISCONNECTED: + ret = ib_cleanup(n); + break; default: error("Unknown event occurred: %u", event->event); @@ -448,6 +485,7 @@ int ib_parse(struct node *n, json_t *cfg) } // Set max. send and receive Work Requests + //ToDo: Set hint that max_*_wr can only be a value 1<< ib->qp_init.cap.max_send_wr = max_send_wr; ib->qp_init.cap.max_recv_wr = max_recv_wr; @@ -493,6 +531,23 @@ int ib_destroy(struct node *n) return 0; } +void * ib_stop_thread(void *n) +{ + struct node *node = (struct node *)n; + struct infiniband *ib = (struct infiniband *)((struct node *)n)->_vd; + struct rdma_cm_event *event; + while(rdma_get_cm_event(ib->ctx.ec, &event) == 0) + { + if(event->event == RDMA_CM_EVENT_DISCONNECTED) + { + ib->conn.rdma_disconnect_called = 1; + node_stop(node); + return NULL; + } + } + return NULL; +} + int ib_start(struct node *n) { struct infiniband *ib = (struct infiniband *) n->_vd; @@ -566,11 +621,49 @@ int ib_start(struct node *n) break; } + ret = pthread_create(&ib->conn.stop_thread, NULL, ib_stop_thread, n); + if(ret) + { + error("Failed to create thread to monitor disconnects in node %s: %s", + node_name(n), gai_strerror(ret)); + } + return 0; } int ib_stop(struct node *n) { + struct infiniband *ib = (struct infiniband *) n->_vd; + struct rdma_cm_event *event = NULL; + int ret; + + // Call RDMA disconnect function + // Will flush all outstanding WRs to the Completion Queue and + // will call RDMA_CM_EVENT_DISCONNECTED if that is done. + ret = rdma_disconnect(ib->ctx.id); + if(ret) + { + error("Error while calling rdma_disconnect in node %s: %s", + node_name(n), gai_strerror(ret)); + } + info("Called rdma_disconnect."); + + // If disconnected event already occured, directly call cleanup function + if(ib->conn.rdma_disconnect_called) + { + ib_cleanup(n); + } + // Else, wait for event to occur + else + { + ib->conn.rdma_disconnect_called = 1; + rdma_get_cm_event(ib->ctx.ec, &event); + + ib_event(n, event); + + rdma_ack_cm_event(event); + } + return 0; } @@ -605,10 +698,16 @@ int ib_read(struct node *n, struct sample *smps[], unsigned cnt) for(int i=0; ilength = ret; smps[0]->capacity = cnt; From 95393eeb35f03a99df6cc2f0004e5346615150b7 Mon Sep 17 00:00:00 2001 From: Dennis Potter Date: Thu, 28 Jun 2018 14:43:08 +0200 Subject: [PATCH 13/35] Forgot to acknowledge events which caused the rdma_cm_id to block. --- lib/nodes/infiniband.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/lib/nodes/infiniband.c b/lib/nodes/infiniband.c index b3faf33e0..4783403fe 100644 --- a/lib/nodes/infiniband.c +++ b/lib/nodes/infiniband.c @@ -51,9 +51,14 @@ int ib_cleanup(struct node *n) pool_destroy(&ib->mem.p_send); info("Destroyed memory pools"); + // Destroy RDMA CM ID rdma_destroy_id(ib->ctx.id); info("Destroyed rdma_cm_id"); + // Destroy event channel + rdma_destroy_event_channel(ib->ctx.ec); + info("Destroyed event channel"); + return 0; } @@ -531,16 +536,19 @@ int ib_destroy(struct node *n) return 0; } -void * ib_stop_thread(void *n) +void * ib_disconnect_thread(void *n) { struct node *node = (struct node *)n; struct infiniband *ib = (struct infiniband *)((struct node *)n)->_vd; struct rdma_cm_event *event; + while(rdma_get_cm_event(ib->ctx.ec, &event) == 0) { if(event->event == RDMA_CM_EVENT_DISCONNECTED) { + rdma_ack_cm_event(event); ib->conn.rdma_disconnect_called = 1; + node_stop(node); return NULL; } @@ -621,7 +629,7 @@ int ib_start(struct node *n) break; } - ret = pthread_create(&ib->conn.stop_thread, NULL, ib_stop_thread, n); + ret = pthread_create(&ib->conn.stop_thread, NULL, ib_disconnect_thread, n); if(ret) { error("Failed to create thread to monitor disconnects in node %s: %s", @@ -659,9 +667,9 @@ int ib_stop(struct node *n) ib->conn.rdma_disconnect_called = 1; rdma_get_cm_event(ib->ctx.ec, &event); - ib_event(n, event); - rdma_ack_cm_event(event); + + ib_event(n, event); } return 0; From 1e2d4158f022de5423ab90ef65e2c0b744b4951a Mon Sep 17 00:00:00 2001 From: Dennis Potter Date: Thu, 28 Jun 2018 17:24:28 +0200 Subject: [PATCH 14/35] Added possibility to add port dynamically, added some config warnings and added a WR refresh function to ib_read --- include/villas/nodes/infiniband.h | 2 ++ lib/nodes/infiniband.c | 58 ++++++++++++++++++++++++------- 2 files changed, 47 insertions(+), 13 deletions(-) diff --git a/include/villas/nodes/infiniband.h b/include/villas/nodes/infiniband.h index ebb443394..6f54f04f7 100644 --- a/include/villas/nodes/infiniband.h +++ b/include/villas/nodes/infiniband.h @@ -90,6 +90,8 @@ struct infiniband { pthread_t stop_thread; int rdma_disconnect_called; + + int used_recv_wrs; } conn; /* Memory related variables */ diff --git a/lib/nodes/infiniband.c b/lib/nodes/infiniband.c index 4783403fe..416449e73 100644 --- a/lib/nodes/infiniband.c +++ b/lib/nodes/infiniband.c @@ -21,6 +21,7 @@ *********************************************************************************/ #include +#include #include #include @@ -220,7 +221,8 @@ static void ib_build_ibv(struct node *n) if(ret) error("Failed to create Queue Pair in node %s.", node_name(n)); - info("Created Queue Pair."); + info("Created Queue Pair with %i receive and %i send elements.", + ib->qp_init.cap.max_recv_wr, ib->qp_init.cap.max_send_wr); // Allocate memory ib->mem.p_recv.state = STATE_DESTROYED; @@ -314,8 +316,6 @@ static int ib_addr_resolved(struct node *n) if(ret) error("Failed to resolve route in node %s.", node_name(n)); - //ToDo: create check if data can be send inline - return 0; } @@ -411,15 +411,15 @@ int ib_parse(struct node *n, json_t *cfg) struct infiniband *ib = (struct infiniband *) n->_vd; int ret; - const char *local = NULL; - const char *remote = NULL; + char *local = NULL; + char *remote = NULL; const char *port_space = "RDMA_PC_TCP"; const char *poll_mode = "BUSY"; const char *qp_type = "IBV_QPT_RC"; int timeout = 1000; - int cq_size = 10; - int max_send_wr = 100; - int max_recv_wr = 100; + int cq_size = 128; + int max_send_wr = 128; + int max_recv_wr = 128; json_error_t err; ret = json_unpack_ex(cfg, &err, 0, "{ s?: s, s?: s, s?: s, s?: i, \ @@ -438,8 +438,9 @@ int ib_parse(struct node *n, json_t *cfg) jerror(&err, "Failed to parse configuration of node %s", node_name(n)); // Translate IP:PORT to a struct addrinfo - //ToDo: Fix fixed port - ret = getaddrinfo(local, (char *)"13337", NULL, &ib->conn.src_addr); + char* ip_adr = strtok(local, ":"); + char* port = strtok(NULL, ":"); + ret = getaddrinfo(ip_adr, port, NULL, &ib->conn.src_addr); if(ret) { error("Failed to resolve local address '%s' of node %s: %s", @@ -490,10 +491,24 @@ int ib_parse(struct node *n, json_t *cfg) } // Set max. send and receive Work Requests - //ToDo: Set hint that max_*_wr can only be a value 1<< + // First check if the set value is a power of 2, and warn the user if this is not the case + int max_send_pow = (int) pow(2, ceil(log2(max_send_wr))); + int max_recv_pow = (int) pow(2, ceil(log2(max_recv_wr))); + + if(max_send_wr != max_send_pow) + warn("Max. number of send WRs (%i) is not a power of 2! The HCA will change this to a power of 2: %i", + max_send_wr, max_send_pow); + + if(max_recv_wr != max_recv_pow) + warn("Max. number of recv WRs (%i) is not a power of 2! The HCA will change this to a power of 2: %i", + max_recv_wr, max_recv_pow); + ib->qp_init.cap.max_send_wr = max_send_wr; ib->qp_init.cap.max_recv_wr = max_recv_wr; + // Set used receive Work Requests to 0 + ib->conn.used_recv_wrs = 0; + // Set remaining QP attributes ib->qp_init.cap.max_send_sge = 1; ib->qp_init.cap.max_recv_sge = 1; @@ -504,8 +519,9 @@ int ib_parse(struct node *n, json_t *cfg) ib->is_source = 1; // Translate address info - //ToDo: Fix fixed port - ret = getaddrinfo(remote, (char *)"13337", NULL, &ib->conn.dst_addr); + char* ip_adr = strtok(remote, ":"); + char* port = strtok(NULL, ":"); + ret = getaddrinfo(ip_adr, port, NULL, &ib->conn.dst_addr); if(ret) { error("Failed to resolve remote address '%s' of node %s: %s", @@ -702,6 +718,8 @@ int ib_read(struct node *n, struct sample *smps[], unsigned cnt) if(ret) { + ib->conn.used_recv_wrs += ret; + data = malloc(ret*sizeof(double)); for(int i=0; imem.p_recv, (double*)(wc[i].wr_id)); + } } smps[0]->length = ret; smps[0]->capacity = cnt; memcpy(smps[0]->data, data, ret*sizeof(double)); } + else + { + //No data received? Put new receive Work Requests to Receive Queue + for(int i=0; iconn.used_recv_wrs; i++) + ib_post_recv_wrs(n); + + ib->conn.used_recv_wrs = 0; + } return ret; } From 9ecdb54832fdf6ddeb42b5b3856f6057b1522e74 Mon Sep 17 00:00:00 2001 From: Steffen Vogel Date: Fri, 29 Jun 2018 17:32:07 +0200 Subject: [PATCH 15/35] node: add new function node_memtype() to get node specific memory allocators --- include/villas/node.h | 3 +++ include/villas/node_type.h | 4 ++++ lib/node.c | 5 +++++ 3 files changed, 12 insertions(+) diff --git a/include/villas/node.h b/include/villas/node.h index 06c0dcc09..29c312124 100644 --- a/include/villas/node.h +++ b/include/villas/node.h @@ -158,4 +158,7 @@ int node_write(struct node *n, struct sample *smps[], unsigned cnt); int node_fd(struct node *n); +struct memtype * node_memtype(struct node *n, struct memtype *parent); + + /** @} */ diff --git a/include/villas/node_type.h b/include/villas/node_type.h index 5e6a47d4f..6de269f52 100644 --- a/include/villas/node_type.h +++ b/include/villas/node_type.h @@ -30,6 +30,7 @@ #include "list.h" #include "common.h" +#include "memory.h" /* Forward declarations */ struct node; @@ -149,6 +150,9 @@ struct node_type { /** Return a file descriptor which can be used by poll / select to detect the availability of new data. */ int (*fd)(struct node *n); + + /** */ + struct memtype * (*memtype)(struct node *n, struct memtype *parent); }; /** Initialize all registered node type subsystems. diff --git a/lib/node.c b/lib/node.c index 7a8c02f99..eae2583e9 100644 --- a/lib/node.c +++ b/lib/node.c @@ -544,6 +544,11 @@ int node_fd(struct node *n) return n->_vt->fd ? n->_vt->fd(n) : -1; } +struct memtype * node_memtype(struct node *n, struct memtype *parent) +{ + return n->_vt->memtype ? n->_vt->memtype(n) : &memtype_huge; +} + int node_parse_list(struct list *list, json_t *cfg, struct list *all) { struct node *node; From c01393bc82d0f1e87c484c5b9e101446ecb232be Mon Sep 17 00:00:00 2001 From: Steffen Vogel Date: Fri, 29 Jun 2018 17:37:10 +0200 Subject: [PATCH 16/35] infiniband: add new memory type --- lib/memory_ib.c | 96 ++++++++++++++++++++++++++++++++++++++++++ lib/nodes/infiniband.c | 3 +- src/pipe.c | 4 +- 3 files changed, 100 insertions(+), 3 deletions(-) create mode 100644 lib/memory_ib.c diff --git a/lib/memory_ib.c b/lib/memory_ib.c new file mode 100644 index 000000000..80f2c868c --- /dev/null +++ b/lib/memory_ib.c @@ -0,0 +1,96 @@ +/** Memory allocators. + * + * @author Steffen Vogel + * @copyright 2017, Institute for Automation of Complex Power Systems, EONERC + * @license GNU General Public License (version 3) + * + * VILLASnode + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + *********************************************************************************/ + +#include + +struct memory_ib { + struct ibv_pd *pd; + struct memtype *parent; +}; + +struct ibv_mr * memory_ib_mr(void *ptr) +{ + struct ibv_mr *mr = (struct ibv_mr *) ptr; + + return (mr - 1); +} + +void * memory_ib_alloc(struct memtype *m, size_t len, size_t alignment) +{ + struct memtype_ib *mi = (struct memtype_ib *) m->_vd; + + struct ibv_mr **mr = memory_alloc_aligned(m->parent, len + sizeof(struct ibv_mr *), alignment); + char *ptr = (char *) (mr + 1); + + *mr = ibv_reg_mr(mi->pd, ptr, len, + IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); + if(!*mr) { + free(ptr); + return NULL; + } + + return ptr; +} + +int memory_ib_free(struct memtype *m, void *ptr, size_t len) +{ + struct ibv_mr *mr = memory_ib_mr(ptr); + + ibv_dereg_mr(mr); + + ptr -= sizeof(struct ibv_mr *); + len += sizeof(struct ibv_mr *); + + memory_free(m->parent, ptr, len); + + return 0; +} + +struct memtype * ib_memtype(struct node *n, struct memtype *parent) +{ + struct infiniband *i = (struct infiniband *) n->_vd; + struct memtype *mt = alloc(struct memtype); + + mt->name = "ib"; + mt->flags = 0; + mt->alloc = memory_ib_alloc; + mt->free = memory_ib_free; + mt->alignment = 1; + + mt->_vd = malloc(sizeof(struct memory_ib)); + + struct memory_ib *mi = (struct memory_ib *) mt->_vd; + + mi->pd = i->ctx.pd; + mi->parent = parent; + + return mt; +} + +/* Ausserhalb von lib/nodes/infiniband.c */ +struct pool p = { .state = STATE_DESTROYED }; +struct node *n = ..; + +pool_init(&p, 100, 32, node_get_memtype(n)); + + + diff --git a/lib/nodes/infiniband.c b/lib/nodes/infiniband.c index 416449e73..963c26376 100644 --- a/lib/nodes/infiniband.c +++ b/lib/nodes/infiniband.c @@ -837,7 +837,8 @@ static struct plugin p = { .deinit = ib_deinit, .read = ib_read, .write = ib_write, - .fd = ib_fd + .fd = ib_fd, + .memtype = ib_memtype } }; diff --git a/src/pipe.c b/src/pipe.c index ee27df078..b9e967d97 100644 --- a/src/pipe.c +++ b/src/pipe.c @@ -132,7 +132,7 @@ static void * send_loop(void *ctx) struct sample *smps[node->out.vectorize]; /* Initialize memory */ - ret = pool_init(&sendd.pool, LOG2_CEIL(node->out.vectorize), SAMPLE_LEN(DEFAULT_SAMPLELEN), &memtype_hugepage); + ret = pool_init(&sendd.pool, LOG2_CEIL(node->out.vectorize), SAMPLE_LEN(DEFAULT_SAMPLELEN), node_memtype(node)); if (ret < 0) error("Failed to allocate memory for receive pool."); @@ -196,7 +196,7 @@ static void * recv_loop(void *ctx) struct sample *smps[node->in.vectorize]; /* Initialize memory */ - ret = pool_init(&recvv.pool, LOG2_CEIL(node->in.vectorize), SAMPLE_LEN(DEFAULT_SAMPLELEN), &memtype_hugepage); + ret = pool_init(&recvv.pool, LOG2_CEIL(node->in.vectorize), SAMPLE_LEN(DEFAULT_SAMPLELEN), node_memtype(node)); if (ret < 0) error("Failed to allocate memory for receive pool."); From c70dbe2263fb6b692f79b415a8cbb193b5c93e1b Mon Sep 17 00:00:00 2001 From: Dennis Potter Date: Sat, 30 Jun 2018 18:20:30 +0200 Subject: [PATCH 17/35] Resolved memory_ib related compile and link errors --- lib/Makefile.villas.inc | 2 +- lib/memory_ib.c | 25 +++++---- lib/node.c | 3 +- lib/nodes/infiniband.c | 113 ++++++++++++++++++++-------------------- src/pipe.c | 4 +- 5 files changed, 74 insertions(+), 73 deletions(-) diff --git a/lib/Makefile.villas.inc b/lib/Makefile.villas.inc index dc0a3ca0a..ff375a743 100644 --- a/lib/Makefile.villas.inc +++ b/lib/Makefile.villas.inc @@ -28,7 +28,7 @@ LIB = $(BUILDDIR)/$(LIB_NAME).so.$(LIB_ABI_VERSION) LIB_SRCS += $(addprefix lib/kernel/, kernel.c rt.c) \ $(addprefix lib/, sample.c path.c node.c hook.c log.c log_config.c \ utils.c super_node.c hist.c timing.c pool.c list.c queue.c \ - queue_signalled.c memory.c advio.c plugin.c node_type.c stats.c \ + queue_signalled.c memory.c memory_ib.c advio.c plugin.c node_type.c stats.c \ mapping.c shmem.c config_helper.c crypt.c compat.c \ log_helper.c task.c buffer.c table.c bitset.c signal.c \ ) diff --git a/lib/memory_ib.c b/lib/memory_ib.c index 80f2c868c..38e9f0f0d 100644 --- a/lib/memory_ib.c +++ b/lib/memory_ib.c @@ -20,12 +20,9 @@ * along with this program. If not, see . *********************************************************************************/ -#include - -struct memory_ib { - struct ibv_pd *pd; - struct memtype *parent; -}; +#include +#include +#include struct ibv_mr * memory_ib_mr(void *ptr) { @@ -36,14 +33,13 @@ struct ibv_mr * memory_ib_mr(void *ptr) void * memory_ib_alloc(struct memtype *m, size_t len, size_t alignment) { - struct memtype_ib *mi = (struct memtype_ib *) m->_vd; + struct memory_ib *mi = (struct memory_ib *) m->_vd; - struct ibv_mr **mr = memory_alloc_aligned(m->parent, len + sizeof(struct ibv_mr *), alignment); + struct ibv_mr **mr = memory_alloc_aligned(mi->parent, len + sizeof(struct ibv_mr *), alignment); char *ptr = (char *) (mr + 1); - *mr = ibv_reg_mr(mi->pd, ptr, len, - IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); - if(!*mr) { + *mr = ibv_reg_mr(mi->pd, ptr, len, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); + if(!*mr) { free(ptr); return NULL; } @@ -53,6 +49,7 @@ void * memory_ib_alloc(struct memtype *m, size_t len, size_t alignment) int memory_ib_free(struct memtype *m, void *ptr, size_t len) { + struct memory_ib *mi = (struct memory_ib *) m->_vd; struct ibv_mr *mr = memory_ib_mr(ptr); ibv_dereg_mr(mr); @@ -60,7 +57,7 @@ int memory_ib_free(struct memtype *m, void *ptr, size_t len) ptr -= sizeof(struct ibv_mr *); len += sizeof(struct ibv_mr *); - memory_free(m->parent, ptr, len); + memory_free(mi->parent, ptr, len); return 0; } @@ -68,7 +65,7 @@ int memory_ib_free(struct memtype *m, void *ptr, size_t len) struct memtype * ib_memtype(struct node *n, struct memtype *parent) { struct infiniband *i = (struct infiniband *) n->_vd; - struct memtype *mt = alloc(struct memtype); + struct memtype *mt = malloc(sizeof(struct memtype)); mt->name = "ib"; mt->flags = 0; @@ -87,10 +84,12 @@ struct memtype * ib_memtype(struct node *n, struct memtype *parent) } /* Ausserhalb von lib/nodes/infiniband.c */ +/* struct pool p = { .state = STATE_DESTROYED }; struct node *n = ..; pool_init(&p, 100, 32, node_get_memtype(n)); +*/ diff --git a/lib/node.c b/lib/node.c index eae2583e9..866cbe576 100644 --- a/lib/node.c +++ b/lib/node.c @@ -32,6 +32,7 @@ #include #include #include +#include static int node_direction_init(struct node_direction *nd, struct node *n) { @@ -546,7 +547,7 @@ int node_fd(struct node *n) struct memtype * node_memtype(struct node *n, struct memtype *parent) { - return n->_vt->memtype ? n->_vt->memtype(n) : &memtype_huge; + return n->_vt->memtype(n, parent) ? n->_vt->memtype(n, parent) : &memtype_hugepage; } int node_parse_list(struct list *list, json_t *cfg, struct list *all) diff --git a/lib/nodes/infiniband.c b/lib/nodes/infiniband.c index 963c26376..1a847365d 100644 --- a/lib/nodes/infiniband.c +++ b/lib/nodes/infiniband.c @@ -29,6 +29,7 @@ #include #include #include +#include #include @@ -36,7 +37,7 @@ int ib_cleanup(struct node *n) { struct infiniband *ib = (struct infiniband *) n->_vd; info("Starting to clean up"); - + // Destroy QP rdma_destroy_qp(ib->ctx.id); info("Destroyed QP"); @@ -46,7 +47,7 @@ int ib_cleanup(struct node *n) if(ib->is_source) ibv_dereg_mr(ib->mem.mr_send); info("Deregistered memory regions"); - + // Destroy pools pool_destroy(&ib->mem.p_recv); pool_destroy(&ib->mem.p_send); @@ -84,7 +85,7 @@ int ib_post_recv_wrs(struct node *n) // Post Work Request ret = ibv_post_recv(ib->ctx.id->qp, &wr, &bad_wr); - return ret; + return ret; } void ib_completion_target(struct node* n, struct ibv_wc* wc, int* size) @@ -93,7 +94,7 @@ void ib_completion_target(struct node* n, struct ibv_wc* wc, int* size) } void ib_completion_source(struct node* n, struct ibv_wc* wc, int* size) -{ +{ struct infiniband *ib = (struct infiniband *)((struct node *)n)->_vd; for(int i=0; i<*size; i++) @@ -130,7 +131,7 @@ void * ib_event_thread(void *n) // Request a new event in the CQ and acknowledge event ibv_req_notify_cq(ib->ctx.cq, 0); - ibv_ack_cq_events(ib->ctx.cq, 1); + ibv_ack_cq_events(ib->ctx.cq, 1); } } @@ -166,10 +167,10 @@ static void ib_init_wc_poll(struct node *n) } // Create completion queue and bind to channel (or NULL) - ib->ctx.cq = ibv_create_cq(ib->ctx.id->verbs, - ib->cq_size, - NULL, - ib->ctx.comp_channel, + ib->ctx.cq = ibv_create_cq(ib->ctx.id->verbs, + ib->cq_size, + NULL, + ib->ctx.comp_channel, 0); if(!ib->ctx.cq) error("Could not create completion queue in node %s.", node_name(n)); @@ -200,7 +201,7 @@ static void ib_build_ibv(struct node *n) { struct infiniband *ib = (struct infiniband *) n->_vd; int ret; - + //Allocate protection domain ib->ctx.pd = ibv_alloc_pd(ib->ctx.id->verbs); if(!ib->ctx.pd) @@ -218,7 +219,7 @@ static void ib_build_ibv(struct node *n) // Create the actual QP ret = rdma_create_qp(ib->ctx.id, ib->ctx.pd, &ib->qp_init); - if(ret) + if(ret) error("Failed to create Queue Pair in node %s.", node_name(n)); info("Created Queue Pair with %i receive and %i send elements.", @@ -229,18 +230,18 @@ static void ib_build_ibv(struct node *n) ib->mem.p_recv.queue.state = STATE_DESTROYED; // Set pool size to maximum size of Receive Queue - pool_init(&ib->mem.p_recv, - ib->qp_init.cap.max_recv_wr, - sizeof(double), + pool_init(&ib->mem.p_recv, + ib->qp_init.cap.max_recv_wr, + sizeof(double), &memtype_heap); - if(ret) + if(ret) { error("Failed to init recv memory pool of node %s: %s", node_name(n), gai_strerror(ret)); } //ToDo: initialize r_addr_key struct if mode is RDMA - + // Register memory for IB Device. Not necessary if data is send // exclusively inline ib->mem.mr_recv = ibv_reg_mr( @@ -260,18 +261,18 @@ static void ib_build_ibv(struct node *n) ib->mem.p_send.queue.state = STATE_DESTROYED; // Set pool size to maximum size of Receive Queue - pool_init(&ib->mem.p_send, - ib->qp_init.cap.max_send_wr, - sizeof(double), + pool_init(&ib->mem.p_send, + ib->qp_init.cap.max_send_wr, + sizeof(double), &memtype_heap); - if(ret) + if(ret) { error("Failed to init send memory of node %s: %s", node_name(n), gai_strerror(ret)); } //ToDo: initialize r_addr_key struct if mode is RDMA - + // Register memory for IB Device. Not necessary if data is send // exclusively inline ib->mem.mr_send = ibv_reg_mr( @@ -286,13 +287,13 @@ static void ib_build_ibv(struct node *n) info("Allocated send memory."); } - + // Post Receive Work Requests to be able to receive data // Fill complete Receive Queue during initialization for(int i=0; iqp_init.cap.max_recv_wr; i++) { ret = ib_post_recv_wrs(n); - if(ret) + if(ret) { error("Failed to post initial receive Work Requests of node %s.", node_name(n)); @@ -313,7 +314,7 @@ static int ib_addr_resolved(struct node *n) // Resolve address ret = rdma_resolve_route(ib->ctx.id, ib->conn.timeout); - if(ret) + if(ret) error("Failed to resolve route in node %s.", node_name(n)); return 0; @@ -327,17 +328,17 @@ static int ib_route_resolved(struct node *n) info("Successfully resolved route."); //ToDo: Post receive WRs - + struct rdma_conn_param cm_params; memset(&cm_params, 0, sizeof(cm_params)); // Send connection request ret = rdma_connect(ib->ctx.id, &cm_params); - if(ret) + if(ret) error("Failed to connect in node %s.", node_name(n)); info("Called rdma_connect."); - + return 0; } @@ -349,17 +350,17 @@ static int ib_connect_request(struct node *n, struct rdma_cm_id *id) ib->ctx.id = id; ib_build_ibv(n); - + struct rdma_conn_param cm_params; memset(&cm_params, 0, sizeof(cm_params)); // Accept connection request ret = rdma_accept(ib->ctx.id, &cm_params); - if(ret) + if(ret) error("Failed to connect in node %s.", node_name(n)); info("Successfully accepted connection request."); - + return 0; } @@ -441,7 +442,7 @@ int ib_parse(struct node *n, json_t *cfg) char* ip_adr = strtok(local, ":"); char* port = strtok(NULL, ":"); ret = getaddrinfo(ip_adr, port, NULL, &ib->conn.src_addr); - if(ret) + if(ret) { error("Failed to resolve local address '%s' of node %s: %s", local, node_name(n), gai_strerror(ret)); @@ -459,20 +460,20 @@ int ib_parse(struct node *n, json_t *cfg) // Set timeout ib->conn.timeout = timeout; - + // Translate poll mode - if(strcmp(poll_mode, "EVENT") == 0) + if(strcmp(poll_mode, "EVENT") == 0) { ib->poll.poll_mode = EVENT; ib->poll.poll_func = ib_event_thread; } - else if(strcmp(poll_mode, "BUSY") == 0) + else if(strcmp(poll_mode, "BUSY") == 0) { ib->poll.poll_mode = BUSY; ib->poll.poll_func = ib_busy_poll_thread; } - else + else { error("Failed to translate poll_mode in node %s. %s is not a valid \ poll mode!", node_name(n), poll_mode); @@ -522,7 +523,7 @@ int ib_parse(struct node *n, json_t *cfg) char* ip_adr = strtok(remote, ":"); char* port = strtok(NULL, ":"); ret = getaddrinfo(ip_adr, port, NULL, &ib->conn.dst_addr); - if(ret) + if(ret) { error("Failed to resolve remote address '%s' of node %s: %s", remote, node_name(n), gai_strerror(ret)); @@ -534,7 +535,7 @@ int ib_parse(struct node *n, json_t *cfg) else { ib->is_source = 0; - + // Set correct Work Completion function ib->poll.on_compl = ib_completion_target; } @@ -569,7 +570,7 @@ void * ib_disconnect_thread(void *n) return NULL; } } - return NULL; + return NULL; } int ib_start(struct node *n) @@ -586,7 +587,7 @@ int ib_start(struct node *n) } ret = rdma_create_id(ib->ctx.ec, &ib->ctx.id, NULL, ib->conn.port_space); - if(ret) + if(ret) { error("Failed to create rdma_cm_id of node %s: %s", node_name(n), gai_strerror(ret)); @@ -595,21 +596,21 @@ int ib_start(struct node *n) // Bind rdma_cm_id to the HCA ret = rdma_bind_addr(ib->ctx.id, ib->conn.src_addr->ai_addr); - if(ret) + if(ret) { error("Failed to bind to local device of node %s: %s", node_name(n), gai_strerror(ret)); } info("Bound rdma_cm_id to Infiniband device."); - + if(ib->is_source) { // Resolve address - ret = rdma_resolve_addr(ib->ctx.id, - NULL, - ib->conn.dst_addr->ai_addr, - ib->conn.timeout); - if(ret) + ret = rdma_resolve_addr(ib->ctx.id, + NULL, + ib->conn.dst_addr->ai_addr, + ib->conn.timeout); + if(ret) { error("Failed to resolve remote address after %ims of node %s: %s", ib->conn.timeout, node_name(n), gai_strerror(ret)); @@ -617,14 +618,14 @@ int ib_start(struct node *n) } else { - // The ID will be overwritten for the target. If the event type is - // RDMA_CM_EVENT_CONNECT_REQUEST, >then this references a new id for - // that communication. + // The ID will be overwritten for the target. If the event type is + // RDMA_CM_EVENT_CONNECT_REQUEST, >then this references a new id for + // that communication. ib->ctx.listen_id = ib->ctx.id; // Listen on rdma_cm_id for events ret = rdma_listen(ib->ctx.listen_id, 10); - if(ret) + if(ret) { error("Failed to listen to rdma_cm_id on node %s", node_name(n)); } @@ -667,11 +668,11 @@ int ib_stop(struct node *n) ret = rdma_disconnect(ib->ctx.id); if(ret) { - error("Error while calling rdma_disconnect in node %s: %s", + error("Error while calling rdma_disconnect in node %s: %s", node_name(n), gai_strerror(ret)); } info("Called rdma_disconnect."); - + // If disconnected event already occured, directly call cleanup function if(ib->conn.rdma_disconnect_called) { @@ -693,7 +694,7 @@ int ib_stop(struct node *n) int ib_init(struct super_node *n) { - + return 0; } @@ -712,7 +713,7 @@ int ib_read(struct node *n, struct sample *smps[], unsigned cnt) union { double f; int64_t i; - } *data; + } *data; ret = ibv_poll_cq(ib->ctx.cq, cnt, wc); @@ -791,7 +792,7 @@ int ib_write(struct node *n, struct sample *smps[], unsigned cnt) wr[i].num_sge = 1; //ToDo: Right now only smps[0] is sg_list. This can be extended //furthermore we should break the transaction up if inline mode //is selected - + if(i == (smps[0]->length-1)) wr[i].next = NULL; else @@ -801,7 +802,7 @@ int ib_write(struct node *n, struct sample *smps[], unsigned cnt) wr[i].opcode = IBV_WR_SEND_WITH_IMM; } - + //Send linked list of Work Requests ret = ibv_post_send(ib->ctx.id->qp, wr, &bad_wr); if(ret) diff --git a/src/pipe.c b/src/pipe.c index b9e967d97..2b0260398 100644 --- a/src/pipe.c +++ b/src/pipe.c @@ -132,7 +132,7 @@ static void * send_loop(void *ctx) struct sample *smps[node->out.vectorize]; /* Initialize memory */ - ret = pool_init(&sendd.pool, LOG2_CEIL(node->out.vectorize), SAMPLE_LEN(DEFAULT_SAMPLELEN), node_memtype(node)); + ret = pool_init(&sendd.pool, LOG2_CEIL(node->out.vectorize), SAMPLE_LEN(DEFAULT_SAMPLELEN), node_memtype(node, &memtype_hugepage)); if (ret < 0) error("Failed to allocate memory for receive pool."); @@ -196,7 +196,7 @@ static void * recv_loop(void *ctx) struct sample *smps[node->in.vectorize]; /* Initialize memory */ - ret = pool_init(&recvv.pool, LOG2_CEIL(node->in.vectorize), SAMPLE_LEN(DEFAULT_SAMPLELEN), node_memtype(node)); + ret = pool_init(&recvv.pool, LOG2_CEIL(node->in.vectorize), SAMPLE_LEN(DEFAULT_SAMPLELEN), node_memtype(node, &memtype_hugepage)); if (ret < 0) error("Failed to allocate memory for receive pool."); From c055010be1666e7f73f85f7dee0e72caccc47edd Mon Sep 17 00:00:00 2001 From: Dennis Potter Date: Sun, 1 Jul 2018 12:56:03 +0200 Subject: [PATCH 18/35] ib_write() now takes data directly from the super pool instead of copying it. ib_read() still copies data and the code needs cleanup after zero-copy is implemented --- lib/memory_ib.c | 11 --- lib/nodes/infiniband.c | 210 ++++++++++++++++++++--------------------- 2 files changed, 105 insertions(+), 116 deletions(-) diff --git a/lib/memory_ib.c b/lib/memory_ib.c index 38e9f0f0d..9a1432aaf 100644 --- a/lib/memory_ib.c +++ b/lib/memory_ib.c @@ -82,14 +82,3 @@ struct memtype * ib_memtype(struct node *n, struct memtype *parent) return mt; } - -/* Ausserhalb von lib/nodes/infiniband.c */ -/* -struct pool p = { .state = STATE_DESTROYED }; -struct node *n = ..; - -pool_init(&p, 100, 32, node_get_memtype(n)); - -*/ - - diff --git a/lib/nodes/infiniband.c b/lib/nodes/infiniband.c index 1a847365d..efe372816 100644 --- a/lib/nodes/infiniband.c +++ b/lib/nodes/infiniband.c @@ -95,23 +95,30 @@ void ib_completion_target(struct node* n, struct ibv_wc* wc, int* size) void ib_completion_source(struct node* n, struct ibv_wc* wc, int* size) { - struct infiniband *ib = (struct infiniband *)((struct node *)n)->_vd; + struct infiniband *ib = (struct infiniband *)((struct node *)n)->_vd; + struct sample* smpl; - for(int i=0; i<*size; i++) - { - //On disconnect, the QP set to error state and will be flushed - if(wc[i].status == IBV_WC_WR_FLUSH_ERR) - { - ib->poll.stopThread = 1; - return; - } - - if(wc[i].status != IBV_WC_SUCCESS) - warn("Work Completion status was not IBV_WC_SUCCES in node %s: %i", - node_name(n), wc[i].status); - - } + for(int i=0; i<*size; i++) + { + //On disconnect, the QP set to error state and will be flushed + if(wc[i].status == IBV_WC_WR_FLUSH_ERR) + { + ib->poll.stopThread = 1; + return; + } + if(wc[i].status != IBV_WC_SUCCESS) + { + warn("Work Completion status was not IBV_WC_SUCCES in node %s: %i", + node_name(n), wc[i].status); + } + else + { + // Release sample + smpl = (struct sample*)wc[i].wr_id; + sample_put(smpl); + } + } } void * ib_event_thread(void *n) @@ -232,7 +239,7 @@ static void ib_build_ibv(struct node *n) // Set pool size to maximum size of Receive Queue pool_init(&ib->mem.p_recv, ib->qp_init.cap.max_recv_wr, - sizeof(double), + 64*sizeof(double), &memtype_heap); if(ret) { @@ -705,115 +712,108 @@ int ib_deinit() int ib_read(struct node *n, struct sample *smps[], unsigned cnt) { - //Create separate thread for polling! This impelemtation is just - //for testing purposes - struct infiniband *ib = (struct infiniband *) n->_vd; - int ret; - struct ibv_wc wc[cnt]; - union { - double f; - int64_t i; - } *data; + //Create separate thread for polling! This impelemtation is just + //for testing purposes + struct infiniband *ib = (struct infiniband *) n->_vd; + int ret; + struct ibv_wc wc[cnt]; + char *ptr; - ret = ibv_poll_cq(ib->ctx.cq, cnt, wc); + ret = ibv_poll_cq(ib->ctx.cq, cnt, wc); - if(ret) - { - ib->conn.used_recv_wrs += ret; + if(ret) + { + ib->conn.used_recv_wrs += ret; - data = malloc(ret*sizeof(double)); + for(int i=0; imem.p_recv, (double*)(wc[i].wr_id)); + } + smps[i]->length = wc[i].byte_len/sizeof(double); + smps[i]->capacity = 64; + memcpy(smps[i]->data, ptr, wc[i].byte_len); + } + } + else + { + //No data received? Put new receive Work Requests to Receive Queue + for(int i=0; iconn.used_recv_wrs; i++) + ib_post_recv_wrs(n); - //Release memory - pool_put(&ib->mem.p_recv, (double*)(wc[i].wr_id)); - } - } - smps[0]->length = ret; - smps[0]->capacity = cnt; - memcpy(smps[0]->data, data, ret*sizeof(double)); - } - else - { - //No data received? Put new receive Work Requests to Receive Queue - for(int i=0; iconn.used_recv_wrs; i++) - ib_post_recv_wrs(n); + ib->conn.used_recv_wrs = 0; + } - ib->conn.used_recv_wrs = 0; - } - - return ret; + return ret; } int ib_write(struct node *n, struct sample *smps[], unsigned cnt) { - /* Send pool is not used at this moment! */ - struct infiniband *ib = (struct infiniband *) n->_vd; - int ret; - struct ibv_send_wr wr[smps[0]->length], *bad_wr = NULL; - struct ibv_sge sge[smps[0]->length]; + struct infiniband *ib = (struct infiniband *) n->_vd; + struct ibv_send_wr wr[cnt], *bad_wr = NULL; + struct ibv_sge sge[cnt]; + struct pool *p; + struct ibv_mr ** mr; + int ret; - memset(&wr, 0, sizeof(wr)); + memset(&wr, 0, sizeof(wr)); - //ToDo: Place this into configuration and create checks if settings are valid - int send_inline = 1; + //ToDo: Place this into configuration and create checks if settings are valid + int send_inline = 0; - for(int i=0; ilength; i++) - { - // If data is send inline, it is not necessary to copy data to protected - // memory region first. - if(send_inline) - { - sge[i].addr = (uint64_t)&smps[0]->data[i].f; - sge[i].length = sizeof(double); - } - else - { - //- copy value to send_region - //- give pointer to start of array - } + // Get Memory Region + p = sample_pool(smps[0]); + mr = (struct ibv_mr **)((char *)(p)+p->buffer_off-8); - // Set Send Work Request - wr[i].wr_id = 0; //ToDo: set this to a useful value - wr[i].sg_list = &sge[i]; - wr[i].num_sge = 1; //ToDo: Right now only smps[0] is sg_list. This can be extended - //furthermore we should break the transaction up if inline mode - //is selected + for(int i=0; ilength-1)) - wr[i].next = NULL; - else - wr[i].next = &wr[i+1]; - wr[i].send_flags = IBV_SEND_SIGNALED | (send_inline<<3); - wr[i].imm_data = htonl(0); //ToDo: set this to a useful value - wr[i].opcode = IBV_WR_SEND_WITH_IMM; + //Set Scatter/Gather element to data of sample + sge[i].addr = (uint64_t)&smps[i]->data->f; + sge[i].length = smps[i]->length*sizeof(double); + sge[i].lkey = (*mr)->lkey; - } + // Set Send Work Request + wr[i].wr_id = (uint64_t)&smps[i]; //This way the sample can be release in WC + wr[i].sg_list = &sge[i]; + wr[i].num_sge = 1; - //Send linked list of Work Requests - ret = ibv_post_send(ib->ctx.id->qp, wr, &bad_wr); - if(ret) - { - error("Failed to send message in node %s: %s", - node_name(n), gai_strerror(ret)); + if(i == (cnt-1)) + wr[i].next = NULL; + else + wr[i].next = &wr[i+1]; - return -ret; - } + wr[i].send_flags = IBV_SEND_SIGNALED | (send_inline<<3); + wr[i].imm_data = htonl(0); //ToDo: set this to a useful value + wr[i].opcode = IBV_WR_SEND_WITH_IMM; + } - return cnt; + //Send linked list of Work Requests + ret = ibv_post_send(ib->ctx.id->qp, wr, &bad_wr); + if(ret) + { + error("Failed to send message in node %s: %s", + node_name(n), gai_strerror(ret)); + + return -ret; + } + + return cnt; } int ib_fd(struct node *n) From 97e25fb2b2dd52d99467538f924a63b3c493724b Mon Sep 17 00:00:00 2001 From: Dennis Potter Date: Mon, 2 Jul 2018 14:20:32 +0200 Subject: [PATCH 19/35] Added memory_ib.h --- include/villas/memory_ib.h | 40 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 include/villas/memory_ib.h diff --git a/include/villas/memory_ib.h b/include/villas/memory_ib.h new file mode 100644 index 000000000..5dfdf854e --- /dev/null +++ b/include/villas/memory_ib.h @@ -0,0 +1,40 @@ +/** Memory allocators. + * + * @file + * @author Dennis Potter + * @copyright 2018, Institute for Automation of Complex Power Systems, EONERC + * @license GNU General Public License (version 3) + * + * VILLASnode + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + *********************************************************************************/ + +#include +#include + +struct memory_ib { + struct ibv_pd *pd; + struct memtype *parent; +}; + +struct ibv_mr* memory_ib_mr(void*); +void* memory_ib_alloc(struct memtype*, size_t, size_t); +int memory_ib_free(struct memtype*, void*, size_t); +struct memtype* ib_memtype(struct node*, struct memtype*); + + + + + From bb70be0b2ce9b419242f496b27d982923d3a7e1d Mon Sep 17 00:00:00 2001 From: Steffen Vogel Date: Mon, 2 Jul 2018 14:17:50 +0200 Subject: [PATCH 20/35] memory: refactored memory subsystem --- include/villas/memory.h | 42 +----- include/villas/memory_type.h | 69 +++++++++ include/villas/node.h | 2 +- include/villas/node_type.h | 2 +- include/villas/pool.h | 4 +- include/villas/queue.h | 6 +- include/villas/queue_signalled.h | 2 +- lib/Makefile.villas.inc | 3 +- lib/api.c | 2 +- lib/api/session.c | 4 +- lib/memory.c | 245 +------------------------------ lib/memory/heap.c | 54 +++++++ lib/memory/hugepage.c | 86 +++++++++++ lib/{memory_ib.c => memory/ib.c} | 15 +- lib/memory/managed.c | 193 ++++++++++++++++++++++++ lib/node.c | 4 +- lib/nodes/iec61850_sv.c | 4 +- lib/nodes/infiniband.c | 8 +- lib/nodes/loopback.c | 4 +- lib/nodes/mqtt.c | 4 +- lib/nodes/websocket.c | 6 +- lib/path.c | 6 +- lib/pool.c | 2 +- lib/queue.c | 2 +- lib/queue_signalled.c | 2 +- lib/shmem.c | 10 +- src/hook.c | 2 +- src/pipe.c | 4 +- src/signal.c | 2 +- src/test-cmp.c | 2 +- tests/unit/io.c | 4 +- tests/unit/memory.c | 18 ++- tests/unit/pool.c | 12 +- tests/unit/queue.c | 18 +-- tests/unit/queue_signalled.c | 2 +- 35 files changed, 496 insertions(+), 349 deletions(-) create mode 100644 include/villas/memory_type.h create mode 100644 lib/memory/heap.c create mode 100644 lib/memory/hugepage.c rename lib/{memory_ib.c => memory/ib.c} (84%) create mode 100644 lib/memory/managed.c diff --git a/include/villas/memory.h b/include/villas/memory.h index 54da2bbc0..95ee21972 100644 --- a/include/villas/memory.h +++ b/include/villas/memory.h @@ -26,35 +26,14 @@ #include #include +#include + #ifdef __cplusplus extern "C" { #endif -#define HUGEPAGESIZE (1 << 21) - -struct memtype; - -typedef void *(*memzone_allocator_t)(struct memtype *mem, size_t len, size_t alignment); -typedef int (*memzone_deallocator_t)(struct memtype *mem, void *ptr, size_t len); - -enum memtype_flags { - MEMORY_MMAP = (1 << 0), - MEMORY_DMA = (1 << 1), - MEMORY_HUGEPAGE = (1 << 2), - MEMORY_HEAP = (1 << 3) -}; - -struct memtype { - const char *name; - int flags; - - size_t alignment; - - memzone_allocator_t alloc; - memzone_deallocator_t free; - - void *_vd; /**0 If allocation was successful. */ -void * memory_alloc(struct memtype *m, size_t len); +void * memory_alloc(struct memory_type *m, size_t len); -void * memory_alloc_aligned(struct memtype *m, size_t len, size_t alignment); +void * memory_alloc_aligned(struct memory_type *m, size_t len, size_t alignment); -int memory_free(struct memtype *m, void *ptr, size_t len); - -struct memtype * memtype_managed_init(void *ptr, size_t len); - -extern struct memtype memtype_heap; -extern struct memtype memtype_hugepage; +int memory_free(struct memory_type *m, void *ptr, size_t len); #ifdef __cplusplus } diff --git a/include/villas/memory_type.h b/include/villas/memory_type.h new file mode 100644 index 000000000..4daae7bfd --- /dev/null +++ b/include/villas/memory_type.h @@ -0,0 +1,69 @@ +/** Memory allocators. + * + * @file + * @author Steffen Vogel + * @copyright 2017, Institute for Automation of Complex Power Systems, EONERC + * @license GNU General Public License (version 3) + * + * VILLASnode + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + *********************************************************************************/ + +#pragma once + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct memory_type; + +typedef void *(*memzone_allocator_t)(struct memory_type *mem, size_t len, size_t alignment); +typedef int (*memzone_deallocator_t)(struct memory_type *mem, void *ptr, size_t len); + +enum memory_type_flags { + MEMORY_MMAP = (1 << 0), + MEMORY_DMA = (1 << 1), + MEMORY_HUGEPAGE = (1 << 2), + MEMORY_HEAP = (1 << 3) +}; + +struct memory_type { + const char *name; + int flags; + + size_t alignment; + + memzone_allocator_t alloc; + memzone_deallocator_t free; + + void *_vd; /**< Virtual data for internal state */ +}; + +extern struct memory_type memory_type_heap; +extern struct memory_type memory_hugepage; + +struct ibv_mr * memory_type_ib_mr(void *ptr); + +struct node; + +struct memory_type * memory_ib(struct node *n, struct memory_type *parent); +struct memory_type * memory_managed(void *ptr, size_t len); + +#ifdef __cplusplus +} +#endif diff --git a/include/villas/node.h b/include/villas/node.h index 60857489d..b63acb2cc 100644 --- a/include/villas/node.h +++ b/include/villas/node.h @@ -162,7 +162,7 @@ int node_write(struct node *n, struct sample *smps[], unsigned cnt); int node_fd(struct node *n); -struct memtype * node_memtype(struct node *n, struct memtype *parent); +struct memory_type * node_memory_type(struct node *n, struct memory_type *parent); #ifdef __cplusplus } diff --git a/include/villas/node_type.h b/include/villas/node_type.h index b34eb8d8f..b9026b1d3 100644 --- a/include/villas/node_type.h +++ b/include/villas/node_type.h @@ -164,7 +164,7 @@ struct node_type { int (*fd)(struct node *n); /** */ - struct memtype * (*memtype)(struct node *n, struct memtype *parent); + struct memory_type * (*memory_type)(struct node *n, struct memory_type *parent); }; /** Initialize all registered node type subsystems. diff --git a/include/villas/pool.h b/include/villas/pool.h index d09b6452c..7ab23a3c4 100644 --- a/include/villas/pool.h +++ b/include/villas/pool.h @@ -39,7 +39,7 @@ extern "C" { /** A thread-safe memory pool */ struct pool { off_t buffer_off; /**< Offset from the struct address to the underlying memory area */ - struct memtype *mem; + struct memory_type *mem; enum state state; @@ -62,7 +62,7 @@ struct pool { * @retval 0 The pool has been successfully initialized. * @retval <>0 There was an error during the pool initialization. */ -int pool_init(struct pool *p, size_t cnt, size_t blocksz, struct memtype *mem); +int pool_init(struct pool *p, size_t cnt, size_t blocksz, struct memory_type *mem); /** Destroy and release memory used by pool. */ int pool_destroy(struct pool *p); diff --git a/include/villas/queue.h b/include/villas/queue.h index 2c7b8c669..b1630d28c 100644 --- a/include/villas/queue.h +++ b/include/villas/queue.h @@ -45,7 +45,7 @@ extern "C"{ #endif /* Forward declarations */ -struct memtype; +struct memory_type; #define CACHELINE_SIZE 64 typedef char cacheline_pad_t[CACHELINE_SIZE]; @@ -61,7 +61,7 @@ struct queue { atomic_state state; - struct memtype *mem; + struct memory_type *mem; size_t buffer_mask; off_t buffer_off; /**< Relative pointer to struct queue_cell[] */ @@ -77,7 +77,7 @@ struct queue { }; /** Initialize MPMC queue */ -int queue_init(struct queue *q, size_t size, struct memtype *mem); +int queue_init(struct queue *q, size_t size, struct memory_type *mem); /** Desroy MPMC queue and release memory */ int queue_destroy(struct queue *q); diff --git a/include/villas/queue_signalled.h b/include/villas/queue_signalled.h index 2d2a44fb3..0b73e7c5b 100644 --- a/include/villas/queue_signalled.h +++ b/include/villas/queue_signalled.h @@ -68,7 +68,7 @@ struct queue_signalled { #define queue_signalled_available(q) queue_available(&((q)->queue)) -int queue_signalled_init(struct queue_signalled *qs, size_t size, struct memtype *mem, int flags); +int queue_signalled_init(struct queue_signalled *qs, size_t size, struct memory_type *mem, int flags); int queue_signalled_destroy(struct queue_signalled *qs); diff --git a/lib/Makefile.villas.inc b/lib/Makefile.villas.inc index 120161f72..963d98511 100644 --- a/lib/Makefile.villas.inc +++ b/lib/Makefile.villas.inc @@ -26,9 +26,10 @@ LIB = $(BUILDDIR)/$(LIB_NAME).so.$(LIB_ABI_VERSION) # Object files for libvillas LIB_SRCS += $(addprefix lib/kernel/, kernel.c rt.c) \ + $(addprefix lib/memory/, heap.c hugepage.c ib.c managed.c) \ $(addprefix lib/, sample.c path.c node.c hook.c log.c log_config.c \ utils.c super_node.c hist.c timing.c pool.c list.c queue.c \ - queue_signalled.c memory.c memory_ib.c advio.c plugin.c node_type.c stats.c \ + queue_signalled.c memory.c advio.c plugin.c node_type.c stats.c \ mapping.c shmem.c config_helper.c crypt.c compat.c \ log_helper.c task.c buffer.c table.c bitset.c signal.c \ hash_table.c \ diff --git a/lib/api.c b/lib/api.c index 8ea079949..dbf31c23d 100644 --- a/lib/api.c +++ b/lib/api.c @@ -272,7 +272,7 @@ int api_init(struct api *a, struct super_node *sn) if (ret) return ret; - ret = queue_signalled_init(&a->pending, 1024, &memtype_heap, 0); + ret = queue_signalled_init(&a->pending, 1024, &memory_type_heap, 0); if (ret) return ret; diff --git a/lib/api/session.c b/lib/api/session.c index 8ad155efe..e421bff79 100644 --- a/lib/api/session.c +++ b/lib/api/session.c @@ -40,11 +40,11 @@ int api_session_init(struct api_session *s, enum api_mode m) if (ret) return ret; - ret = queue_init(&s->request.queue, 128, &memtype_heap); + ret = queue_init(&s->request.queue, 128, &memory_type_heap); if (ret) return ret; - ret = queue_init(&s->response.queue, 128, &memtype_heap); + ret = queue_init(&s->response.queue, 128, &memory_type_heap); if (ret) return ret; diff --git a/lib/memory.c b/lib/memory.c index 246be848a..34f1f299c 100644 --- a/lib/memory.c +++ b/lib/memory.c @@ -25,21 +25,13 @@ #include #include -#include #include #include -#include - -/* Required to allocate hugepages on Apple OS X */ -#ifdef __MACH__ - #include -#elif defined(__linux__) - #include -#endif #include #include #include +#include int memory_init(int hugepages) { @@ -77,7 +69,7 @@ int memory_init(int hugepages) return 0; } -void * memory_alloc(struct memtype *m, size_t len) +void * memory_alloc(struct memory_type *m, size_t len) { void *ptr = m->alloc(m, len, sizeof(void *)); @@ -86,7 +78,7 @@ void * memory_alloc(struct memtype *m, size_t len) return ptr; } -void * memory_alloc_aligned(struct memtype *m, size_t len, size_t alignment) +void * memory_alloc_aligned(struct memory_type *m, size_t len, size_t alignment) { void *ptr = m->alloc(m, len, alignment); @@ -95,238 +87,9 @@ void * memory_alloc_aligned(struct memtype *m, size_t len, size_t alignment) return ptr; } -int memory_free(struct memtype *m, void *ptr, size_t len) +int memory_free(struct memory_type *m, void *ptr, size_t len) { debug(LOG_MEM | 5, "Releasing %#zx bytes of %s memory", len, m->name); return m->free(m, ptr, len); } - -static void * memory_heap_alloc(struct memtype *m, size_t len, size_t alignment) -{ - void *ptr; - int ret; - - if (alignment < sizeof(void *)) - alignment = sizeof(void *); - - ret = posix_memalign(&ptr, alignment, len); - - return ret ? NULL : ptr; -} - -int memory_heap_free(struct memtype *m, void *ptr, size_t len) -{ - free(ptr); - - return 0; -} - -/** Allocate memory backed by hugepages with malloc() like interface */ -static void * memory_hugepage_alloc(struct memtype *m, size_t len, size_t alignment) -{ - void *ret; - int prot = PROT_READ | PROT_WRITE; - int flags = MAP_PRIVATE | MAP_ANONYMOUS; - -#ifdef __MACH__ - flags |= VM_FLAGS_SUPERPAGE_SIZE_2MB; -#elif defined(__linux__) - flags |= MAP_HUGETLB; - - if (getuid() == 0) - flags |= MAP_LOCKED; -#endif - - ret = mmap(NULL, len, prot, flags, -1, 0); - if (ret == MAP_FAILED) - return NULL; - - return ret; -} - -static int memory_hugepage_free(struct memtype *m, void *ptr, size_t len) -{ - /** We must make sure that len is a multiple of the hugepage size - * - * See: https://lkml.org/lkml/2014/10/22/925 - */ - len = ALIGN(len, HUGEPAGESIZE); - - return munmap(ptr, len); -} - -void* memory_managed_alloc(struct memtype *m, size_t len, size_t alignment) -{ - /* Simple first-fit allocation */ - struct memblock *first = (struct memblock *) m->_vd; - struct memblock *block; - - for (block = first; block != NULL; block = block->next) { - if (block->flags & MEMBLOCK_USED) - continue; - - char* cptr = (char *) block + sizeof(struct memblock); - size_t avail = block->len; - uintptr_t uptr = (uintptr_t) cptr; - - /* Check alignment first; leave a gap at start of block to assure - * alignment if necessary */ - uintptr_t rem = uptr % alignment; - uintptr_t gap = 0; - if (rem != 0) { - gap = alignment - rem; - if (gap > avail) - continue; /* Next aligned address isn't in this block anymore */ - - cptr += gap; - avail -= gap; - } - - if (avail >= len) { - if (gap > sizeof(struct memblock)) { - /* The alignment gap is big enough to fit another block. - * The original block descriptor is already at the correct - * position, so we just change its len and create a new block - * descriptor for the actual block we're handling. */ - block->len = gap - sizeof(struct memblock); - struct memblock *newblock = (struct memblock *) (cptr - sizeof(struct memblock)); - newblock->prev = block; - newblock->next = block->next; - block->next = newblock; - newblock->flags = 0; - newblock->len = len; - block = newblock; - } - else { - /* The gap is too small to fit another block descriptor, so we - * must account for the gap length in the block length. */ - block->len = len + gap; - } - - if (avail > len + sizeof(struct memblock)) { - /* Imperfect fit, so create another block for the remaining part */ - struct memblock *newblock = (struct memblock *) (cptr + len); - newblock->prev = block; - newblock->next = block->next; - block->next = newblock; - if (newblock->next) - newblock->next->prev = newblock; - newblock->flags = 0; - newblock->len = avail - len - sizeof(struct memblock); - } - else { - /* If this block was larger than the requested length, but only - * by less than sizeof(struct memblock), we may have wasted - * memory by previous assignments to block->len. */ - block->len = avail; - } - - block->flags |= MEMBLOCK_USED; - - return (void *) cptr; - } - } - - /* No suitable block found */ - return NULL; -} - -int memory_managed_free(struct memtype *m, void *ptr, size_t len) -{ - struct memblock *first = (struct memblock *) m->_vd; - struct memblock *block; - char *cptr = ptr; - - for (block = first; block != NULL; block = block->next) { - if (!(block->flags & MEMBLOCK_USED)) - continue; - - /* Since we may waste some memory at the start of a block to ensure - * alignment, ptr may not actually be the start of the block */ - if ((char *) block + sizeof(struct memblock) <= cptr && - cptr < (char *) block + sizeof(struct memblock) + block->len) { - /* Try to merge it with neighbouring free blocks */ - if (block->prev && !(block->prev->flags & MEMBLOCK_USED) && - block->next && !(block->next->flags & MEMBLOCK_USED)) { - /* Special case first: both previous and next block are unused */ - block->prev->len += block->len + block->next->len + 2 * sizeof(struct memblock); - block->prev->next = block->next->next; - if (block->next->next) - block->next->next->prev = block->prev; - } - else if (block->prev && !(block->prev->flags & MEMBLOCK_USED)) { - block->prev->len += block->len + sizeof(struct memblock); - block->prev->next = block->next; - if (block->next) - block->next->prev = block->prev; - } - else if (block->next && !(block->next->flags & MEMBLOCK_USED)) { - block->len += block->next->len + sizeof(struct memblock); - block->next = block->next->next; - if (block->next) - block->next->prev = block; - } - else { - /* no neighbouring free block, so just mark it as free */ - block->flags &= ~MEMBLOCK_USED; - } - - return 0; - } - } - - return -1; -} - -struct memtype * memtype_managed_init(void *ptr, size_t len) -{ - struct memtype *mt = ptr; - struct memblock *mb; - char *cptr = ptr; - - if (len < sizeof(struct memtype) + sizeof(struct memblock)) { - info("memtype_managed_init: passed region too small"); - return NULL; - } - - /* Initialize memtype */ - mt->name = "managed"; - mt->flags = 0; - mt->alloc = memory_managed_alloc; - mt->free = memory_managed_free; - mt->alignment = 1; - - cptr += ALIGN(sizeof(struct memtype), sizeof(void *)); - - /* Initialize first free memblock */ - mb = (struct memblock *) cptr; - mb->prev = NULL; - mb->next = NULL; - mb->flags = 0; - - cptr += ALIGN(sizeof(struct memblock), sizeof(void *)); - - mb->len = len - (cptr - (char *) ptr); - - mt->_vd = (void *) mb; - - return mt; -} - -/* List of available memory types */ -struct memtype memtype_heap = { - .name = "heap", - .flags = MEMORY_HEAP, - .alloc = memory_heap_alloc, - .free = memory_heap_free, - .alignment = 1 -}; - -struct memtype memtype_hugepage = { - .name = "mmap_hugepages", - .flags = MEMORY_MMAP | MEMORY_HUGEPAGE, - .alloc = memory_hugepage_alloc, - .free = memory_hugepage_free, - .alignment = 21 /* 2 MiB hugepage */ -}; diff --git a/lib/memory/heap.c b/lib/memory/heap.c new file mode 100644 index 000000000..7a70abf63 --- /dev/null +++ b/lib/memory/heap.c @@ -0,0 +1,54 @@ +/** Memory allocators. + * + * @author Steffen Vogel + * @copyright 2017, Institute for Automation of Complex Power Systems, EONERC + * @license GNU General Public License (version 3) + * + * VILLASnode + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + *********************************************************************************/ + +#include + +#include + +static void * memory_heap_alloc(struct memory_type *m, size_t len, size_t alignment) +{ + void *ptr; + int ret; + + if (alignment < sizeof(void *)) + alignment = sizeof(void *); + + ret = posix_memalign(&ptr, alignment, len); + + return ret ? NULL : ptr; +} + +int memory_heap_free(struct memory_type *m, void *ptr, size_t len) +{ + free(ptr); + + return 0; +} + +/* List of available memory types */ +struct memory_type memory_type_heap = { + .name = "heap", + .flags = MEMORY_HEAP, + .alloc = memory_heap_alloc, + .free = memory_heap_free, + .alignment = 1 +}; diff --git a/lib/memory/hugepage.c b/lib/memory/hugepage.c new file mode 100644 index 000000000..83abbcf15 --- /dev/null +++ b/lib/memory/hugepage.c @@ -0,0 +1,86 @@ +/** Hugepage memory allocator. + * + * @author Steffen Vogel + * @copyright 2017, Institute for Automation of Complex Power Systems, EONERC + * @license GNU General Public License (version 3) + * + * VILLASnode + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + *********************************************************************************/ + +#include +#include +#include +#include + +#include +#include +#include +#include + +/* Required to allocate hugepages on Apple OS X */ +#ifdef __MACH__ + #include +#elif defined(__linux__) + #include +#endif + +#include +#include +#include + +#define HUGEPAGESIZE (1 << 21) /* 2 MiB */ + +/** Allocate memory backed by hugepages with malloc() like interface */ +static void * memory_hugepage_alloc(struct memory_type *m, size_t len, size_t alignment) +{ + void *ret; + int prot = PROT_READ | PROT_WRITE; + int flags = MAP_PRIVATE | MAP_ANONYMOUS; + +#ifdef __MACH__ + flags |= VM_FLAGS_SUPERPAGE_SIZE_2MB; +#elif defined(__linux__) + flags |= MAP_HUGETLB; + + if (getuid() == 0) + flags |= MAP_LOCKED; +#endif + + ret = mmap(NULL, len, prot, flags, -1, 0); + if (ret == MAP_FAILED) + return NULL; + + return ret; +} + +static int memory_hugepage_free(struct memory_type *m, void *ptr, size_t len) +{ + /** We must make sure that len is a multiple of the hugepage size + * + * See: https://lkml.org/lkml/2014/10/22/925 + */ + len = ALIGN(len, HUGEPAGESIZE); + + return munmap(ptr, len); +} + +struct memory_type memory_hugepage = { + .name = "mmap_hugepages", + .flags = MEMORY_MMAP | MEMORY_HUGEPAGE, + .alloc = memory_hugepage_alloc, + .free = memory_hugepage_free, + .alignment = 21 /* 2 MiB hugepage */ +}; diff --git a/lib/memory_ib.c b/lib/memory/ib.c similarity index 84% rename from lib/memory_ib.c rename to lib/memory/ib.c index 9a1432aaf..38955e4d6 100644 --- a/lib/memory_ib.c +++ b/lib/memory/ib.c @@ -21,9 +21,14 @@ *********************************************************************************/ #include -#include +#include #include +struct memory_ib { + struct ibv_pd *pd; + struct memory_type *parent; +}; + struct ibv_mr * memory_ib_mr(void *ptr) { struct ibv_mr *mr = (struct ibv_mr *) ptr; @@ -31,7 +36,7 @@ struct ibv_mr * memory_ib_mr(void *ptr) return (mr - 1); } -void * memory_ib_alloc(struct memtype *m, size_t len, size_t alignment) +void * memory_ib_alloc(struct memory_type *m, size_t len, size_t alignment) { struct memory_ib *mi = (struct memory_ib *) m->_vd; @@ -47,7 +52,7 @@ void * memory_ib_alloc(struct memtype *m, size_t len, size_t alignment) return ptr; } -int memory_ib_free(struct memtype *m, void *ptr, size_t len) +int memory_ib_free(struct memory_type *m, void *ptr, size_t len) { struct memory_ib *mi = (struct memory_ib *) m->_vd; struct ibv_mr *mr = memory_ib_mr(ptr); @@ -62,10 +67,10 @@ int memory_ib_free(struct memtype *m, void *ptr, size_t len) return 0; } -struct memtype * ib_memtype(struct node *n, struct memtype *parent) +struct memory_type * memory_ib(struct node *n, struct memory_type *parent) { struct infiniband *i = (struct infiniband *) n->_vd; - struct memtype *mt = malloc(sizeof(struct memtype)); + struct memory_type *mt = malloc(sizeof(struct memory_type)); mt->name = "ib"; mt->flags = 0; diff --git a/lib/memory/managed.c b/lib/memory/managed.c new file mode 100644 index 000000000..05ea6f439 --- /dev/null +++ b/lib/memory/managed.c @@ -0,0 +1,193 @@ +/** Memory allocators. + * + * @author Steffen Vogel + * @copyright 2017, Institute for Automation of Complex Power Systems, EONERC + * @license GNU General Public License (version 3) + * + * VILLASnode + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + *********************************************************************************/ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +void* memory_managed_alloc(struct memory_type *m, size_t len, size_t alignment) +{ + /* Simple first-fit allocation */ + struct memblock *first = (struct memblock *) m->_vd; + struct memblock *block; + + for (block = first; block != NULL; block = block->next) { + if (block->flags & MEMBLOCK_USED) + continue; + + char* cptr = (char *) block + sizeof(struct memblock); + size_t avail = block->len; + uintptr_t uptr = (uintptr_t) cptr; + + /* Check alignment first; leave a gap at start of block to assure + * alignment if necessary */ + uintptr_t rem = uptr % alignment; + uintptr_t gap = 0; + if (rem != 0) { + gap = alignment - rem; + if (gap > avail) + continue; /* Next aligned address isn't in this block anymore */ + + cptr += gap; + avail -= gap; + } + + if (avail >= len) { + if (gap > sizeof(struct memblock)) { + /* The alignment gap is big enough to fit another block. + * The original block descriptor is already at the correct + * position, so we just change its len and create a new block + * descriptor for the actual block we're handling. */ + block->len = gap - sizeof(struct memblock); + struct memblock *newblock = (struct memblock *) (cptr - sizeof(struct memblock)); + newblock->prev = block; + newblock->next = block->next; + block->next = newblock; + newblock->flags = 0; + newblock->len = len; + block = newblock; + } + else { + /* The gap is too small to fit another block descriptor, so we + * must account for the gap length in the block length. */ + block->len = len + gap; + } + + if (avail > len + sizeof(struct memblock)) { + /* Imperfect fit, so create another block for the remaining part */ + struct memblock *newblock = (struct memblock *) (cptr + len); + newblock->prev = block; + newblock->next = block->next; + block->next = newblock; + if (newblock->next) + newblock->next->prev = newblock; + newblock->flags = 0; + newblock->len = avail - len - sizeof(struct memblock); + } + else { + /* If this block was larger than the requested length, but only + * by less than sizeof(struct memblock), we may have wasted + * memory by previous assignments to block->len. */ + block->len = avail; + } + + block->flags |= MEMBLOCK_USED; + + return (void *) cptr; + } + } + + /* No suitable block found */ + return NULL; +} + +int memory_managed_free(struct memory_type *m, void *ptr, size_t len) +{ + struct memblock *first = (struct memblock *) m->_vd; + struct memblock *block; + char *cptr = ptr; + + for (block = first; block != NULL; block = block->next) { + if (!(block->flags & MEMBLOCK_USED)) + continue; + + /* Since we may waste some memory at the start of a block to ensure + * alignment, ptr may not actually be the start of the block */ + if ((char *) block + sizeof(struct memblock) <= cptr && + cptr < (char *) block + sizeof(struct memblock) + block->len) { + /* Try to merge it with neighbouring free blocks */ + if (block->prev && !(block->prev->flags & MEMBLOCK_USED) && + block->next && !(block->next->flags & MEMBLOCK_USED)) { + /* Special case first: both previous and next block are unused */ + block->prev->len += block->len + block->next->len + 2 * sizeof(struct memblock); + block->prev->next = block->next->next; + if (block->next->next) + block->next->next->prev = block->prev; + } + else if (block->prev && !(block->prev->flags & MEMBLOCK_USED)) { + block->prev->len += block->len + sizeof(struct memblock); + block->prev->next = block->next; + if (block->next) + block->next->prev = block->prev; + } + else if (block->next && !(block->next->flags & MEMBLOCK_USED)) { + block->len += block->next->len + sizeof(struct memblock); + block->next = block->next->next; + if (block->next) + block->next->prev = block; + } + else { + /* no neighbouring free block, so just mark it as free */ + block->flags &= ~MEMBLOCK_USED; + } + + return 0; + } + } + + return -1; +} + +struct memory_type * memory_managed(void *ptr, size_t len) +{ + struct memory_type *mt = ptr; + struct memblock *mb; + char *cptr = ptr; + + if (len < sizeof(struct memory_type) + sizeof(struct memblock)) { + info("memory_managed: passed region too small"); + return NULL; + } + + /* Initialize memory_type */ + mt->name = "managed"; + mt->flags = 0; + mt->alloc = memory_managed_alloc; + mt->free = memory_managed_free; + mt->alignment = 1; + + cptr += ALIGN(sizeof(struct memory_type), sizeof(void *)); + + /* Initialize first free memblock */ + mb = (struct memblock *) cptr; + mb->prev = NULL; + mb->next = NULL; + mb->flags = 0; + + cptr += ALIGN(sizeof(struct memblock), sizeof(void *)); + + mb->len = len - (cptr - (char *) ptr); + + mt->_vd = (void *) mb; + + return mt; +} diff --git a/lib/node.c b/lib/node.c index 4b028c471..e7c9f7a7f 100644 --- a/lib/node.c +++ b/lib/node.c @@ -549,9 +549,9 @@ int node_fd(struct node *n) return n->_vt->fd ? n->_vt->fd(n) : -1; } -struct memtype * node_memtype(struct node *n, struct memtype *parent) +struct memory_type * node_memory_type(struct node *n, struct memory_type *parent) { - return n->_vt->memtype(n, parent) ? n->_vt->memtype(n, parent) : &memtype_hugepage; + return n->_vt->memory_type ? n->_vt->memory_type(n, parent) : &memory_hugepage; } int node_parse_list(struct list *list, json_t *cfg, struct list *all) diff --git a/lib/nodes/iec61850_sv.c b/lib/nodes/iec61850_sv.c index 70c91efca..82dadf238 100644 --- a/lib/nodes/iec61850_sv.c +++ b/lib/nodes/iec61850_sv.c @@ -294,11 +294,11 @@ int iec61850_sv_start(struct node *n) SVReceiver_addSubscriber(i->subscriber.receiver, i->subscriber.subscriber); /* Initialize pool and queue to pass samples between threads */ - ret = pool_init(&i->subscriber.pool, 1024, SAMPLE_LEN(n->samplelen), &memtype_hugepage); + ret = pool_init(&i->subscriber.pool, 1024, SAMPLE_LEN(n->samplelen), &memory_hugepage); if (ret) return ret; - ret = queue_signalled_init(&i->subscriber.queue, 1024, &memtype_hugepage, 0); + ret = queue_signalled_init(&i->subscriber.queue, 1024, &memory_hugepage, 0); if (ret) return ret; } diff --git a/lib/nodes/infiniband.c b/lib/nodes/infiniband.c index efe372816..c732d756c 100644 --- a/lib/nodes/infiniband.c +++ b/lib/nodes/infiniband.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include @@ -240,7 +240,7 @@ static void ib_build_ibv(struct node *n) pool_init(&ib->mem.p_recv, ib->qp_init.cap.max_recv_wr, 64*sizeof(double), - &memtype_heap); + &memory_type_heap); if(ret) { error("Failed to init recv memory pool of node %s: %s", @@ -271,7 +271,7 @@ static void ib_build_ibv(struct node *n) pool_init(&ib->mem.p_send, ib->qp_init.cap.max_send_wr, sizeof(double), - &memtype_heap); + &memory_type_heap); if(ret) { error("Failed to init send memory of node %s: %s", @@ -839,7 +839,7 @@ static struct plugin p = { .read = ib_read, .write = ib_write, .fd = ib_fd, - .memtype = ib_memtype + .memory_type = memory_ib } }; diff --git a/lib/nodes/loopback.c b/lib/nodes/loopback.c index e25efcd67..62b11519c 100644 --- a/lib/nodes/loopback.c +++ b/lib/nodes/loopback.c @@ -50,11 +50,11 @@ int loopback_open(struct node *n) int ret; struct loopback *l = (struct loopback *) n->_vd; - ret = pool_init(&l->pool, l->queuelen, SAMPLE_LEN(n->samplelen), &memtype_hugepage); + ret = pool_init(&l->pool, l->queuelen, SAMPLE_LEN(n->samplelen), &memory_hugepage); if (ret) return ret; - return queue_signalled_init(&l->queue, l->queuelen, &memtype_hugepage, QUEUE_SIGNALLED_EVENTFD); + return queue_signalled_init(&l->queue, l->queuelen, &memory_hugepage, QUEUE_SIGNALLED_EVENTFD); } int loopback_close(struct node *n) diff --git a/lib/nodes/mqtt.c b/lib/nodes/mqtt.c index bc3692f13..893d10d8d 100644 --- a/lib/nodes/mqtt.c +++ b/lib/nodes/mqtt.c @@ -301,11 +301,11 @@ int mqtt_start(struct node *n) if (ret) return ret; - ret = pool_init(&m->pool, 1024, SAMPLE_LEN(n->samplelen), &memtype_hugepage); + ret = pool_init(&m->pool, 1024, SAMPLE_LEN(n->samplelen), &memory_hugepage); if (ret) return ret; - ret = queue_signalled_init(&m->queue, 1024, &memtype_hugepage, 0); + ret = queue_signalled_init(&m->queue, 1024, &memory_hugepage, 0); if (ret) return ret; diff --git a/lib/nodes/websocket.c b/lib/nodes/websocket.c index b30926fff..a81b8b2e8 100644 --- a/lib/nodes/websocket.c +++ b/lib/nodes/websocket.c @@ -81,7 +81,7 @@ static int websocket_connection_init(struct websocket_connection *c) c->_name = NULL; - ret = queue_init(&c->queue, DEFAULT_QUEUELEN, &memtype_hugepage); + ret = queue_init(&c->queue, DEFAULT_QUEUELEN, &memory_hugepage); if (ret) return ret; @@ -400,11 +400,11 @@ int websocket_start(struct node *n) int ret; struct websocket *w = (struct websocket *) n->_vd; - ret = pool_init(&w->pool, DEFAULT_WEBSOCKET_QUEUELEN, SAMPLE_LEN(DEFAULT_WEBSOCKET_SAMPLELEN), &memtype_hugepage); + ret = pool_init(&w->pool, DEFAULT_WEBSOCKET_QUEUELEN, SAMPLE_LEN(DEFAULT_WEBSOCKET_SAMPLELEN), &memory_hugepage); if (ret) return ret; - ret = queue_signalled_init(&w->queue, DEFAULT_WEBSOCKET_QUEUELEN, &memtype_hugepage, 0); + ret = queue_signalled_init(&w->queue, DEFAULT_WEBSOCKET_QUEUELEN, &memory_hugepage, 0); if (ret) return ret; diff --git a/lib/path.c b/lib/path.c index 7763e1aa5..b3bb3d9e0 100644 --- a/lib/path.c +++ b/lib/path.c @@ -46,7 +46,7 @@ static int path_source_init(struct path_source *ps) { int ret; - ret = pool_init(&ps->pool, MAX(DEFAULT_QUEUELEN, ps->node->in.vectorize), SAMPLE_LEN(ps->node->samplelen), &memtype_hugepage); + ret = pool_init(&ps->pool, MAX(DEFAULT_QUEUELEN, ps->node->in.vectorize), SAMPLE_LEN(ps->node->samplelen), &memory_hugepage); if (ret) return ret; @@ -148,7 +148,7 @@ static int path_destination_init(struct path_destination *pd, int queuelen) { int ret; - ret = queue_init(&pd->queue, queuelen, &memtype_hugepage); + ret = queue_init(&pd->queue, queuelen, &memory_hugepage); if (ret) return ret; @@ -430,7 +430,7 @@ int path_init2(struct path *p) if (!p->samplelen) p->samplelen = DEFAULT_SAMPLELEN; - ret = pool_init(&p->pool, MAX(1, list_length(&p->destinations)) * p->queuelen, SAMPLE_LEN(p->samplelen), &memtype_hugepage); + ret = pool_init(&p->pool, MAX(1, list_length(&p->destinations)) * p->queuelen, SAMPLE_LEN(p->samplelen), &memory_hugepage); if (ret) return ret; diff --git a/lib/pool.c b/lib/pool.c index 0f77df83a..78b47f2c8 100644 --- a/lib/pool.c +++ b/lib/pool.c @@ -25,7 +25,7 @@ #include #include -int pool_init(struct pool *p, size_t cnt, size_t blocksz, struct memtype *m) +int pool_init(struct pool *p, size_t cnt, size_t blocksz, struct memory_type *m) { int ret; diff --git a/lib/queue.c b/lib/queue.c index af65d0bde..438a2c523 100644 --- a/lib/queue.c +++ b/lib/queue.c @@ -36,7 +36,7 @@ #include /** Initialize MPMC queue */ -int queue_init(struct queue *q, size_t size, struct memtype *mem) +int queue_init(struct queue *q, size_t size, struct memory_type *mem) { assert(q->state == STATE_DESTROYED); diff --git a/lib/queue_signalled.c b/lib/queue_signalled.c index a37caa316..bc625ecd6 100644 --- a/lib/queue_signalled.c +++ b/lib/queue_signalled.c @@ -36,7 +36,7 @@ static void queue_signalled_cleanup(void *p) pthread_mutex_unlock(&qs->pthread.mutex); } -int queue_signalled_init(struct queue_signalled *qs, size_t size, struct memtype *mem, int flags) +int queue_signalled_init(struct queue_signalled *qs, size_t size, struct memory_type *mem, int flags) { int ret; diff --git a/lib/shmem.c b/lib/shmem.c index 56020a249..65042a155 100644 --- a/lib/shmem.c +++ b/lib/shmem.c @@ -35,8 +35,8 @@ size_t shmem_total_size(int queuelen, int samplelen) { - /* We have the constant const of the memtype header */ - return sizeof(struct memtype) + /* We have the constant const of the memory_type header */ + return sizeof(struct memory_type) /* and the shared struct itself */ + sizeof(struct shmem_shared) /* the size of the actual queue and the queue for the pool */ @@ -55,7 +55,7 @@ int shmem_int_open(const char *wname, const char* rname, struct shmem_int *shm, int fd, ret; size_t len; void *base; - struct memtype *manager; + struct memory_type *manager; struct shmem_shared *shared; struct stat stat_buf; sem_t *sem_own, *sem_other; @@ -92,7 +92,7 @@ retry: fd = shm_open(wname, O_RDWR|O_CREAT|O_EXCL, 0600); close(fd); - manager = memtype_managed_init(base, len); + manager = memory_managed(base, len); shared = memory_alloc(manager, sizeof(struct shmem_shared)); if (!shared) { errno = ENOMEM; @@ -144,7 +144,7 @@ retry: fd = shm_open(wname, O_RDWR|O_CREAT|O_EXCL, 0600); if (base == MAP_FAILED) return -10; - cptr = (char *) base + sizeof(struct memtype) + sizeof(struct memblock); + cptr = (char *) base + sizeof(struct memory_type) + sizeof(struct memblock); shared = (struct shmem_shared *) cptr; shm->read.base = base; shm->read.name = rname; diff --git a/src/hook.c b/src/hook.c index ecf31cf9f..1d444b458 100644 --- a/src/hook.c +++ b/src/hook.c @@ -182,7 +182,7 @@ check: if (optarg == endptr) smps = alloc(cnt * sizeof(struct sample *)); - ret = pool_init(&q, 10 * cnt, SAMPLE_LEN(DEFAULT_SAMPLELEN), &memtype_hugepage); + ret = pool_init(&q, 10 * cnt, SAMPLE_LEN(DEFAULT_SAMPLELEN), &memory_hugepage); if (ret) error("Failed to initilize memory pool"); diff --git a/src/pipe.c b/src/pipe.c index 2b0260398..67e68435e 100644 --- a/src/pipe.c +++ b/src/pipe.c @@ -132,7 +132,7 @@ static void * send_loop(void *ctx) struct sample *smps[node->out.vectorize]; /* Initialize memory */ - ret = pool_init(&sendd.pool, LOG2_CEIL(node->out.vectorize), SAMPLE_LEN(DEFAULT_SAMPLELEN), node_memtype(node, &memtype_hugepage)); + ret = pool_init(&sendd.pool, LOG2_CEIL(node->out.vectorize), SAMPLE_LEN(DEFAULT_SAMPLELEN), node_memory_type(node, &memory_hugepage)); if (ret < 0) error("Failed to allocate memory for receive pool."); @@ -196,7 +196,7 @@ static void * recv_loop(void *ctx) struct sample *smps[node->in.vectorize]; /* Initialize memory */ - ret = pool_init(&recvv.pool, LOG2_CEIL(node->in.vectorize), SAMPLE_LEN(DEFAULT_SAMPLELEN), node_memtype(node, &memtype_hugepage)); + ret = pool_init(&recvv.pool, LOG2_CEIL(node->in.vectorize), SAMPLE_LEN(DEFAULT_SAMPLELEN), node_memory_type(node, &memory_hugepage)); if (ret < 0) error("Failed to allocate memory for receive pool."); diff --git a/src/signal.c b/src/signal.c index 324f12733..31e47716c 100644 --- a/src/signal.c +++ b/src/signal.c @@ -155,7 +155,7 @@ int main(int argc, char *argv[]) if (ret) error("Failed to verify node configuration"); - ret = pool_init(&q, 16, SAMPLE_LEN(n.samplelen), &memtype_heap); + ret = pool_init(&q, 16, SAMPLE_LEN(n.samplelen), &memory_type_heap); if (ret) error("Failed to initialize pool"); diff --git a/src/test-cmp.c b/src/test-cmp.c index 0c5bad66e..52d93c5b0 100644 --- a/src/test-cmp.c +++ b/src/test-cmp.c @@ -122,7 +122,7 @@ check: if (optarg == endptr) int n = argc - optind; /* The number of files which we compare */ struct side s[n]; - ret = pool_init(&pool, n, SAMPLE_LEN(DEFAULT_SAMPLELEN), &memtype_heap); + ret = pool_init(&pool, n, SAMPLE_LEN(DEFAULT_SAMPLELEN), &memory_type_heap); if (ret) error("Failed to initialize pool"); diff --git a/tests/unit/io.c b/tests/unit/io.c index 4abdc57a2..8c2338cd9 100644 --- a/tests/unit/io.c +++ b/tests/unit/io.c @@ -185,7 +185,7 @@ ParameterizedTest(char *fmt, io, lowlevel) struct sample *smps[NUM_SAMPLES]; struct sample *smpt[NUM_SAMPLES]; - ret = pool_init(&p, 2 * NUM_SAMPLES, SAMPLE_LEN(NUM_VALUES), &memtype_hugepage); + ret = pool_init(&p, 2 * NUM_SAMPLES, SAMPLE_LEN(NUM_VALUES), &memory_hugepage); cr_assert_eq(ret, 0); info("Running test for format = %s", fmt); @@ -232,7 +232,7 @@ ParameterizedTest(char *fmt, io, highlevel) info("Running test for format = %s", fmt); - ret = pool_init(&p, 2 * NUM_SAMPLES, SAMPLE_LEN(NUM_VALUES), &memtype_hugepage); + ret = pool_init(&p, 2 * NUM_SAMPLES, SAMPLE_LEN(NUM_VALUES), &memory_hugepage); cr_assert_eq(ret, 0); generate_samples(&p, smps, smpt, NUM_SAMPLES, NUM_VALUES); diff --git a/tests/unit/memory.c b/tests/unit/memory.c index 55435d45c..38ccbe51a 100644 --- a/tests/unit/memory.c +++ b/tests/unit/memory.c @@ -28,13 +28,15 @@ #include #include +#define HUGEPAGESIZE (1<<22) + TheoryDataPoints(memory, aligned) = { DataPoints(size_t, 1, 32, 55, 1 << 10, 1 << 20), DataPoints(size_t, 1, 8, 1 << 12), - DataPoints(struct memtype *, &memtype_heap, &memtype_hugepage) + DataPoints(struct memory_type *, &memory_type_heap, &memory_hugepage) }; -Theory((size_t len, size_t align, struct memtype *m), memory, aligned) { +Theory((size_t len, size_t align, struct memory_type *m), memory, aligned) { int ret; void *ptr; @@ -43,7 +45,7 @@ Theory((size_t len, size_t align, struct memtype *m), memory, aligned) { cr_assert(IS_ALIGNED(ptr, align)); - if (m == &memtype_hugepage) { + if (m == &memory_hugepage) { cr_assert(IS_ALIGNED(ptr, HUGEPAGESIZE)); } @@ -57,15 +59,15 @@ Test(memory, manager) { int ret; void *p, *p1, *p2, *p3; - struct memtype *m; + struct memory_type *m; total_size = 1 << 10; - max_block = total_size - sizeof(struct memtype) - sizeof(struct memblock); + max_block = total_size - sizeof(struct memory_type) - sizeof(struct memblock); - p = memory_alloc(&memtype_heap, total_size); + p = memory_alloc(&memory_type_heap, total_size); cr_assert_not_null(p); - m = memtype_managed_init(p, total_size); + m = memory_managed(p, total_size); cr_assert_not_null(m); p1 = memory_alloc(m, 16); @@ -100,6 +102,6 @@ Test(memory, manager) { ret = memory_free(m, p1, max_block); cr_assert(ret == 0); - ret = memory_free(&memtype_heap, p, total_size); + ret = memory_free(&memory_type_heap, p, total_size); cr_assert(ret == 0); } diff --git a/tests/unit/pool.c b/tests/unit/pool.c index d2ef6160a..e2958185a 100644 --- a/tests/unit/pool.c +++ b/tests/unit/pool.c @@ -32,16 +32,16 @@ struct param { int thread_count; int pool_size; size_t block_size; - struct memtype *memtype; + struct memory_type *memory_type; }; ParameterizedTestParameters(pool, basic) { static struct param params[] = { - { 1, 4096, 150, &memtype_heap }, - { 1, 128, 8, &memtype_hugepage }, - { 1, 4, 8192, &memtype_hugepage }, - { 1, 1 << 13, 4, &memtype_heap } + { 1, 4096, 150, &memory_type_heap }, + { 1, 128, 8, &memory_hugepage }, + { 1, 4, 8192, &memory_hugepage }, + { 1, 1 << 13, 4, &memory_type_heap } }; return cr_make_param_array(struct param, params, ARRAY_LEN(params)); @@ -54,7 +54,7 @@ ParameterizedTest(struct param *p, pool, basic) void *ptr, *ptrs[p->pool_size]; - ret = pool_init(&pool, p->pool_size, p->block_size, p->memtype); + ret = pool_init(&pool, p->pool_size, p->block_size, p->memory_type); cr_assert_eq(ret, 0, "Failed to create pool"); ptr = pool_get(&pool); diff --git a/tests/unit/queue.c b/tests/unit/queue.c index fad646c11..dca93f12d 100644 --- a/tests/unit/queue.c +++ b/tests/unit/queue.c @@ -51,7 +51,7 @@ struct param { int batch_size; void * (*thread_func)(void *); struct queue queue; - const struct memtype *memtype; + const struct memory_type *memory_type; }; /** Get thread id as integer @@ -243,7 +243,7 @@ Test(queue, single_threaded) .start = 1 /* we start immeadiatly */ }; - ret = queue_init(&p.queue, p.queue_size, &memtype_heap); + ret = queue_init(&p.queue, p.queue_size, &memory_type_heap); cr_assert_eq(ret, 0, "Failed to create queue"); producer(&p); @@ -265,35 +265,35 @@ ParameterizedTestParameters(queue, multi_threaded) .thread_count = 32, .thread_func = producer_consumer_many, .batch_size = 10, - .memtype = &memtype_heap + .memory_type = &memory_type_heap }, { .iter_count = 1 << 8, .queue_size = 1 << 9, .thread_count = 4, .thread_func = producer_consumer_many, .batch_size = 100, - .memtype = &memtype_heap + .memory_type = &memory_type_heap }, { .iter_count = 1 << 16, .queue_size = 1 << 14, .thread_count = 16, .thread_func = producer_consumer_many, .batch_size = 100, - .memtype = &memtype_heap + .memory_type = &memory_type_heap }, { .iter_count = 1 << 8, .queue_size = 1 << 9, .thread_count = 4, .thread_func = producer_consumer_many, .batch_size = 10, - .memtype = &memtype_heap + .memory_type = &memory_type_heap }, { .iter_count = 1 << 16, .queue_size = 1 << 9, .thread_count = 16, .thread_func = producer_consumer, .batch_size = 10, - .memtype = &memtype_hugepage + .memory_type = &memory_hugepage } }; @@ -308,7 +308,7 @@ ParameterizedTest(struct param *p, queue, multi_threaded, .timeout = 20) p->start = 0; - ret = queue_init(&p->queue, p->queue_size, &memtype_heap); + ret = queue_init(&p->queue, p->queue_size, &memory_type_heap); cr_assert_eq(ret, 0, "Failed to create queue"); uint64_t start_tsc_time, end_tsc_time; @@ -350,7 +350,7 @@ Test(queue, init_destroy) int ret; struct queue q = { .state = STATE_DESTROYED }; - ret = queue_init(&q, 1024, &memtype_heap); + ret = queue_init(&q, 1024, &memory_type_heap); cr_assert_eq(ret, 0); /* Should succeed */ ret = queue_destroy(&q); diff --git a/tests/unit/queue_signalled.c b/tests/unit/queue_signalled.c index 0acf8d1ea..030ef9117 100644 --- a/tests/unit/queue_signalled.c +++ b/tests/unit/queue_signalled.c @@ -132,7 +132,7 @@ ParameterizedTest(struct param *param, queue_signalled, simple, .timeout = 5) pthread_t t1, t2; - ret = queue_signalled_init(&q, LOG2_CEIL(NUM_ELEM), &memtype_heap, param->flags); + ret = queue_signalled_init(&q, LOG2_CEIL(NUM_ELEM), &memory_type_heap, param->flags); cr_assert_eq(ret, 0, "Failed to initialize queue: flags=%#x, ret=%d", param->flags, ret); ret = pthread_create(&t1, NULL, producer, &q); From 669d75a666b85b6fa22b5cc5168f91c7ad356afe Mon Sep 17 00:00:00 2001 From: Dennis Potter Date: Mon, 2 Jul 2018 16:03:16 +0200 Subject: [PATCH 21/35] Basic implementation of zero-copying is done. The is still a problem with rread = hook_read_list. It doesn't return anything after the fourth read --- include/villas/nodes/infiniband.h | 4 +- lib/nodes/infiniband.c | 141 ++++++++++++++++-------------- src/pipe.c | 4 +- 3 files changed, 80 insertions(+), 69 deletions(-) diff --git a/include/villas/nodes/infiniband.h b/include/villas/nodes/infiniband.h index 6f54f04f7..7fa5f2285 100644 --- a/include/villas/nodes/infiniband.h +++ b/include/villas/nodes/infiniband.h @@ -40,7 +40,7 @@ typedef void (*ib_on_completion)(struct node*, struct ibv_wc*, int*); typedef void* (*ib_poll_function)(void*); /* Enums */ -enum poll_mode_e +enum poll_mode_e { EVENT, BUSY @@ -91,7 +91,7 @@ struct infiniband { pthread_t stop_thread; int rdma_disconnect_called; - int used_recv_wrs; + int available_recv_wrs; } conn; /* Memory related variables */ diff --git a/lib/nodes/infiniband.c b/lib/nodes/infiniband.c index efe372816..f12bfa215 100644 --- a/lib/nodes/infiniband.c +++ b/lib/nodes/infiniband.c @@ -88,15 +88,11 @@ int ib_post_recv_wrs(struct node *n) return ret; } -void ib_completion_target(struct node* n, struct ibv_wc* wc, int* size) -{ - //ToDo: No implementation yet. This is still handled in ib_read -} +void ib_completion_target(struct node* n, struct ibv_wc* wc, int* size){} void ib_completion_source(struct node* n, struct ibv_wc* wc, int* size) { struct infiniband *ib = (struct infiniband *)((struct node *)n)->_vd; - struct sample* smpl; for(int i=0; i<*size; i++) { @@ -115,8 +111,7 @@ void ib_completion_source(struct node* n, struct ibv_wc* wc, int* size) else { // Release sample - smpl = (struct sample*)wc[i].wr_id; - sample_put(smpl); + sample_put((struct sample*)(wc[i].wr_id)); } } } @@ -144,19 +139,19 @@ void * ib_event_thread(void *n) void * ib_busy_poll_thread(void *n) { - struct infiniband *ib = (struct infiniband *)((struct node *)n)->_vd; - struct ibv_wc wc[ib->cq_size]; - int size; + struct infiniband *ib = (struct infiniband *)((struct node *)n)->_vd; + struct ibv_wc wc[ib->cq_size]; + int size; - while(1) - { - // Poll as long as WCs are available - while((size = ibv_poll_cq(ib->ctx.cq, ib->cq_size, wc))) - ib->poll.on_compl(n, wc, &size); + while(1) + { + // Poll as long as WCs are available + while((size = ibv_poll_cq(ib->ctx.cq, ib->cq_size, wc))) + ib->poll.on_compl(n, wc, &size); - if(ib->poll.stopThread) - return NULL; - } + if(ib->poll.stopThread) + return NULL; + } } static void ib_init_wc_poll(struct node *n) @@ -239,7 +234,7 @@ static void ib_build_ibv(struct node *n) // Set pool size to maximum size of Receive Queue pool_init(&ib->mem.p_recv, ib->qp_init.cap.max_recv_wr, - 64*sizeof(double), + SAMPLE_DATA_LEN(DEFAULT_SAMPLELEN), &memtype_heap); if(ret) { @@ -297,16 +292,16 @@ static void ib_build_ibv(struct node *n) // Post Receive Work Requests to be able to receive data // Fill complete Receive Queue during initialization - for(int i=0; iqp_init.cap.max_recv_wr; i++) - { - ret = ib_post_recv_wrs(n); - if(ret) - { - error("Failed to post initial receive Work Requests of node %s.", - node_name(n)); - } - } - info("Filled the complete Receive Queue."); + //for(int i=0; iqp_init.cap.max_recv_wr; i++) + //{ + // ret = ib_post_recv_wrs(n); + // if(ret) + // { + // error("Failed to post initial receive Work Requests of node %s.", + // node_name(n)); + // } + //} + //info("Filled the complete Receive Queue."); } static int ib_addr_resolved(struct node *n) @@ -514,8 +509,8 @@ int ib_parse(struct node *n, json_t *cfg) ib->qp_init.cap.max_send_wr = max_send_wr; ib->qp_init.cap.max_recv_wr = max_recv_wr; - // Set used receive Work Requests to 0 - ib->conn.used_recv_wrs = 0; + // Set available receive Work Requests to 0 + ib->conn.available_recv_wrs = 0; // Set remaining QP attributes ib->qp_init.cap.max_send_sge = 1; @@ -712,49 +707,65 @@ int ib_deinit() int ib_read(struct node *n, struct sample *smps[], unsigned cnt) { - //Create separate thread for polling! This impelemtation is just - //for testing purposes struct infiniband *ib = (struct infiniband *) n->_vd; + struct ibv_wc wc[n->in.vectorize]; + struct ibv_recv_wr wr, *bad_wr = NULL; + struct ibv_sge sge; + struct ibv_mr ** mr; + struct pool *p; int ret; - struct ibv_wc wc[cnt]; - char *ptr; - ret = ibv_poll_cq(ib->ctx.cq, cnt, wc); + ret = ibv_poll_cq(ib->ctx.cq, n->in.vectorize, wc); if(ret) { - ib->conn.used_recv_wrs += ret; + ib->conn.available_recv_wrs -= ret; - for(int i=0; ilength = wc[i].byte_len/sizeof(double); + smps[i]->capacity = DEFAULT_SAMPLELEN; - //Release memory - pool_put(&ib->mem.p_recv, (double*)(wc[i].wr_id)); - } - smps[i]->length = wc[i].byte_len/sizeof(double); - smps[i]->capacity = 64; - memcpy(smps[i]->data, ptr, wc[i].byte_len); - } + //Release sample + sample_put(smps[i]); + + } + } } - else + else if(ib->conn.available_recv_wrs < ib->qp_init.cap.max_recv_wr) { - //No data received? Put new receive Work Requests to Receive Queue - for(int i=0; iconn.used_recv_wrs; i++) - ib_post_recv_wrs(n); + // No data received? Put new receive Work Requests to Receive Queue + // Get Memory Region + p = sample_pool(smps[0]); + mr = (struct ibv_mr **)((char *)(p)+p->buffer_off-8); - ib->conn.used_recv_wrs = 0; + // Increase refcnt of sample + sample_get(smps[0]); + + // Prepare receive Scatter/Gather element + sge.addr = (uint64_t)&smps[0]->data; + sge.length = SAMPLE_DATA_LEN(DEFAULT_SAMPLELEN); + sge.lkey = (*mr)->lkey; + + // Prepare a receive Work Request + wr.wr_id = (uintptr_t)smps[0]; + wr.next = NULL; + wr.sg_list = &sge; + wr.num_sge = 1; + + // Post Work Request + ret = ibv_post_recv(ib->ctx.id->qp, &wr, &bad_wr); + + ib->conn.available_recv_wrs++; } return ret; @@ -765,8 +776,8 @@ int ib_write(struct node *n, struct sample *smps[], unsigned cnt) struct infiniband *ib = (struct infiniband *) n->_vd; struct ibv_send_wr wr[cnt], *bad_wr = NULL; struct ibv_sge sge[cnt]; - struct pool *p; struct ibv_mr ** mr; + struct pool *p; int ret; memset(&wr, 0, sizeof(wr)); @@ -784,12 +795,12 @@ int ib_write(struct node *n, struct sample *smps[], unsigned cnt) sample_get(smps[i]); //Set Scatter/Gather element to data of sample - sge[i].addr = (uint64_t)&smps[i]->data->f; + sge[i].addr = (uint64_t)&smps[i]->data; sge[i].length = smps[i]->length*sizeof(double); sge[i].lkey = (*mr)->lkey; // Set Send Work Request - wr[i].wr_id = (uint64_t)&smps[i]; //This way the sample can be release in WC + wr[i].wr_id = (uintptr_t)smps[i]; //This way the sample can be release in WC wr[i].sg_list = &sge[i]; wr[i].num_sge = 1; diff --git a/src/pipe.c b/src/pipe.c index 2b0260398..f1267e876 100644 --- a/src/pipe.c +++ b/src/pipe.c @@ -204,8 +204,8 @@ static void * recv_loop(void *ctx) ready = sample_alloc_many(&recvv.pool, smps, node->in.vectorize); if (ready < 0) error("Failed to allocate %u samples from receive pool.", node->in.vectorize); - else if (ready < node->in.vectorize) - warn("Receive pool underrun"); +// else if (ready < node->in.vectorize) +// warn("Receive pool underrun"); recv = node_read(node, smps, ready); if (recv < 0) From 45b121d884dafc594b7a1c22c1b57b8a74786541 Mon Sep 17 00:00:00 2001 From: Steffen Vogel Date: Mon, 2 Jul 2018 16:05:05 +0200 Subject: [PATCH 22/35] infiniband: do not build memory allocator if node is not acticated --- lib/Makefile.villas.inc | 2 +- lib/nodes/Makefile.inc | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/Makefile.villas.inc b/lib/Makefile.villas.inc index 963d98511..48819cb1d 100644 --- a/lib/Makefile.villas.inc +++ b/lib/Makefile.villas.inc @@ -26,7 +26,7 @@ LIB = $(BUILDDIR)/$(LIB_NAME).so.$(LIB_ABI_VERSION) # Object files for libvillas LIB_SRCS += $(addprefix lib/kernel/, kernel.c rt.c) \ - $(addprefix lib/memory/, heap.c hugepage.c ib.c managed.c) \ + $(addprefix lib/memory/, heap.c hugepage.c managed.c) \ $(addprefix lib/, sample.c path.c node.c hook.c log.c log_config.c \ utils.c super_node.c hist.c timing.c pool.c list.c queue.c \ queue_signalled.c memory.c advio.c plugin.c node_type.c stats.c \ diff --git a/lib/nodes/Makefile.inc b/lib/nodes/Makefile.inc index 47a329242..f6423c129 100644 --- a/lib/nodes/Makefile.inc +++ b/lib/nodes/Makefile.inc @@ -159,6 +159,7 @@ endif # Enable Infiniband support ifeq ($(WITH_NODE_INFINIBAND),1) LIB_SRCS += lib/nodes/infiniband.c + LIB_SRCS += lib/memory/ib.c LIB_NODES += infiniband LIB_LDLIBS += -libverbs LIB_LDLIBS += -lrdmacm From da8124e472d353b05e8b383b3243bf5cfadc7cde Mon Sep 17 00:00:00 2001 From: Steffen Vogel Date: Mon, 2 Jul 2018 19:00:55 +0200 Subject: [PATCH 23/35] memory: use hash_table to store allocation metadata --- include/villas/hash_table.h | 10 +++ include/villas/memory.h | 42 +++++++---- include/villas/memory_type.h | 9 ++- include/villas/pool.h | 2 - include/villas/queue.h | 1 - lib/Makefile.villas-ext.inc | 2 +- lib/memory.c | 47 +++++++++--- lib/memory/heap.c | 31 +++++--- lib/memory/hugepage.c | 41 +++++++---- lib/memory/ib.c | 36 +++++---- lib/memory/managed.c | 138 +++++++++++++++++------------------ lib/pool.c | 3 +- lib/queue.c | 8 +- lib/shmem.c | 4 +- tests/unit/memory.c | 24 +++--- 15 files changed, 241 insertions(+), 157 deletions(-) diff --git a/include/villas/hash_table.h b/include/villas/hash_table.h index d8a79f873..87726184f 100644 --- a/include/villas/hash_table.h +++ b/include/villas/hash_table.h @@ -20,11 +20,17 @@ * along with this program. If not, see . *********************************************************************************/ +#pragma once + #include #include #include +#ifdef __cplusplus +extern "C" { +#endif + struct hash_table_entry { void *key; void *data; @@ -73,3 +79,7 @@ void * hash_table_lookup(struct hash_table *ht, void *key); /** Dump the contents of the hash table in a human readable format to stdout. */ void hash_table_dump(struct hash_table *ht); + +#ifdef __cplusplus +} +#endif diff --git a/include/villas/memory.h b/include/villas/memory.h index 95ee21972..616377d20 100644 --- a/include/villas/memory.h +++ b/include/villas/memory.h @@ -25,6 +25,7 @@ #include #include +#include #include @@ -35,26 +36,35 @@ extern "C" { /* Forward declarations */ struct node; -enum memblock_flags { - MEMBLOCK_USED = 1, -}; - /** Descriptor of a memory block. Associated block always starts at - * &m + sizeof(struct memblock). */ -struct memblock { - struct memblock *prev; - struct memblock *next; - size_t len; /** #include #include +#include #include +static struct hash_table allocations = { .state = STATE_DESTROYED }; + int memory_init(int hugepages) { + int ret; + + if (allocations.state == STATE_DESTROYED) { + ret = hash_table_init(&allocations, 100); + if (ret) + return ret; + } + #ifdef __linux__ int ret, pagecnt, pagesz; struct rlimit l; @@ -71,25 +82,39 @@ int memory_init(int hugepages) void * memory_alloc(struct memory_type *m, size_t len) { - void *ptr = m->alloc(m, len, sizeof(void *)); - - debug(LOG_MEM | 5, "Allocated %#zx bytes of %s memory: %p", len, m->name, ptr); - - return ptr; + return memory_alloc_aligned(m, len, sizeof(void *)); } void * memory_alloc_aligned(struct memory_type *m, size_t len, size_t alignment) { - void *ptr = m->alloc(m, len, alignment); + struct memory_allocation *ma = m->alloc(m, len, alignment); - debug(LOG_MEM | 5, "Allocated %#zx bytes of %#zx-byte-aligned %s memory: %p", len, alignment, m->name, ptr); + hash_table_insert(&allocations, ma->address, ma); - return ptr; + debug(LOG_MEM | 5, "Allocated %#zx bytes of %#zx-byte-aligned %s memory: %p", ma->length, ma->alignment, ma->type->name, ma->address); + + return ma->address; } -int memory_free(struct memory_type *m, void *ptr, size_t len) +int memory_free(void *ptr) { - debug(LOG_MEM | 5, "Releasing %#zx bytes of %s memory", len, m->name); + int ret; - return m->free(m, ptr, len); + /* Find corresponding memory allocation entry */ + struct memory_allocation *ma = (struct memory_allocation *) hash_table_lookup(&allocations, ptr); + if (!ma) + return -1; + + debug(LOG_MEM | 5, "Releasing %#zx bytes of %s memory", ma->length, ma->type->name); + + ret = ma->type->free(ma->type, ma); + if (ret) + return ret; + + /* Remove allocation entry */ + ret = hash_table_delete(&allocations, ma->address); + if (ret) + return ret; + + return 0; } diff --git a/lib/memory/heap.c b/lib/memory/heap.c index 7a70abf63..1a2c61552 100644 --- a/lib/memory/heap.c +++ b/lib/memory/heap.c @@ -22,24 +22,37 @@ #include -#include +#include +#include -static void * memory_heap_alloc(struct memory_type *m, size_t len, size_t alignment) +static struct memory_allocation * memory_heap_alloc(struct memory_type *m, size_t len, size_t alignment) { - void *ptr; int ret; - if (alignment < sizeof(void *)) - alignment = sizeof(void *); + struct memory_allocation *ma = alloc(sizeof(struct memory_allocation)); + if (!ma) + return NULL; - ret = posix_memalign(&ptr, alignment, len); + ma->alignment = alignment; + ma->type = m; + ma->length = len; - return ret ? NULL : ptr; + if (ma->alignment < sizeof(void *)) + ma->alignment = sizeof(void *); + + ret = posix_memalign(&ma->address, ma->alignment, ma->length); + if (ret) { + free(ma); + return ret; + } + + return ma; } -int memory_heap_free(struct memory_type *m, void *ptr, size_t len) +static int memory_heap_free(struct memory_type *m, struct memory_allocation *ma) { - free(ptr); + free(ma->address); + free(ma); return 0; } diff --git a/lib/memory/hugepage.c b/lib/memory/hugepage.c index 83abbcf15..6bf17719c 100644 --- a/lib/memory/hugepage.c +++ b/lib/memory/hugepage.c @@ -38,15 +38,14 @@ #endif #include -#include +#include #include -#define HUGEPAGESIZE (1 << 21) /* 2 MiB */ +#define HUGEPAGESIZE (1 << 22) /* 2 MiB */ /** Allocate memory backed by hugepages with malloc() like interface */ -static void * memory_hugepage_alloc(struct memory_type *m, size_t len, size_t alignment) +static struct memory_allocation * memory_hugepage_alloc(struct memory_type *m, size_t len, size_t alignment) { - void *ret; int prot = PROT_READ | PROT_WRITE; int flags = MAP_PRIVATE | MAP_ANONYMOUS; @@ -59,22 +58,38 @@ static void * memory_hugepage_alloc(struct memory_type *m, size_t len, size_t al flags |= MAP_LOCKED; #endif - ret = mmap(NULL, len, prot, flags, -1, 0); - if (ret == MAP_FAILED) + struct memory_allocation *ma = alloc(sizeof(struct memory_allocation)); + if (!ma) return NULL; - return ret; -} - -static int memory_hugepage_free(struct memory_type *m, void *ptr, size_t len) -{ /** We must make sure that len is a multiple of the hugepage size * * See: https://lkml.org/lkml/2014/10/22/925 */ - len = ALIGN(len, HUGEPAGESIZE); + ma->length = ALIGN(len, HUGEPAGESIZE); + ma->alignment = alignment; + ma->type = m; - return munmap(ptr, len); + ma->address = mmap(NULL, len, prot, flags, -1, 0); + if (ma->address == MAP_FAILED) { + free(ma); + return NULL; + } + + return ma; +} + +static int memory_hugepage_free(struct memory_type *m, struct memory_allocation *ma) +{ + int ret; + + ret = munmap(ma->address, ma->length); + if (ret) + return ret; + + free(ma); + + return 0; } struct memory_type memory_hugepage = { diff --git a/lib/memory/ib.c b/lib/memory/ib.c index 38955e4d6..fe62e223f 100644 --- a/lib/memory/ib.c +++ b/lib/memory/ib.c @@ -36,33 +36,43 @@ struct ibv_mr * memory_ib_mr(void *ptr) return (mr - 1); } -void * memory_ib_alloc(struct memory_type *m, size_t len, size_t alignment) +static struct memory_allocation * memory_ib_alloc(struct memory_type *m, size_t len, size_t alignment) { struct memory_ib *mi = (struct memory_ib *) m->_vd; - struct ibv_mr **mr = memory_alloc_aligned(mi->parent, len + sizeof(struct ibv_mr *), alignment); - char *ptr = (char *) (mr + 1); + struct memory_allocation *ma = alloc(sizeof(struct memory_allocation)); + if (!ma) + return NULL; - *mr = ibv_reg_mr(mi->pd, ptr, len, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); - if(!*mr) { - free(ptr); + ma->type = m; + ma->length = len; + ma->alignment = alignment; + + ma->parent = mi->parent->alloc(mi->parent, len + sizeof(struct ibv_mr *), alignment); + ma->address = ma->parent->address; + + ma->ib.mr = ibv_reg_mr(mi->pd, ma->address, ma->length, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); + if(!ma->ib.mr) { + mi->parent->free(mi->parent, ma->parent); + free(ma); return NULL; } - return ptr; + return ma; } -int memory_ib_free(struct memory_type *m, void *ptr, size_t len) +static int memory_ib_free(struct memory_type *m, struct memory_allocation *ma) { + int ret; struct memory_ib *mi = (struct memory_ib *) m->_vd; - struct ibv_mr *mr = memory_ib_mr(ptr); - ibv_dereg_mr(mr); + ibv_dereg_mr(ma->ib.mr); - ptr -= sizeof(struct ibv_mr *); - len += sizeof(struct ibv_mr *); + ret = mi->parent->free(mi->parent, ma->parent); + if (ret) + return ret; - memory_free(mi->parent, ptr, len); + free(ma); return 0; } diff --git a/lib/memory/managed.c b/lib/memory/managed.c index 05ea6f439..ab8bc4b59 100644 --- a/lib/memory/managed.c +++ b/lib/memory/managed.c @@ -34,18 +34,18 @@ #include #include -void* memory_managed_alloc(struct memory_type *m, size_t len, size_t alignment) +static struct memory_allocation * memory_managed_alloc(struct memory_type *m, size_t len, size_t alignment) { /* Simple first-fit allocation */ - struct memblock *first = (struct memblock *) m->_vd; - struct memblock *block; + struct memory_block *first = (struct memory_block *) m->_vd; + struct memory_block *block; for (block = first; block != NULL; block = block->next) { - if (block->flags & MEMBLOCK_USED) + if (block->used) continue; - char* cptr = (char *) block + sizeof(struct memblock); - size_t avail = block->len; + char* cptr = (char *) block + sizeof(struct memory_block); + size_t avail = block->length; uintptr_t uptr = (uintptr_t) cptr; /* Check alignment first; leave a gap at start of block to assure @@ -62,47 +62,59 @@ void* memory_managed_alloc(struct memory_type *m, size_t len, size_t alignment) } if (avail >= len) { - if (gap > sizeof(struct memblock)) { + if (gap > sizeof(struct memory_block)) { /* The alignment gap is big enough to fit another block. * The original block descriptor is already at the correct * position, so we just change its len and create a new block * descriptor for the actual block we're handling. */ - block->len = gap - sizeof(struct memblock); - struct memblock *newblock = (struct memblock *) (cptr - sizeof(struct memblock)); + block->length = gap - sizeof(struct memory_block); + struct memory_block *newblock = (struct memory_block *) (cptr - sizeof(struct memory_block)); newblock->prev = block; newblock->next = block->next; block->next = newblock; - newblock->flags = 0; - newblock->len = len; + newblock->used = false; + newblock->length = len; block = newblock; } else { /* The gap is too small to fit another block descriptor, so we * must account for the gap length in the block length. */ - block->len = len + gap; + block->length = len + gap; } - if (avail > len + sizeof(struct memblock)) { + if (avail > len + sizeof(struct memory_block)) { /* Imperfect fit, so create another block for the remaining part */ - struct memblock *newblock = (struct memblock *) (cptr + len); + struct memory_block *newblock = (struct memory_block *) (cptr + len); newblock->prev = block; newblock->next = block->next; block->next = newblock; + if (newblock->next) newblock->next->prev = newblock; - newblock->flags = 0; - newblock->len = avail - len - sizeof(struct memblock); + + newblock->used = false; + newblock->length = avail - len - sizeof(struct memory_block); } else { /* If this block was larger than the requested length, but only - * by less than sizeof(struct memblock), we may have wasted - * memory by previous assignments to block->len. */ - block->len = avail; + * by less than sizeof(struct memory_block), we may have wasted + * memory by previous assignments to block->length. */ + block->length = avail; } - block->flags |= MEMBLOCK_USED; + block->used = true; - return (void *) cptr; + struct memory_allocation *ma = alloc(sizeof(struct memory_allocation)); + if (!ma) + return NULL; + + ma->address = cptr; + ma->type = m; + ma->alignment = alignment; + ma->length = len; + ma->managed.block = block; + + return ma; } } @@ -110,60 +122,48 @@ void* memory_managed_alloc(struct memory_type *m, size_t len, size_t alignment) return NULL; } -int memory_managed_free(struct memory_type *m, void *ptr, size_t len) +static int memory_managed_free(struct memory_type *m, struct memory_allocation *ma) { - struct memblock *first = (struct memblock *) m->_vd; - struct memblock *block; - char *cptr = ptr; + struct memory_block *block = ma->managed.block; - for (block = first; block != NULL; block = block->next) { - if (!(block->flags & MEMBLOCK_USED)) - continue; - - /* Since we may waste some memory at the start of a block to ensure - * alignment, ptr may not actually be the start of the block */ - if ((char *) block + sizeof(struct memblock) <= cptr && - cptr < (char *) block + sizeof(struct memblock) + block->len) { - /* Try to merge it with neighbouring free blocks */ - if (block->prev && !(block->prev->flags & MEMBLOCK_USED) && - block->next && !(block->next->flags & MEMBLOCK_USED)) { - /* Special case first: both previous and next block are unused */ - block->prev->len += block->len + block->next->len + 2 * sizeof(struct memblock); - block->prev->next = block->next->next; - if (block->next->next) - block->next->next->prev = block->prev; - } - else if (block->prev && !(block->prev->flags & MEMBLOCK_USED)) { - block->prev->len += block->len + sizeof(struct memblock); - block->prev->next = block->next; - if (block->next) - block->next->prev = block->prev; - } - else if (block->next && !(block->next->flags & MEMBLOCK_USED)) { - block->len += block->next->len + sizeof(struct memblock); - block->next = block->next->next; - if (block->next) - block->next->prev = block; - } - else { - /* no neighbouring free block, so just mark it as free */ - block->flags &= ~MEMBLOCK_USED; - } - - return 0; - } + /* Try to merge it with neighbouring free blocks */ + if (block->prev && !block->prev->used && + block->next && !block->next->used) { + /* Special case first: both previous and next block are unused */ + block->prev->length += block->length + block->next->length + 2 * sizeof(struct memory_block); + block->prev->next = block->next->next; + if (block->next->next) + block->next->next->prev = block->prev; + } + else if (block->prev && !block->prev->used) { + block->prev->length += block->length + sizeof(struct memory_block); + block->prev->next = block->next; + if (block->next) + block->next->prev = block->prev; + } + else if (block->next && !block->next->used) { + block->length += block->next->length + sizeof(struct memory_block); + block->next = block->next->next; + if (block->next) + block->next->prev = block; + } + else { + /* no neighbouring free block, so just mark it as free */ + block->used = false; } - return -1; + free(ma); + + return 0; } struct memory_type * memory_managed(void *ptr, size_t len) { struct memory_type *mt = ptr; - struct memblock *mb; + struct memory_block *mb; char *cptr = ptr; - if (len < sizeof(struct memory_type) + sizeof(struct memblock)) { + if (len < sizeof(struct memory_type) + sizeof(struct memory_block)) { info("memory_managed: passed region too small"); return NULL; } @@ -177,15 +177,15 @@ struct memory_type * memory_managed(void *ptr, size_t len) cptr += ALIGN(sizeof(struct memory_type), sizeof(void *)); - /* Initialize first free memblock */ - mb = (struct memblock *) cptr; + /* Initialize first free memory block */ + mb = (struct memory_block *) cptr; mb->prev = NULL; mb->next = NULL; - mb->flags = 0; + mb->used = false; - cptr += ALIGN(sizeof(struct memblock), sizeof(void *)); + cptr += ALIGN(sizeof(struct memory_block), sizeof(void *)); - mb->len = len - (cptr - (char *) ptr); + mb->length = len - (cptr - (char *) ptr); mt->_vd = (void *) mb; diff --git a/lib/pool.c b/lib/pool.c index 78b47f2c8..1a7ce726f 100644 --- a/lib/pool.c +++ b/lib/pool.c @@ -35,7 +35,6 @@ int pool_init(struct pool *p, size_t cnt, size_t blocksz, struct memory_type *m) p->alignment = kernel_get_cacheline_size(); p->blocksz = p->alignment * CEIL(blocksz, p->alignment); p->len = cnt * p->blocksz; - p->mem = m; void *buffer = memory_alloc_aligned(m, p->len, p->alignment); if (!buffer) @@ -66,7 +65,7 @@ int pool_destroy(struct pool *p) queue_destroy(&p->queue); void *buffer = (char*) p + p->buffer_off; - ret = memory_free(p->mem, buffer, p->len); + ret = memory_free(buffer); if (ret == 0) p->state = STATE_DESTROYED; diff --git a/lib/queue.c b/lib/queue.c index 438a2c523..0b020b192 100644 --- a/lib/queue.c +++ b/lib/queue.c @@ -36,7 +36,7 @@ #include /** Initialize MPMC queue */ -int queue_init(struct queue *q, size_t size, struct memory_type *mem) +int queue_init(struct queue *q, size_t size, struct memory_type *m) { assert(q->state == STATE_DESTROYED); @@ -47,9 +47,8 @@ int queue_init(struct queue *q, size_t size, struct memory_type *mem) warn("A queue size was changed from %lu to %lu", old_size, size); } - q->mem = mem; q->buffer_mask = size - 1; - struct queue_cell *buffer = (struct queue_cell *) memory_alloc(q->mem, sizeof(struct queue_cell) * size); + struct queue_cell *buffer = (struct queue_cell *) memory_alloc(m, sizeof(struct queue_cell) * size); if (!buffer) return -2; @@ -74,8 +73,7 @@ int queue_destroy(struct queue *q) if (q->state == STATE_DESTROYED) return 0; - ret = memory_free(q->mem, buffer, (q->buffer_mask + 1) * sizeof(struct queue_cell)); - + ret = memory_free(buffer); if (ret == 0) q->state = STATE_DESTROYED; diff --git a/lib/shmem.c b/lib/shmem.c index 65042a155..294e7ae67 100644 --- a/lib/shmem.c +++ b/lib/shmem.c @@ -44,7 +44,7 @@ size_t shmem_total_size(int queuelen, int samplelen) /* the size of the pool */ + queuelen * kernel_get_cacheline_size() * CEIL(SAMPLE_LEN(samplelen), kernel_get_cacheline_size()) /* a memblock for each allocation (1 shmem_shared, 2 queues, 1 pool) */ - + 4 * sizeof(struct memblock) + + 4 * sizeof(struct memory_block) /* and some extra buffer for alignment */ + 1024; } @@ -144,7 +144,7 @@ retry: fd = shm_open(wname, O_RDWR|O_CREAT|O_EXCL, 0600); if (base == MAP_FAILED) return -10; - cptr = (char *) base + sizeof(struct memory_type) + sizeof(struct memblock); + cptr = (char *) base + sizeof(struct memory_type) + sizeof(struct memory_block); shared = (struct shmem_shared *) cptr; shm->read.base = base; shm->read.name = rname; diff --git a/tests/unit/memory.c b/tests/unit/memory.c index 38ccbe51a..821795afd 100644 --- a/tests/unit/memory.c +++ b/tests/unit/memory.c @@ -28,7 +28,7 @@ #include #include -#define HUGEPAGESIZE (1<<22) +#define HUGEPAGESIZE (1 << 22) TheoryDataPoints(memory, aligned) = { DataPoints(size_t, 1, 32, 55, 1 << 10, 1 << 20), @@ -40,6 +40,9 @@ Theory((size_t len, size_t align, struct memory_type *m), memory, aligned) { int ret; void *ptr; + ret = memory_init(100); + cr_assert(!ret); + ptr = memory_alloc_aligned(m, len, align); cr_assert_neq(ptr, NULL, "Failed to allocate memory"); @@ -49,7 +52,7 @@ Theory((size_t len, size_t align, struct memory_type *m), memory, aligned) { cr_assert(IS_ALIGNED(ptr, HUGEPAGESIZE)); } - ret = memory_free(m, ptr, len); + ret = memory_free(ptr); cr_assert_eq(ret, 0, "Failed to release memory: ret=%d, ptr=%p, len=%zu: %s", ret, ptr, len, strerror(errno)); } @@ -62,7 +65,10 @@ Test(memory, manager) { struct memory_type *m; total_size = 1 << 10; - max_block = total_size - sizeof(struct memory_type) - sizeof(struct memblock); + max_block = total_size - sizeof(struct memory_type) - sizeof(struct memory_block); + + ret = memory_init(0); + cr_assert(!ret); p = memory_alloc(&memory_type_heap, total_size); cr_assert_not_null(p); @@ -76,7 +82,7 @@ Test(memory, manager) { p2 = memory_alloc(m, 32); cr_assert_not_null(p2); - ret = memory_free(m, p1, 16); + ret = memory_free(p1); cr_assert(ret == 0); p1 = memory_alloc_aligned(m, 128, 128); @@ -87,21 +93,21 @@ Test(memory, manager) { cr_assert(p3); cr_assert(IS_ALIGNED(p3, 256)); - ret = memory_free(m, p2, 32); + ret = memory_free(p2); cr_assert(ret == 0); - ret = memory_free(m, p1, 128); + ret = memory_free(p1); cr_assert(ret == 0); - ret = memory_free(m, p3, 128); + ret = memory_free(p3); cr_assert(ret == 0); p1 = memory_alloc(m, max_block); cr_assert_not_null(p1); - ret = memory_free(m, p1, max_block); + ret = memory_free(p1); cr_assert(ret == 0); - ret = memory_free(&memory_type_heap, p, total_size); + ret = memory_free(p); cr_assert(ret == 0); } From 6c7c7b7ed422871e0f814f81d1102601356c0901 Mon Sep 17 00:00:00 2001 From: Dennis Potter Date: Tue, 3 Jul 2018 11:13:59 +0200 Subject: [PATCH 24/35] Fixed indentations --- include/villas/nodes/infiniband.h | 96 +++++++++++++++---------------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/include/villas/nodes/infiniband.h b/include/villas/nodes/infiniband.h index 7fa5f2285..07e11cd30 100644 --- a/include/villas/nodes/infiniband.h +++ b/include/villas/nodes/infiniband.h @@ -42,73 +42,73 @@ typedef void* (*ib_poll_function)(void*); /* Enums */ enum poll_mode_e { - EVENT, - BUSY + EVENT, + BUSY }; struct r_addr_key_s { - uint64_t remote_addr; - uint32_t rkey; + uint64_t remote_addr; + uint32_t rkey; }; struct infiniband { + /* IBV/RDMA CM structs */ + struct context_s { + struct rdma_cm_id *listen_id; + struct rdma_cm_id *id; + struct rdma_event_channel *ec; - /* IBV/RDMA CM structs */ - struct context_s { - struct rdma_cm_id *listen_id; - struct rdma_cm_id *id; - struct rdma_event_channel *ec; + struct ibv_pd *pd; + struct ibv_cq *recv_cq; + struct ibv_cq *send_cq; + struct ibv_comp_channel *comp_channel; + } ctx; + /* Work Completion related */ + struct poll_s { + enum poll_mode_e poll_mode; - struct ibv_pd *pd; - struct ibv_cq *cq; - struct ibv_comp_channel *comp_channel; - } ctx; - /* Work Completion related */ - struct poll_s { - enum poll_mode_e poll_mode; + /* On completion function */ + ib_on_completion on_compl; - /* On completion function */ - ib_on_completion on_compl; + /* Busy poll or Event function */ + ib_poll_function poll_func; - /* Busy poll or Event function */ - ib_poll_function poll_func; + /* Poll thread */ + pthread_t cq_poller_thread; - /* Poll thread */ - pthread_t cq_poller_thread; + int stopThread; + } poll; - int stopThread; - } poll; + /* Connection specific variables */ + struct connection_s { + struct addrinfo *src_addr; + struct addrinfo *dst_addr; + enum rdma_port_space port_space; + int timeout; - /* Connection specific variables */ - struct connection_s { - struct addrinfo *src_addr; - struct addrinfo *dst_addr; - enum rdma_port_space port_space; - int timeout; + struct r_addr_key_s *r_addr_key; - struct r_addr_key_s *r_addr_key; + pthread_t stop_thread; + int rdma_disconnect_called; - pthread_t stop_thread; - int rdma_disconnect_called; + int available_recv_wrs; + } conn; - int available_recv_wrs; - } conn; + /* Memory related variables */ + struct ib_memory { + struct pool p_recv; + struct pool p_send; - /* Memory related variables */ - struct ib_memory { - struct pool p_recv; - struct pool p_send; + struct ibv_mr *mr_recv; + struct ibv_mr *mr_send; + } mem; - struct ibv_mr *mr_recv; - struct ibv_mr *mr_send; - } mem; + /* Queue Pair init variables */ + struct ibv_qp_init_attr qp_init; - /* Queue Pair init variables */ - struct ibv_qp_init_attr qp_init; - - /* Misc settings */ - int is_source; - int cq_size; + /* Misc settings */ + int is_source; +int cq_size; }; /** @see node_type::reverse */ From 217ce45ff1d4431c7c15bda5a759f1a8ffef175f Mon Sep 17 00:00:00 2001 From: Dennis Potter Date: Tue, 3 Jul 2018 17:37:17 +0200 Subject: [PATCH 25/35] Made pools larger to be compatible with IB node --- src/pipe.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/pipe.c b/src/pipe.c index f1267e876..272f4cf1b 100644 --- a/src/pipe.c +++ b/src/pipe.c @@ -132,7 +132,7 @@ static void * send_loop(void *ctx) struct sample *smps[node->out.vectorize]; /* Initialize memory */ - ret = pool_init(&sendd.pool, LOG2_CEIL(node->out.vectorize), SAMPLE_LEN(DEFAULT_SAMPLELEN), node_memtype(node, &memtype_hugepage)); + ret = pool_init(&sendd.pool, MAX(16384, 2*LOG2_CEIL(node->out.vectorize)), SAMPLE_LEN(DEFAULT_SAMPLELEN), node_memtype(node, &memtype_heap)); if (ret < 0) error("Failed to allocate memory for receive pool."); @@ -196,7 +196,8 @@ static void * recv_loop(void *ctx) struct sample *smps[node->in.vectorize]; /* Initialize memory */ - ret = pool_init(&recvv.pool, LOG2_CEIL(node->in.vectorize), SAMPLE_LEN(DEFAULT_SAMPLELEN), node_memtype(node, &memtype_hugepage)); + ret = pool_init(&recvv.pool, MAX(16*8192, 2*LOG2_CEIL(node->in.vectorize)), SAMPLE_LEN(DEFAULT_SAMPLELEN), node_memtype(node, &memtype_heap)); + if (ret < 0) error("Failed to allocate memory for receive pool."); @@ -204,8 +205,8 @@ static void * recv_loop(void *ctx) ready = sample_alloc_many(&recvv.pool, smps, node->in.vectorize); if (ready < 0) error("Failed to allocate %u samples from receive pool.", node->in.vectorize); -// else if (ready < node->in.vectorize) -// warn("Receive pool underrun"); + else if (ready < node->in.vectorize) + warn("Receive pool underrun"); recv = node_read(node, smps, ready); if (recv < 0) From a8250094570d75f1c6a37d9d89ad5f949376eccd Mon Sep 17 00:00:00 2001 From: Dennis Potter Date: Tue, 3 Jul 2018 17:39:06 +0200 Subject: [PATCH 26/35] Removed bugs with regard to the zero-copy implementation. This commit is able to send -r 5000 -l 50000 with villas pipe. With a higher -r, the source throws errors that there aren't any receive WRs, so probably pipe doesn't fill up the WRs fast enough --- lib/nodes/infiniband.c | 396 +++++++++++++++++++++-------------------- 1 file changed, 204 insertions(+), 192 deletions(-) diff --git a/lib/nodes/infiniband.c b/lib/nodes/infiniband.c index f12bfa215..b24db19ac 100644 --- a/lib/nodes/infiniband.c +++ b/lib/nodes/infiniband.c @@ -35,57 +35,57 @@ int ib_cleanup(struct node *n) { - struct infiniband *ib = (struct infiniband *) n->_vd; - info("Starting to clean up"); + struct infiniband *ib = (struct infiniband *) n->_vd; + info("Starting to clean up"); - // Destroy QP - rdma_destroy_qp(ib->ctx.id); - info("Destroyed QP"); + // Destroy QP + rdma_destroy_qp(ib->ctx.id); + info("Destroyed QP"); - // Deregister memory regions - ibv_dereg_mr(ib->mem.mr_recv); - if(ib->is_source) - ibv_dereg_mr(ib->mem.mr_send); - info("Deregistered memory regions"); + // Deregister memory regions + ibv_dereg_mr(ib->mem.mr_recv); + if(ib->is_source) + ibv_dereg_mr(ib->mem.mr_send); + info("Deregistered memory regions"); - // Destroy pools - pool_destroy(&ib->mem.p_recv); - pool_destroy(&ib->mem.p_send); - info("Destroyed memory pools"); + // Destroy pools + pool_destroy(&ib->mem.p_recv); + pool_destroy(&ib->mem.p_send); + info("Destroyed memory pools"); - // Destroy RDMA CM ID - rdma_destroy_id(ib->ctx.id); - info("Destroyed rdma_cm_id"); + // Destroy RDMA CM ID + rdma_destroy_id(ib->ctx.id); + info("Destroyed rdma_cm_id"); - // Destroy event channel - rdma_destroy_event_channel(ib->ctx.ec); - info("Destroyed event channel"); + // Destroy event channel + rdma_destroy_event_channel(ib->ctx.ec); + info("Destroyed event channel"); - return 0; + return 0; } int ib_post_recv_wrs(struct node *n) { - struct infiniband *ib = (struct infiniband *) n->_vd; - struct ibv_recv_wr wr, *bad_wr = NULL; - int ret; - struct ibv_sge sge; + struct infiniband *ib = (struct infiniband *) n->_vd; + struct ibv_recv_wr wr, *bad_wr = NULL; + int ret; + struct ibv_sge sge; - // Prepare receive Scatter/Gather element - sge.addr = (uintptr_t)pool_get(&ib->mem.p_recv); - sge.length = ib->mem.p_recv.blocksz; - sge.lkey = ib->mem.mr_recv->lkey; + // Prepare receive Scatter/Gather element + sge.addr = (uintptr_t)pool_get(&ib->mem.p_recv); + sge.length = ib->mem.p_recv.blocksz; + sge.lkey = ib->mem.mr_recv->lkey; - // Prepare a receive Work Request - wr.wr_id = (uintptr_t)sge.addr; - wr.next = NULL; - wr.sg_list = &sge; - wr.num_sge = 1; + // Prepare a receive Work Request + wr.wr_id = (uintptr_t)sge.addr; + wr.next = NULL; + wr.sg_list = &sge; + wr.num_sge = 1; - // Post Work Request - ret = ibv_post_recv(ib->ctx.id->qp, &wr, &bad_wr); + // Post Work Request + ret = ibv_post_recv(ib->ctx.id->qp, &wr, &bad_wr); - return ret; + return ret; } void ib_completion_target(struct node* n, struct ibv_wc* wc, int* size){} @@ -108,33 +108,30 @@ void ib_completion_source(struct node* n, struct ibv_wc* wc, int* size) warn("Work Completion status was not IBV_WC_SUCCES in node %s: %i", node_name(n), wc[i].status); } - else - { - // Release sample - sample_put((struct sample*)(wc[i].wr_id)); - } + + sample_put((struct sample*)(wc[i].wr_id)); } } void * ib_event_thread(void *n) { - struct infiniband *ib = (struct infiniband *)((struct node *)n)->_vd; - struct ibv_wc wc[ib->cq_size]; - int size; + struct infiniband *ib = (struct infiniband *)((struct node *)n)->_vd; + struct ibv_wc wc[ib->cq_size]; + int size; - while(1) - { - // Function blocks, until an event occurs - ibv_get_cq_event(ib->ctx.comp_channel, &ib->ctx.cq, NULL); + while(1) + { + // Function blocks, until an event occurs + ibv_get_cq_event(ib->ctx.comp_channel, &ib->ctx.send_cq, NULL); - // Poll as long as WCs are available - while((size = ibv_poll_cq(ib->ctx.cq, ib->cq_size, wc))) - ib->poll.on_compl(n, wc, &size); + // Poll as long as WCs are available + while((size = ibv_poll_cq(ib->ctx.send_cq, ib->cq_size, wc))) + ib->poll.on_compl(n, wc, &size); - // Request a new event in the CQ and acknowledge event - ibv_req_notify_cq(ib->ctx.cq, 0); - ibv_ack_cq_events(ib->ctx.cq, 1); - } + // Request a new event in the CQ and acknowledge event + ibv_req_notify_cq(ib->ctx.send_cq, 0); + ibv_ack_cq_events(ib->ctx.send_cq, 1); + } } void * ib_busy_poll_thread(void *n) @@ -146,7 +143,7 @@ void * ib_busy_poll_thread(void *n) while(1) { // Poll as long as WCs are available - while((size = ibv_poll_cq(ib->ctx.cq, ib->cq_size, wc))) + while((size = ibv_poll_cq(ib->ctx.send_cq, ib->cq_size, wc))) ib->poll.on_compl(n, wc, &size); if(ib->poll.stopThread) @@ -168,19 +165,27 @@ static void ib_init_wc_poll(struct node *n) error("Could not create completion channel in node %s.", node_name(n)); } - // Create completion queue and bind to channel (or NULL) - ib->ctx.cq = ibv_create_cq(ib->ctx.id->verbs, + // Create completion queues and bind to channel (or NULL) + ib->ctx.recv_cq = ibv_create_cq(ib->ctx.id->verbs, + ib->cq_size, + NULL, + NULL, + 0); + if(!ib->ctx.recv_cq) + error("Could not create receive completion queue in node %s.", node_name(n)); + + ib->ctx.send_cq = ibv_create_cq(ib->ctx.id->verbs, ib->cq_size, NULL, ib->ctx.comp_channel, 0); - if(!ib->ctx.cq) - error("Could not create completion queue in node %s.", node_name(n)); + if(!ib->ctx.send_cq) + error("Could not create send completion queue in node %s.", node_name(n)); if(ib->poll.poll_mode == EVENT) { // Request notifications from completion queue - ret = ibv_req_notify_cq(ib->ctx.cq, 0); + ret = ibv_req_notify_cq(ib->ctx.send_cq, 0); if(ret) error("Failed to request notifiy CQ in node %s: %s", node_name(n), gai_strerror(ret)); @@ -214,8 +219,8 @@ static void ib_build_ibv(struct node *n) ib_init_wc_poll(n); // Prepare remaining Queue Pair (QP) attributes - ib->qp_init.send_cq = ib->ctx.cq; - ib->qp_init.recv_cq = ib->ctx.cq; + ib->qp_init.send_cq = ib->ctx.send_cq; + ib->qp_init.recv_cq = ib->ctx.recv_cq; //ToDo: Set maximum inline data @@ -306,64 +311,64 @@ static void ib_build_ibv(struct node *n) static int ib_addr_resolved(struct node *n) { - struct infiniband *ib = (struct infiniband *) n->_vd; - int ret; + struct infiniband *ib = (struct infiniband *) n->_vd; + int ret; - info("Successfully resolved address."); + info("Successfully resolved address."); - // Build all components from IB Verbs - ib_build_ibv(n); + // Build all components from IB Verbs + ib_build_ibv(n); - // Resolve address - ret = rdma_resolve_route(ib->ctx.id, ib->conn.timeout); - if(ret) - error("Failed to resolve route in node %s.", node_name(n)); + // Resolve address + ret = rdma_resolve_route(ib->ctx.id, ib->conn.timeout); + if(ret) + error("Failed to resolve route in node %s.", node_name(n)); - return 0; + return 0; } static int ib_route_resolved(struct node *n) { - struct infiniband *ib = (struct infiniband *) n->_vd; - int ret; + struct infiniband *ib = (struct infiniband *) n->_vd; + int ret; - info("Successfully resolved route."); + info("Successfully resolved route."); - //ToDo: Post receive WRs + //ToDo: Post receive WRs - struct rdma_conn_param cm_params; - memset(&cm_params, 0, sizeof(cm_params)); + struct rdma_conn_param cm_params; + memset(&cm_params, 0, sizeof(cm_params)); - // Send connection request - ret = rdma_connect(ib->ctx.id, &cm_params); - if(ret) - error("Failed to connect in node %s.", node_name(n)); + // Send connection request + ret = rdma_connect(ib->ctx.id, &cm_params); + if(ret) + error("Failed to connect in node %s.", node_name(n)); - info("Called rdma_connect."); + info("Called rdma_connect."); - return 0; + return 0; } static int ib_connect_request(struct node *n, struct rdma_cm_id *id) { - struct infiniband *ib = (struct infiniband *) n->_vd; - int ret; - info("Received a connection request!"); + struct infiniband *ib = (struct infiniband *) n->_vd; + int ret; + info("Received a connection request!"); - ib->ctx.id = id; - ib_build_ibv(n); + ib->ctx.id = id; + ib_build_ibv(n); - struct rdma_conn_param cm_params; - memset(&cm_params, 0, sizeof(cm_params)); + struct rdma_conn_param cm_params; + memset(&cm_params, 0, sizeof(cm_params)); - // Accept connection request - ret = rdma_accept(ib->ctx.id, &cm_params); - if(ret) - error("Failed to connect in node %s.", node_name(n)); + // Accept connection request + ret = rdma_accept(ib->ctx.id, &cm_params); + if(ret) + error("Failed to connect in node %s.", node_name(n)); - info("Successfully accepted connection request."); + info("Successfully accepted connection request."); - return 0; + return 0; } static int ib_event(struct node *n, struct rdma_cm_event *event) @@ -463,6 +468,8 @@ int ib_parse(struct node *n, json_t *cfg) // Set timeout ib->conn.timeout = timeout; + n->in.vectorize = 256; + // Translate poll mode if(strcmp(poll_mode, "EVENT") == 0) { @@ -660,62 +667,97 @@ int ib_start(struct node *n) int ib_stop(struct node *n) { - struct infiniband *ib = (struct infiniband *) n->_vd; - struct rdma_cm_event *event = NULL; - int ret; + struct infiniband *ib = (struct infiniband *) n->_vd; + struct rdma_cm_event *event = NULL; + int ret; - // Call RDMA disconnect function - // Will flush all outstanding WRs to the Completion Queue and - // will call RDMA_CM_EVENT_DISCONNECTED if that is done. - ret = rdma_disconnect(ib->ctx.id); - if(ret) - { - error("Error while calling rdma_disconnect in node %s: %s", - node_name(n), gai_strerror(ret)); - } - info("Called rdma_disconnect."); + // Call RDMA disconnect function + // Will flush all outstanding WRs to the Completion Queue and + // will call RDMA_CM_EVENT_DISCONNECTED if that is done. + ret = rdma_disconnect(ib->ctx.id); + if(ret) + { + error("Error while calling rdma_disconnect in node %s: %s", + node_name(n), gai_strerror(ret)); + } + info("Called rdma_disconnect."); - // If disconnected event already occured, directly call cleanup function - if(ib->conn.rdma_disconnect_called) - { - ib_cleanup(n); - } - // Else, wait for event to occur - else - { - ib->conn.rdma_disconnect_called = 1; - rdma_get_cm_event(ib->ctx.ec, &event); + // If disconnected event already occured, directly call cleanup function + if(ib->conn.rdma_disconnect_called) + { + ib_cleanup(n); + } + // Else, wait for event to occur + else + { + ib->conn.rdma_disconnect_called = 1; + rdma_get_cm_event(ib->ctx.ec, &event); - rdma_ack_cm_event(event); + rdma_ack_cm_event(event); - ib_event(n, event); - } + ib_event(n, event); + } - return 0; + return 0; } int ib_init(struct super_node *n) { - - return 0; + return 0; } int ib_deinit() { - return 0; + return 0; } int ib_read(struct node *n, struct sample *smps[], unsigned cnt) { struct infiniband *ib = (struct infiniband *) n->_vd; struct ibv_wc wc[n->in.vectorize]; - struct ibv_recv_wr wr, *bad_wr = NULL; - struct ibv_sge sge; + struct ibv_recv_wr wr[cnt], *bad_wr = NULL; + struct ibv_sge sge[cnt]; struct ibv_mr ** mr; struct pool *p; int ret; - ret = ibv_poll_cq(ib->ctx.cq, n->in.vectorize, wc); + if(ib->conn.available_recv_wrs <= ib->qp_init.cap.max_recv_wr && cnt==n->in.vectorize) + { + // Get Memory Region + p = sample_pool(smps[0]); + mr = (struct ibv_mr **)((char *)(p)+p->buffer_off-8); + + for(int i=0; idata; + sge[i].length = SAMPLE_DATA_LEN(DEFAULT_SAMPLELEN); + sge[i].lkey = (*mr)->lkey; + + // Prepare a receive Work Request + wr[i].wr_id = (uintptr_t)smps[i]; + wr[i].next = &wr[i+1]; + wr[i].sg_list = &sge[i]; + wr[i].num_sge = 1; + + ib->conn.available_recv_wrs++; + + if(ib->conn.available_recv_wrs == ib->qp_init.cap.max_recv_wr || i==(cnt-1)) + { + wr[i].next = NULL; + break; + } + } + // Post list of Work Requests + ret = ibv_post_recv(ib->ctx.id->qp, &wr[0], &bad_wr); + + } + + // Poll Completion Queue + ret = ibv_poll_cq(ib->ctx.recv_cq, n->in.vectorize, wc); if(ret) { @@ -723,50 +765,20 @@ int ib_read(struct node *n, struct sample *smps[], unsigned cnt) for(int i=0; ilength = wc[i].byte_len/sizeof(double); - smps[i]->capacity = DEFAULT_SAMPLELEN; - - //Release sample - sample_put(smps[i]); - } + else + ret = 0; + + //Release sample + sample_put((struct sample*)(wc[i].wr_id)); } } - else if(ib->conn.available_recv_wrs < ib->qp_init.cap.max_recv_wr) - { - // No data received? Put new receive Work Requests to Receive Queue - // Get Memory Region - p = sample_pool(smps[0]); - mr = (struct ibv_mr **)((char *)(p)+p->buffer_off-8); - - // Increase refcnt of sample - sample_get(smps[0]); - - // Prepare receive Scatter/Gather element - sge.addr = (uint64_t)&smps[0]->data; - sge.length = SAMPLE_DATA_LEN(DEFAULT_SAMPLELEN); - sge.lkey = (*mr)->lkey; - - // Prepare a receive Work Request - wr.wr_id = (uintptr_t)smps[0]; - wr.next = NULL; - wr.sg_list = &sge; - wr.num_sge = 1; - - // Post Work Request - ret = ibv_post_recv(ib->ctx.id->qp, &wr, &bad_wr); - - ib->conn.available_recv_wrs++; - } return ret; } @@ -783,7 +795,7 @@ int ib_write(struct node *n, struct sample *smps[], unsigned cnt) memset(&wr, 0, sizeof(wr)); //ToDo: Place this into configuration and create checks if settings are valid - int send_inline = 0; + int send_inline = 1; // Get Memory Region p = sample_pool(smps[0]); @@ -818,10 +830,10 @@ int ib_write(struct node *n, struct sample *smps[], unsigned cnt) ret = ibv_post_send(ib->ctx.id->qp, wr, &bad_wr); if(ret) { - error("Failed to send message in node %s: %s", - node_name(n), gai_strerror(ret)); + error("Failed to send message in node %s: %s", + node_name(n), gai_strerror(ret)); - return -ret; + return -ret; } return cnt; @@ -829,29 +841,29 @@ int ib_write(struct node *n, struct sample *smps[], unsigned cnt) int ib_fd(struct node *n) { - return 0; + return 0; } static struct plugin p = { - .name = "infiniband", - .description = "Infiniband", - .type = PLUGIN_TYPE_NODE, - .node = { - .vectorize = 0, - .size = sizeof(struct infiniband), - .reverse = ib_reverse, - .parse = ib_parse, - .print = ib_print, - .start = ib_start, - .destroy = ib_destroy, - .stop = ib_stop, - .init = ib_init, - .deinit = ib_deinit, - .read = ib_read, - .write = ib_write, - .fd = ib_fd, - .memtype = ib_memtype - } + .name = "infiniband", + .description = "Infiniband", + .type = PLUGIN_TYPE_NODE, + .node = { + .vectorize = 0, + .size = sizeof(struct infiniband), + .reverse = ib_reverse, + .parse = ib_parse, + .print = ib_print, + .start = ib_start, + .destroy = ib_destroy, + .stop = ib_stop, + .init = ib_init, + .deinit = ib_deinit, + .read = ib_read, + .write = ib_write, + .fd = ib_fd, + .memtype = ib_memtype + } }; REGISTER_PLUGIN(&p) From 0470ebda333fbeb77b218f1ec5974a5031674ea9 Mon Sep 17 00:00:00 2001 From: Dennis Potter Date: Tue, 3 Jul 2018 18:01:49 +0200 Subject: [PATCH 27/35] Fixed indentations --- lib/nodes/infiniband.c | 706 ++++++++++++++++++++--------------------- 1 file changed, 342 insertions(+), 364 deletions(-) diff --git a/lib/nodes/infiniband.c b/lib/nodes/infiniband.c index b24db19ac..b1a6fe528 100644 --- a/lib/nodes/infiniband.c +++ b/lib/nodes/infiniband.c @@ -153,160 +153,145 @@ void * ib_busy_poll_thread(void *n) static void ib_init_wc_poll(struct node *n) { - int ret; - struct infiniband *ib = (struct infiniband *) n->_vd; - ib->ctx.comp_channel = NULL; + int ret; + struct infiniband *ib = (struct infiniband *) n->_vd; + ib->ctx.comp_channel = NULL; - if(ib->poll.poll_mode == EVENT) - { - // Create completion channel - ib->ctx.comp_channel = ibv_create_comp_channel(ib->ctx.id->verbs); - if(!ib->ctx.comp_channel) - error("Could not create completion channel in node %s.", node_name(n)); - } + if(ib->poll.poll_mode == EVENT) + { + // Create completion channel + ib->ctx.comp_channel = ibv_create_comp_channel(ib->ctx.id->verbs); + if(!ib->ctx.comp_channel) + error("Could not create completion channel in node %s.", node_name(n)); + } - // Create completion queues and bind to channel (or NULL) - ib->ctx.recv_cq = ibv_create_cq(ib->ctx.id->verbs, - ib->cq_size, - NULL, - NULL, - 0); - if(!ib->ctx.recv_cq) - error("Could not create receive completion queue in node %s.", node_name(n)); + // Create completion queues and bind to channel (or NULL) + ib->ctx.recv_cq = ibv_create_cq(ib->ctx.id->verbs, + ib->cq_size, + NULL, + NULL, + 0); + if(!ib->ctx.recv_cq) + error("Could not create receive completion queue in node %s.", node_name(n)); - ib->ctx.send_cq = ibv_create_cq(ib->ctx.id->verbs, - ib->cq_size, - NULL, - ib->ctx.comp_channel, - 0); - if(!ib->ctx.send_cq) - error("Could not create send completion queue in node %s.", node_name(n)); + ib->ctx.send_cq = ibv_create_cq(ib->ctx.id->verbs, + ib->cq_size, + NULL, + ib->ctx.comp_channel, + 0); + if(!ib->ctx.send_cq) + error("Could not create send completion queue in node %s.", node_name(n)); - if(ib->poll.poll_mode == EVENT) - { - // Request notifications from completion queue - ret = ibv_req_notify_cq(ib->ctx.send_cq, 0); - if(ret) - error("Failed to request notifiy CQ in node %s: %s", - node_name(n), gai_strerror(ret)); - } + if(ib->poll.poll_mode == EVENT) + { + // Request notifications from completion queue + ret = ibv_req_notify_cq(ib->ctx.send_cq, 0); + if(ret) + error("Failed to request notifiy CQ in node %s: %s", + node_name(n), gai_strerror(ret)); + } - // Initialize polling pthread - //ToDo: Remove if(is_source) - if(ib->is_source) - { - ret = pthread_create(&ib->poll.cq_poller_thread, NULL, ib->poll.poll_func, n); - if(ret) - { - error("Failed to create poll thread of node %s: %s", - node_name(n), gai_strerror(ret)); - } - } + // Initialize polling pthread for source + if(ib->is_source) + { + ret = pthread_create(&ib->poll.cq_poller_thread, NULL, ib->poll.poll_func, n); + if(ret) + { + error("Failed to create poll thread of node %s: %s", + node_name(n), gai_strerror(ret)); + } + } } static void ib_build_ibv(struct node *n) { - struct infiniband *ib = (struct infiniband *) n->_vd; - int ret; + struct infiniband *ib = (struct infiniband *) n->_vd; + int ret; - //Allocate protection domain - ib->ctx.pd = ibv_alloc_pd(ib->ctx.id->verbs); - if(!ib->ctx.pd) - error("Could not allocate protection domain in node %s.", node_name(n)); - info("Allocated Protection Domain"); + //Allocate protection domain + ib->ctx.pd = ibv_alloc_pd(ib->ctx.id->verbs); + if(!ib->ctx.pd) + error("Could not allocate protection domain in node %s.", node_name(n)); + info("Allocated Protection Domain"); - // Initiate poll mode - ib_init_wc_poll(n); + // Initiate poll mode + ib_init_wc_poll(n); - // Prepare remaining Queue Pair (QP) attributes - ib->qp_init.send_cq = ib->ctx.send_cq; - ib->qp_init.recv_cq = ib->ctx.recv_cq; + // Prepare remaining Queue Pair (QP) attributes + ib->qp_init.send_cq = ib->ctx.send_cq; + ib->qp_init.recv_cq = ib->ctx.recv_cq; - //ToDo: Set maximum inline data + //ToDo: Set maximum inline data - // Create the actual QP - ret = rdma_create_qp(ib->ctx.id, ib->ctx.pd, &ib->qp_init); - if(ret) - error("Failed to create Queue Pair in node %s.", node_name(n)); + // Create the actual QP + ret = rdma_create_qp(ib->ctx.id, ib->ctx.pd, &ib->qp_init); + if(ret) + error("Failed to create Queue Pair in node %s.", node_name(n)); - info("Created Queue Pair with %i receive and %i send elements.", - ib->qp_init.cap.max_recv_wr, ib->qp_init.cap.max_send_wr); + info("Created Queue Pair with %i receive and %i send elements.", + ib->qp_init.cap.max_recv_wr, ib->qp_init.cap.max_send_wr); - // Allocate memory - ib->mem.p_recv.state = STATE_DESTROYED; - ib->mem.p_recv.queue.state = STATE_DESTROYED; + // Allocate memory + ib->mem.p_recv.state = STATE_DESTROYED; + ib->mem.p_recv.queue.state = STATE_DESTROYED; - // Set pool size to maximum size of Receive Queue - pool_init(&ib->mem.p_recv, - ib->qp_init.cap.max_recv_wr, - SAMPLE_DATA_LEN(DEFAULT_SAMPLELEN), - &memtype_heap); - if(ret) - { - error("Failed to init recv memory pool of node %s: %s", - node_name(n), gai_strerror(ret)); - } + // Set pool size to maximum size of Receive Queue + pool_init(&ib->mem.p_recv, + ib->qp_init.cap.max_recv_wr, + SAMPLE_DATA_LEN(DEFAULT_SAMPLELEN), + &memtype_heap); + if(ret) + { + error("Failed to init recv memory pool of node %s: %s", + node_name(n), gai_strerror(ret)); + } - //ToDo: initialize r_addr_key struct if mode is RDMA + //ToDo: initialize r_addr_key struct if mode is RDMA - // Register memory for IB Device. Not necessary if data is send - // exclusively inline - ib->mem.mr_recv = ibv_reg_mr( - ib->ctx.pd, - (char*)&ib->mem.p_recv+ib->mem.p_recv.buffer_off, - ib->mem.p_recv.len, - IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); - if(!ib->mem.mr_recv) { - error("Failed to register mr_recv with ibv_reg_mr of node %s.", - node_name(n)); - } - info("Allocated receive memory."); + // Register memory for IB Device. Not necessary if data is send + // exclusively inline + ib->mem.mr_recv = ibv_reg_mr( + ib->ctx.pd, + (char*)&ib->mem.p_recv+ib->mem.p_recv.buffer_off, + ib->mem.p_recv.len, + IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); + if(!ib->mem.mr_recv) { + error("Failed to register mr_recv with ibv_reg_mr of node %s.", + node_name(n)); + } + info("Allocated receive memory."); - if(ib->is_source) - { - ib->mem.p_send.state = STATE_DESTROYED; - ib->mem.p_send.queue.state = STATE_DESTROYED; + if(ib->is_source) + { + ib->mem.p_send.state = STATE_DESTROYED; + ib->mem.p_send.queue.state = STATE_DESTROYED; - // Set pool size to maximum size of Receive Queue - pool_init(&ib->mem.p_send, - ib->qp_init.cap.max_send_wr, - sizeof(double), - &memtype_heap); - if(ret) - { - error("Failed to init send memory of node %s: %s", - node_name(n), gai_strerror(ret)); - } + // Set pool size to maximum size of Receive Queue + pool_init(&ib->mem.p_send, + ib->qp_init.cap.max_send_wr, + sizeof(double), + &memtype_heap); + if(ret) + { + error("Failed to init send memory of node %s: %s", + node_name(n), gai_strerror(ret)); + } - //ToDo: initialize r_addr_key struct if mode is RDMA + //ToDo: initialize r_addr_key struct if mode is RDMA - // Register memory for IB Device. Not necessary if data is send - // exclusively inline - ib->mem.mr_send = ibv_reg_mr( - ib->ctx.pd, - (char*)&ib->mem.p_send+ib->mem.p_send.buffer_off, - ib->mem.p_send.len, - IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); - if(!ib->mem.mr_send) { - error("Failed to register mr_send with ibv_reg_mr of node %s.", - node_name(n)); - } - info("Allocated send memory."); - - } - - // Post Receive Work Requests to be able to receive data - // Fill complete Receive Queue during initialization - //for(int i=0; iqp_init.cap.max_recv_wr; i++) - //{ - // ret = ib_post_recv_wrs(n); - // if(ret) - // { - // error("Failed to post initial receive Work Requests of node %s.", - // node_name(n)); - // } - //} - //info("Filled the complete Receive Queue."); + // Register memory for IB Device. Not necessary if data is send + // exclusively inline + ib->mem.mr_send = ibv_reg_mr( + ib->ctx.pd, + (char*)&ib->mem.p_send+ib->mem.p_send.buffer_off, + ib->mem.p_send.len, + IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); + if(!ib->mem.mr_send) { + error("Failed to register mr_send with ibv_reg_mr of node %s.", + node_name(n)); + } + info("Allocated send memory."); + } } static int ib_addr_resolved(struct node *n) @@ -334,8 +319,6 @@ static int ib_route_resolved(struct node *n) info("Successfully resolved route."); - //ToDo: Post receive WRs - struct rdma_conn_param cm_params; memset(&cm_params, 0, sizeof(cm_params)); @@ -373,296 +356,291 @@ static int ib_connect_request(struct node *n, struct rdma_cm_id *id) static int ib_event(struct node *n, struct rdma_cm_event *event) { - int ret = 0; + int ret = 0; - switch(event->event) - { - case RDMA_CM_EVENT_ADDR_RESOLVED: - ret = ib_addr_resolved(n); - break; - case RDMA_CM_EVENT_ADDR_ERROR: - error("Address resolution (rdma_resolve_addr) failed!"); - case RDMA_CM_EVENT_ROUTE_RESOLVED: - ret = ib_route_resolved(n); - break; - case RDMA_CM_EVENT_ROUTE_ERROR: - error("Route resolution (rdma_resovle_route) failed!"); - case RDMA_CM_EVENT_CONNECT_REQUEST: - ret = ib_connect_request(n, event->id); - break; - case RDMA_CM_EVENT_CONNECT_ERROR: - error("An error has occurred trying to establish a connection!"); - case RDMA_CM_EVENT_REJECTED: - error("Connection request or response was rejected by the remote end point!"); - case RDMA_CM_EVENT_ESTABLISHED: - info("Connection established!"); - ret = 1; - break; - case RDMA_CM_EVENT_DISCONNECTED: - ret = ib_cleanup(n); - break; - default: - error("Unknown event occurred: %u", - event->event); - } + switch(event->event) + { + case RDMA_CM_EVENT_ADDR_RESOLVED: + ret = ib_addr_resolved(n); + break; + case RDMA_CM_EVENT_ADDR_ERROR: + error("Address resolution (rdma_resolve_addr) failed!"); + case RDMA_CM_EVENT_ROUTE_RESOLVED: + ret = ib_route_resolved(n); + break; + case RDMA_CM_EVENT_ROUTE_ERROR: + error("Route resolution (rdma_resovle_route) failed!"); + case RDMA_CM_EVENT_CONNECT_REQUEST: + ret = ib_connect_request(n, event->id); + break; + case RDMA_CM_EVENT_CONNECT_ERROR: + error("An error has occurred trying to establish a connection!"); + case RDMA_CM_EVENT_REJECTED: + error("Connection request or response was rejected by the remote end point!"); + case RDMA_CM_EVENT_ESTABLISHED: + info("Connection established!"); + ret = 1; + break; + case RDMA_CM_EVENT_DISCONNECTED: + ret = ib_cleanup(n); + break; + default: + error("Unknown event occurred: %u", + event->event); + } - return ret; + return ret; } int ib_reverse(struct node *n) { - return 0; + return 0; } int ib_parse(struct node *n, json_t *cfg) { - struct infiniband *ib = (struct infiniband *) n->_vd; + struct infiniband *ib = (struct infiniband *) n->_vd; - int ret; - char *local = NULL; - char *remote = NULL; - const char *port_space = "RDMA_PC_TCP"; - const char *poll_mode = "BUSY"; - const char *qp_type = "IBV_QPT_RC"; - int timeout = 1000; - int cq_size = 128; - int max_send_wr = 128; - int max_recv_wr = 128; + int ret; + char *local = NULL; + char *remote = NULL; + const char *port_space = "RDMA_PC_TCP"; + const char *poll_mode = "BUSY"; + const char *qp_type = "IBV_QPT_RC"; + int timeout = 1000; + int cq_size = 128; + int max_send_wr = 128; + int max_recv_wr = 128; - json_error_t err; - ret = json_unpack_ex(cfg, &err, 0, "{ s?: s, s?: s, s?: s, s?: i, \ - s?: s, s?: i, s?: s, s?: i, s?: i}", - "remote", &remote, - "local", &local, - "rdma_port_space", &port_space, - "resolution_timeout", &timeout, - "poll_mode", &poll_mode, - "cq_size", &cq_size, - "qp_type", &qp_type, - "max_send_wr", &max_send_wr, - "max_recv_wr", &max_recv_wr - ); - if(ret) - jerror(&err, "Failed to parse configuration of node %s", node_name(n)); + json_error_t err; + ret = json_unpack_ex(cfg, &err, 0, "{ s?: s, s?: s, s?: s, s?: i, \ + s?: s, s?: i, s?: s, s?: i, s?: i}", + "remote", &remote, + "local", &local, + "rdma_port_space", &port_space, + "resolution_timeout", &timeout, + "poll_mode", &poll_mode, + "cq_size", &cq_size, + "qp_type", &qp_type, + "max_send_wr", &max_send_wr, + "max_recv_wr", &max_recv_wr + ); + if(ret) + jerror(&err, "Failed to parse configuration of node %s", node_name(n)); - // Translate IP:PORT to a struct addrinfo - char* ip_adr = strtok(local, ":"); - char* port = strtok(NULL, ":"); - ret = getaddrinfo(ip_adr, port, NULL, &ib->conn.src_addr); - if(ret) - { - error("Failed to resolve local address '%s' of node %s: %s", - local, node_name(n), gai_strerror(ret)); - } + // Translate IP:PORT to a struct addrinfo + char* ip_adr = strtok(local, ":"); + char* port = strtok(NULL, ":"); + ret = getaddrinfo(ip_adr, port, NULL, &ib->conn.src_addr); + if(ret) + { + error("Failed to resolve local address '%s' of node %s: %s", + local, node_name(n), gai_strerror(ret)); + } - // Translate port space - if(strcmp(port_space, "RDMA_PS_IPOIB") == 0) ib->conn.port_space = RDMA_PS_IPOIB; - else if(strcmp(port_space, "RDMA_PS_TCP") == 0) ib->conn.port_space = RDMA_PS_TCP; - else if(strcmp(port_space, "RDMA_PS_UDP") == 0) ib->conn.port_space = RDMA_PS_UDP; - else if(strcmp(port_space, "RDMA_PS_IB") == 0) ib->conn.port_space = RDMA_PS_IB; - else { - error("Failed to translate rdma_port_space in node %s. %s is not a valid \ - port space supported by rdma_cma.h!", node_name(n), port_space); - } + // Translate port space + if(strcmp(port_space, "RDMA_PS_IPOIB") == 0) ib->conn.port_space = RDMA_PS_IPOIB; + else if(strcmp(port_space, "RDMA_PS_TCP") == 0) ib->conn.port_space = RDMA_PS_TCP; + else if(strcmp(port_space, "RDMA_PS_UDP") == 0) ib->conn.port_space = RDMA_PS_UDP; + else if(strcmp(port_space, "RDMA_PS_IB") == 0) ib->conn.port_space = RDMA_PS_IB; + else { + error("Failed to translate rdma_port_space in node %s. %s is not a valid \ + port space supported by rdma_cma.h!", node_name(n), port_space); + } - // Set timeout - ib->conn.timeout = timeout; + // Set timeout + ib->conn.timeout = timeout; - n->in.vectorize = 256; + n->in.vectorize = 256; - // Translate poll mode - if(strcmp(poll_mode, "EVENT") == 0) - { - ib->poll.poll_mode = EVENT; - ib->poll.poll_func = ib_event_thread; + // Translate poll mode + if(strcmp(poll_mode, "EVENT") == 0) + { + ib->poll.poll_mode = EVENT; + ib->poll.poll_func = ib_event_thread; + } + else if(strcmp(poll_mode, "BUSY") == 0) + { + ib->poll.poll_mode = BUSY; + ib->poll.poll_func = ib_busy_poll_thread; + } + else + { + error("Failed to translate poll_mode in node %s. %s is not a valid \ + poll mode!", node_name(n), poll_mode); + } - } - else if(strcmp(poll_mode, "BUSY") == 0) - { - ib->poll.poll_mode = BUSY; - ib->poll.poll_func = ib_busy_poll_thread; - } - else - { - error("Failed to translate poll_mode in node %s. %s is not a valid \ - poll mode!", node_name(n), poll_mode); - } + // Set completion queue size + ib->cq_size = cq_size; - // Set completion queue size - ib->cq_size = cq_size; + // Translate QP type + if(strcmp(qp_type, "IBV_QPT_RC") == 0) ib->qp_init.qp_type = IBV_QPT_RC; + else if(strcmp(qp_type, "IBV_QPT_UC") == 0) ib->qp_init.qp_type = IBV_QPT_UC; + else if(strcmp(qp_type, "IBV_QPT_UD") == 0) ib->qp_init.qp_type = IBV_QPT_UD; + else { + error("Failed to translate qp_type in node %s. %s is not a valid \ + qp_type!", node_name(n), qp_type); + } - // Translate QP type - if(strcmp(qp_type, "IBV_QPT_RC") == 0) ib->qp_init.qp_type = IBV_QPT_RC; - else if(strcmp(qp_type, "IBV_QPT_UC") == 0) ib->qp_init.qp_type = IBV_QPT_UC; - else if(strcmp(qp_type, "IBV_QPT_UD") == 0) ib->qp_init.qp_type = IBV_QPT_UD; - else { - error("Failed to translate qp_type in node %s. %s is not a valid \ - qp_type!", node_name(n), qp_type); - } + // Set max. send and receive Work Requests + // First check if the set value is a power of 2, and warn the user if this is not the case + int max_send_pow = (int) pow(2, ceil(log2(max_send_wr))); + int max_recv_pow = (int) pow(2, ceil(log2(max_recv_wr))); - // Set max. send and receive Work Requests - // First check if the set value is a power of 2, and warn the user if this is not the case - int max_send_pow = (int) pow(2, ceil(log2(max_send_wr))); - int max_recv_pow = (int) pow(2, ceil(log2(max_recv_wr))); + if(max_send_wr != max_send_pow) + warn("Max. number of send WRs (%i) is not a power of 2! The HCA will change this to a power of 2: %i", + max_send_wr, max_send_pow); - if(max_send_wr != max_send_pow) - warn("Max. number of send WRs (%i) is not a power of 2! The HCA will change this to a power of 2: %i", - max_send_wr, max_send_pow); + if(max_recv_wr != max_recv_pow) + warn("Max. number of recv WRs (%i) is not a power of 2! The HCA will change this to a power of 2: %i", + max_recv_wr, max_recv_pow); - if(max_recv_wr != max_recv_pow) - warn("Max. number of recv WRs (%i) is not a power of 2! The HCA will change this to a power of 2: %i", - max_recv_wr, max_recv_pow); + ib->qp_init.cap.max_send_wr = max_send_wr; + ib->qp_init.cap.max_recv_wr = max_recv_wr; - ib->qp_init.cap.max_send_wr = max_send_wr; - ib->qp_init.cap.max_recv_wr = max_recv_wr; + // Set available receive Work Requests to 0 + ib->conn.available_recv_wrs = 0; - // Set available receive Work Requests to 0 - ib->conn.available_recv_wrs = 0; + // Set remaining QP attributes + ib->qp_init.cap.max_send_sge = 1; + ib->qp_init.cap.max_recv_sge = 1; - // Set remaining QP attributes - ib->qp_init.cap.max_send_sge = 1; - ib->qp_init.cap.max_recv_sge = 1; + //Check if node is a source and connect to target + if(remote) + { + ib->is_source = 1; - //Check if node is a source and connect to target - if(remote) - { - ib->is_source = 1; + // Translate address info + char* ip_adr = strtok(remote, ":"); + char* port = strtok(NULL, ":"); + ret = getaddrinfo(ip_adr, port, NULL, &ib->conn.dst_addr); + if(ret) + { + error("Failed to resolve remote address '%s' of node %s: %s", + remote, node_name(n), gai_strerror(ret)); + } - // Translate address info - char* ip_adr = strtok(remote, ":"); - char* port = strtok(NULL, ":"); - ret = getaddrinfo(ip_adr, port, NULL, &ib->conn.dst_addr); - if(ret) - { - error("Failed to resolve remote address '%s' of node %s: %s", - remote, node_name(n), gai_strerror(ret)); - } + // Set correct Work Completion function + ib->poll.on_compl = ib_completion_source; + } + else + { + ib->is_source = 0; - // Set correct Work Completion function - ib->poll.on_compl = ib_completion_source; - } - else - { - ib->is_source = 0; + // Set correct Work Completion function + ib->poll.on_compl = ib_completion_target; + } - // Set correct Work Completion function - ib->poll.on_compl = ib_completion_target; - } - - return 0; + return 0; } char * ib_print(struct node *n) { - return 0; + return 0; } int ib_destroy(struct node *n) { - return 0; + return 0; } void * ib_disconnect_thread(void *n) { - struct node *node = (struct node *)n; - struct infiniband *ib = (struct infiniband *)((struct node *)n)->_vd; - struct rdma_cm_event *event; + struct node *node = (struct node *)n; + struct infiniband *ib = (struct infiniband *)((struct node *)n)->_vd; + struct rdma_cm_event *event; - while(rdma_get_cm_event(ib->ctx.ec, &event) == 0) - { - if(event->event == RDMA_CM_EVENT_DISCONNECTED) - { - rdma_ack_cm_event(event); - ib->conn.rdma_disconnect_called = 1; + while(rdma_get_cm_event(ib->ctx.ec, &event) == 0) + { + if(event->event == RDMA_CM_EVENT_DISCONNECTED) + { + rdma_ack_cm_event(event); + ib->conn.rdma_disconnect_called = 1; - node_stop(node); - return NULL; - } - } - return NULL; + node_stop(node); + return NULL; + } + } + return NULL; } int ib_start(struct node *n) { - struct infiniband *ib = (struct infiniband *) n->_vd; - struct rdma_cm_event *event = NULL; - int ret; + struct infiniband *ib = (struct infiniband *) n->_vd; + struct rdma_cm_event *event = NULL; + int ret; - // Create event channel - ib->ctx.ec = rdma_create_event_channel(); - if(!ib->ctx.ec) { - error("Failed to create event channel in node %s!", - node_name(n)); - } + // Create event channel + ib->ctx.ec = rdma_create_event_channel(); + if(!ib->ctx.ec) + error("Failed to create event channel in node %s!", node_name(n)); - ret = rdma_create_id(ib->ctx.ec, &ib->ctx.id, NULL, ib->conn.port_space); - if(ret) - { - error("Failed to create rdma_cm_id of node %s: %s", - node_name(n), gai_strerror(ret)); - } - info("Succesfully created rdma_cm_id."); + ret = rdma_create_id(ib->ctx.ec, &ib->ctx.id, NULL, ib->conn.port_space); + if(ret) + { + error("Failed to create rdma_cm_id of node %s: %s", + node_name(n), gai_strerror(ret)); + } + info("Succesfully created rdma_cm_id."); - // Bind rdma_cm_id to the HCA - ret = rdma_bind_addr(ib->ctx.id, ib->conn.src_addr->ai_addr); - if(ret) - { - error("Failed to bind to local device of node %s: %s", - node_name(n), gai_strerror(ret)); - } - info("Bound rdma_cm_id to Infiniband device."); + // Bind rdma_cm_id to the HCA + ret = rdma_bind_addr(ib->ctx.id, ib->conn.src_addr->ai_addr); + if(ret) + { + error("Failed to bind to local device of node %s: %s", + node_name(n), gai_strerror(ret)); + } + info("Bound rdma_cm_id to Infiniband device."); - if(ib->is_source) - { - // Resolve address - ret = rdma_resolve_addr(ib->ctx.id, - NULL, - ib->conn.dst_addr->ai_addr, - ib->conn.timeout); - if(ret) - { - error("Failed to resolve remote address after %ims of node %s: %s", - ib->conn.timeout, node_name(n), gai_strerror(ret)); - } - } - else - { - // The ID will be overwritten for the target. If the event type is - // RDMA_CM_EVENT_CONNECT_REQUEST, >then this references a new id for - // that communication. - ib->ctx.listen_id = ib->ctx.id; + if(ib->is_source) + { + // Resolve address + ret = rdma_resolve_addr(ib->ctx.id, + NULL, + ib->conn.dst_addr->ai_addr, + ib->conn.timeout); + if(ret) + { + error("Failed to resolve remote address after %ims of node %s: %s", + ib->conn.timeout, node_name(n), gai_strerror(ret)); + } + } + else + { + // The ID will be overwritten for the target. If the event type is + // RDMA_CM_EVENT_CONNECT_REQUEST, >then this references a new id for + // that communication. + ib->ctx.listen_id = ib->ctx.id; - // Listen on rdma_cm_id for events - ret = rdma_listen(ib->ctx.listen_id, 10); - if(ret) - { - error("Failed to listen to rdma_cm_id on node %s", node_name(n)); - } - } + // Listen on rdma_cm_id for events + ret = rdma_listen(ib->ctx.listen_id, 10); + if(ret) + error("Failed to listen to rdma_cm_id on node %s", node_name(n)); + } - // Several events should occur on the event channel, to make - // sure the nodes are succesfully connected. - info("Starting to monitor events on rdma_cm_id."); + // Several events should occur on the event channel, to make + // sure the nodes are succesfully connected. + info("Starting to monitor events on rdma_cm_id."); - while(rdma_get_cm_event(ib->ctx.ec, &event) == 0) - { - struct rdma_cm_event event_copy; - memcpy(&event_copy, event, sizeof(*event)); + while(rdma_get_cm_event(ib->ctx.ec, &event) == 0) + { + struct rdma_cm_event event_copy; + memcpy(&event_copy, event, sizeof(*event)); - rdma_ack_cm_event(event); + rdma_ack_cm_event(event); - if(ib_event(n, &event_copy)) - break; - } + if(ib_event(n, &event_copy)) + break; + } - ret = pthread_create(&ib->conn.stop_thread, NULL, ib_disconnect_thread, n); - if(ret) - { - error("Failed to create thread to monitor disconnects in node %s: %s", - node_name(n), gai_strerror(ret)); - } + ret = pthread_create(&ib->conn.stop_thread, NULL, ib_disconnect_thread, n); + if(ret) + { + error("Failed to create thread to monitor disconnects in node %s: %s", + node_name(n), gai_strerror(ret)); + } - return 0; + return 0; } int ib_stop(struct node *n) From 1c2c210f13cfd08560548adbe8e471755b4d0289 Mon Sep 17 00:00:00 2001 From: Dennis Potter Date: Tue, 3 Jul 2018 18:28:21 +0200 Subject: [PATCH 28/35] Fixed two minor compilation errors --- lib/memory.c | 2 +- lib/memory/ib.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/memory.c b/lib/memory.c index 0eac3e437..4257d14aa 100644 --- a/lib/memory.c +++ b/lib/memory.c @@ -47,7 +47,7 @@ int memory_init(int hugepages) } #ifdef __linux__ - int ret, pagecnt, pagesz; + int pagecnt, pagesz; struct rlimit l; info("Initialize memory sub-system"); diff --git a/lib/memory/ib.c b/lib/memory/ib.c index fe62e223f..5679c6e6a 100644 --- a/lib/memory/ib.c +++ b/lib/memory/ib.c @@ -22,6 +22,7 @@ #include #include +#include #include struct memory_ib { From 5a6b9120be8ced2c52a90c26af17a35501f37115 Mon Sep 17 00:00:00 2001 From: Dennis Potter Date: Wed, 4 Jul 2018 10:39:13 +0200 Subject: [PATCH 29/35] Fixed incompatible integer to pointer conversion returning 'int' from a function with result type 'struct memory_allocation *' --- lib/memory/heap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/memory/heap.c b/lib/memory/heap.c index 1a2c61552..7e054fd23 100644 --- a/lib/memory/heap.c +++ b/lib/memory/heap.c @@ -43,7 +43,7 @@ static struct memory_allocation * memory_heap_alloc(struct memory_type *m, size_ ret = posix_memalign(&ma->address, ma->alignment, ma->length); if (ret) { free(ma); - return ret; + return NULL; } return ma; From 070e2c2bde99b3cdba350689fdf08d8de47a54a3 Mon Sep 17 00:00:00 2001 From: Dennis Potter Date: Wed, 4 Jul 2018 11:59:52 +0200 Subject: [PATCH 30/35] Quick work around. api init function uses memory_type_heap, so the hash table needs to be initialized. This happens in memory_init() --- lib/api.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/api.c b/lib/api.c index dbf31c23d..b98af2785 100644 --- a/lib/api.c +++ b/lib/api.c @@ -272,6 +272,7 @@ int api_init(struct api *a, struct super_node *sn) if (ret) return ret; + memory_init(0); ret = queue_signalled_init(&a->pending, 1024, &memory_type_heap, 0); if (ret) return ret; From 51519c06df4f1802412e60510349584e9b1369b7 Mon Sep 17 00:00:00 2001 From: Dennis Potter Date: Wed, 4 Jul 2018 15:15:24 +0200 Subject: [PATCH 31/35] Implemented new memory system implementation in infiniband node --- include/villas/memory_ib.h | 40 -------------------------------------- lib/memory.c | 6 ++++++ lib/memory/ib.c | 17 +++++++++------- lib/nodes/infiniband.c | 22 ++++++++++----------- 4 files changed, 27 insertions(+), 58 deletions(-) delete mode 100644 include/villas/memory_ib.h diff --git a/include/villas/memory_ib.h b/include/villas/memory_ib.h deleted file mode 100644 index 5dfdf854e..000000000 --- a/include/villas/memory_ib.h +++ /dev/null @@ -1,40 +0,0 @@ -/** Memory allocators. - * - * @file - * @author Dennis Potter - * @copyright 2018, Institute for Automation of Complex Power Systems, EONERC - * @license GNU General Public License (version 3) - * - * VILLASnode - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - *********************************************************************************/ - -#include -#include - -struct memory_ib { - struct ibv_pd *pd; - struct memtype *parent; -}; - -struct ibv_mr* memory_ib_mr(void*); -void* memory_ib_alloc(struct memtype*, size_t, size_t); -int memory_ib_free(struct memtype*, void*, size_t); -struct memtype* ib_memtype(struct node*, struct memtype*); - - - - - diff --git a/lib/memory.c b/lib/memory.c index 4257d14aa..d60c8d698 100644 --- a/lib/memory.c +++ b/lib/memory.c @@ -118,3 +118,9 @@ int memory_free(void *ptr) return 0; } + +struct memory_allocation * memory_get_allocation(void *ptr) +{ + struct memory_allocation *ma = (struct memory_allocation *) hash_table_lookup(&allocations, ptr); + return ma; +} diff --git a/lib/memory/ib.c b/lib/memory/ib.c index 5679c6e6a..573487477 100644 --- a/lib/memory/ib.c +++ b/lib/memory/ib.c @@ -24,17 +24,20 @@ #include #include #include +#include -struct memory_ib { - struct ibv_pd *pd; - struct memory_type *parent; -}; -struct ibv_mr * memory_ib_mr(void *ptr) +struct ibv_mr * memory_ib_get_mr(struct sample *smps) { - struct ibv_mr *mr = (struct ibv_mr *) ptr; + struct memory_allocation *ma; + struct pool *p; + struct ibv_mr *mr; - return (mr - 1); + p = sample_pool(smps); + + ma = memory_get_allocation((char *)(p)+p->buffer_off); + mr = ma->ib.mr; + return mr; } static struct memory_allocation * memory_ib_alloc(struct memory_type *m, size_t len, size_t alignment) diff --git a/lib/nodes/infiniband.c b/lib/nodes/infiniband.c index b5483c114..0cc37af2a 100644 --- a/lib/nodes/infiniband.c +++ b/lib/nodes/infiniband.c @@ -30,6 +30,7 @@ #include #include #include +#include #include @@ -695,15 +696,15 @@ int ib_read(struct node *n, struct sample *smps[], unsigned cnt) struct ibv_wc wc[n->in.vectorize]; struct ibv_recv_wr wr[cnt], *bad_wr = NULL; struct ibv_sge sge[cnt]; - struct ibv_mr ** mr; - struct pool *p; + struct ibv_mr * mr; int ret; + + if(ib->conn.available_recv_wrs <= ib->qp_init.cap.max_recv_wr && cnt==n->in.vectorize) { // Get Memory Region - p = sample_pool(smps[0]); - mr = (struct ibv_mr **)((char *)(p)+p->buffer_off-8); + mr = memory_ib_get_mr(smps[0]); for(int i=0; idata; sge[i].length = SAMPLE_DATA_LEN(DEFAULT_SAMPLELEN); - sge[i].lkey = (*mr)->lkey; + sge[i].lkey = mr->lkey; // Prepare a receive Work Request wr[i].wr_id = (uintptr_t)smps[i]; @@ -744,7 +745,8 @@ int ib_read(struct node *n, struct sample *smps[], unsigned cnt) for(int i=0; i_vd; struct ibv_send_wr wr[cnt], *bad_wr = NULL; struct ibv_sge sge[cnt]; - struct ibv_mr ** mr; - struct pool *p; + struct ibv_mr * mr; int ret; memset(&wr, 0, sizeof(wr)); @@ -776,8 +777,7 @@ int ib_write(struct node *n, struct sample *smps[], unsigned cnt) int send_inline = 1; // Get Memory Region - p = sample_pool(smps[0]); - mr = (struct ibv_mr **)((char *)(p)+p->buffer_off-8); + mr = memory_ib_get_mr(smps[0]); for(int i=0; idata; sge[i].length = smps[i]->length*sizeof(double); - sge[i].lkey = (*mr)->lkey; + sge[i].lkey = mr->lkey; // Set Send Work Request wr[i].wr_id = (uintptr_t)smps[i]; //This way the sample can be release in WC From 8f52d167f5a4517c595c7e2564b006f307365e4b Mon Sep 17 00:00:00 2001 From: Dennis Potter Date: Wed, 4 Jul 2018 15:26:22 +0200 Subject: [PATCH 32/35] Fixed wrong directory in include of ib.h --- lib/memory/ib.c | 2 +- lib/nodes/infiniband.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/memory/ib.c b/lib/memory/ib.c index 573487477..3b89e5637 100644 --- a/lib/memory/ib.c +++ b/lib/memory/ib.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include struct ibv_mr * memory_ib_get_mr(struct sample *smps) diff --git a/lib/nodes/infiniband.c b/lib/nodes/infiniband.c index 0cc37af2a..cb63e01ee 100644 --- a/lib/nodes/infiniband.c +++ b/lib/nodes/infiniband.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include From 86363f06da5f562c8a575098e48913a65ff83953 Mon Sep 17 00:00:00 2001 From: Dennis Potter Date: Wed, 4 Jul 2018 15:37:25 +0200 Subject: [PATCH 33/35] Added better handling for FLUSH_ERRs of receive work queue --- lib/nodes/infiniband.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/lib/nodes/infiniband.c b/lib/nodes/infiniband.c index cb63e01ee..8e35952a9 100644 --- a/lib/nodes/infiniband.c +++ b/lib/nodes/infiniband.c @@ -744,10 +744,18 @@ int ib_read(struct node *n, struct sample *smps[], unsigned cnt) for(int i=0; ilength = wc[i].byte_len/sizeof(double); From 08682bec9ba9bd74fb1c6c117d530748ef355d5d Mon Sep 17 00:00:00 2001 From: Dennis Potter Date: Wed, 4 Jul 2018 16:34:08 +0200 Subject: [PATCH 34/35] Added LOG_IB --- include/villas/log.h | 11 ++++++----- lib/log.c | 1 + 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/include/villas/log.h b/include/villas/log.h index 7188ef31e..cac1e5e98 100644 --- a/include/villas/log.h +++ b/include/villas/log.h @@ -57,7 +57,7 @@ extern "C" { enum log_facilities { LOG_POOL = (1L << 8), LOG_QUEUE = (1L << 9), - LOG_CONFIG = (1L << 10), + LOG_CONFIG = (1L << 10), LOG_HOOK = (1L << 11), LOG_PATH = (1L << 12), LOG_NODE = (1L << 13), @@ -73,16 +73,17 @@ enum log_facilities { LOG_ADVIO = (1L << 23), /* Node-types */ - LOG_SOCKET = (1L << 24), + LOG_SOCKET = (1L << 24), LOG_FILE = (1L << 25), LOG_FPGA = (1L << 26), LOG_NGSI = (1L << 27), - LOG_WEBSOCKET = (1L << 28), + LOG_WEBSOCKET = (1L << 28), LOG_OPAL = (1L << 30), - LOG_COMEDI = (1L << 31), + LOG_COMEDI = (1L << 31), + LOG_IB = (1L << 32), /* Classes */ - LOG_NODES = LOG_NODE | LOG_SOCKET | LOG_FILE | LOG_FPGA | LOG_NGSI | LOG_WEBSOCKET | LOG_OPAL, + LOG_NODES = LOG_NODE | LOG_SOCKET | LOG_FILE | LOG_FPGA | LOG_NGSI | LOG_WEBSOCKET | LOG_OPAL | LOG_IB, LOG_KERNEL = LOG_VFIO | LOG_PCI | LOG_TC | LOG_IF, LOG_ALL = ~0xFF }; diff --git a/lib/log.c b/lib/log.c index c2c328df6..32e8925d4 100644 --- a/lib/log.c +++ b/lib/log.c @@ -85,6 +85,7 @@ static const char *facilities_strs[] = { "ngsi", /* LOG_NGSI */ "websocket", /* LOG_WEBSOCKET */ "opal", /* LOG_OPAL */ + "ib", /* LOG_IB */ }; #ifdef __GNUC__ From 12d256b64a21ed05b4e28e0ed19c3ab3080edb7d Mon Sep 17 00:00:00 2001 From: Dennis Potter Date: Wed, 4 Jul 2018 16:34:38 +0200 Subject: [PATCH 35/35] Added function declaration --- include/villas/memory.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/villas/memory.h b/include/villas/memory.h index 616377d20..d1829f71a 100644 --- a/include/villas/memory.h +++ b/include/villas/memory.h @@ -81,6 +81,8 @@ void * memory_alloc_aligned(struct memory_type *m, size_t len, size_t alignment) int memory_free(void *ptr); +struct memory_allocation * memory_get_allocation(void *ptr); + #ifdef __cplusplus } #endif