diff --git a/arch/aarch64/kernel/entry.S b/arch/aarch64/kernel/entry.S index 520516587..95ad16c4b 100644 --- a/arch/aarch64/kernel/entry.S +++ b/arch/aarch64/kernel/entry.S @@ -209,7 +209,16 @@ mmu_on: /* Test core ID */ mrs x0, mpidr_el1 + ldr x0, =cpu_online + ldr x0, [x0] + cmp x0, 0 + b.ne smp_start + bl hermit_main + bl halt + +smp: + bl smp_main /* halt */ halt: @@ -273,6 +282,11 @@ _setup_cpu: ret _setup_pgtable: + ldr x0, =cpu_online + ldr x0, [x0] + cmp x0, 0 + b.ne 4f + ldr x0, =kernel_end /* align to a 16KByte boundary */ add x0, x0, 0x10000 @@ -320,6 +334,7 @@ _setup_pgtable: cmp x7, 511*PAGE_SIZE b.lo 3b +4: ret //_calc_offset: diff --git a/arch/aarch64/kernel/tasks.c b/arch/aarch64/kernel/tasks.c index ee18675f7..6fe4d16d4 100644 --- a/arch/aarch64/kernel/tasks.c +++ b/arch/aarch64/kernel/tasks.c @@ -37,6 +37,8 @@ #define TSL_ALIGNMASK ((~0L) << TLS_ALIGNBITS) #define TLS_FLOOR(addr) ((((size_t)addr) + TLS_ALIGNSIZE - 1) & TSL_ALIGNMASK) +extern int smp_main(void); + /* * Note that linker symbols are not variables, they have no memory allocated for * maintaining a value, rather their address is their value. @@ -45,6 +47,7 @@ extern const void tls_start; extern const void tls_end; extern atomic_int32_t cpu_online; +extern atomic_int32_t current_boot_id; static char tls[16][DEFAULT_STACK_SIZE]; static int id = 0; @@ -167,6 +170,19 @@ int create_default_frame(task_t* task, entry_point_t ep, void* arg, uint32_t cor return 0; } +#if MAX_CORES > 1 +int smp_start(void) +{ + int32_t core_id = atomic_int32_read(¤t_boot_id); + + LOG_INFO("Try to initialize processor (local id %d)\n", core_id); + + atomic_int32_inc(&cpu_online); + + return smp_main(); +} +#endif + int is_proxy(void) { return 0; diff --git a/arch/aarch64/kernel/timer.c b/arch/aarch64/kernel/timer.c index 1c39f510a..ed86c1691 100644 --- a/arch/aarch64/kernel/timer.c +++ b/arch/aarch64/kernel/timer.c @@ -175,6 +175,9 @@ int timer_wait(unsigned int ticks) int timer_init(void) { #ifdef DYNAMIC_TICKS + if (boot_tsc) + return 0; + boot_tsc = get_cntpct(); set_per_core(last_tsc, boot_tsc); #endif diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index 8b18b205f..1a2febb84 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -3,12 +3,35 @@ project(hermit_tools) include(../cmake/HermitCore-Paths.cmake) +option(ENABLE_RDMA_MIGRATION "Migration support via RDMA" OFF) + + add_compile_options(-std=c99) -add_executable(proxy proxy.c utils.c uhyve.c uhyve-net.c uhyve-x86_64.c uhyve-aarch64.c) -target_compile_options(proxy PUBLIC -pthread) +list(APPEND LIBS "-pthread") +set(SRC proxy.c + utils.c + uhyve.c + uhyve-net.c + uhyve-migration.c + uhyve-x86_64.c + uhyve-aarch64.c +) + +### Optional migration via RDMA +if(ENABLE_RDMA_MIGRATION) + add_definitions(-D__RDMA_MIGRATION__) + list(APPEND LIBS "-libverbs") + set(SRC ${SRC} uhyve-migration-rdma.c) +else() + remove_definitions(-D__RDMA_MIGRATION__) +endif() + +add_executable(proxy ${SRC}) + +target_compile_options(proxy PUBLIC ${LIBS}) target_compile_options(proxy PUBLIC -DMAX_ARGC_ENVC=${MAX_ARGC_ENVC}) -target_link_libraries(proxy -pthread) +target_link_libraries(proxy ${LIBS}) install(TARGETS proxy DESTINATION bin) diff --git a/tools/uhyve-aarch64.c b/tools/uhyve-aarch64.c index 439d100ce..684cd8aa3 100644 --- a/tools/uhyve-aarch64.c +++ b/tools/uhyve-aarch64.c @@ -135,6 +135,17 @@ void print_registers(void) } } + +vcpu_state_t read_cpu_state() +{ + err(1, "Migration is currently not supported!"); +} + +void migration_handler(int signum) +{ + err(1, "Migration is currently not supported!"); +} + void timer_handler(int signum) { err(1, "Checkpointing is currently not supported!"); @@ -145,7 +156,13 @@ void restore_cpu_state(void) err(1, "Checkpointing is currently not supported!"); } -void save_cpu_state(void) +vcpu_state_t save_cpu_state(void) +{ + err(1, "Checkpointing is currently not supported!"); +} + + +void write_cpu_state(void) { err(1, "Checkpointing is currently not supported!"); } @@ -155,6 +172,16 @@ int load_checkpoint(uint8_t* mem, char* path) err(1, "Checkpointing is currently not supported!"); } +int load_migration_data(uint8_t* mem) +{ + err(1, "Checkpointing is currently not supported!"); +} + +void wait_for_incomming_migration(migration_metadata_t *metadata, uint16_t listen_portno) +{ + err(1, "Checkpointing is currently not supported!"); +} + void init_cpu_state(uint64_t elf_entry) { struct kvm_vcpu_init vcpu_init = { diff --git a/tools/uhyve-migration-rdma.c b/tools/uhyve-migration-rdma.c new file mode 100644 index 000000000..6bac043b5 --- /dev/null +++ b/tools/uhyve-migration-rdma.c @@ -0,0 +1,684 @@ +/* + * Copyright (c) 2018, Simon Pickartz, RWTH Aachen University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "uhyve-migration.h" +#include "uhyve.h" + + +#ifdef __RDMA_MIGRATION__ + +#define IB_USED_PORT (1) +#define IB_CQ_ENTRIES (1) +#define IB_MAX_INLINE_DATA (0) +#define IB_MAX_DEST_RD_ATOMIC (1) +#define IB_MIN_RNR_TIMER (1) +#define IB_MAX_RECV_WR (1) +#define IB_MAX_SEND_SGE (1) +#define IB_MAX_RECV_SGE (1) + +typedef enum ib_wr_ids { + IB_WR_NO_ID = 0, + IB_WR_WRITE_PAGE_ID, + IB_WR_WRITE_LAST_PAGE_ID, + IB_WR_RECV_LAST_PAGE_ID +} ib_wr_ids_t; + +typedef struct qp_info { + uint32_t qpn; + uint16_t lid; + uint16_t psn; + uint32_t key; + uint64_t addr; +} qp_info_t; + +typedef struct com_hndl { + struct ibv_context *ctx; /* device context */ + struct ibv_device_attr dev_attr; /* device attributes */ + struct ibv_pd *pd; /* protection domain */ + struct ibv_mr *mr; /* memory region */ + struct ibv_cq *cq; /* completion queue */ + struct ibv_qp *qp; /* queue pair */ + struct ibv_comp_channel *comp_chan; /* completion event channel */ + qp_info_t loc_qp_info; + qp_info_t rem_qp_info; + uint8_t *buf; /* the communication buffer */ + uint32_t size; /* size of the buffer */ +} com_hndl_t; + + +static com_hndl_t com_hndl; +static struct ibv_send_wr *send_list = NULL; +static struct ibv_send_wr *send_list_last = NULL; +static size_t send_list_length = 0; + + +/** + * \brief Initializes the IB communication structures + * + * \param com_hndl the structure containing all communication relevant infos + * \param buf the buffer that should be registrered with the QP + * + * This function sets up the IB communication channel. It registers the 'buf' + * with a new protection domain. On its termination there is a QP in the INIT + * state ready to be connected with the remote side. + */ +static void +init_com_hndl(void) +{ + /* the guest physical memory is the communication buffer */ + com_hndl.size = guest_size; + com_hndl.buf = guest_mem; + + struct ibv_device **device_list; + struct ibv_device *ib_pp_device; + int num_devices; + + /* determine first available device */ + if ((device_list = ibv_get_device_list(&num_devices)) == NULL) { + fprintf(stderr, + "ERROR: Could not determine available IB devices " + "- %d (%s). Abort!\n", + errno, + strerror(errno)); + exit(EXIT_FAILURE); + } + if (num_devices != 0) { + ib_pp_device = device_list[0]; + } else { + fprintf(stderr, + "ERROR: Could not find any IB device. Abort!\n"); + exit(EXIT_FAILURE); + } + + /* open the device context and create protection domain */ + if ((com_hndl.ctx = ibv_open_device(ib_pp_device)) == NULL) { + fprintf(stderr, + "ERROR: Could not open the device context " + "- %d (%s). Abort!\n", + errno, + strerror(errno)); + exit(EXIT_FAILURE); + } + + /* query device capability (e.g., to determine 'max_qp_wr') */ + if (ibv_query_device(com_hndl.ctx, &com_hndl.dev_attr) < 0) { + fprintf(stderr, + "ERROR: Could not query device attributes " + "- %d (%s). Abort!\n", + errno, + strerror(errno)); + exit(EXIT_FAILURE); + } + + /* allocate protection domain */ + if ((com_hndl.pd = ibv_alloc_pd(com_hndl.ctx)) == NULL) { + fprintf(stderr, + "ERROR: Could not allocate protection domain " + "- %d (%s). Abort!\n", + errno, + strerror(errno)); + exit(EXIT_FAILURE); + } + + /* determine LID */ + struct ibv_port_attr port_attr; + if (ibv_query_port(com_hndl.ctx, IB_USED_PORT, &port_attr) < 0){ + fprintf(stderr, + "ERROR: Could not query port %u " + "- %d (%s). Abort!\n", + IB_USED_PORT, + errno, + strerror(errno)); + exit(EXIT_FAILURE); + } + + /* register guest memory with the protection domain */ + if ((com_hndl.mr = ibv_reg_mr(com_hndl.pd, + com_hndl.buf, + com_hndl.size, + IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_WRITE)) == NULL) { + fprintf(stderr, + "ERROR: Could not register the memory region " + "- %d (%s). Abort!\n", + errno, + strerror(errno)); + exit(EXIT_FAILURE); + } + + /* create completion event channel */ + if ((com_hndl.comp_chan = + ibv_create_comp_channel(com_hndl.ctx)) == NULL) { + fprintf(stderr, + "ERROR: Could not create the completion channel " + "- %d (%s). Abort!\n", + errno, + strerror(errno)); + exit(EXIT_FAILURE); + } + + /* create the completion queue */ + if ((com_hndl.cq = ibv_create_cq(com_hndl.ctx, + IB_CQ_ENTRIES, + NULL, + com_hndl.comp_chan, + 0)) == NULL) { + fprintf(stderr, + "ERROR: Could not create the completion queue " + "- %d (%s). Abort!\n", + errno, + strerror(errno)); + exit(EXIT_FAILURE); + } + + /* create send and recv queue pair and initialize it */ + struct ibv_qp_init_attr init_attr = { + .send_cq = com_hndl.cq, + .recv_cq = com_hndl.cq, + .cap = { + .max_send_wr = com_hndl.dev_attr.max_qp_wr, + .max_recv_wr = IB_MAX_RECV_WR, + .max_send_sge = IB_MAX_SEND_SGE, + .max_recv_sge = IB_MAX_RECV_SGE, + .max_inline_data = IB_MAX_INLINE_DATA + }, + .qp_type = IBV_QPT_RC, + .sq_sig_all = 0 /* we do not want a CQE for each WR */ + }; + if ((com_hndl.qp = ibv_create_qp(com_hndl.pd, &init_attr)) == NULL) { + fprintf(stderr, + "ERROR: Could not create the queue pair " + "- %d (%s). Abort!\n", + errno, + strerror(errno)); + exit(EXIT_FAILURE); + } + + struct ibv_qp_attr attr = { + .qp_state = IBV_QPS_INIT, + .pkey_index = 0, + .port_num = IB_USED_PORT, + .qp_access_flags = (IBV_ACCESS_REMOTE_WRITE) + }; + if (ibv_modify_qp(com_hndl.qp, + &attr, + IBV_QP_STATE | + IBV_QP_PKEY_INDEX | + IBV_QP_PORT | + IBV_QP_ACCESS_FLAGS) < 0) { + fprintf(stderr, + "ERROR: Could not set QP into init state " + "- %d (%s). Abort!\n", + errno, + strerror(errno)); + exit(EXIT_FAILURE); + } + + /* fill in local qp_info */ + com_hndl.loc_qp_info.qpn = com_hndl.qp->qp_num; + com_hndl.loc_qp_info.psn = lrand48() & 0xffffff; + com_hndl.loc_qp_info.key = com_hndl.mr->rkey; + com_hndl.loc_qp_info.addr = (uint64_t)com_hndl.buf; + com_hndl.loc_qp_info.lid = port_attr.lid; + +} + +/** + * \brief Frees IB related resources + * + * \param com_hndl the structure containing all communication relevant infos + */ +static void +destroy_com_hndl(void) +{ + if (ibv_destroy_qp(com_hndl.qp) < 0) { + fprintf(stderr, + "ERROR: Could not destroy the queue pair " + "- %d (%s). Abort!\n", + errno, + strerror(errno)); + exit(EXIT_FAILURE); + } + + if (ibv_destroy_cq(com_hndl.cq) < 0) { + fprintf(stderr, + "ERROR: Could not deallocate the protection domain " + "- %d (%s). Abort!\n", + errno, + strerror(errno)); + exit(EXIT_FAILURE); + } + + if (ibv_destroy_comp_channel(com_hndl.comp_chan) < 0) { + fprintf(stderr, + "ERROR: Could not deallocate the protection domain " + "- %d (%s). Abort!\n", + errno, + strerror(errno)); + exit(EXIT_FAILURE); + } + + if (ibv_dereg_mr(com_hndl.mr) < 0) { + fprintf(stderr, + "ERROR: Could not deallocate the protection domain " + "- %d (%s). Abort!\n", + errno, + strerror(errno)); + exit(EXIT_FAILURE); + } + + + + if (ibv_dealloc_pd(com_hndl.pd) < 0) { + fprintf(stderr, + "ERROR: Could not deallocate the protection domain " + "- %d (%s). Abort!\n", + errno, + strerror(errno)); + exit(EXIT_FAILURE); + } + + if (ibv_close_device(com_hndl.ctx) < 0) { + fprintf(stderr, + "ERROR: Could not close the device context " + "- %d (%s). Abort!\n", + errno, + strerror(errno)); + exit(EXIT_FAILURE); + } + +} + +/** + * \brief Connects the QP created within init_com_hndl + * + * \param com_hndl the structure containing all communication relevant infos + * + * This function performs the actual connection setup between the two QPs. + */ +static void +con_com_buf(void) { + /* transistion to ready-to-receive state */ + struct ibv_qp_attr qp_attr = { + .qp_state = IBV_QPS_RTR, + .path_mtu = IBV_MTU_2048, + .dest_qp_num = com_hndl.rem_qp_info.qpn, + .rq_psn = com_hndl.rem_qp_info.psn, + .max_dest_rd_atomic = IB_MAX_DEST_RD_ATOMIC, + .min_rnr_timer = IB_MIN_RNR_TIMER, + .ah_attr = { + .is_global = 0, + .sl = 0, + .src_path_bits = 0, + .dlid = com_hndl.rem_qp_info.lid, + .port_num = IB_USED_PORT, + } + }; + if (ibv_modify_qp(com_hndl.qp, + &qp_attr, + IBV_QP_STATE | + IBV_QP_PATH_MTU | + IBV_QP_DEST_QPN | + IBV_QP_RQ_PSN | + IBV_QP_MAX_DEST_RD_ATOMIC | + IBV_QP_MIN_RNR_TIMER | + IBV_QP_AV)) { + fprintf(stderr, + "ERROR: Could not put QP into RTR state" + "- %d (%s). Abort!\n", + errno, + strerror(errno)); + exit(errno); + } + + /* transistion to ready-to-send state */ + qp_attr.qp_state = IBV_QPS_RTS; + qp_attr.timeout = 14; + qp_attr.retry_cnt = 7; + qp_attr.rnr_retry = 7; /* infinite retrys on RNR NACK */ + qp_attr.sq_psn = com_hndl.loc_qp_info.psn; + qp_attr.max_rd_atomic = 1; + if (ibv_modify_qp(com_hndl.qp, &qp_attr, + IBV_QP_STATE | + IBV_QP_TIMEOUT | + IBV_QP_RETRY_CNT | + IBV_QP_RNR_RETRY | + IBV_QP_SQ_PSN | + IBV_QP_MAX_QP_RD_ATOMIC)) { + fprintf(stderr, + "ERROR: Could not put QP into RTS state" + "- %d (%s). Abort!\n", + errno, + strerror(errno)); + exit(errno); + } +} + +/** + * \brief Set the destination node for a migration + * + * \param ip_str a string containing the IPv4 addr of the destination + * \param port the migration port + */ +static void +exchange_qp_info(bool server) +{ + int res = 0; + if (server) { + res = recv_data(&com_hndl.rem_qp_info, sizeof(qp_info_t)); + res = send_data(&com_hndl.loc_qp_info, sizeof(qp_info_t)); + } else { + res = send_data(&com_hndl.loc_qp_info, sizeof(qp_info_t)); + res = recv_data(&com_hndl.rem_qp_info, sizeof(qp_info_t)); + } + + fprintf(stderr, "QP info sent! (QPN: %lu; LID: %lu; PSN: %lu; KEY: %lu; ADDR: 0x%x)\n", + com_hndl.loc_qp_info.qpn, + com_hndl.loc_qp_info.lid, + com_hndl.loc_qp_info.psn, + com_hndl.loc_qp_info.key, + com_hndl.loc_qp_info.addr); + fprintf(stderr, "QP info received! (QPN: %lu; LID: %lu; PSN: %lu; KEY: %lu; ADDR: 0x%x)\n", + com_hndl.rem_qp_info.qpn, + com_hndl.rem_qp_info.lid, + com_hndl.rem_qp_info.psn, + com_hndl.rem_qp_info.key, + com_hndl.rem_qp_info.addr); +} + +/** + * \brief Prepares the an 'ibv_send_wr' + * + * This function prepares an 'ibv_send_wr' structure that is prepared for the + * transmission of a single memory page using the IBV_WR_RDMA_WRITE verb. + */ +static inline struct ibv_send_wr * +prepare_send_list_elem(void) +{ + /* create work request */ + struct ibv_send_wr *send_wr = (struct ibv_send_wr*)calloc(1, sizeof(struct ibv_send_wr)); + struct ibv_sge *sge = (struct ibv_sge*)calloc(1, sizeof(struct ibv_sge)); + + /* basic work request configuration */ + send_wr->next = NULL; + send_wr->sg_list = sge; + send_wr->num_sge = 1; + send_wr->wr_id = IB_WR_WRITE_PAGE_ID; + send_wr->opcode = IBV_WR_RDMA_WRITE; + + return send_wr; + +} + +/** + * \brief Creates an 'ibv_send_wr' and appends it to the send_list + * + * \param addr the page table entry of the memory page + * \param addr_size the size of the page table entry + * \param page the buffer to be send in this WR + * \param page_size the size of the buffer + * + * This function creates an 'ibv_send_wr' structure and appends this to the + * global send_list. It sets the source/destination information and sets the + * IBV_SEND_SIGNALED flag as appropriate. + */ +static void +create_send_list_entry (void *addr, size_t addr_size, void *page, size_t page_size) +{ + /* create work request */ + struct ibv_send_wr *send_wr = prepare_send_list_elem(); + + /* configure source buffer */ + send_wr->sg_list->addr = (uintptr_t)page; + send_wr->sg_list->length = page_size; + send_wr->sg_list->lkey = com_hndl.mr->lkey; + + /* configure destination buffer */ + if (addr) { + send_wr->wr.rdma.remote_addr = com_hndl.rem_qp_info.addr + determine_dest_offset(*(size_t*)addr); + } else { + send_wr->wr.rdma.remote_addr = com_hndl.rem_qp_info.addr; + } + send_wr->wr.rdma.rkey = com_hndl.rem_qp_info.key; + + /* apped work request to send list */ + if (send_list == NULL) { + send_list = send_list_last = send_wr; + } else { + send_list_last->next = send_wr; + send_list_last = send_list_last->next; + } + /* we have to request a CQE if max_send_wr is reached to avoid overflows */ + if ((++send_list_length%com_hndl.dev_attr.max_qp_wr) == 0) { + send_list_last->send_flags = IBV_SEND_SIGNALED; + } +} + +/** + * \brief Sends the guest memory to the destination + * + * \param mode MIG_MODE_COMPLETE_DUMP sends the complete memory and + * MIG_MODE_INCREMENTAL_DUMP only the mapped guest pages + */ +void send_guest_mem(mig_mode_t mode, bool final_dump) +{ + int res = 0; + static bool ib_initialized = false; + + /* prepare IB channel */ + if (!ib_initialized) { + init_com_hndl(); + exchange_qp_info(false); + con_com_buf(); + + ib_initialized = true; + } + + /* determine migration mode */ + switch (mode) { + case MIG_MODE_COMPLETE_DUMP: + /* one send_wr for the whole guest memory */ + create_send_list_entry(NULL, 0, (void*)com_hndl.buf, guest_size); + break; + case MIG_MODE_INCREMENTAL_DUMP: + /* iterate guest page tables */ + determine_dirty_pages(create_send_list_entry); + break; + default: + fprintf(stderr, "ERROR: Unknown migration mode. Abort!\n"); + exit(EXIT_FAILURE); + } + + /* create a dumy WR request if there is nothing to send */ + if (send_list_length == 0) + create_send_list_entry(NULL, 0, NULL, 0); + + /* we have to wait for the last WR before informing dest */ + if ((mode == MIG_MODE_COMPLETE_DUMP) || final_dump) { + send_list_last->wr_id = IB_WR_WRITE_LAST_PAGE_ID; + send_list_last->opcode = IBV_WR_RDMA_WRITE_WITH_IMM; + send_list_last->send_flags = IBV_SEND_SIGNALED | IBV_SEND_SOLICITED; + send_list_last->imm_data = htonl(0x1); + } else { + send_list_last->wr_id = IB_WR_WRITE_LAST_PAGE_ID; + send_list_last->send_flags = IBV_SEND_SIGNALED; + } + + printf("DEBUG: Send list length %d\n", send_list_length); + + /* we have to call ibv_post_send() as long as 'send_list' contains elements */ + struct ibv_wc wc; + struct ibv_send_wr *remaining_send_wr = NULL; + do { + /* send data */ + remaining_send_wr = NULL; + if (ibv_post_send(com_hndl.qp, send_list, &remaining_send_wr) && (errno != ENOMEM)) { + fprintf(stderr, + "ERROR: Could not post send" + "- %d (%s). Abort!\n", + errno, + strerror(errno)); + exit(EXIT_FAILURE); + } + + /* wait for send WRs if CQ is full */ + do { + if ((res = ibv_poll_cq(com_hndl.cq, 1, &wc)) < 0) { + fprintf(stderr, + "ERROR: Could not poll on CQ" + "- %d (%s). Abort!\n", + errno, + strerror(errno)); + exit(EXIT_FAILURE); + } + } while (res < 1); + if (wc.status != IBV_WC_SUCCESS) { + fprintf(stderr, + "ERROR: WR failed status %s (%d) for wr_id %d\n", + ibv_wc_status_str(wc.status), + wc.status, + (int)wc.wr_id); + } + send_list = remaining_send_wr; + } while (remaining_send_wr); + + + /* ensure that we receive the CQE for the last page */ + if (wc.wr_id != IB_WR_WRITE_LAST_PAGE_ID) { + fprintf(stderr, + "ERROR: WR failed status %s (%d) for wr_id %d\n", + ibv_wc_status_str(wc.status), + wc.status, + (int)wc.wr_id); + } + + /* cleanup send_list */ + struct ibv_send_wr *cur_send_wr = send_list; + struct ibv_send_wr *tmp_send_wr = NULL; + while (cur_send_wr != NULL) { + free(cur_send_wr->sg_list); + tmp_send_wr = cur_send_wr; + cur_send_wr = cur_send_wr->next; + free(tmp_send_wr); + } + send_list_length = 0; + + /* do not close the channel in a pre-dump */ + if (!final_dump) + return; + + /* free IB-related resources */ + destroy_com_hndl(); + ib_initialized = false; + + fprintf(stderr, "Guest memory sent!\n"); +} + + + +/** + * \brief Receives the guest memory from the source + * + * The receive participates in the IB connection setup and waits for the + * 'solicited' event sent with the last WR issued by the sender. + */ +void recv_guest_mem(void) +{ + int res = 0; + + /* prepare IB channel */ + init_com_hndl(); + exchange_qp_info(true); + con_com_buf(); + + /* request notification on the event channel */ + if (ibv_req_notify_cq(com_hndl.cq, 1) < 0) { + fprintf(stderr, + "ERROR: Could request notify for completion queue " + "- %d (%s). Abort!\n", + errno, + strerror(errno)); + exit(EXIT_FAILURE); + } + + /* post recv matching IBV_RDMA_WRITE_WITH_IMM */ + struct ibv_cq *ev_cq; + void *ev_ctx; + struct ibv_sge sg; + struct ibv_recv_wr recv_wr; + struct ibv_recv_wr *bad_wr; + uint32_t recv_buf = 0; + + memset(&sg, 0, sizeof(sg)); + sg.addr = (uintptr_t)&recv_buf; + sg.length = sizeof(recv_buf); + sg.lkey = com_hndl.mr->lkey; + + memset(&recv_wr, 0, sizeof(recv_wr)); + recv_wr.wr_id = 0; + recv_wr.sg_list = &sg; + recv_wr.num_sge = 1; + + if (ibv_post_recv(com_hndl.qp, &recv_wr, &bad_wr) < 0) { + fprintf(stderr, + "ERROR: Could post recv - %d (%s). Abort!\n", + errno, + strerror(errno)); + exit(EXIT_FAILURE); + } + + /* wait for requested event */ + if (ibv_get_cq_event(com_hndl.comp_chan, &ev_cq, &ev_ctx) < 0) { + fprintf(stderr, + "ERROR: Could get event from completion channel " + "- %d (%s). Abort!\n", + errno, + strerror(errno)); + exit(EXIT_FAILURE); + } + + /* acknowledge the event */ + ibv_ack_cq_events(com_hndl.cq, 1); + + /* free IB-related resources */ + destroy_com_hndl(); + + fprintf(stderr, "Guest memory received!\n"); +} +#endif /* __RDMA_MIGRATION__ */ diff --git a/tools/uhyve-migration.c b/tools/uhyve-migration.c new file mode 100644 index 000000000..560237c89 --- /dev/null +++ b/tools/uhyve-migration.c @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2018, Simon Pickartz, RWTH Aachen University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef __x86_64__ +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include + +#include "uhyve-migration.h" +#include "uhyve.h" + +static struct sockaddr_in mig_server; +static int com_sock = 0; +static int listen_sock = 0; + +static mig_type_t mig_type = MIG_TYPE_COLD; + +/** + * \brief Returns the configured migration type + */ +mig_type_t +get_migration_type(void) +{ + return mig_type; +} + +/** + * \brief Sets the migration type + * + * \param mig_type_str A string defining the migration type + */ +void +set_migration_type(const char *mig_type_str) +{ + if (mig_type_str == NULL) + return; + + int i; + bool found_type = false; + for (i=0; i + +extern size_t guest_size; +extern uint8_t* guest_mem; + +#define MIGRATION_PORT 1337 + +typedef enum { + MIG_MODE_COMPLETE_DUMP = 1, + MIG_MODE_INCREMENTAL_DUMP, +} mig_mode_t; + +typedef enum { + MIG_TYPE_COLD = 0, + MIG_TYPE_LIVE, +} mig_type_t; + +const static struct { + mig_type_t mig_type; + const char *str; +} mig_type_conv [] = { + {MIG_TYPE_COLD, "cold"}, + {MIG_TYPE_LIVE, "live"}, +}; + +typedef struct _migration_metadata { + uint32_t ncores; + size_t guest_size; + uint32_t no_checkpoint; + uint64_t elf_entry; + bool full_checkpoint; +} migration_metadata_t; + +void set_migration_type(const char *mig_type_str); +mig_type_t get_migration_type(void); + +void wait_for_client(uint16_t listen_portno); +void set_migration_target(const char *ip_str, int port); +void connect_to_server(void); +void close_migration_channel(void); + +int recv_data(void *buffer, size_t length); +int send_data(void *buffer, size_t length); + +void send_guest_mem(mig_mode_t mode, bool final_dump); +void recv_guest_mem(void); +#endif /* __UHYVE_MIGRATION_H__ */ + + + diff --git a/tools/uhyve-x86_64.c b/tools/uhyve-x86_64.c index 6550e2b0a..c677e5bd3 100644 --- a/tools/uhyve-x86_64.c +++ b/tools/uhyve-x86_64.c @@ -61,14 +61,15 @@ #include "uhyve.h" #include "uhyve-x86_64.h" #include "uhyve-syscalls.h" +#include "uhyve-migration.h" #include "uhyve-net.h" #include "proxy.h" // define this macro to create checkpoints with KVM's dirty log //#define USE_DIRTY_LOG +#define MIG_ITERS 4 #define MAX_FNAME 256 -#define MAX_MSR_ENTRIES 25 #define GUEST_OFFSET 0x0 #define CPUID_FUNC_PERFMON 0x0A @@ -149,14 +150,18 @@ #define IOAPIC_DEFAULT_BASE 0xfec00000 #define APIC_DEFAULT_BASE 0xfee00000 + static bool cap_tsc_deadline = false; static bool cap_irqchip = false; static bool cap_adjust_clock_stable = false; static bool cap_irqfd = false; static bool cap_vapic = false; +FILE *chk_file = NULL; + extern size_t guest_size; extern pthread_barrier_t barrier; +extern pthread_barrier_t migration_barrier; extern pthread_t* vcpu_threads; extern uint64_t elf_entry; extern uint8_t* klog; @@ -172,6 +177,8 @@ extern __thread struct kvm_run *run; extern __thread int vcpufd; extern __thread uint32_t cpuid; +extern vcpu_state_t *vcpu_thread_states; + static inline void show_dtable(const char *name, struct kvm_dtable *dtable) { fprintf(stderr, " %s %016zx %08hx\n", name, (size_t) dtable->base, (uint16_t) dtable->limit); @@ -380,6 +387,17 @@ static void setup_cpuid(int kvm, int vcpufd) free(kvm_cpuid); } +size_t determine_dest_offset(size_t src_addr) +{ + size_t ret = 0; + if (src_addr & PG_PSE) { + ret = src_addr & PAGE_2M_MASK; + } else { + ret = src_addr & PAGE_MASK; + } + return ret; +} + void init_cpu_state(uint64_t elf_entry) { struct kvm_regs regs = { @@ -416,110 +434,82 @@ void init_cpu_state(uint64_t elf_entry) *((volatile uint32_t*) (mboot + 0x30)) = cpuid; } -void restore_cpu_state(void) -{ - struct kvm_regs regs; - struct kvm_mp_state mp_state = { KVM_MP_STATE_RUNNABLE }; +vcpu_state_t read_cpu_state() { + vcpu_state_t cpu_state; char fname[MAX_FNAME]; - struct kvm_sregs sregs; - struct kvm_fpu fpu; - struct { - struct kvm_msrs info; - struct kvm_msr_entry entries[MAX_MSR_ENTRIES]; - } msr_data; - struct kvm_lapic_state lapic; - struct kvm_xsave xsave; - struct kvm_xcrs xcrs; - struct kvm_vcpu_events events; - - run->apic_base = APIC_DEFAULT_BASE; - setup_cpuid(kvm, vcpufd); - snprintf(fname, MAX_FNAME, "checkpoint/chk%u_core%u.dat", no_checkpoint, cpuid); FILE* f = fopen(fname, "r"); if (f == NULL) err(1, "fopen: unable to open file"); - if (fread(&sregs, sizeof(sregs), 1, f) != 1) - err(1, "fread failed\n"); - if (fread(®s, sizeof(regs), 1, f) != 1) - err(1, "fread failed\n"); - if (fread(&fpu, sizeof(fpu), 1, f) != 1) - err(1, "fread failed\n"); - if (fread(&msr_data, sizeof(msr_data), 1, f) != 1) - err(1, "fread failed\n"); - if (fread(&lapic, sizeof(lapic), 1, f) != 1) - err(1, "fread failed\n"); - if (fread(&xsave, sizeof(xsave), 1, f) != 1) - err(1, "fread failed\n"); - if (fread(&xcrs, sizeof(xcrs), 1, f) != 1) - err(1, "fread failed\n"); - if (fread(&events, sizeof(events), 1, f) != 1) - err(1, "fread failed\n"); - if (fread(&mp_state, sizeof(mp_state), 1, f) != 1) + if (fread(&cpu_state, sizeof(cpu_state), 1, f) != 1) err(1, "fread failed\n"); fclose(f); - kvm_ioctl(vcpufd, KVM_SET_SREGS, &sregs); - kvm_ioctl(vcpufd, KVM_SET_REGS, ®s); - kvm_ioctl(vcpufd, KVM_SET_MSRS, &msr_data); - kvm_ioctl(vcpufd, KVM_SET_XCRS, &xcrs); - kvm_ioctl(vcpufd, KVM_SET_MP_STATE, &mp_state); - kvm_ioctl(vcpufd, KVM_SET_LAPIC, &lapic); - kvm_ioctl(vcpufd, KVM_SET_FPU, &fpu); - kvm_ioctl(vcpufd, KVM_SET_XSAVE, &xsave); - kvm_ioctl(vcpufd, KVM_SET_VCPU_EVENTS, &events); - + return cpu_state; } -void save_cpu_state(void) +void restore_cpu_state(vcpu_state_t cpu_state) +{ + cpu_state.mp_state.mp_state = KVM_MP_STATE_RUNNABLE; + + run->apic_base = APIC_DEFAULT_BASE; + setup_cpuid(kvm, vcpufd); + + + kvm_ioctl(vcpufd, KVM_SET_SREGS, &cpu_state.sregs); + kvm_ioctl(vcpufd, KVM_SET_REGS, &cpu_state.regs); + kvm_ioctl(vcpufd, KVM_SET_MSRS, &cpu_state.msr_data); + kvm_ioctl(vcpufd, KVM_SET_XCRS, &cpu_state.xcrs); + kvm_ioctl(vcpufd, KVM_SET_MP_STATE, &cpu_state.mp_state); + kvm_ioctl(vcpufd, KVM_SET_LAPIC, &cpu_state.lapic); + kvm_ioctl(vcpufd, KVM_SET_FPU, &cpu_state.fpu); + kvm_ioctl(vcpufd, KVM_SET_XSAVE, &cpu_state.xsave); + kvm_ioctl(vcpufd, KVM_SET_VCPU_EVENTS, &cpu_state.events); +} + +vcpu_state_t save_cpu_state(void) { - struct { - struct kvm_msrs info; - struct kvm_msr_entry entries[MAX_MSR_ENTRIES]; - } msr_data; - struct kvm_msr_entry *msrs = msr_data.entries; - struct kvm_regs regs; - struct kvm_sregs sregs; - struct kvm_fpu fpu; - struct kvm_lapic_state lapic; - struct kvm_xsave xsave; - struct kvm_xcrs xcrs; - struct kvm_vcpu_events events; - struct kvm_mp_state mp_state; - char fname[MAX_FNAME]; int n = 0; + vcpu_state_t cpu_state; /* define the list of required MSRs */ - msrs[n++].index = MSR_IA32_APICBASE; - msrs[n++].index = MSR_IA32_SYSENTER_CS; - msrs[n++].index = MSR_IA32_SYSENTER_ESP; - msrs[n++].index = MSR_IA32_SYSENTER_EIP; - msrs[n++].index = MSR_IA32_CR_PAT; - msrs[n++].index = MSR_IA32_MISC_ENABLE; - msrs[n++].index = MSR_IA32_TSC; - msrs[n++].index = MSR_CSTAR; - msrs[n++].index = MSR_STAR; - msrs[n++].index = MSR_EFER; - msrs[n++].index = MSR_LSTAR; - msrs[n++].index = MSR_GS_BASE; - msrs[n++].index = MSR_FS_BASE; - msrs[n++].index = MSR_KERNEL_GS_BASE; + cpu_state.msr_data.entries[n++].index = MSR_IA32_APICBASE; + cpu_state.msr_data.entries[n++].index = MSR_IA32_SYSENTER_CS; + cpu_state.msr_data.entries[n++].index = MSR_IA32_SYSENTER_ESP; + cpu_state.msr_data.entries[n++].index = MSR_IA32_SYSENTER_EIP; + cpu_state.msr_data.entries[n++].index = MSR_IA32_CR_PAT; + cpu_state.msr_data.entries[n++].index = MSR_IA32_MISC_ENABLE; + cpu_state.msr_data.entries[n++].index = MSR_IA32_TSC; + cpu_state.msr_data.entries[n++].index = MSR_CSTAR; + cpu_state.msr_data.entries[n++].index = MSR_STAR; + cpu_state.msr_data.entries[n++].index = MSR_EFER; + cpu_state.msr_data.entries[n++].index = MSR_LSTAR; + cpu_state.msr_data.entries[n++].index = MSR_GS_BASE; + cpu_state.msr_data.entries[n++].index = MSR_FS_BASE; + cpu_state.msr_data.entries[n++].index = MSR_KERNEL_GS_BASE; //msrs[n++].index = MSR_IA32_FEATURE_CONTROL; - msr_data.info.nmsrs = n; + cpu_state.msr_data.info.nmsrs = n; - kvm_ioctl(vcpufd, KVM_GET_SREGS, &sregs); - kvm_ioctl(vcpufd, KVM_GET_REGS, ®s); - kvm_ioctl(vcpufd, KVM_GET_MSRS, &msr_data); - kvm_ioctl(vcpufd, KVM_GET_XCRS, &xcrs); - kvm_ioctl(vcpufd, KVM_GET_LAPIC, &lapic); - kvm_ioctl(vcpufd, KVM_GET_FPU, &fpu); - kvm_ioctl(vcpufd, KVM_GET_XSAVE, &xsave); - kvm_ioctl(vcpufd, KVM_GET_VCPU_EVENTS, &events); - kvm_ioctl(vcpufd, KVM_GET_MP_STATE, &mp_state); + kvm_ioctl(vcpufd, KVM_GET_SREGS, &cpu_state.sregs); + kvm_ioctl(vcpufd, KVM_GET_REGS, &cpu_state.regs); + kvm_ioctl(vcpufd, KVM_GET_MSRS, &cpu_state.msr_data); + kvm_ioctl(vcpufd, KVM_GET_XCRS, &cpu_state.xcrs); + kvm_ioctl(vcpufd, KVM_GET_LAPIC, &cpu_state.lapic); + kvm_ioctl(vcpufd, KVM_GET_FPU, &cpu_state.fpu); + kvm_ioctl(vcpufd, KVM_GET_XSAVE, &cpu_state.xsave); + kvm_ioctl(vcpufd, KVM_GET_VCPU_EVENTS, &cpu_state.events); + kvm_ioctl(vcpufd, KVM_GET_MP_STATE, &cpu_state.mp_state); + return cpu_state; +} + +void write_cpu_state(void) +{ + vcpu_state_t cpu_state = save_cpu_state(); + char fname[MAX_FNAME]; snprintf(fname, MAX_FNAME, "checkpoint/chk%u_core%u.dat", no_checkpoint, cpuid); FILE* f = fopen(fname, "w"); @@ -527,73 +517,14 @@ void save_cpu_state(void) err(1, "fopen: unable to open file\n"); } - if (fwrite(&sregs, sizeof(sregs), 1, f) != 1) - err(1, "fwrite failed\n"); - if (fwrite(®s, sizeof(regs), 1, f) != 1) - err(1, "fwrite failed\n"); - if (fwrite(&fpu, sizeof(fpu), 1, f) != 1) - err(1, "fwrite failed\n"); - if (fwrite(&msr_data, sizeof(msr_data), 1, f) != 1) - err(1, "fwrite failed\n"); - if (fwrite(&lapic, sizeof(lapic), 1, f) != 1) - err(1, "fwrite failed\n"); - if (fwrite(&xsave, sizeof(xsave), 1, f) != 1) - err(1, "fwrite failed\n"); - if (fwrite(&xcrs, sizeof(xcrs), 1, f) != 1) - err(1, "fwrite failed\n"); - if (fwrite(&events, sizeof(events), 1, f) != 1) - err(1, "fwrite failed\n"); - if (fwrite(&mp_state, sizeof(mp_state), 1, f) != 1) + if (fwrite(&cpu_state, sizeof(cpu_state), 1, f) != 1) err(1, "fwrite failed\n"); fclose(f); } -void timer_handler(int signum) +void scan_dirty_log(void (*save_page)(void*, size_t, void*, size_t)) { - struct stat st = {0}; - const size_t flag = (!full_checkpoint && (no_checkpoint > 0)) ? PG_DIRTY : PG_ACCESSED; - char fname[MAX_FNAME]; - struct timeval begin, end; - - if (verbose) - gettimeofday(&begin, NULL); - - if (stat("checkpoint", &st) == -1) - mkdir("checkpoint", 0700); - - for(size_t i = 0; i < ncores; i++) - if (vcpu_threads[i] != pthread_self()) - pthread_kill(vcpu_threads[i], SIGRTMIN); - - pthread_barrier_wait(&barrier); - - save_cpu_state(); - - snprintf(fname, MAX_FNAME, "checkpoint/chk%u_mem.dat", no_checkpoint); - - FILE* f = fopen(fname, "w"); - if (f == NULL) { - err(1, "fopen: unable to open file"); - } - - /*struct kvm_irqchip irqchip = {}; - if (cap_irqchip) - kvm_ioctl(vmfd, KVM_GET_IRQCHIP, &irqchip); - else - memset(&irqchip, 0x00, sizeof(irqchip)); - if (fwrite(&irqchip, sizeof(irqchip), 1, f) != 1) - err(1, "fwrite failed");*/ - - struct kvm_clock_data clock = {}; - kvm_ioctl(vmfd, KVM_GET_CLOCK, &clock); - if (fwrite(&clock, sizeof(clock), 1, f) != 1) - err(1, "fwrite failed"); - -#if 0 - if (fwrite(guest_mem, guest_size, 1, f) != 1) - err(1, "fwrite failed"); -#elif defined(USE_DIRTY_LOG) static struct kvm_dirty_log dlog = { .slot = 0, .dirty_bitmap = NULL @@ -601,8 +532,7 @@ void timer_handler(int signum) size_t dirty_log_size = (guest_size >> PAGE_BITS) / sizeof(size_t); // do we create our first checkpoint - if (dlog.dirty_bitmap == NULL) - { + if (dlog.dirty_bitmap == NULL) { // besure that all paddings are zero memset(&dlog, 0x00, sizeof(dlog)); @@ -616,24 +546,17 @@ void timer_handler(int signum) nextslot: kvm_ioctl(vmfd, KVM_GET_DIRTY_LOG, &dlog); - for(size_t i=0; i 0)) ? PG_DIRTY : PG_ACCESSED; + size_t* pml4 = (size_t*) (guest_mem+elf_entry+PAGE_SIZE); for(size_t i=0; i<(1 << PAGE_MAP_BITS); i++) { if ((pml4[i] & PG_PRESENT) != PG_PRESENT) @@ -669,32 +597,101 @@ nextslot: if (!full_checkpoint) pgt[l] = pgt[l] & ~(PG_DIRTY|PG_ACCESSED); size_t pgt_entry = pgt[l] & ~PG_PSE; // because PAT use the same bit as PSE - if (fwrite(&pgt_entry, sizeof(size_t), 1, f) != 1) - err(1, "fwrite failed"); - if (fwrite((size_t*) (guest_mem + (pgt[l] & PAGE_MASK)), (1UL << PAGE_BITS), 1, f) != 1) - err(1, "fwrite failed"); + + save_page(&pgt_entry, sizeof(size_t), (void*) (guest_mem + (pgt[l] & PAGE_MASK)), (1UL << PAGE_BITS)); } } } else if ((pgd[k] & flag) == flag) { //printf("\t\t*pgd[%zd] 0x%zx, 2MB\n", k, pgd[k] & ~PG_XD); if (!full_checkpoint) pgd[k] = pgd[k] & ~(PG_DIRTY|PG_ACCESSED); - if (fwrite(pgd+k, sizeof(size_t), 1, f) != 1) - err(1, "fwrite failed"); - if (fwrite((size_t*) (guest_mem + (pgd[k] & PAGE_2M_MASK)), (1UL << PAGE_2M_BITS), 1, f) != 1) - err(1, "fwrite failed"); + + save_page(pgd+k, sizeof(size_t), (void*) (guest_mem + (pgd[k] & PAGE_2M_MASK)), (1UL << PAGE_2M_BITS)); } } } } +} +void open_chk_file(char *fname) { + chk_file = fopen(fname, "w"); + if (chk_file == NULL) { + err(1, "fopen: unable to open file"); + } +} + +void close_chk_file(void) { + fclose(chk_file); +} + +void write_chk_file(void *addr, size_t bytes) { + if (fwrite(addr, bytes, 1, chk_file) != 1) { + err(1, "fwrite failed"); + } +} + +void write_mem_page_to_chk_file(void *entry, size_t entry_size, void *page, size_t page_size) { + write_chk_file(entry, entry_size); + write_chk_file(page, page_size); +} + +void determine_dirty_pages(void (*save_page_handler)(void*, size_t, void*, size_t)) +{ +#ifdef USE_DIRTY_LOG + scan_dirty_log(save_page_handler); +#else + scan_page_tables(save_page_handler); #endif - fclose(f); +} + +void timer_handler(int signum) +{ + + struct stat st = {0}; + char fname[MAX_FNAME]; + struct timeval begin, end; + + if (verbose) + gettimeofday(&begin, NULL); + + if (stat("checkpoint", &st) == -1) + mkdir("checkpoint", 0700); + + for(size_t i = 0; i < ncores; i++) + if (vcpu_threads[i] != pthread_self()) + pthread_kill(vcpu_threads[i], SIGTHRCHKP); pthread_barrier_wait(&barrier); + write_cpu_state(); + + snprintf(fname, MAX_FNAME, "checkpoint/chk%u_mem.dat", no_checkpoint); + + open_chk_file(fname); + + /*struct kvm_irqchip irqchip = {}; + if (cap_irqchip) + kvm_ioctl(vmfd, KVM_GET_IRQCHIP, &irqchip); + else + memset(&irqchip, 0x00, sizeof(irqchip)); + if (fwrite(&irqchip, sizeof(irqchip), 1, f) != 1) + err(1, "fwrite failed");*/ + + struct kvm_clock_data clock = {}; + kvm_ioctl(vmfd, KVM_GET_CLOCK, &clock); + write_chk_file(&clock, sizeof(clock)); + +#if 0 + if (fwrite(guest_mem, guest_size, 1, f) != 1) + err(1, "fwrite failed"); +#else + determine_dirty_pages(write_mem_page_to_chk_file); +#endif + close_chk_file(); + pthread_barrier_wait(&barrier); + // update configuration file - f = fopen("checkpoint/chk_config.txt", "w"); + FILE *f = fopen("checkpoint/chk_config.txt", "w"); if (f == NULL) { err(1, "fopen: unable to open file"); } @@ -720,6 +717,97 @@ nextslot: no_checkpoint++; } +void *migration_handler(void *arg) +{ + sigset_t *signal_mask = (sigset_t *)arg; + int res = 0; + size_t i = 0; + + int sig_caught; /* signal caught */ + + /* Use same mask as the set of signals that we'd like to know about! */ + sigwait(signal_mask, &sig_caught); + connect_to_server(); + + /* send metadata */ + migration_metadata_t metadata = { + ncores, + guest_size, + 0, /* no_checkpoint */ + elf_entry, + full_checkpoint}; + + res = send_data(&metadata, sizeof(migration_metadata_t)); + fprintf(stderr, "Metadata sent! (%d bytes)\n", res); + + if (get_migration_type() == MIG_TYPE_LIVE) { + /* resend rounds */ + for (i=0; incores, metadata->guest_size, metadata->no_checkpoint, metadata->elf_entry, metadata->full_checkpoint); +} + void init_kvm_arch(void) { uint64_t identity_base = 0xfffbc000; @@ -1008,7 +1110,8 @@ int load_kernel(uint8_t* mem, char* path) *((uint32_t*) (mem+paddr-GUEST_OFFSET + 0x30)) = 0; // apicid *((uint32_t*) (mem+paddr-GUEST_OFFSET + 0x60)) = 1; // numa nodes *((uint32_t*) (mem+paddr-GUEST_OFFSET + 0x94)) = 1; // announce uhyve - *((uint64_t*) (mem+paddr-GUEST_OFFSET + 0x98)) = UHYVE_UART_PORT ; // announce uhyve + if (verbose) + *((uint64_t*) (mem+paddr-GUEST_OFFSET + 0x98)) = UHYVE_UART_PORT ; // announce uhyve char* str = getenv("HERMIT_IP"); if (str) { @@ -1042,8 +1145,7 @@ int load_kernel(uint8_t* mem, char* path) *((uint8_t*) (mem+paddr-GUEST_OFFSET + 0xBB)) = (uint8_t) ip[3]; } - if (verbose) - *((uint64_t*) (mem+paddr-GUEST_OFFSET + 0xbc)) = (uint64_t) guest_mem; + *((uint64_t*) (mem+paddr-GUEST_OFFSET + 0xbc)) = (uint64_t)guest_mem; } *((uint64_t*) (mem+paddr-GUEST_OFFSET + 0x38)) += memsz; // total kernel size } diff --git a/tools/uhyve.c b/tools/uhyve.c index d46cd859b..c9f664b5b 100644 --- a/tools/uhyve.c +++ b/tools/uhyve.c @@ -34,6 +34,7 @@ #define _GNU_SOURCE +#include #include #include #include @@ -64,10 +65,12 @@ #include "uhyve.h" #include "uhyve-syscalls.h" +#include "uhyve-migration.h" #include "uhyve-net.h" #include "proxy.h" static bool restart = false; +static bool migration = false; static pthread_t net_thread; static int* vcpu_fds = NULL; static pthread_mutex_t kvm_lock = PTHREAD_MUTEX_INITIALIZER; @@ -78,6 +81,7 @@ static char* guest_path = NULL; size_t guest_size = 0x20000000ULL; bool full_checkpoint = false; pthread_barrier_t barrier; +pthread_barrier_t migration_barrier; pthread_t* vcpu_threads = NULL; uint8_t* klog = NULL; uint8_t* guest_mem = NULL; @@ -97,6 +101,9 @@ char **uhyve_argv = NULL; extern char **environ; char **uhyve_envp = NULL; +vcpu_state_t *vcpu_thread_states = NULL; +static sigset_t signal_mask; + typedef struct { int argc; int argsz[MAX_ARGC_ENVC]; @@ -254,13 +261,23 @@ static int vcpu_loop(void) pthread_barrier_wait(&barrier); - if (restart) - restore_cpu_state(); - else + if (restart) { + vcpu_state_t cpu_state = read_cpu_state(); + restore_cpu_state(cpu_state); + } else if (vcpu_thread_states) { + restore_cpu_state(vcpu_thread_states[cpuid]); + } else { init_cpu_state(elf_entry); + } - if (restart && (cpuid == 0)) - no_checkpoint++; + if (cpuid == 0) { + if (restart) { + no_checkpoint++; + } else if (migration) { + free(vcpu_thread_states); + vcpu_thread_states = NULL; + } + } while (1) { ret = ioctl(vcpufd, KVM_RUN, NULL); @@ -497,12 +514,29 @@ static int vcpu_init(void) static void sigusr_handler(int signum) { pthread_barrier_wait(&barrier); - - save_cpu_state(); + write_cpu_state(); pthread_barrier_wait(&barrier); } +static void vcpu_thread_mig_handler(int signum) +{ + /* memory should be allocated at this point */ + assert(vcpu_thread_states != NULL); + + /* ensure consistency among VCPUs */ + pthread_barrier_wait(&barrier); + + /* save state */ + vcpu_thread_states[cpuid] = save_cpu_state(); + + /* synchronize with migration thread */ + pthread_barrier_wait(&migration_barrier); + + /* wait to be killed */ + pthread_barrier_wait(&migration_barrier); +} + static void* uhyve_thread(void* arg) { size_t ret; @@ -512,10 +546,15 @@ static void* uhyve_thread(void* arg) cpuid = (size_t) arg; - /* Install timer_handler as the signal handler for SIGVTALRM. */ + /* install signal handler for checkpoint */ memset(&sa, 0x00, sizeof(sa)); sa.sa_handler = &sigusr_handler; - sigaction(SIGRTMIN, &sa, NULL); + sigaction(SIGTHRCHKP, &sa, NULL); + + /* install signal handler for migration */ + memset(&sa, 0x00, sizeof(sa)); + sa.sa_handler = &vcpu_thread_mig_handler; + sigaction(SIGTHRMIG, &sa, NULL); // create new cpu vcpu_init(); @@ -537,6 +576,7 @@ void sigterm_handler(int signum) int uhyve_init(char *path) { + FILE *f = NULL; guest_path = path; signal(SIGTERM, sigterm_handler); @@ -544,8 +584,24 @@ int uhyve_init(char *path) // register routine to close the VM atexit(uhyve_atexit); - FILE* f = fopen("checkpoint/chk_config.txt", "r"); - if (f != NULL) { + const char *start_mig_server = getenv("HERMIT_MIGRATION_SERVER"); + + /* + * Three startups + * a) incoming migration + * b) load existing checkpoint + * c) normal run + */ + if (start_mig_server) { + migration = true; + migration_metadata_t metadata; + wait_for_incomming_migration(&metadata, MIGRATION_PORT); + + ncores = metadata.ncores; + guest_size = metadata.guest_size; + elf_entry = metadata.elf_entry; + full_checkpoint = metadata.full_checkpoint; + } else if ((f = fopen("checkpoint/chk_config.txt", "r")) != NULL) { int tmp = 0; restart = true; @@ -557,7 +613,10 @@ int uhyve_init(char *path) full_checkpoint = tmp ? true : false; if (verbose) - fprintf(stderr, "Restart from checkpoint %u (ncores %d, mem size 0x%zx)\n", no_checkpoint, ncores, guest_size); + fprintf(stderr, + "Restart from checkpoint %u " + "(ncores %d, mem size 0x%zx)\n", + no_checkpoint, ncores, guest_size); fclose(f); } else { const char* hermit_memory = getenv("HERMIT_MEM"); @@ -598,6 +657,9 @@ int uhyve_init(char *path) if (restart) { if (load_checkpoint(guest_mem, path) != 0) exit(EXIT_FAILURE); + } else if (start_mig_server) { + load_migration_data(guest_mem); + close_migration_channel(); } else { if (load_kernel(guest_mem, path) != 0) exit(EXIT_FAILURE); @@ -605,6 +667,7 @@ int uhyve_init(char *path) #endif pthread_barrier_init(&barrier, NULL, ncores); + pthread_barrier_init(&migration_barrier, NULL, ncores+1); cpuid = 0; // create first CPU, it will be the boot processor by default @@ -626,6 +689,8 @@ int uhyve_init(char *path) int uhyve_loop(int argc, char **argv) { const char* hermit_check = getenv("HERMIT_CHECKPOINT"); + const char* hermit_mig_support = getenv("HERMIT_MIGRATION_SUPPORT"); + const char* hermit_mig_type = getenv("HERMIT_MIGRATION_TYPE"); int ts = 0, i = 0; /* argv[0] is 'proxy', do not count it */ @@ -656,6 +721,27 @@ int uhyve_loop(int argc, char **argv) if (hermit_check) ts = atoi(hermit_check); + if (hermit_mig_support) { + set_migration_target(hermit_mig_support, MIGRATION_PORT); + set_migration_type(hermit_mig_type); + + /* block SIGUSR1 in main thread */ + sigemptyset (&signal_mask); + sigaddset (&signal_mask, SIGUSR1); + pthread_sigmask (SIG_BLOCK, &signal_mask, NULL); + + /* start migration thread; handles SIGUSR1 */ + pthread_t sig_thr_id; + pthread_create (&sig_thr_id, NULL, migration_handler, (void *)&signal_mask); + + /* install signal handler for migration */ + struct sigaction sa; + memset(&sa, 0x00, sizeof(sa)); + sa.sa_handler = &vcpu_thread_mig_handler; + sigaction(SIGTHRMIG, &sa, NULL); + } + + // First CPU is special because it will boot the system. Other CPUs will // be booted linearily after the first one. vcpu_threads[0] = pthread_self(); diff --git a/tools/uhyve.h b/tools/uhyve.h index e52b68641..19233d30c 100644 --- a/tools/uhyve.h +++ b/tools/uhyve.h @@ -29,6 +29,7 @@ #define __UHYVE_H__ #include +#include #define UHYVE_PORT_WRITE 0x400 #define UHYVE_PORT_OPEN 0x440 @@ -52,6 +53,9 @@ #define UHYVE_IRQ 11 +#define SIGTHRCHKP (SIGRTMIN+0) +#define SIGTHRMIG (SIGRTMIN+1) + #define kvm_ioctl(fd, cmd, arg) ({ \ const int ret = ioctl(fd, cmd, arg); \ if(ret == -1) \ @@ -59,14 +63,43 @@ ret; \ }) +#define MAX_MSR_ENTRIES 25 +struct msr_data { + struct kvm_msrs info; + struct kvm_msr_entry entries[MAX_MSR_ENTRIES]; +}; + + +typedef struct _vcpu_state { + struct msr_data msr_data; + struct kvm_regs regs; + struct kvm_sregs sregs; + struct kvm_fpu fpu; + struct kvm_lapic_state lapic; + struct kvm_xsave xsave; + struct kvm_xcrs xcrs; + struct kvm_vcpu_events events; + struct kvm_mp_state mp_state; +} vcpu_state_t; + + +typedef struct _migration_metadata migration_metadata_t; + void print_registers(void); void timer_handler(int signum); -void restore_cpu_state(void); -void save_cpu_state(void); +void *migration_handler(void *arg); +void restore_cpu_state(vcpu_state_t cpu_state); +vcpu_state_t read_cpu_state(void); +vcpu_state_t save_cpu_state(void); +void write_cpu_state(void); void init_cpu_state(uint64_t elf_entry); int load_kernel(uint8_t* mem, char* path); int load_checkpoint(uint8_t* mem, char* path); +int load_migration_data(uint8_t* mem); +void wait_for_incomming_migration(migration_metadata_t *metadata, uint16_t listen_portno); void init_kvm_arch(void); int load_kernel(uint8_t* mem, char* path); +size_t determine_dest_offset(size_t src_addr); +void determine_dirty_pages(void (*save_page_handler)(void*, size_t, void*, size_t)); #endif diff --git a/usr/tests/CMakeLists.txt b/usr/tests/CMakeLists.txt index 716bf8376..de140d906 100644 --- a/usr/tests/CMakeLists.txt +++ b/usr/tests/CMakeLists.txt @@ -10,6 +10,10 @@ add_executable(hello++ hello++.cpp) add_executable(hellof hellof.f90) add_executable(pi pi.go) +add_executable(endless endless.c) +target_compile_options(endless PRIVATE -fopenmp) +target_link_libraries(endless -fopenmp) + add_executable(test-malloc test-malloc.c) add_executable(test-malloc-mt test-malloc-mt.c) target_compile_options(test-malloc-mt PRIVATE -pthread) diff --git a/usr/tests/endless.c b/usr/tests/endless.c new file mode 100644 index 000000000..0368fb97a --- /dev/null +++ b/usr/tests/endless.c @@ -0,0 +1,18 @@ +#include +#include +#include + + +int main(int argc, char **argv) { + int cnt = 0; +#pragma omp parallel + while(1) { +#pragma omp critical +{ + printf("Counter %d\n", ++cnt); +} + sys_msleep(500); + } + + return 0; +}