1
0
Fork 0
mirror of https://github.com/hermitcore/libhermit.git synced 2025-03-30 00:00:15 +01:00

uhyve migration support (#89)

- incremental/complete guest memory transfer via RDMA
- cold migration via RDMA
- live migration via RDMA (currently fixed number of iterations)
- complete guest memory transfer via TCP/IP (for compatibility)
This commit is contained in:
Simon Pickartz 2018-04-26 20:57:05 +02:00 committed by Stefan Lankes
parent b789360af2
commit 59d0bfa956
10 changed files with 1520 additions and 192 deletions

View file

@ -3,12 +3,35 @@ project(hermit_tools)
include(../cmake/HermitCore-Paths.cmake)
option(ENABLE_RDMA_MIGRATION "Migration support via RDMA" OFF)
add_compile_options(-std=c99)
add_executable(proxy proxy.c utils.c uhyve.c uhyve-net.c uhyve-x86_64.c uhyve-aarch64.c)
target_compile_options(proxy PUBLIC -pthread)
list(APPEND LIBS "-pthread")
set(SRC proxy.c
utils.c
uhyve.c
uhyve-net.c
uhyve-migration.c
uhyve-x86_64.c
uhyve-aarch64.c
)
### Optional migration via RDMA
if(ENABLE_RDMA_MIGRATION)
add_definitions(-D__RDMA_MIGRATION__)
list(APPEND LIBS "-libverbs")
set(SRC ${SRC} uhyve-migration-rdma.c)
else()
remove_definitions(-D__RDMA_MIGRATION__)
endif()
add_executable(proxy ${SRC})
target_compile_options(proxy PUBLIC ${LIBS})
target_compile_options(proxy PUBLIC -DMAX_ARGC_ENVC=${MAX_ARGC_ENVC})
target_link_libraries(proxy -pthread)
target_link_libraries(proxy ${LIBS})
install(TARGETS proxy
DESTINATION bin)

View file

@ -135,6 +135,17 @@ void print_registers(void)
}
}
vcpu_state_t read_cpu_state()
{
err(1, "Migration is currently not supported!");
}
void migration_handler(int signum)
{
err(1, "Migration is currently not supported!");
}
void timer_handler(int signum)
{
err(1, "Checkpointing is currently not supported!");
@ -145,7 +156,13 @@ void restore_cpu_state(void)
err(1, "Checkpointing is currently not supported!");
}
void save_cpu_state(void)
vcpu_state_t save_cpu_state(void)
{
err(1, "Checkpointing is currently not supported!");
}
void write_cpu_state(void)
{
err(1, "Checkpointing is currently not supported!");
}
@ -155,6 +172,16 @@ int load_checkpoint(uint8_t* mem, char* path)
err(1, "Checkpointing is currently not supported!");
}
int load_migration_data(uint8_t* mem)
{
err(1, "Checkpointing is currently not supported!");
}
void wait_for_incomming_migration(migration_metadata_t *metadata, uint16_t listen_portno)
{
err(1, "Checkpointing is currently not supported!");
}
void init_cpu_state(uint64_t elf_entry)
{
struct kvm_vcpu_init vcpu_init = {

View file

@ -0,0 +1,684 @@
/*
* Copyright (c) 2018, Simon Pickartz, RWTH Aachen University
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#define _GNU_SOURCE
#include <stdlib.h>
#include <arpa/inet.h>
#include <infiniband/verbs.h>
#include <errno.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "uhyve-migration.h"
#include "uhyve.h"
#ifdef __RDMA_MIGRATION__
#define IB_USED_PORT (1)
#define IB_CQ_ENTRIES (1)
#define IB_MAX_INLINE_DATA (0)
#define IB_MAX_DEST_RD_ATOMIC (1)
#define IB_MIN_RNR_TIMER (1)
#define IB_MAX_RECV_WR (1)
#define IB_MAX_SEND_SGE (1)
#define IB_MAX_RECV_SGE (1)
typedef enum ib_wr_ids {
IB_WR_NO_ID = 0,
IB_WR_WRITE_PAGE_ID,
IB_WR_WRITE_LAST_PAGE_ID,
IB_WR_RECV_LAST_PAGE_ID
} ib_wr_ids_t;
typedef struct qp_info {
uint32_t qpn;
uint16_t lid;
uint16_t psn;
uint32_t key;
uint64_t addr;
} qp_info_t;
typedef struct com_hndl {
struct ibv_context *ctx; /* device context */
struct ibv_device_attr dev_attr; /* device attributes */
struct ibv_pd *pd; /* protection domain */
struct ibv_mr *mr; /* memory region */
struct ibv_cq *cq; /* completion queue */
struct ibv_qp *qp; /* queue pair */
struct ibv_comp_channel *comp_chan; /* completion event channel */
qp_info_t loc_qp_info;
qp_info_t rem_qp_info;
uint8_t *buf; /* the communication buffer */
uint32_t size; /* size of the buffer */
} com_hndl_t;
static com_hndl_t com_hndl;
static struct ibv_send_wr *send_list = NULL;
static struct ibv_send_wr *send_list_last = NULL;
static size_t send_list_length = 0;
/**
* \brief Initializes the IB communication structures
*
* \param com_hndl the structure containing all communication relevant infos
* \param buf the buffer that should be registrered with the QP
*
* This function sets up the IB communication channel. It registers the 'buf'
* with a new protection domain. On its termination there is a QP in the INIT
* state ready to be connected with the remote side.
*/
static void
init_com_hndl(void)
{
/* the guest physical memory is the communication buffer */
com_hndl.size = guest_size;
com_hndl.buf = guest_mem;
struct ibv_device **device_list;
struct ibv_device *ib_pp_device;
int num_devices;
/* determine first available device */
if ((device_list = ibv_get_device_list(&num_devices)) == NULL) {
fprintf(stderr,
"ERROR: Could not determine available IB devices "
"- %d (%s). Abort!\n",
errno,
strerror(errno));
exit(EXIT_FAILURE);
}
if (num_devices != 0) {
ib_pp_device = device_list[0];
} else {
fprintf(stderr,
"ERROR: Could not find any IB device. Abort!\n");
exit(EXIT_FAILURE);
}
/* open the device context and create protection domain */
if ((com_hndl.ctx = ibv_open_device(ib_pp_device)) == NULL) {
fprintf(stderr,
"ERROR: Could not open the device context "
"- %d (%s). Abort!\n",
errno,
strerror(errno));
exit(EXIT_FAILURE);
}
/* query device capability (e.g., to determine 'max_qp_wr') */
if (ibv_query_device(com_hndl.ctx, &com_hndl.dev_attr) < 0) {
fprintf(stderr,
"ERROR: Could not query device attributes "
"- %d (%s). Abort!\n",
errno,
strerror(errno));
exit(EXIT_FAILURE);
}
/* allocate protection domain */
if ((com_hndl.pd = ibv_alloc_pd(com_hndl.ctx)) == NULL) {
fprintf(stderr,
"ERROR: Could not allocate protection domain "
"- %d (%s). Abort!\n",
errno,
strerror(errno));
exit(EXIT_FAILURE);
}
/* determine LID */
struct ibv_port_attr port_attr;
if (ibv_query_port(com_hndl.ctx, IB_USED_PORT, &port_attr) < 0){
fprintf(stderr,
"ERROR: Could not query port %u "
"- %d (%s). Abort!\n",
IB_USED_PORT,
errno,
strerror(errno));
exit(EXIT_FAILURE);
}
/* register guest memory with the protection domain */
if ((com_hndl.mr = ibv_reg_mr(com_hndl.pd,
com_hndl.buf,
com_hndl.size,
IBV_ACCESS_LOCAL_WRITE |
IBV_ACCESS_REMOTE_WRITE)) == NULL) {
fprintf(stderr,
"ERROR: Could not register the memory region "
"- %d (%s). Abort!\n",
errno,
strerror(errno));
exit(EXIT_FAILURE);
}
/* create completion event channel */
if ((com_hndl.comp_chan =
ibv_create_comp_channel(com_hndl.ctx)) == NULL) {
fprintf(stderr,
"ERROR: Could not create the completion channel "
"- %d (%s). Abort!\n",
errno,
strerror(errno));
exit(EXIT_FAILURE);
}
/* create the completion queue */
if ((com_hndl.cq = ibv_create_cq(com_hndl.ctx,
IB_CQ_ENTRIES,
NULL,
com_hndl.comp_chan,
0)) == NULL) {
fprintf(stderr,
"ERROR: Could not create the completion queue "
"- %d (%s). Abort!\n",
errno,
strerror(errno));
exit(EXIT_FAILURE);
}
/* create send and recv queue pair and initialize it */
struct ibv_qp_init_attr init_attr = {
.send_cq = com_hndl.cq,
.recv_cq = com_hndl.cq,
.cap = {
.max_send_wr = com_hndl.dev_attr.max_qp_wr,
.max_recv_wr = IB_MAX_RECV_WR,
.max_send_sge = IB_MAX_SEND_SGE,
.max_recv_sge = IB_MAX_RECV_SGE,
.max_inline_data = IB_MAX_INLINE_DATA
},
.qp_type = IBV_QPT_RC,
.sq_sig_all = 0 /* we do not want a CQE for each WR */
};
if ((com_hndl.qp = ibv_create_qp(com_hndl.pd, &init_attr)) == NULL) {
fprintf(stderr,
"ERROR: Could not create the queue pair "
"- %d (%s). Abort!\n",
errno,
strerror(errno));
exit(EXIT_FAILURE);
}
struct ibv_qp_attr attr = {
.qp_state = IBV_QPS_INIT,
.pkey_index = 0,
.port_num = IB_USED_PORT,
.qp_access_flags = (IBV_ACCESS_REMOTE_WRITE)
};
if (ibv_modify_qp(com_hndl.qp,
&attr,
IBV_QP_STATE |
IBV_QP_PKEY_INDEX |
IBV_QP_PORT |
IBV_QP_ACCESS_FLAGS) < 0) {
fprintf(stderr,
"ERROR: Could not set QP into init state "
"- %d (%s). Abort!\n",
errno,
strerror(errno));
exit(EXIT_FAILURE);
}
/* fill in local qp_info */
com_hndl.loc_qp_info.qpn = com_hndl.qp->qp_num;
com_hndl.loc_qp_info.psn = lrand48() & 0xffffff;
com_hndl.loc_qp_info.key = com_hndl.mr->rkey;
com_hndl.loc_qp_info.addr = (uint64_t)com_hndl.buf;
com_hndl.loc_qp_info.lid = port_attr.lid;
}
/**
* \brief Frees IB related resources
*
* \param com_hndl the structure containing all communication relevant infos
*/
static void
destroy_com_hndl(void)
{
if (ibv_destroy_qp(com_hndl.qp) < 0) {
fprintf(stderr,
"ERROR: Could not destroy the queue pair "
"- %d (%s). Abort!\n",
errno,
strerror(errno));
exit(EXIT_FAILURE);
}
if (ibv_destroy_cq(com_hndl.cq) < 0) {
fprintf(stderr,
"ERROR: Could not deallocate the protection domain "
"- %d (%s). Abort!\n",
errno,
strerror(errno));
exit(EXIT_FAILURE);
}
if (ibv_destroy_comp_channel(com_hndl.comp_chan) < 0) {
fprintf(stderr,
"ERROR: Could not deallocate the protection domain "
"- %d (%s). Abort!\n",
errno,
strerror(errno));
exit(EXIT_FAILURE);
}
if (ibv_dereg_mr(com_hndl.mr) < 0) {
fprintf(stderr,
"ERROR: Could not deallocate the protection domain "
"- %d (%s). Abort!\n",
errno,
strerror(errno));
exit(EXIT_FAILURE);
}
if (ibv_dealloc_pd(com_hndl.pd) < 0) {
fprintf(stderr,
"ERROR: Could not deallocate the protection domain "
"- %d (%s). Abort!\n",
errno,
strerror(errno));
exit(EXIT_FAILURE);
}
if (ibv_close_device(com_hndl.ctx) < 0) {
fprintf(stderr,
"ERROR: Could not close the device context "
"- %d (%s). Abort!\n",
errno,
strerror(errno));
exit(EXIT_FAILURE);
}
}
/**
* \brief Connects the QP created within init_com_hndl
*
* \param com_hndl the structure containing all communication relevant infos
*
* This function performs the actual connection setup between the two QPs.
*/
static void
con_com_buf(void) {
/* transistion to ready-to-receive state */
struct ibv_qp_attr qp_attr = {
.qp_state = IBV_QPS_RTR,
.path_mtu = IBV_MTU_2048,
.dest_qp_num = com_hndl.rem_qp_info.qpn,
.rq_psn = com_hndl.rem_qp_info.psn,
.max_dest_rd_atomic = IB_MAX_DEST_RD_ATOMIC,
.min_rnr_timer = IB_MIN_RNR_TIMER,
.ah_attr = {
.is_global = 0,
.sl = 0,
.src_path_bits = 0,
.dlid = com_hndl.rem_qp_info.lid,
.port_num = IB_USED_PORT,
}
};
if (ibv_modify_qp(com_hndl.qp,
&qp_attr,
IBV_QP_STATE |
IBV_QP_PATH_MTU |
IBV_QP_DEST_QPN |
IBV_QP_RQ_PSN |
IBV_QP_MAX_DEST_RD_ATOMIC |
IBV_QP_MIN_RNR_TIMER |
IBV_QP_AV)) {
fprintf(stderr,
"ERROR: Could not put QP into RTR state"
"- %d (%s). Abort!\n",
errno,
strerror(errno));
exit(errno);
}
/* transistion to ready-to-send state */
qp_attr.qp_state = IBV_QPS_RTS;
qp_attr.timeout = 14;
qp_attr.retry_cnt = 7;
qp_attr.rnr_retry = 7; /* infinite retrys on RNR NACK */
qp_attr.sq_psn = com_hndl.loc_qp_info.psn;
qp_attr.max_rd_atomic = 1;
if (ibv_modify_qp(com_hndl.qp, &qp_attr,
IBV_QP_STATE |
IBV_QP_TIMEOUT |
IBV_QP_RETRY_CNT |
IBV_QP_RNR_RETRY |
IBV_QP_SQ_PSN |
IBV_QP_MAX_QP_RD_ATOMIC)) {
fprintf(stderr,
"ERROR: Could not put QP into RTS state"
"- %d (%s). Abort!\n",
errno,
strerror(errno));
exit(errno);
}
}
/**
* \brief Set the destination node for a migration
*
* \param ip_str a string containing the IPv4 addr of the destination
* \param port the migration port
*/
static void
exchange_qp_info(bool server)
{
int res = 0;
if (server) {
res = recv_data(&com_hndl.rem_qp_info, sizeof(qp_info_t));
res = send_data(&com_hndl.loc_qp_info, sizeof(qp_info_t));
} else {
res = send_data(&com_hndl.loc_qp_info, sizeof(qp_info_t));
res = recv_data(&com_hndl.rem_qp_info, sizeof(qp_info_t));
}
fprintf(stderr, "QP info sent! (QPN: %lu; LID: %lu; PSN: %lu; KEY: %lu; ADDR: 0x%x)\n",
com_hndl.loc_qp_info.qpn,
com_hndl.loc_qp_info.lid,
com_hndl.loc_qp_info.psn,
com_hndl.loc_qp_info.key,
com_hndl.loc_qp_info.addr);
fprintf(stderr, "QP info received! (QPN: %lu; LID: %lu; PSN: %lu; KEY: %lu; ADDR: 0x%x)\n",
com_hndl.rem_qp_info.qpn,
com_hndl.rem_qp_info.lid,
com_hndl.rem_qp_info.psn,
com_hndl.rem_qp_info.key,
com_hndl.rem_qp_info.addr);
}
/**
* \brief Prepares the an 'ibv_send_wr'
*
* This function prepares an 'ibv_send_wr' structure that is prepared for the
* transmission of a single memory page using the IBV_WR_RDMA_WRITE verb.
*/
static inline struct ibv_send_wr *
prepare_send_list_elem(void)
{
/* create work request */
struct ibv_send_wr *send_wr = (struct ibv_send_wr*)calloc(1, sizeof(struct ibv_send_wr));
struct ibv_sge *sge = (struct ibv_sge*)calloc(1, sizeof(struct ibv_sge));
/* basic work request configuration */
send_wr->next = NULL;
send_wr->sg_list = sge;
send_wr->num_sge = 1;
send_wr->wr_id = IB_WR_WRITE_PAGE_ID;
send_wr->opcode = IBV_WR_RDMA_WRITE;
return send_wr;
}
/**
* \brief Creates an 'ibv_send_wr' and appends it to the send_list
*
* \param addr the page table entry of the memory page
* \param addr_size the size of the page table entry
* \param page the buffer to be send in this WR
* \param page_size the size of the buffer
*
* This function creates an 'ibv_send_wr' structure and appends this to the
* global send_list. It sets the source/destination information and sets the
* IBV_SEND_SIGNALED flag as appropriate.
*/
static void
create_send_list_entry (void *addr, size_t addr_size, void *page, size_t page_size)
{
/* create work request */
struct ibv_send_wr *send_wr = prepare_send_list_elem();
/* configure source buffer */
send_wr->sg_list->addr = (uintptr_t)page;
send_wr->sg_list->length = page_size;
send_wr->sg_list->lkey = com_hndl.mr->lkey;
/* configure destination buffer */
if (addr) {
send_wr->wr.rdma.remote_addr = com_hndl.rem_qp_info.addr + determine_dest_offset(*(size_t*)addr);
} else {
send_wr->wr.rdma.remote_addr = com_hndl.rem_qp_info.addr;
}
send_wr->wr.rdma.rkey = com_hndl.rem_qp_info.key;
/* apped work request to send list */
if (send_list == NULL) {
send_list = send_list_last = send_wr;
} else {
send_list_last->next = send_wr;
send_list_last = send_list_last->next;
}
/* we have to request a CQE if max_send_wr is reached to avoid overflows */
if ((++send_list_length%com_hndl.dev_attr.max_qp_wr) == 0) {
send_list_last->send_flags = IBV_SEND_SIGNALED;
}
}
/**
* \brief Sends the guest memory to the destination
*
* \param mode MIG_MODE_COMPLETE_DUMP sends the complete memory and
* MIG_MODE_INCREMENTAL_DUMP only the mapped guest pages
*/
void send_guest_mem(mig_mode_t mode, bool final_dump)
{
int res = 0;
static bool ib_initialized = false;
/* prepare IB channel */
if (!ib_initialized) {
init_com_hndl();
exchange_qp_info(false);
con_com_buf();
ib_initialized = true;
}
/* determine migration mode */
switch (mode) {
case MIG_MODE_COMPLETE_DUMP:
/* one send_wr for the whole guest memory */
create_send_list_entry(NULL, 0, (void*)com_hndl.buf, guest_size);
break;
case MIG_MODE_INCREMENTAL_DUMP:
/* iterate guest page tables */
determine_dirty_pages(create_send_list_entry);
break;
default:
fprintf(stderr, "ERROR: Unknown migration mode. Abort!\n");
exit(EXIT_FAILURE);
}
/* create a dumy WR request if there is nothing to send */
if (send_list_length == 0)
create_send_list_entry(NULL, 0, NULL, 0);
/* we have to wait for the last WR before informing dest */
if ((mode == MIG_MODE_COMPLETE_DUMP) || final_dump) {
send_list_last->wr_id = IB_WR_WRITE_LAST_PAGE_ID;
send_list_last->opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
send_list_last->send_flags = IBV_SEND_SIGNALED | IBV_SEND_SOLICITED;
send_list_last->imm_data = htonl(0x1);
} else {
send_list_last->wr_id = IB_WR_WRITE_LAST_PAGE_ID;
send_list_last->send_flags = IBV_SEND_SIGNALED;
}
printf("DEBUG: Send list length %d\n", send_list_length);
/* we have to call ibv_post_send() as long as 'send_list' contains elements */
struct ibv_wc wc;
struct ibv_send_wr *remaining_send_wr = NULL;
do {
/* send data */
remaining_send_wr = NULL;
if (ibv_post_send(com_hndl.qp, send_list, &remaining_send_wr) && (errno != ENOMEM)) {
fprintf(stderr,
"ERROR: Could not post send"
"- %d (%s). Abort!\n",
errno,
strerror(errno));
exit(EXIT_FAILURE);
}
/* wait for send WRs if CQ is full */
do {
if ((res = ibv_poll_cq(com_hndl.cq, 1, &wc)) < 0) {
fprintf(stderr,
"ERROR: Could not poll on CQ"
"- %d (%s). Abort!\n",
errno,
strerror(errno));
exit(EXIT_FAILURE);
}
} while (res < 1);
if (wc.status != IBV_WC_SUCCESS) {
fprintf(stderr,
"ERROR: WR failed status %s (%d) for wr_id %d\n",
ibv_wc_status_str(wc.status),
wc.status,
(int)wc.wr_id);
}
send_list = remaining_send_wr;
} while (remaining_send_wr);
/* ensure that we receive the CQE for the last page */
if (wc.wr_id != IB_WR_WRITE_LAST_PAGE_ID) {
fprintf(stderr,
"ERROR: WR failed status %s (%d) for wr_id %d\n",
ibv_wc_status_str(wc.status),
wc.status,
(int)wc.wr_id);
}
/* cleanup send_list */
struct ibv_send_wr *cur_send_wr = send_list;
struct ibv_send_wr *tmp_send_wr = NULL;
while (cur_send_wr != NULL) {
free(cur_send_wr->sg_list);
tmp_send_wr = cur_send_wr;
cur_send_wr = cur_send_wr->next;
free(tmp_send_wr);
}
send_list_length = 0;
/* do not close the channel in a pre-dump */
if (!final_dump)
return;
/* free IB-related resources */
destroy_com_hndl();
ib_initialized = false;
fprintf(stderr, "Guest memory sent!\n");
}
/**
* \brief Receives the guest memory from the source
*
* The receive participates in the IB connection setup and waits for the
* 'solicited' event sent with the last WR issued by the sender.
*/
void recv_guest_mem(void)
{
int res = 0;
/* prepare IB channel */
init_com_hndl();
exchange_qp_info(true);
con_com_buf();
/* request notification on the event channel */
if (ibv_req_notify_cq(com_hndl.cq, 1) < 0) {
fprintf(stderr,
"ERROR: Could request notify for completion queue "
"- %d (%s). Abort!\n",
errno,
strerror(errno));
exit(EXIT_FAILURE);
}
/* post recv matching IBV_RDMA_WRITE_WITH_IMM */
struct ibv_cq *ev_cq;
void *ev_ctx;
struct ibv_sge sg;
struct ibv_recv_wr recv_wr;
struct ibv_recv_wr *bad_wr;
uint32_t recv_buf = 0;
memset(&sg, 0, sizeof(sg));
sg.addr = (uintptr_t)&recv_buf;
sg.length = sizeof(recv_buf);
sg.lkey = com_hndl.mr->lkey;
memset(&recv_wr, 0, sizeof(recv_wr));
recv_wr.wr_id = 0;
recv_wr.sg_list = &sg;
recv_wr.num_sge = 1;
if (ibv_post_recv(com_hndl.qp, &recv_wr, &bad_wr) < 0) {
fprintf(stderr,
"ERROR: Could post recv - %d (%s). Abort!\n",
errno,
strerror(errno));
exit(EXIT_FAILURE);
}
/* wait for requested event */
if (ibv_get_cq_event(com_hndl.comp_chan, &ev_cq, &ev_ctx) < 0) {
fprintf(stderr,
"ERROR: Could get event from completion channel "
"- %d (%s). Abort!\n",
errno,
strerror(errno));
exit(EXIT_FAILURE);
}
/* acknowledge the event */
ibv_ack_cq_events(com_hndl.cq, 1);
/* free IB-related resources */
destroy_com_hndl();
fprintf(stderr, "Guest memory received!\n");
}
#endif /* __RDMA_MIGRATION__ */

265
tools/uhyve-migration.c Normal file
View file

@ -0,0 +1,265 @@
/*
* Copyright (c) 2018, Simon Pickartz, RWTH Aachen University
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef __x86_64__
#define _GNU_SOURCE
#include <arpa/inet.h>
#include <errno.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "uhyve-migration.h"
#include "uhyve.h"
static struct sockaddr_in mig_server;
static int com_sock = 0;
static int listen_sock = 0;
static mig_type_t mig_type = MIG_TYPE_COLD;
/**
* \brief Returns the configured migration type
*/
mig_type_t
get_migration_type(void)
{
return mig_type;
}
/**
* \brief Sets the migration type
*
* \param mig_type_str A string defining the migration type
*/
void
set_migration_type(const char *mig_type_str)
{
if (mig_type_str == NULL)
return;
int i;
bool found_type = false;
for (i=0; i<sizeof(mig_type_conv)/sizeof(mig_type_conv[0]); ++i) {
if (!strcmp (mig_type_str, mig_type_conv[i].str)) {
mig_type = mig_type_conv[i].mig_type;
found_type = true;
}
}
/* we do not know this migration type */
if (!found_type) {
fprintf(stderr, "ERROR: Migration type '%s' not supported. Fallback to 'cold'\n", mig_type_str);
}
return;
}
/**
* \brief Closes a socket
*
* \param sock the socket to be closed
*/
static inline void
close_sock(int sock)
{
if (close(sock) < 0) {
fprintf(stderr,
"ERROR: Could not close the communication socket "
"- %d (%s). Abort!\n",
errno,
strerror(errno));
exit(EXIT_FAILURE);
}
}
/**
* \brief Set the destination node for a migration
*
* \param ip_str a string containing the IPv4 addr of the destination
* \param port the migration port
*/
void set_migration_target(const char *ip_str, int port)
{
/* determine server address */
memset(&mig_server, '0', sizeof(mig_server));
mig_server.sin_family = AF_INET;
mig_server.sin_port = htons(port);
int res = inet_pton(AF_INET, ip_str, &mig_server.sin_addr);
if (res == 0) {
fprintf(stderr, "'%s' is not a valid server address\n", ip_str);
} else if (res < 0) {
fprintf(stderr, "An error occured while retrieving the migration server address\n");
perror("inet_pton");
}
}
/**
* \brief Connects to a migration target via TCP/IP
*/
void connect_to_server(void)
{
int res = 0;
char buf[INET_ADDRSTRLEN];
if (inet_ntop(AF_INET, (const void*)&mig_server.sin_addr, buf, INET_ADDRSTRLEN) == NULL) {
perror("inet_ntop");
exit(EXIT_FAILURE);
}
if((com_sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
perror("socket");
exit(EXIT_FAILURE);
}
fprintf(stderr, "Trying to connect to migration server: %s\n", buf);
if (connect(com_sock, (struct sockaddr *)&mig_server, sizeof(mig_server)) < 0) {
perror("connect");
exit(EXIT_FAILURE);
}
fprintf(stderr, "Successfully connected to: %s\n", buf);
}
/**
* \brief Waits for a migration source to connect via TCP/IP
*
* \param listen_portno the port of the migration socket
*/
void wait_for_client(uint16_t listen_portno)
{
int client_addr_len = 0, res = 0;
struct sockaddr_in serv_addr;
struct sockaddr_in client_addr;
/* open migration socket */
fprintf(stderr, "Waiting for incomming migration request ...\n");
listen_sock = socket(AF_INET, SOCK_STREAM, 0);
memset(&serv_addr, '0', sizeof(serv_addr));
serv_addr.sin_family = AF_INET;
serv_addr.sin_addr.s_addr = htonl(INADDR_ANY);
serv_addr.sin_port = htons(listen_portno);
bind(listen_sock, (struct sockaddr*)&serv_addr, sizeof(serv_addr));
listen(listen_sock, 10);
client_addr_len = sizeof(struct sockaddr_in);
if ((com_sock = accept(listen_sock, &client_addr, &client_addr_len)) < 0) {
perror("accept");
exit(EXIT_FAILURE);
}
char buf[INET_ADDRSTRLEN];
if (inet_ntop(AF_INET, (const void*)&client_addr.sin_addr, buf, INET_ADDRSTRLEN) == NULL) {
perror("inet_ntop");
exit(EXIT_FAILURE);
}
fprintf(stderr, "Incomming migration from: %s\n", buf);
}
/**
* \brief Receives data from the migration socket
*
* \param buffer the destination buffer
* \param length the buffer size
*/
int recv_data(void *buffer, size_t length)
{
size_t bytes_received = 0;
while(bytes_received < length) {
bytes_received += recv(
com_sock,
(void*)((uint64_t)buffer+bytes_received),
length-bytes_received,
0);
}
return bytes_received;
}
/**
* \brief Sends data via the migration socket
*
* \param buffer the source buffer
* \param length the buffer size
*/
int send_data(void *buffer, size_t length)
{
size_t bytes_sent = 0;
while(bytes_sent < length) {
bytes_sent += send(
com_sock,
(void*)((uint64_t)buffer+bytes_sent),
length-bytes_sent,
0);
}
return bytes_sent;
}
/**
* \brief Closes the TCP connection
*/
void close_migration_channel(void)
{
if (listen_sock) {
close_sock(listen_sock);
}
close_sock(com_sock);
}
#ifndef __RDMA_MIGRATION__
void send_guest_mem(mig_mode_t mode, bool final_dump)
{
/* determine migration mode */
switch (mode) {
case MIG_MODE_INCREMENTAL_DUMP:
fprintf(stderr, "ERROR: Incremental dumps currently not supported via TCP/IP. Fallback to complete dump!\n");
case MIG_MODE_COMPLETE_DUMP:
send_data(guest_mem, guest_size);
break;
default:
fprintf(stderr, "ERROR: Unknown migration mode. Abort!\n");
exit(EXIT_FAILURE);
}
fprintf(stderr, "Guest memory sent!\n");
}
void recv_guest_mem(void)
{
recv_data(guest_mem, guest_size);
fprintf(stderr, "Guest memory received!\n");
}
#endif /* __RDMA_MIGRATION__ */
#endif

86
tools/uhyve-migration.h Normal file
View file

@ -0,0 +1,86 @@
#ifndef __UHYVE_MIGRATION_H__
/*
* Copyright (c) 2018, Simon Pickartz, RWTH Aachen University
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/**
* @author Simon Pickartz
* @file tools/uhyve-migration.h
* @brief Migration-related functions
*/
#define __UHYVE_MIGRATION_H__
#include <stdbool.h>
extern size_t guest_size;
extern uint8_t* guest_mem;
#define MIGRATION_PORT 1337
typedef enum {
MIG_MODE_COMPLETE_DUMP = 1,
MIG_MODE_INCREMENTAL_DUMP,
} mig_mode_t;
typedef enum {
MIG_TYPE_COLD = 0,
MIG_TYPE_LIVE,
} mig_type_t;
const static struct {
mig_type_t mig_type;
const char *str;
} mig_type_conv [] = {
{MIG_TYPE_COLD, "cold"},
{MIG_TYPE_LIVE, "live"},
};
typedef struct _migration_metadata {
uint32_t ncores;
size_t guest_size;
uint32_t no_checkpoint;
uint64_t elf_entry;
bool full_checkpoint;
} migration_metadata_t;
void set_migration_type(const char *mig_type_str);
mig_type_t get_migration_type(void);
void wait_for_client(uint16_t listen_portno);
void set_migration_target(const char *ip_str, int port);
void connect_to_server(void);
void close_migration_channel(void);
int recv_data(void *buffer, size_t length);
int send_data(void *buffer, size_t length);
void send_guest_mem(mig_mode_t mode, bool final_dump);
void recv_guest_mem(void);
#endif /* __UHYVE_MIGRATION_H__ */

View file

@ -61,14 +61,15 @@
#include "uhyve.h"
#include "uhyve-x86_64.h"
#include "uhyve-syscalls.h"
#include "uhyve-migration.h"
#include "uhyve-net.h"
#include "proxy.h"
// define this macro to create checkpoints with KVM's dirty log
//#define USE_DIRTY_LOG
#define MIG_ITERS 4
#define MAX_FNAME 256
#define MAX_MSR_ENTRIES 25
#define GUEST_OFFSET 0x0
#define CPUID_FUNC_PERFMON 0x0A
@ -149,14 +150,18 @@
#define IOAPIC_DEFAULT_BASE 0xfec00000
#define APIC_DEFAULT_BASE 0xfee00000
static bool cap_tsc_deadline = false;
static bool cap_irqchip = false;
static bool cap_adjust_clock_stable = false;
static bool cap_irqfd = false;
static bool cap_vapic = false;
FILE *chk_file = NULL;
extern size_t guest_size;
extern pthread_barrier_t barrier;
extern pthread_barrier_t migration_barrier;
extern pthread_t* vcpu_threads;
extern uint64_t elf_entry;
extern uint8_t* klog;
@ -172,6 +177,8 @@ extern __thread struct kvm_run *run;
extern __thread int vcpufd;
extern __thread uint32_t cpuid;
extern vcpu_state_t *vcpu_thread_states;
static inline void show_dtable(const char *name, struct kvm_dtable *dtable)
{
fprintf(stderr, " %s %016zx %08hx\n", name, (size_t) dtable->base, (uint16_t) dtable->limit);
@ -380,6 +387,17 @@ static void setup_cpuid(int kvm, int vcpufd)
free(kvm_cpuid);
}
size_t determine_dest_offset(size_t src_addr)
{
size_t ret = 0;
if (src_addr & PG_PSE) {
ret = src_addr & PAGE_2M_MASK;
} else {
ret = src_addr & PAGE_MASK;
}
return ret;
}
void init_cpu_state(uint64_t elf_entry)
{
struct kvm_regs regs = {
@ -416,110 +434,82 @@ void init_cpu_state(uint64_t elf_entry)
*((volatile uint32_t*) (mboot + 0x30)) = cpuid;
}
void restore_cpu_state(void)
{
struct kvm_regs regs;
struct kvm_mp_state mp_state = { KVM_MP_STATE_RUNNABLE };
vcpu_state_t read_cpu_state() {
vcpu_state_t cpu_state;
char fname[MAX_FNAME];
struct kvm_sregs sregs;
struct kvm_fpu fpu;
struct {
struct kvm_msrs info;
struct kvm_msr_entry entries[MAX_MSR_ENTRIES];
} msr_data;
struct kvm_lapic_state lapic;
struct kvm_xsave xsave;
struct kvm_xcrs xcrs;
struct kvm_vcpu_events events;
run->apic_base = APIC_DEFAULT_BASE;
setup_cpuid(kvm, vcpufd);
snprintf(fname, MAX_FNAME, "checkpoint/chk%u_core%u.dat", no_checkpoint, cpuid);
FILE* f = fopen(fname, "r");
if (f == NULL)
err(1, "fopen: unable to open file");
if (fread(&sregs, sizeof(sregs), 1, f) != 1)
err(1, "fread failed\n");
if (fread(&regs, sizeof(regs), 1, f) != 1)
err(1, "fread failed\n");
if (fread(&fpu, sizeof(fpu), 1, f) != 1)
err(1, "fread failed\n");
if (fread(&msr_data, sizeof(msr_data), 1, f) != 1)
err(1, "fread failed\n");
if (fread(&lapic, sizeof(lapic), 1, f) != 1)
err(1, "fread failed\n");
if (fread(&xsave, sizeof(xsave), 1, f) != 1)
err(1, "fread failed\n");
if (fread(&xcrs, sizeof(xcrs), 1, f) != 1)
err(1, "fread failed\n");
if (fread(&events, sizeof(events), 1, f) != 1)
err(1, "fread failed\n");
if (fread(&mp_state, sizeof(mp_state), 1, f) != 1)
if (fread(&cpu_state, sizeof(cpu_state), 1, f) != 1)
err(1, "fread failed\n");
fclose(f);
kvm_ioctl(vcpufd, KVM_SET_SREGS, &sregs);
kvm_ioctl(vcpufd, KVM_SET_REGS, &regs);
kvm_ioctl(vcpufd, KVM_SET_MSRS, &msr_data);
kvm_ioctl(vcpufd, KVM_SET_XCRS, &xcrs);
kvm_ioctl(vcpufd, KVM_SET_MP_STATE, &mp_state);
kvm_ioctl(vcpufd, KVM_SET_LAPIC, &lapic);
kvm_ioctl(vcpufd, KVM_SET_FPU, &fpu);
kvm_ioctl(vcpufd, KVM_SET_XSAVE, &xsave);
kvm_ioctl(vcpufd, KVM_SET_VCPU_EVENTS, &events);
return cpu_state;
}
void save_cpu_state(void)
void restore_cpu_state(vcpu_state_t cpu_state)
{
cpu_state.mp_state.mp_state = KVM_MP_STATE_RUNNABLE;
run->apic_base = APIC_DEFAULT_BASE;
setup_cpuid(kvm, vcpufd);
kvm_ioctl(vcpufd, KVM_SET_SREGS, &cpu_state.sregs);
kvm_ioctl(vcpufd, KVM_SET_REGS, &cpu_state.regs);
kvm_ioctl(vcpufd, KVM_SET_MSRS, &cpu_state.msr_data);
kvm_ioctl(vcpufd, KVM_SET_XCRS, &cpu_state.xcrs);
kvm_ioctl(vcpufd, KVM_SET_MP_STATE, &cpu_state.mp_state);
kvm_ioctl(vcpufd, KVM_SET_LAPIC, &cpu_state.lapic);
kvm_ioctl(vcpufd, KVM_SET_FPU, &cpu_state.fpu);
kvm_ioctl(vcpufd, KVM_SET_XSAVE, &cpu_state.xsave);
kvm_ioctl(vcpufd, KVM_SET_VCPU_EVENTS, &cpu_state.events);
}
vcpu_state_t save_cpu_state(void)
{
struct {
struct kvm_msrs info;
struct kvm_msr_entry entries[MAX_MSR_ENTRIES];
} msr_data;
struct kvm_msr_entry *msrs = msr_data.entries;
struct kvm_regs regs;
struct kvm_sregs sregs;
struct kvm_fpu fpu;
struct kvm_lapic_state lapic;
struct kvm_xsave xsave;
struct kvm_xcrs xcrs;
struct kvm_vcpu_events events;
struct kvm_mp_state mp_state;
char fname[MAX_FNAME];
int n = 0;
vcpu_state_t cpu_state;
/* define the list of required MSRs */
msrs[n++].index = MSR_IA32_APICBASE;
msrs[n++].index = MSR_IA32_SYSENTER_CS;
msrs[n++].index = MSR_IA32_SYSENTER_ESP;
msrs[n++].index = MSR_IA32_SYSENTER_EIP;
msrs[n++].index = MSR_IA32_CR_PAT;
msrs[n++].index = MSR_IA32_MISC_ENABLE;
msrs[n++].index = MSR_IA32_TSC;
msrs[n++].index = MSR_CSTAR;
msrs[n++].index = MSR_STAR;
msrs[n++].index = MSR_EFER;
msrs[n++].index = MSR_LSTAR;
msrs[n++].index = MSR_GS_BASE;
msrs[n++].index = MSR_FS_BASE;
msrs[n++].index = MSR_KERNEL_GS_BASE;
cpu_state.msr_data.entries[n++].index = MSR_IA32_APICBASE;
cpu_state.msr_data.entries[n++].index = MSR_IA32_SYSENTER_CS;
cpu_state.msr_data.entries[n++].index = MSR_IA32_SYSENTER_ESP;
cpu_state.msr_data.entries[n++].index = MSR_IA32_SYSENTER_EIP;
cpu_state.msr_data.entries[n++].index = MSR_IA32_CR_PAT;
cpu_state.msr_data.entries[n++].index = MSR_IA32_MISC_ENABLE;
cpu_state.msr_data.entries[n++].index = MSR_IA32_TSC;
cpu_state.msr_data.entries[n++].index = MSR_CSTAR;
cpu_state.msr_data.entries[n++].index = MSR_STAR;
cpu_state.msr_data.entries[n++].index = MSR_EFER;
cpu_state.msr_data.entries[n++].index = MSR_LSTAR;
cpu_state.msr_data.entries[n++].index = MSR_GS_BASE;
cpu_state.msr_data.entries[n++].index = MSR_FS_BASE;
cpu_state.msr_data.entries[n++].index = MSR_KERNEL_GS_BASE;
//msrs[n++].index = MSR_IA32_FEATURE_CONTROL;
msr_data.info.nmsrs = n;
cpu_state.msr_data.info.nmsrs = n;
kvm_ioctl(vcpufd, KVM_GET_SREGS, &sregs);
kvm_ioctl(vcpufd, KVM_GET_REGS, &regs);
kvm_ioctl(vcpufd, KVM_GET_MSRS, &msr_data);
kvm_ioctl(vcpufd, KVM_GET_XCRS, &xcrs);
kvm_ioctl(vcpufd, KVM_GET_LAPIC, &lapic);
kvm_ioctl(vcpufd, KVM_GET_FPU, &fpu);
kvm_ioctl(vcpufd, KVM_GET_XSAVE, &xsave);
kvm_ioctl(vcpufd, KVM_GET_VCPU_EVENTS, &events);
kvm_ioctl(vcpufd, KVM_GET_MP_STATE, &mp_state);
kvm_ioctl(vcpufd, KVM_GET_SREGS, &cpu_state.sregs);
kvm_ioctl(vcpufd, KVM_GET_REGS, &cpu_state.regs);
kvm_ioctl(vcpufd, KVM_GET_MSRS, &cpu_state.msr_data);
kvm_ioctl(vcpufd, KVM_GET_XCRS, &cpu_state.xcrs);
kvm_ioctl(vcpufd, KVM_GET_LAPIC, &cpu_state.lapic);
kvm_ioctl(vcpufd, KVM_GET_FPU, &cpu_state.fpu);
kvm_ioctl(vcpufd, KVM_GET_XSAVE, &cpu_state.xsave);
kvm_ioctl(vcpufd, KVM_GET_VCPU_EVENTS, &cpu_state.events);
kvm_ioctl(vcpufd, KVM_GET_MP_STATE, &cpu_state.mp_state);
return cpu_state;
}
void write_cpu_state(void)
{
vcpu_state_t cpu_state = save_cpu_state();
char fname[MAX_FNAME];
snprintf(fname, MAX_FNAME, "checkpoint/chk%u_core%u.dat", no_checkpoint, cpuid);
FILE* f = fopen(fname, "w");
@ -527,73 +517,14 @@ void save_cpu_state(void)
err(1, "fopen: unable to open file\n");
}
if (fwrite(&sregs, sizeof(sregs), 1, f) != 1)
err(1, "fwrite failed\n");
if (fwrite(&regs, sizeof(regs), 1, f) != 1)
err(1, "fwrite failed\n");
if (fwrite(&fpu, sizeof(fpu), 1, f) != 1)
err(1, "fwrite failed\n");
if (fwrite(&msr_data, sizeof(msr_data), 1, f) != 1)
err(1, "fwrite failed\n");
if (fwrite(&lapic, sizeof(lapic), 1, f) != 1)
err(1, "fwrite failed\n");
if (fwrite(&xsave, sizeof(xsave), 1, f) != 1)
err(1, "fwrite failed\n");
if (fwrite(&xcrs, sizeof(xcrs), 1, f) != 1)
err(1, "fwrite failed\n");
if (fwrite(&events, sizeof(events), 1, f) != 1)
err(1, "fwrite failed\n");
if (fwrite(&mp_state, sizeof(mp_state), 1, f) != 1)
if (fwrite(&cpu_state, sizeof(cpu_state), 1, f) != 1)
err(1, "fwrite failed\n");
fclose(f);
}
void timer_handler(int signum)
void scan_dirty_log(void (*save_page)(void*, size_t, void*, size_t))
{
struct stat st = {0};
const size_t flag = (!full_checkpoint && (no_checkpoint > 0)) ? PG_DIRTY : PG_ACCESSED;
char fname[MAX_FNAME];
struct timeval begin, end;
if (verbose)
gettimeofday(&begin, NULL);
if (stat("checkpoint", &st) == -1)
mkdir("checkpoint", 0700);
for(size_t i = 0; i < ncores; i++)
if (vcpu_threads[i] != pthread_self())
pthread_kill(vcpu_threads[i], SIGRTMIN);
pthread_barrier_wait(&barrier);
save_cpu_state();
snprintf(fname, MAX_FNAME, "checkpoint/chk%u_mem.dat", no_checkpoint);
FILE* f = fopen(fname, "w");
if (f == NULL) {
err(1, "fopen: unable to open file");
}
/*struct kvm_irqchip irqchip = {};
if (cap_irqchip)
kvm_ioctl(vmfd, KVM_GET_IRQCHIP, &irqchip);
else
memset(&irqchip, 0x00, sizeof(irqchip));
if (fwrite(&irqchip, sizeof(irqchip), 1, f) != 1)
err(1, "fwrite failed");*/
struct kvm_clock_data clock = {};
kvm_ioctl(vmfd, KVM_GET_CLOCK, &clock);
if (fwrite(&clock, sizeof(clock), 1, f) != 1)
err(1, "fwrite failed");
#if 0
if (fwrite(guest_mem, guest_size, 1, f) != 1)
err(1, "fwrite failed");
#elif defined(USE_DIRTY_LOG)
static struct kvm_dirty_log dlog = {
.slot = 0,
.dirty_bitmap = NULL
@ -601,8 +532,7 @@ void timer_handler(int signum)
size_t dirty_log_size = (guest_size >> PAGE_BITS) / sizeof(size_t);
// do we create our first checkpoint
if (dlog.dirty_bitmap == NULL)
{
if (dlog.dirty_bitmap == NULL) {
// besure that all paddings are zero
memset(&dlog, 0x00, sizeof(dlog));
@ -616,24 +546,17 @@ void timer_handler(int signum)
nextslot:
kvm_ioctl(vmfd, KVM_GET_DIRTY_LOG, &dlog);
for(size_t i=0; i<dirty_log_size; i++)
{
for(size_t i=0; i<dirty_log_size; i++) {
size_t value = ((size_t*) dlog.dirty_bitmap)[i];
if (value)
{
for(size_t j=0; j<sizeof(size_t)*8; j++)
{
if (value) {
for(size_t j=0; j<sizeof(size_t)*8; j++) {
size_t test = 1ULL << j;
if ((value & test) == test)
{
if ((value & test) == test) {
size_t addr = (i*sizeof(size_t)*8+j)*PAGE_SIZE;
if (fwrite(&addr, sizeof(size_t), 1, f) != 1)
err(1, "fwrite failed");
if (fwrite((size_t*) (guest_mem + addr), PAGE_SIZE, 1, f) != 1)
err(1, "fwrite failed");
save_page(&addr, sizeof(size_t), (void*)((uint64_t)guest_mem+(uint64_t)addr), PAGE_SIZE);
}
}
}
@ -645,7 +568,12 @@ nextslot:
memset(dlog.dirty_bitmap, 0x00, dirty_log_size * sizeof(size_t));
goto nextslot;
}
#else
}
void scan_page_tables(void (*save_page)(void*, size_t, void*, size_t))
{
const size_t flag = (!full_checkpoint && (no_checkpoint > 0)) ? PG_DIRTY : PG_ACCESSED;
size_t* pml4 = (size_t*) (guest_mem+elf_entry+PAGE_SIZE);
for(size_t i=0; i<(1 << PAGE_MAP_BITS); i++) {
if ((pml4[i] & PG_PRESENT) != PG_PRESENT)
@ -669,32 +597,101 @@ nextslot:
if (!full_checkpoint)
pgt[l] = pgt[l] & ~(PG_DIRTY|PG_ACCESSED);
size_t pgt_entry = pgt[l] & ~PG_PSE; // because PAT use the same bit as PSE
if (fwrite(&pgt_entry, sizeof(size_t), 1, f) != 1)
err(1, "fwrite failed");
if (fwrite((size_t*) (guest_mem + (pgt[l] & PAGE_MASK)), (1UL << PAGE_BITS), 1, f) != 1)
err(1, "fwrite failed");
save_page(&pgt_entry, sizeof(size_t), (void*) (guest_mem + (pgt[l] & PAGE_MASK)), (1UL << PAGE_BITS));
}
}
} else if ((pgd[k] & flag) == flag) {
//printf("\t\t*pgd[%zd] 0x%zx, 2MB\n", k, pgd[k] & ~PG_XD);
if (!full_checkpoint)
pgd[k] = pgd[k] & ~(PG_DIRTY|PG_ACCESSED);
if (fwrite(pgd+k, sizeof(size_t), 1, f) != 1)
err(1, "fwrite failed");
if (fwrite((size_t*) (guest_mem + (pgd[k] & PAGE_2M_MASK)), (1UL << PAGE_2M_BITS), 1, f) != 1)
err(1, "fwrite failed");
save_page(pgd+k, sizeof(size_t), (void*) (guest_mem + (pgd[k] & PAGE_2M_MASK)), (1UL << PAGE_2M_BITS));
}
}
}
}
}
void open_chk_file(char *fname) {
chk_file = fopen(fname, "w");
if (chk_file == NULL) {
err(1, "fopen: unable to open file");
}
}
void close_chk_file(void) {
fclose(chk_file);
}
void write_chk_file(void *addr, size_t bytes) {
if (fwrite(addr, bytes, 1, chk_file) != 1) {
err(1, "fwrite failed");
}
}
void write_mem_page_to_chk_file(void *entry, size_t entry_size, void *page, size_t page_size) {
write_chk_file(entry, entry_size);
write_chk_file(page, page_size);
}
void determine_dirty_pages(void (*save_page_handler)(void*, size_t, void*, size_t))
{
#ifdef USE_DIRTY_LOG
scan_dirty_log(save_page_handler);
#else
scan_page_tables(save_page_handler);
#endif
fclose(f);
}
void timer_handler(int signum)
{
struct stat st = {0};
char fname[MAX_FNAME];
struct timeval begin, end;
if (verbose)
gettimeofday(&begin, NULL);
if (stat("checkpoint", &st) == -1)
mkdir("checkpoint", 0700);
for(size_t i = 0; i < ncores; i++)
if (vcpu_threads[i] != pthread_self())
pthread_kill(vcpu_threads[i], SIGTHRCHKP);
pthread_barrier_wait(&barrier);
write_cpu_state();
snprintf(fname, MAX_FNAME, "checkpoint/chk%u_mem.dat", no_checkpoint);
open_chk_file(fname);
/*struct kvm_irqchip irqchip = {};
if (cap_irqchip)
kvm_ioctl(vmfd, KVM_GET_IRQCHIP, &irqchip);
else
memset(&irqchip, 0x00, sizeof(irqchip));
if (fwrite(&irqchip, sizeof(irqchip), 1, f) != 1)
err(1, "fwrite failed");*/
struct kvm_clock_data clock = {};
kvm_ioctl(vmfd, KVM_GET_CLOCK, &clock);
write_chk_file(&clock, sizeof(clock));
#if 0
if (fwrite(guest_mem, guest_size, 1, f) != 1)
err(1, "fwrite failed");
#else
determine_dirty_pages(write_mem_page_to_chk_file);
#endif
close_chk_file();
pthread_barrier_wait(&barrier);
// update configuration file
f = fopen("checkpoint/chk_config.txt", "w");
FILE *f = fopen("checkpoint/chk_config.txt", "w");
if (f == NULL) {
err(1, "fopen: unable to open file");
}
@ -720,6 +717,97 @@ nextslot:
no_checkpoint++;
}
void *migration_handler(void *arg)
{
sigset_t *signal_mask = (sigset_t *)arg;
int res = 0;
size_t i = 0;
int sig_caught; /* signal caught */
/* Use same mask as the set of signals that we'd like to know about! */
sigwait(signal_mask, &sig_caught);
connect_to_server();
/* send metadata */
migration_metadata_t metadata = {
ncores,
guest_size,
0, /* no_checkpoint */
elf_entry,
full_checkpoint};
res = send_data(&metadata, sizeof(migration_metadata_t));
fprintf(stderr, "Metadata sent! (%d bytes)\n", res);
if (get_migration_type() == MIG_TYPE_LIVE) {
/* resend rounds */
for (i=0; i<MIG_ITERS; ++i) {
send_guest_mem(MIG_MODE_INCREMENTAL_DUMP, 0);
}
}
/* synchronize VCPU threads */
assert(vcpu_thread_states == NULL);
vcpu_thread_states = (vcpu_state_t*)calloc(ncores, sizeof(vcpu_state_t));
for(i = 0; i < ncores; i++)
pthread_kill(vcpu_threads[i], SIGTHRMIG);
pthread_barrier_wait(&migration_barrier);
/* send the final dump */
send_guest_mem(MIG_MODE_INCREMENTAL_DUMP, 1);
/* send CPU state */
res = send_data(vcpu_thread_states, sizeof(vcpu_state_t)*ncores);
fprintf(stderr, "CPU state sent! (%d bytes)\n", res);
/* free vcpu_thread_states */
free(vcpu_thread_states);
vcpu_thread_states = NULL;
/* send clock */
if (cap_adjust_clock_stable) {
struct kvm_clock_data clock = {};
kvm_ioctl(vmfd, KVM_GET_CLOCK, &clock);
res = send_data(&clock, sizeof(clock));
fprintf(stderr, "Clock sent! (%d bytes)\n", res);
}
/* close socket */
close_migration_channel();
exit(EXIT_SUCCESS);
}
int load_migration_data(uint8_t* mem)
{
size_t paddr = elf_entry;
int res = 0;
if (!klog)
klog = mem+paddr+0x5000-GUEST_OFFSET;
if (!mboot)
mboot = mem+paddr-GUEST_OFFSET;
recv_guest_mem();
/* receive cpu state */
assert(vcpu_thread_states == NULL);
vcpu_thread_states = (vcpu_state_t*)calloc(ncores, sizeof(vcpu_state_t));
res = recv_data(vcpu_thread_states, sizeof(vcpu_state_t)*ncores);
fprintf(stderr, "CPU states received! (%d bytes)\n", res);
/* receive clock */
if (cap_adjust_clock_stable) {
struct kvm_clock_data clock = {}, data = {};
res = recv_data(&clock, sizeof(clock));
fprintf(stderr, "Clock received! (%d bytes)\n", res);
data.clock = clock.clock;
kvm_ioctl(vmfd, KVM_SET_CLOCK, &data);
}
}
int load_checkpoint(uint8_t* mem, char* path)
{
char fname[MAX_FNAME];
@ -783,10 +871,11 @@ int load_checkpoint(uint8_t* mem, char* path)
while (fread(&location, sizeof(location), 1, f) == 1) {
//printf("location 0x%zx\n", location);
size_t *dest_addr = (size_t*) (mem + determine_dest_offset(location));
if (location & PG_PSE)
ret = fread((size_t*) (mem + (location & PAGE_2M_MASK)), (1UL << PAGE_2M_BITS), 1, f);
ret = fread(dest_addr, (1UL << PAGE_2M_BITS), 1, f);
else
ret = fread((size_t*) (mem + (location & PAGE_MASK)), (1UL << PAGE_BITS), 1, f);
ret = fread(dest_addr, (1UL << PAGE_BITS), 1, f);
if (ret != 1) {
fprintf(stderr, "Unable to read checkpoint: ret = %d", ret);
@ -808,6 +897,19 @@ int load_checkpoint(uint8_t* mem, char* path)
return 0;
}
void wait_for_incomming_migration(migration_metadata_t *metadata, uint16_t listen_portno)
{
int res = 0, com_sock = 0;
wait_for_client(listen_portno);
/* receive metadata state */
res = recv_data(metadata, sizeof(migration_metadata_t));
fprintf(stderr, "Metadata received! (%d bytes)\n", res);
fprintf(stderr, "NCORES = %u; GUEST_SIZE = %llu; NO_CHKPOINT = %u; ELF_ENTRY = 0x%x; FULL_CHKPT = %d\n",
metadata->ncores, metadata->guest_size, metadata->no_checkpoint, metadata->elf_entry, metadata->full_checkpoint);
}
void init_kvm_arch(void)
{
uint64_t identity_base = 0xfffbc000;
@ -1042,7 +1144,7 @@ int load_kernel(uint8_t* mem, char* path)
*((uint8_t*) (mem+paddr-GUEST_OFFSET + 0xBB)) = (uint8_t) ip[3];
}
*((uint64_t*) (mem+paddr-GUEST_OFFSET + 0xbc)) = guest_mem;
*((uint64_t*) (mem+paddr-GUEST_OFFSET + 0xbc)) = (uint64_t)guest_mem;
}
*((uint64_t*) (mem+paddr-GUEST_OFFSET + 0x38)) += memsz; // total kernel size
}

View file

@ -34,6 +34,7 @@
#define _GNU_SOURCE
#include <arpa/inet.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
@ -64,10 +65,12 @@
#include "uhyve.h"
#include "uhyve-syscalls.h"
#include "uhyve-migration.h"
#include "uhyve-net.h"
#include "proxy.h"
static bool restart = false;
static bool migration = false;
static pthread_t net_thread;
static int* vcpu_fds = NULL;
static pthread_mutex_t kvm_lock = PTHREAD_MUTEX_INITIALIZER;
@ -78,6 +81,7 @@ static char* guest_path = NULL;
size_t guest_size = 0x20000000ULL;
bool full_checkpoint = false;
pthread_barrier_t barrier;
pthread_barrier_t migration_barrier;
pthread_t* vcpu_threads = NULL;
uint8_t* klog = NULL;
uint8_t* guest_mem = NULL;
@ -97,6 +101,9 @@ char **uhyve_argv = NULL;
extern char **environ;
char **uhyve_envp = NULL;
vcpu_state_t *vcpu_thread_states = NULL;
static sigset_t signal_mask;
typedef struct {
int argc;
int argsz[MAX_ARGC_ENVC];
@ -266,13 +273,23 @@ static int vcpu_loop(void)
pthread_barrier_wait(&barrier);
if (restart)
restore_cpu_state();
else
if (restart) {
vcpu_state_t cpu_state = read_cpu_state();
restore_cpu_state(cpu_state);
} else if (vcpu_thread_states) {
restore_cpu_state(vcpu_thread_states[cpuid]);
} else {
init_cpu_state(elf_entry);
}
if (restart && (cpuid == 0))
no_checkpoint++;
if (cpuid == 0) {
if (restart) {
no_checkpoint++;
} else if (migration) {
free(vcpu_thread_states);
vcpu_thread_states = NULL;
}
}
while (1) {
ret = ioctl(vcpufd, KVM_RUN, NULL);
@ -506,12 +523,29 @@ static int vcpu_init(void)
static void sigusr_handler(int signum)
{
pthread_barrier_wait(&barrier);
save_cpu_state();
write_cpu_state();
pthread_barrier_wait(&barrier);
}
static void vcpu_thread_mig_handler(int signum)
{
/* memory should be allocated at this point */
assert(vcpu_thread_states != NULL);
/* ensure consistency among VCPUs */
pthread_barrier_wait(&barrier);
/* save state */
vcpu_thread_states[cpuid] = save_cpu_state();
/* synchronize with migration thread */
pthread_barrier_wait(&migration_barrier);
/* wait to be killed */
pthread_barrier_wait(&migration_barrier);
}
static void* uhyve_thread(void* arg)
{
size_t ret;
@ -521,10 +555,15 @@ static void* uhyve_thread(void* arg)
cpuid = (size_t) arg;
/* Install timer_handler as the signal handler for SIGVTALRM. */
/* install signal handler for checkpoint */
memset(&sa, 0x00, sizeof(sa));
sa.sa_handler = &sigusr_handler;
sigaction(SIGRTMIN, &sa, NULL);
sigaction(SIGTHRCHKP, &sa, NULL);
/* install signal handler for migration */
memset(&sa, 0x00, sizeof(sa));
sa.sa_handler = &vcpu_thread_mig_handler;
sigaction(SIGTHRMIG, &sa, NULL);
// create new cpu
vcpu_init();
@ -546,6 +585,7 @@ void sigterm_handler(int signum)
int uhyve_init(char *path)
{
FILE *f = NULL;
guest_path = path;
signal(SIGTERM, sigterm_handler);
@ -553,8 +593,24 @@ int uhyve_init(char *path)
// register routine to close the VM
atexit(uhyve_atexit);
FILE* f = fopen("checkpoint/chk_config.txt", "r");
if (f != NULL) {
const char *start_mig_server = getenv("HERMIT_MIGRATION_SERVER");
/*
* Three startups
* a) incoming migration
* b) load existing checkpoint
* c) normal run
*/
if (start_mig_server) {
migration = true;
migration_metadata_t metadata;
wait_for_incomming_migration(&metadata, MIGRATION_PORT);
ncores = metadata.ncores;
guest_size = metadata.guest_size;
elf_entry = metadata.elf_entry;
full_checkpoint = metadata.full_checkpoint;
} else if ((f = fopen("checkpoint/chk_config.txt", "r")) != NULL) {
int tmp = 0;
restart = true;
@ -566,7 +622,10 @@ int uhyve_init(char *path)
full_checkpoint = tmp ? true : false;
if (verbose)
fprintf(stderr, "Restart from checkpoint %u (ncores %d, mem size 0x%zx)\n", no_checkpoint, ncores, guest_size);
fprintf(stderr,
"Restart from checkpoint %u "
"(ncores %d, mem size 0x%zx)\n",
no_checkpoint, ncores, guest_size);
fclose(f);
} else {
const char* hermit_memory = getenv("HERMIT_MEM");
@ -607,6 +666,9 @@ int uhyve_init(char *path)
if (restart) {
if (load_checkpoint(guest_mem, path) != 0)
exit(EXIT_FAILURE);
} else if (start_mig_server) {
load_migration_data(guest_mem);
close_migration_channel();
} else {
if (load_kernel(guest_mem, path) != 0)
exit(EXIT_FAILURE);
@ -614,6 +676,7 @@ int uhyve_init(char *path)
#endif
pthread_barrier_init(&barrier, NULL, ncores);
pthread_barrier_init(&migration_barrier, NULL, ncores+1);
cpuid = 0;
// create first CPU, it will be the boot processor by default
@ -635,6 +698,8 @@ int uhyve_init(char *path)
int uhyve_loop(int argc, char **argv)
{
const char* hermit_check = getenv("HERMIT_CHECKPOINT");
const char* hermit_mig_support = getenv("HERMIT_MIGRATION_SUPPORT");
const char* hermit_mig_type = getenv("HERMIT_MIGRATION_TYPE");
int ts = 0, i = 0;
/* argv[0] is 'proxy', do not count it */
@ -665,6 +730,27 @@ int uhyve_loop(int argc, char **argv)
if (hermit_check)
ts = atoi(hermit_check);
if (hermit_mig_support) {
set_migration_target(hermit_mig_support, MIGRATION_PORT);
set_migration_type(hermit_mig_type);
/* block SIGUSR1 in main thread */
sigemptyset (&signal_mask);
sigaddset (&signal_mask, SIGUSR1);
pthread_sigmask (SIG_BLOCK, &signal_mask, NULL);
/* start migration thread; handles SIGUSR1 */
pthread_t sig_thr_id;
pthread_create (&sig_thr_id, NULL, migration_handler, (void *)&signal_mask);
/* install signal handler for migration */
struct sigaction sa;
memset(&sa, 0x00, sizeof(sa));
sa.sa_handler = &vcpu_thread_mig_handler;
sigaction(SIGTHRMIG, &sa, NULL);
}
// First CPU is special because it will boot the system. Other CPUs will
// be booted linearily after the first one.
vcpu_threads[0] = pthread_self();

View file

@ -29,6 +29,7 @@
#define __UHYVE_H__
#include <err.h>
#include <linux/kvm.h>
#define UHYVE_PORT_WRITE 0x400
#define UHYVE_PORT_OPEN 0x440
@ -50,6 +51,9 @@
#define UHYVE_IRQ 11
#define SIGTHRCHKP (SIGRTMIN+0)
#define SIGTHRMIG (SIGRTMIN+1)
#define kvm_ioctl(fd, cmd, arg) ({ \
const int ret = ioctl(fd, cmd, arg); \
if(ret == -1) \
@ -57,14 +61,43 @@
ret; \
})
#define MAX_MSR_ENTRIES 25
struct msr_data {
struct kvm_msrs info;
struct kvm_msr_entry entries[MAX_MSR_ENTRIES];
};
typedef struct _vcpu_state {
struct msr_data msr_data;
struct kvm_regs regs;
struct kvm_sregs sregs;
struct kvm_fpu fpu;
struct kvm_lapic_state lapic;
struct kvm_xsave xsave;
struct kvm_xcrs xcrs;
struct kvm_vcpu_events events;
struct kvm_mp_state mp_state;
} vcpu_state_t;
typedef struct _migration_metadata migration_metadata_t;
void print_registers(void);
void timer_handler(int signum);
void restore_cpu_state(void);
void save_cpu_state(void);
void *migration_handler(void *arg);
void restore_cpu_state(vcpu_state_t cpu_state);
vcpu_state_t read_cpu_state(void);
vcpu_state_t save_cpu_state(void);
void write_cpu_state(void);
void init_cpu_state(uint64_t elf_entry);
int load_kernel(uint8_t* mem, char* path);
int load_checkpoint(uint8_t* mem, char* path);
int load_migration_data(uint8_t* mem);
void wait_for_incomming_migration(migration_metadata_t *metadata, uint16_t listen_portno);
void init_kvm_arch(void);
int load_kernel(uint8_t* mem, char* path);
size_t determine_dest_offset(size_t src_addr);
void determine_dirty_pages(void (*save_page_handler)(void*, size_t, void*, size_t));
#endif

View file

@ -10,6 +10,10 @@ add_executable(hello++ hello++.cpp)
add_executable(hellof hellof.f90)
add_executable(pi pi.go)
add_executable(endless endless.c)
target_compile_options(endless PRIVATE -fopenmp)
target_link_libraries(endless -fopenmp)
add_executable(test-malloc test-malloc.c)
add_executable(test-malloc-mt test-malloc-mt.c)
target_compile_options(test-malloc-mt PRIVATE -pthread)

18
usr/tests/endless.c Normal file
View file

@ -0,0 +1,18 @@
#include <time.h>
#include <stdio.h>
#include <hermit/syscall.h>
int main(int argc, char **argv) {
int cnt = 0;
#pragma omp parallel
while(1) {
#pragma omp critical
{
printf("Counter %d\n", ++cnt);
}
sys_msleep(500);
}
return 0;
}