/* Copyright (c) 2015, IBM * Author(s): Dan Williams * Ricardo Koller * Copyright (c) 2017, RWTH Aachen University * Author(s): Stefan Lankes * * Permission to use, copy, modify, and/or distribute this software * for any purpose with or without fee is hereby granted, provided * that the above copyright notice and this permission notice appear * in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE * AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, * NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ /* We used several existing projects as guides * kvmtest.c: http://lwn.net/Articles/658512/ * Solo5: https://github.com/Solo5/solo5 */ /* * 15.1.2017: extend original version (https://github.com/Solo5/solo5) * for HermitCore * 25.2.2017: add SMP support to enable more than one core * 24.4.2017: add checkpoint/restore support, * remove memory limit */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "uhyve.h" #include "uhyve-syscalls.h" #include "uhyve-migration.h" #include "uhyve-net.h" #include "proxy.h" static bool restart = false; static bool migration = false; static pthread_t net_thread; static int* vcpu_fds = NULL; static pthread_mutex_t kvm_lock = PTHREAD_MUTEX_INITIALIZER; extern bool verbose; static char* guest_path = NULL; size_t guest_size = 0x20000000ULL; bool full_checkpoint = false; pthread_barrier_t barrier; pthread_barrier_t migration_barrier; pthread_t* vcpu_threads = NULL; uint8_t* klog = NULL; uint8_t* guest_mem = NULL; uint32_t no_checkpoint = 0; uint32_t ncores = 1; uint64_t elf_entry; int kvm = -1, vmfd = -1, netfd = -1, efd = -1; uint8_t* mboot = NULL; __thread struct kvm_run *run = NULL; __thread int vcpufd = -1; __thread uint32_t cpuid = 0; static sem_t net_sem; int uhyve_argc = -1; int uhyve_envc = -1; char **uhyve_argv = NULL; extern char **environ; char **uhyve_envp = NULL; vcpu_state_t *vcpu_thread_states = NULL; static sigset_t signal_mask; typedef struct { int argc; int argsz[MAX_ARGC_ENVC]; int envc; int envsz[MAX_ARGC_ENVC]; } __attribute__ ((packed)) uhyve_cmdsize_t; typedef struct { char **argv; char **envp; } __attribute__ ((packed)) uhyve_cmdval_t; static uint64_t memparse(const char *ptr) { // local pointer to end of parsed string char *endptr; // parse number uint64_t size = strtoull(ptr, &endptr, 0); // parse size extension, intentional fall-through switch (*endptr) { case 'E': case 'e': size <<= 10; case 'P': case 'p': size <<= 10; case 'T': case 't': size <<= 10; case 'G': case 'g': size <<= 10; case 'M': case 'm': size <<= 10; case 'K': case 'k': size <<= 10; endptr++; default: break; } return size; } // Just close file descriptor if not already done static void close_fd(int* fd) { if (*fd != -1) { close(*fd); *fd = -1; } } static void uhyve_exit(void* arg) { //print_registers(); if (pthread_mutex_trylock(&kvm_lock)) { close_fd(&vcpufd); return; } // only the main thread will execute this if (vcpu_threads) { for(uint32_t i=0; i 0) pthread_kill(net_thread, SIGTERM); } close_fd(&vcpufd); } static void uhyve_atexit(void) { uhyve_exit(NULL); if (vcpu_threads) { for(uint32_t i = 0; i < ncores; i++) { if (pthread_self() == vcpu_threads[i]) continue; pthread_join(vcpu_threads[i], NULL); } free(vcpu_threads); } if (vcpu_fds) free(vcpu_fds); // clean up and close KVM close_fd(&vmfd); close_fd(&kvm); } static void* wait_for_packet(void* arg) { int ret; struct pollfd fds = { .fd = netfd, .events = POLLIN, .revents = 0}; while(1) { fds.revents = 0; ret = poll(&fds, 1, -1000); if (ret < 0 && errno == EINTR) continue; if (ret < 0) perror("poll()"); else if (ret) { uint64_t event_counter = 1; write(efd, &event_counter, sizeof(event_counter)); sem_wait(&net_sem); } } return NULL; } static inline void check_network(void) { // should we start the network thread? if ((efd < 0) && (getenv("HERMIT_NETIF"))) { struct kvm_irqfd irqfd = {}; efd = eventfd(0, 0); irqfd.fd = efd; irqfd.gsi = UHYVE_IRQ; kvm_ioctl(vmfd, KVM_IRQFD, &irqfd); sem_init(&net_sem, 0, 0); if (pthread_create(&net_thread, NULL, wait_for_packet, NULL)) err(1, "unable to create thread"); } } static int vcpu_loop(void) { int ret; pthread_barrier_wait(&barrier); if (restart) { vcpu_state_t cpu_state = read_cpu_state(); restore_cpu_state(cpu_state); } else if (vcpu_thread_states) { restore_cpu_state(vcpu_thread_states[cpuid]); } else { init_cpu_state(elf_entry); } if (cpuid == 0) { if (restart) { no_checkpoint++; } else if (migration) { free(vcpu_thread_states); vcpu_thread_states = NULL; } } while (1) { ret = ioctl(vcpufd, KVM_RUN, NULL); if(ret == -1) { switch(errno) { case EINTR: continue; case EFAULT: { struct kvm_regs regs; kvm_ioctl(vcpufd, KVM_GET_REGS, ®s); #ifdef __x86_64__ err(1, "KVM: host/guest translation fault: rip=0x%llx", regs.rip); #else err(1, "KVM: host/guest translation fault: elr_el1=0x%llx", regs.elr_el1); #endif } default: err(1, "KVM: ioctl KVM_RUN in vcpu_loop for cpuid %d failed", cpuid); break; } } uint64_t port = 0; unsigned raddr = 0; /* handle requests */ switch (run->exit_reason) { case KVM_EXIT_HLT: fprintf(stderr, "Guest has halted the CPU, this is considered as a normal exit.\n"); return 0; case KVM_EXIT_MMIO: port = run->mmio.phys_addr; if (run->mmio.is_write) memcpy(&raddr, run->mmio.data, sizeof(raddr) /*run->mmio.len*/); //printf("KVM: handled KVM_EXIT_MMIO at 0x%lx (data %u)\n", port, raddr); case KVM_EXIT_IO: if (!port) { port = run->io.port; raddr = *((unsigned*)((size_t)run+run->io.data_offset)); } //printf("port 0x%x\n", run->io.port); switch (port) { case UHYVE_UART_PORT: if (verbose) putc((unsigned char) raddr, stderr); break; case UHYVE_PORT_WRITE: { uhyve_write_t* uhyve_write = (uhyve_write_t*) (guest_mem+raddr); uhyve_write->len = write(uhyve_write->fd, guest_mem+(size_t)uhyve_write->buf, uhyve_write->len); break; } case UHYVE_PORT_READ: { uhyve_read_t* uhyve_read = (uhyve_read_t*) (guest_mem+raddr); uhyve_read->ret = read(uhyve_read->fd, guest_mem+(size_t)uhyve_read->buf, uhyve_read->len); break; } case UHYVE_PORT_EXIT: { if (cpuid) pthread_exit((int*)(guest_mem+raddr)); else exit(*(int*)(guest_mem+raddr)); break; } case UHYVE_PORT_OPEN: { uhyve_open_t* uhyve_open = (uhyve_open_t*) (guest_mem+raddr); char rpath[PATH_MAX]; // forbid to open the kvm device if (realpath((const char*)guest_mem+(size_t)uhyve_open->name, rpath) < 0) uhyve_open->ret = -1; else if (strcmp(rpath, "/dev/kvm") == 0) uhyve_open->ret = -1; else uhyve_open->ret = open((const char*)guest_mem+(size_t)uhyve_open->name, uhyve_open->flags, uhyve_open->mode); break; } case UHYVE_PORT_CLOSE: { uhyve_close_t* uhyve_close = (uhyve_close_t*) (guest_mem+raddr); if (uhyve_close->fd > 2) uhyve_close->ret = close(uhyve_close->fd); else uhyve_close->ret = 0; break; } case UHYVE_PORT_NETINFO: { uhyve_netinfo_t* uhyve_netinfo = (uhyve_netinfo_t*)(guest_mem+raddr); memcpy(uhyve_netinfo->mac_str, uhyve_get_mac(), 18); // guest configure the ethernet device => start network thread check_network(); break; } case UHYVE_PORT_NETWRITE: { uhyve_netwrite_t* uhyve_netwrite = (uhyve_netwrite_t*)(guest_mem + raddr); uhyve_netwrite->ret = 0; ret = write(netfd, guest_mem + (size_t)uhyve_netwrite->data, uhyve_netwrite->len); if (ret >= 0) { uhyve_netwrite->ret = 0; uhyve_netwrite->len = ret; } else { uhyve_netwrite->ret = -1; } break; } case UHYVE_PORT_NETREAD: { uhyve_netread_t* uhyve_netread = (uhyve_netread_t*)(guest_mem + raddr); ret = read(netfd, guest_mem + (size_t)uhyve_netread->data, uhyve_netread->len); if (ret > 0) { uhyve_netread->len = ret; uhyve_netread->ret = 0; } else { uhyve_netread->ret = -1; sem_post(&net_sem); } break; } case UHYVE_PORT_NETSTAT: { uhyve_netstat_t* uhyve_netstat = (uhyve_netstat_t*)(guest_mem + raddr); char* str = getenv("HERMIT_NETIF"); if (str) uhyve_netstat->status = 1; else uhyve_netstat->status = 0; break; } case UHYVE_PORT_LSEEK: { uhyve_lseek_t* uhyve_lseek = (uhyve_lseek_t*) (guest_mem+raddr); uhyve_lseek->offset = lseek(uhyve_lseek->fd, uhyve_lseek->offset, uhyve_lseek->whence); break; } case UHYVE_PORT_CMDSIZE: { int i; uhyve_cmdsize_t *val = (uhyve_cmdsize_t *) (guest_mem+raddr); val->argc = uhyve_argc; for(i=0; iargsz[i] = strlen(uhyve_argv[i]) + 1; val->envc = uhyve_envc; for(i=0; ienvsz[i] = strlen(uhyve_envp[i]) + 1; break; } case UHYVE_PORT_CMDVAL: { int i; char **argv_ptr, **env_ptr; uhyve_cmdval_t *val = (uhyve_cmdval_t *) (guest_mem+raddr); /* argv */ argv_ptr = (char **)(guest_mem + (size_t)val->argv); for(i=0; ienvp); for(i=0; ifail_entry.hardware_entry_failure_reason); break; case KVM_EXIT_INTERNAL_ERROR: err(1, "KVM: internal error exit: suberror = 0x%x\n", run->internal.suberror); break; case KVM_EXIT_SHUTDOWN: fprintf(stderr, "KVM: receive shutdown command\n"); case KVM_EXIT_DEBUG: print_registers(); exit(EXIT_FAILURE); default: fprintf(stderr, "KVM: unhandled exit: exit_reason = 0x%x\n", run->exit_reason); exit(EXIT_FAILURE); } } close(vcpufd); vcpufd = -1; return 0; } static int vcpu_init(void) { vcpu_fds[cpuid] = vcpufd = kvm_ioctl(vmfd, KVM_CREATE_VCPU, cpuid); /* Map the shared kvm_run structure and following data. */ size_t mmap_size = (size_t) kvm_ioctl(kvm, KVM_GET_VCPU_MMAP_SIZE, NULL); if (mmap_size < sizeof(*run)) err(1, "KVM: invalid VCPU_MMAP_SIZE: %zd", mmap_size); run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, vcpufd, 0); if (run == MAP_FAILED) err(1, "KVM: VCPU mmap failed"); return 0; } static void sigusr_handler(int signum) { pthread_barrier_wait(&barrier); write_cpu_state(); pthread_barrier_wait(&barrier); } static void vcpu_thread_mig_handler(int signum) { /* memory should be allocated at this point */ assert(vcpu_thread_states != NULL); /* ensure consistency among VCPUs */ pthread_barrier_wait(&barrier); /* save state */ vcpu_thread_states[cpuid] = save_cpu_state(); /* synchronize with migration thread */ pthread_barrier_wait(&migration_barrier); /* wait to be killed */ pthread_barrier_wait(&migration_barrier); } static void* uhyve_thread(void* arg) { size_t ret; struct sigaction sa; pthread_cleanup_push(uhyve_exit, NULL); cpuid = (size_t) arg; /* install signal handler for checkpoint */ memset(&sa, 0x00, sizeof(sa)); sa.sa_handler = &sigusr_handler; sigaction(SIGTHRCHKP, &sa, NULL); /* install signal handler for migration */ memset(&sa, 0x00, sizeof(sa)); sa.sa_handler = &vcpu_thread_mig_handler; sigaction(SIGTHRMIG, &sa, NULL); // create new cpu vcpu_init(); pthread_barrier_wait(&barrier); // run cpu loop until thread gets killed ret = vcpu_loop(); pthread_cleanup_pop(1); return (void*) ret; } void sigterm_handler(int signum) { pthread_exit(0); } int uhyve_init(char *path) { FILE *f = NULL; guest_path = path; signal(SIGTERM, sigterm_handler); // register routine to close the VM atexit(uhyve_atexit); const char *start_mig_server = getenv("HERMIT_MIGRATION_SERVER"); /* * Three startups * a) incoming migration * b) load existing checkpoint * c) normal run */ if (start_mig_server) { migration = true; migration_metadata_t metadata; wait_for_incomming_migration(&metadata, MIGRATION_PORT); ncores = metadata.ncores; guest_size = metadata.guest_size; elf_entry = metadata.elf_entry; full_checkpoint = metadata.full_checkpoint; } else if ((f = fopen("checkpoint/chk_config.txt", "r")) != NULL) { int tmp = 0; restart = true; fscanf(f, "number of cores: %u\n", &ncores); fscanf(f, "memory size: 0x%zx\n", &guest_size); fscanf(f, "checkpoint number: %u\n", &no_checkpoint); fscanf(f, "entry point: 0x%zx", &elf_entry); fscanf(f, "full checkpoint: %d", &tmp); full_checkpoint = tmp ? true : false; if (verbose) fprintf(stderr, "Restart from checkpoint %u " "(ncores %d, mem size 0x%zx)\n", no_checkpoint, ncores, guest_size); fclose(f); } else { const char* hermit_memory = getenv("HERMIT_MEM"); if (hermit_memory) guest_size = memparse(hermit_memory); const char* hermit_cpus = getenv("HERMIT_CPUS"); if (hermit_cpus) ncores = (uint32_t) atoi(hermit_cpus); const char* full_chk = getenv("HERMIT_FULLCHECKPOINT"); if (full_chk && (strcmp(full_chk, "0") != 0)) full_checkpoint = true; } vcpu_threads = (pthread_t*) calloc(ncores, sizeof(pthread_t)); if (!vcpu_threads) err(1, "Not enough memory"); vcpu_fds = (int*) calloc(ncores, sizeof(int)); if (!vcpu_fds) err(1, "Not enough memory"); kvm = open("/dev/kvm", O_RDWR | O_CLOEXEC); if (kvm < 0) err(1, "Could not open: /dev/kvm"); /* Make sure we have the stable version of the API */ int kvm_api_version = kvm_ioctl(kvm, KVM_GET_API_VERSION, NULL); if (kvm_api_version != 12) err(1, "KVM: API version is %d, uhyve requires version 12", kvm_api_version); /* Create the virtual machine */ vmfd = kvm_ioctl(kvm, KVM_CREATE_VM, 0); #ifdef __x86_64__ init_kvm_arch(); if (restart) { if (load_checkpoint(guest_mem, path) != 0) exit(EXIT_FAILURE); } else if (start_mig_server) { load_migration_data(guest_mem); close_migration_channel(); } else { if (load_kernel(guest_mem, path) != 0) exit(EXIT_FAILURE); } #endif pthread_barrier_init(&barrier, NULL, ncores); pthread_barrier_init(&migration_barrier, NULL, ncores+1); cpuid = 0; // create first CPU, it will be the boot processor by default int ret = vcpu_init(); const char* netif_str = getenv("HERMIT_NETIF"); if (netif_str) { // TODO: strncmp for different network interfaces // for example tun/tap device or uhyvetap device netfd = uhyve_net_init(netif_str); if (netfd < 0) err(1, "unable to initialized network"); } return ret; } int uhyve_loop(int argc, char **argv) { const char* hermit_check = getenv("HERMIT_CHECKPOINT"); const char* hermit_mig_support = getenv("HERMIT_MIGRATION_SUPPORT"); const char* hermit_mig_type = getenv("HERMIT_MIGRATION_TYPE"); int ts = 0, i = 0; /* argv[0] is 'proxy', do not count it */ uhyve_argc = argc-1; uhyve_argv = &argv[1]; uhyve_envp = environ; while(uhyve_envp[i] != NULL) i++; uhyve_envc = i; if (uhyve_argc > MAX_ARGC_ENVC) { fprintf(stderr, "uhyve downsiize envc from %d to %d\n", uhyve_argc, MAX_ARGC_ENVC); uhyve_argc = MAX_ARGC_ENVC; } if (uhyve_envc > MAX_ARGC_ENVC-1) { fprintf(stderr, "uhyve downsiize envc from %d to %d\n", uhyve_envc, MAX_ARGC_ENVC-1); uhyve_envc = MAX_ARGC_ENVC-1; } if(uhyve_argc > MAX_ARGC_ENVC || uhyve_envc > MAX_ARGC_ENVC) { fprintf(stderr, "uhyve cannot forward more than %d command line " "arguments or environment variables, please consider increasing " "the MAX_ARGC_ENVP cmake argument\n", MAX_ARGC_ENVC); return -1; } if (hermit_check) ts = atoi(hermit_check); if (hermit_mig_support) { set_migration_target(hermit_mig_support, MIGRATION_PORT); set_migration_type(hermit_mig_type); /* block SIGUSR1 in main thread */ sigemptyset (&signal_mask); sigaddset (&signal_mask, SIGUSR1); pthread_sigmask (SIG_BLOCK, &signal_mask, NULL); /* start migration thread; handles SIGUSR1 */ pthread_t sig_thr_id; pthread_create (&sig_thr_id, NULL, migration_handler, (void *)&signal_mask); /* install signal handler for migration */ struct sigaction sa; memset(&sa, 0x00, sizeof(sa)); sa.sa_handler = &vcpu_thread_mig_handler; sigaction(SIGTHRMIG, &sa, NULL); } // First CPU is special because it will boot the system. Other CPUs will // be booted linearily after the first one. vcpu_threads[0] = pthread_self(); // start threads to create VCPUs for(size_t i = 1; i < ncores; i++) pthread_create(&vcpu_threads[i], NULL, uhyve_thread, (void*) i); pthread_barrier_wait(&barrier); #ifdef __aarch64__ init_kvm_arch(); if (restart) { if (load_checkpoint(guest_mem, guest_path) != 0) exit(EXIT_FAILURE); } else { if (load_kernel(guest_mem, guest_path) != 0) exit(EXIT_FAILURE); } #endif *((uint32_t*) (mboot+0x24)) = ncores; if (ts > 0) { struct sigaction sa; struct itimerval timer; /* Install timer_handler as the signal handler for SIGVTALRM. */ memset(&sa, 0x00, sizeof(sa)); sa.sa_handler = &timer_handler; sigaction(SIGALRM, &sa, NULL); /* Configure the timer to expire after "ts" sec... */ timer.it_value.tv_sec = ts; timer.it_value.tv_usec = 0; /* ... and every "ts" sec after that. */ timer.it_interval.tv_sec = ts; timer.it_interval.tv_usec = 0; /* Start a virtual timer. It counts down whenever this process is executing. */ setitimer(ITIMER_REAL, &timer, NULL); } // Run first CPU return vcpu_loop(); }