1
0
Fork 0
mirror of https://github.com/hermitcore/libhermit.git synced 2025-03-09 00:00:03 +01:00
libhermit/tools/uhyve.c

813 lines
19 KiB
C

/* Copyright (c) 2015, IBM
* Author(s): Dan Williams <djwillia@us.ibm.com>
* Ricardo Koller <kollerr@us.ibm.com>
* Copyright (c) 2017, RWTH Aachen University
* Author(s): Stefan Lankes <slankes@eonerc.rwth-aachen.de>
*
* Permission to use, copy, modify, and/or distribute this software
* for any purpose with or without fee is hereby granted, provided
* that the above copyright notice and this permission notice appear
* in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
* WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
* AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
* CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
* NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
/* We used several existing projects as guides
* kvmtest.c: http://lwn.net/Articles/658512/
* Solo5: https://github.com/Solo5/solo5
*/
/*
* 15.1.2017: extend original version (https://github.com/Solo5/solo5)
* for HermitCore
* 25.2.2017: add SMP support to enable more than one core
* 24.4.2017: add checkpoint/restore support,
* remove memory limit
*/
#define _GNU_SOURCE
#include <arpa/inet.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdbool.h>
#include <errno.h>
#include <fcntl.h>
#include <sched.h>
#include <signal.h>
#include <limits.h>
#include <pthread.h>
#include <semaphore.h>
#include <elf.h>
#include <err.h>
#include <poll.h>
#include <sys/wait.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/time.h>
#include <sys/eventfd.h>
#include <linux/const.h>
#include <linux/kvm.h>
#include "uhyve.h"
#include "uhyve-syscalls.h"
#include "uhyve-migration.h"
#include "uhyve-net.h"
#include "uhyve-gdb.h"
#include "proxy.h"
static bool restart = false;
static bool migration = false;
static pthread_t net_thread;
static int* vcpu_fds = NULL;
static pthread_mutex_t kvm_lock = PTHREAD_MUTEX_INITIALIZER;
extern bool verbose;
static char* guest_path = NULL;
static bool uhyve_gdb_enabled = false;
size_t guest_size = 0x20000000ULL;
bool full_checkpoint = false;
pthread_barrier_t barrier;
pthread_barrier_t migration_barrier;
pthread_t* vcpu_threads = NULL;
uint8_t* klog = NULL;
uint8_t* guest_mem = NULL;
uint32_t no_checkpoint = 0;
uint32_t ncores = 1;
uint64_t elf_entry;
int kvm = -1, vmfd = -1, netfd = -1, efd = -1;
uint8_t* mboot = NULL;
__thread struct kvm_run *run = NULL;
__thread int vcpufd = -1;
__thread uint32_t cpuid = 0;
static sem_t net_sem;
int uhyve_argc = -1;
int uhyve_envc = -1;
char **uhyve_argv = NULL;
extern char **environ;
char **uhyve_envp = NULL;
vcpu_state_t *vcpu_thread_states = NULL;
static sigset_t signal_mask;
typedef struct {
int argc;
int argsz[MAX_ARGC_ENVC];
int envc;
int envsz[MAX_ARGC_ENVC];
} __attribute__ ((packed)) uhyve_cmdsize_t;
typedef struct {
char **argv;
char **envp;
} __attribute__ ((packed)) uhyve_cmdval_t;
static uint64_t memparse(const char *ptr)
{
// local pointer to end of parsed string
char *endptr;
// parse number
uint64_t size = strtoull(ptr, &endptr, 0);
// parse size extension, intentional fall-through
switch (*endptr) {
case 'E':
case 'e':
size <<= 10;
case 'P':
case 'p':
size <<= 10;
case 'T':
case 't':
size <<= 10;
case 'G':
case 'g':
size <<= 10;
case 'M':
case 'm':
size <<= 10;
case 'K':
case 'k':
size <<= 10;
endptr++;
default:
break;
}
return size;
}
// Just close file descriptor if not already done
static void close_fd(int* fd)
{
if (*fd != -1) {
close(*fd);
*fd = -1;
}
}
static void uhyve_exit(void* arg)
{
//print_registers();
if (pthread_mutex_trylock(&kvm_lock))
{
close_fd(&vcpufd);
return;
}
// only the main thread will execute this
if (vcpu_threads) {
for(uint32_t i=0; i<ncores; i++) {
if (pthread_self() == vcpu_threads[i])
continue;
pthread_kill(vcpu_threads[i], SIGTERM);
}
if (netfd > 0)
pthread_kill(net_thread, SIGTERM);
}
close_fd(&vcpufd);
}
static void uhyve_atexit(void)
{
uhyve_exit(NULL);
if (vcpu_threads) {
for(uint32_t i = 0; i < ncores; i++) {
if (pthread_self() == vcpu_threads[i])
continue;
pthread_join(vcpu_threads[i], NULL);
}
free(vcpu_threads);
}
if (vcpu_fds)
free(vcpu_fds);
// clean up and close KVM
close_fd(&vmfd);
close_fd(&kvm);
}
static void* wait_for_packet(void* arg)
{
int ret;
struct pollfd fds = { .fd = netfd,
.events = POLLIN,
.revents = 0};
while(1)
{
fds.revents = 0;
ret = poll(&fds, 1, -1000);
if (ret < 0 && errno == EINTR)
continue;
if (ret < 0)
perror("poll()");
else if (ret) {
uint64_t event_counter = 1;
write(efd, &event_counter, sizeof(event_counter));
sem_wait(&net_sem);
}
}
return NULL;
}
static inline void check_network(void)
{
// should we start the network thread?
if ((efd < 0) && (getenv("HERMIT_NETIF"))) {
struct kvm_irqfd irqfd = {};
efd = eventfd(0, 0);
irqfd.fd = efd;
irqfd.gsi = UHYVE_IRQ;
kvm_ioctl(vmfd, KVM_IRQFD, &irqfd);
sem_init(&net_sem, 0, 0);
if (pthread_create(&net_thread, NULL, wait_for_packet, NULL))
err(1, "unable to create thread");
}
}
static int vcpu_loop(void)
{
int ret;
pthread_barrier_wait(&barrier);
if (restart) {
vcpu_state_t cpu_state = read_cpu_state();
restore_cpu_state(cpu_state);
} else if (vcpu_thread_states) {
restore_cpu_state(vcpu_thread_states[cpuid]);
} else {
init_cpu_state(elf_entry);
}
if (cpuid == 0) {
if (restart) {
no_checkpoint++;
} else if (migration) {
free(vcpu_thread_states);
vcpu_thread_states = NULL;
}
}
/* init uhyve gdb support */
if (uhyve_gdb_enabled) {
if (cpuid == 0)
uhyve_gdb_init(vcpufd);
pthread_barrier_wait(&barrier);
}
while (1) {
ret = ioctl(vcpufd, KVM_RUN, NULL);
if(ret == -1) {
switch(errno) {
case EINTR:
continue;
case EFAULT: {
struct kvm_regs regs;
kvm_ioctl(vcpufd, KVM_GET_REGS, &regs);
#ifdef __x86_64__
err(1, "KVM: host/guest translation fault: rip=0x%llx", regs.rip);
#else
err(1, "KVM: host/guest translation fault: elr_el1=0x%llx", regs.elr_el1);
#endif
}
default:
err(1, "KVM: ioctl KVM_RUN in vcpu_loop for cpuid %d failed", cpuid);
break;
}
}
uint64_t port = 0;
unsigned raddr = 0;
/* handle requests */
switch (run->exit_reason) {
case KVM_EXIT_HLT:
fprintf(stderr, "Guest has halted the CPU, this is considered as a normal exit.\n");
if (uhyve_gdb_enabled)
uhyve_gdb_handle_term();
return 0;
case KVM_EXIT_MMIO:
port = run->mmio.phys_addr;
if (run->mmio.is_write)
memcpy(&raddr, run->mmio.data, sizeof(raddr) /*run->mmio.len*/);
//printf("KVM: handled KVM_EXIT_MMIO at 0x%lx (data %u)\n", port, raddr);
case KVM_EXIT_IO:
if (!port) {
port = run->io.port;
raddr = *((unsigned*)((size_t)run+run->io.data_offset));
}
//printf("port 0x%x\n", run->io.port);
switch (port) {
case UHYVE_UART_PORT:
if (verbose)
putc((unsigned char) raddr, stderr);
break;
case UHYVE_PORT_WRITE: {
uhyve_write_t* uhyve_write = (uhyve_write_t*) (guest_mem+raddr);
uhyve_write->len = write(uhyve_write->fd, guest_mem+(size_t)uhyve_write->buf, uhyve_write->len);
break;
}
case UHYVE_PORT_READ: {
uhyve_read_t* uhyve_read = (uhyve_read_t*) (guest_mem+raddr);
uhyve_read->ret = read(uhyve_read->fd, guest_mem+(size_t)uhyve_read->buf, uhyve_read->len);
break;
}
case UHYVE_PORT_EXIT: {
if (cpuid)
pthread_exit((int*)(guest_mem+raddr));
else
exit(*(int*)(guest_mem+raddr));
break;
}
case UHYVE_PORT_OPEN: {
uhyve_open_t* uhyve_open = (uhyve_open_t*) (guest_mem+raddr);
char rpath[PATH_MAX];
// forbid to open the kvm device
if (realpath((const char*)guest_mem+(size_t)uhyve_open->name, rpath) < 0)
uhyve_open->ret = -1;
else if (strcmp(rpath, "/dev/kvm") == 0)
uhyve_open->ret = -1;
else
uhyve_open->ret = open((const char*)guest_mem+(size_t)uhyve_open->name, uhyve_open->flags, uhyve_open->mode);
break;
}
case UHYVE_PORT_CLOSE: {
uhyve_close_t* uhyve_close = (uhyve_close_t*) (guest_mem+raddr);
if (uhyve_close->fd > 2)
uhyve_close->ret = close(uhyve_close->fd);
else
uhyve_close->ret = 0;
break;
}
case UHYVE_PORT_NETINFO: {
uhyve_netinfo_t* uhyve_netinfo = (uhyve_netinfo_t*)(guest_mem+raddr);
memcpy(uhyve_netinfo->mac_str, uhyve_get_mac(), 18);
// guest configure the ethernet device => start network thread
check_network();
break;
}
case UHYVE_PORT_NETWRITE: {
uhyve_netwrite_t* uhyve_netwrite = (uhyve_netwrite_t*)(guest_mem + raddr);
uhyve_netwrite->ret = 0;
ret = write(netfd, guest_mem + (size_t)uhyve_netwrite->data, uhyve_netwrite->len);
if (ret >= 0) {
uhyve_netwrite->ret = 0;
uhyve_netwrite->len = ret;
} else {
uhyve_netwrite->ret = -1;
}
break;
}
case UHYVE_PORT_NETREAD: {
uhyve_netread_t* uhyve_netread = (uhyve_netread_t*)(guest_mem + raddr);
ret = read(netfd, guest_mem + (size_t)uhyve_netread->data, uhyve_netread->len);
if (ret > 0) {
uhyve_netread->len = ret;
uhyve_netread->ret = 0;
} else {
uhyve_netread->ret = -1;
sem_post(&net_sem);
}
break;
}
case UHYVE_PORT_NETSTAT: {
uhyve_netstat_t* uhyve_netstat = (uhyve_netstat_t*)(guest_mem + raddr);
char* str = getenv("HERMIT_NETIF");
if (str)
uhyve_netstat->status = 1;
else
uhyve_netstat->status = 0;
break;
}
case UHYVE_PORT_LSEEK: {
uhyve_lseek_t* uhyve_lseek = (uhyve_lseek_t*) (guest_mem+raddr);
uhyve_lseek->offset = lseek(uhyve_lseek->fd, uhyve_lseek->offset, uhyve_lseek->whence);
break;
}
case UHYVE_PORT_CMDSIZE: {
int i;
uhyve_cmdsize_t *val = (uhyve_cmdsize_t *) (guest_mem+raddr);
val->argc = uhyve_argc;
for(i=0; i<uhyve_argc; i++)
val->argsz[i] = strlen(uhyve_argv[i]) + 1;
val->envc = uhyve_envc;
for(i=0; i<uhyve_envc; i++)
val->envsz[i] = strlen(uhyve_envp[i]) + 1;
break;
}
case UHYVE_PORT_CMDVAL: {
int i;
char **argv_ptr, **env_ptr;
uhyve_cmdval_t *val = (uhyve_cmdval_t *) (guest_mem+raddr);
/* argv */
argv_ptr = (char **)(guest_mem + (size_t)val->argv);
for(i=0; i<uhyve_argc; i++)
strcpy(guest_mem + (size_t)argv_ptr[i], uhyve_argv[i]);
/* env */
env_ptr = (char **)(guest_mem + (size_t)val->envp);
for(i=0; i<uhyve_envc; i++)
strcpy(guest_mem + (size_t)env_ptr[i], uhyve_envp[i]);
break;
}
default:
err(1, "KVM: unhandled KVM_EXIT_IO / KVM_EXIT_MMIO at port 0x%lx\n", port);
break;
}
break;
case KVM_EXIT_FAIL_ENTRY:
if (uhyve_gdb_enabled)
uhyve_gdb_handle_exception(vcpufd, GDB_SIGNAL_SEGV);
err(1, "KVM: entry failure: hw_entry_failure_reason=0x%llx\n",
run->fail_entry.hardware_entry_failure_reason);
break;
case KVM_EXIT_INTERNAL_ERROR:
if (uhyve_gdb_enabled)
uhyve_gdb_handle_exception(vcpufd, GDB_SIGNAL_SEGV);
err(1, "KVM: internal error exit: suberror = 0x%x\n", run->internal.suberror);
break;
case KVM_EXIT_SHUTDOWN:
fprintf(stderr, "KVM: receive shutdown command\n");
case KVM_EXIT_DEBUG:
if (uhyve_gdb_enabled) {
uhyve_gdb_handle_exception(vcpufd, GDB_SIGNAL_TRAP);
break;
} else print_registers();
exit(EXIT_FAILURE);
default:
fprintf(stderr, "KVM: unhandled exit: exit_reason = 0x%x\n", run->exit_reason);
exit(EXIT_FAILURE);
}
}
close(vcpufd);
vcpufd = -1;
return 0;
}
static int vcpu_init(void)
{
vcpu_fds[cpuid] = vcpufd = kvm_ioctl(vmfd, KVM_CREATE_VCPU, cpuid);
/* Map the shared kvm_run structure and following data. */
size_t mmap_size = (size_t) kvm_ioctl(kvm, KVM_GET_VCPU_MMAP_SIZE, NULL);
if (mmap_size < sizeof(*run))
err(1, "KVM: invalid VCPU_MMAP_SIZE: %zd", mmap_size);
run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, vcpufd, 0);
if (run == MAP_FAILED)
err(1, "KVM: VCPU mmap failed");
return 0;
}
static void sigusr_handler(int signum)
{
pthread_barrier_wait(&barrier);
write_cpu_state();
pthread_barrier_wait(&barrier);
}
static void vcpu_thread_mig_handler(int signum)
{
/* memory should be allocated at this point */
assert(vcpu_thread_states != NULL);
/* ensure consistency among VCPUs */
pthread_barrier_wait(&barrier);
/* save state */
vcpu_thread_states[cpuid] = save_cpu_state();
/* synchronize with migration thread */
pthread_barrier_wait(&migration_barrier);
/* wait to be killed */
pthread_barrier_wait(&migration_barrier);
}
static void* uhyve_thread(void* arg)
{
size_t ret;
struct sigaction sa;
pthread_cleanup_push(uhyve_exit, NULL);
cpuid = (size_t) arg;
/* install signal handler for checkpoint */
memset(&sa, 0x00, sizeof(sa));
sa.sa_handler = &sigusr_handler;
sigaction(SIGTHRCHKP, &sa, NULL);
/* install signal handler for migration */
memset(&sa, 0x00, sizeof(sa));
sa.sa_handler = &vcpu_thread_mig_handler;
sigaction(SIGTHRMIG, &sa, NULL);
// create new cpu
vcpu_init();
pthread_barrier_wait(&barrier);
// run cpu loop until thread gets killed
ret = vcpu_loop();
pthread_cleanup_pop(1);
return (void*) ret;
}
void sigterm_handler(int signum)
{
pthread_exit(0);
}
int uhyve_init(char *path)
{
FILE *f = NULL;
guest_path = path;
signal(SIGTERM, sigterm_handler);
// register routine to close the VM
atexit(uhyve_atexit);
const char *start_mig_server = getenv("HERMIT_MIGRATION_SERVER");
/*
* Three startups
* a) incoming migration
* b) load existing checkpoint
* c) normal run
*/
if (start_mig_server) {
migration = true;
migration_metadata_t metadata;
wait_for_incomming_migration(&metadata, MIGRATION_PORT);
ncores = metadata.ncores;
guest_size = metadata.guest_size;
elf_entry = metadata.elf_entry;
full_checkpoint = metadata.full_checkpoint;
} else if ((f = fopen("checkpoint/chk_config.txt", "r")) != NULL) {
int tmp = 0;
restart = true;
fscanf(f, "number of cores: %u\n", &ncores);
fscanf(f, "memory size: 0x%zx\n", &guest_size);
fscanf(f, "checkpoint number: %u\n", &no_checkpoint);
fscanf(f, "entry point: 0x%zx", &elf_entry);
fscanf(f, "full checkpoint: %d", &tmp);
full_checkpoint = tmp ? true : false;
if (verbose)
fprintf(stderr,
"Restart from checkpoint %u "
"(ncores %d, mem size 0x%zx)\n",
no_checkpoint, ncores, guest_size);
fclose(f);
} else {
const char* hermit_memory = getenv("HERMIT_MEM");
if (hermit_memory)
guest_size = memparse(hermit_memory);
const char* hermit_cpus = getenv("HERMIT_CPUS");
if (hermit_cpus)
ncores = (uint32_t) atoi(hermit_cpus);
const char* full_chk = getenv("HERMIT_FULLCHECKPOINT");
if (full_chk && (strcmp(full_chk, "0") != 0))
full_checkpoint = true;
}
vcpu_threads = (pthread_t*) calloc(ncores, sizeof(pthread_t));
if (!vcpu_threads)
err(1, "Not enough memory");
vcpu_fds = (int*) calloc(ncores, sizeof(int));
if (!vcpu_fds)
err(1, "Not enough memory");
kvm = open("/dev/kvm", O_RDWR | O_CLOEXEC);
if (kvm < 0)
err(1, "Could not open: /dev/kvm");
/* Make sure we have the stable version of the API */
int kvm_api_version = kvm_ioctl(kvm, KVM_GET_API_VERSION, NULL);
if (kvm_api_version != 12)
err(1, "KVM: API version is %d, uhyve requires version 12", kvm_api_version);
/* Create the virtual machine */
vmfd = kvm_ioctl(kvm, KVM_CREATE_VM, 0);
#ifdef __x86_64__
init_kvm_arch();
if (restart) {
if (load_checkpoint(guest_mem, path) != 0)
exit(EXIT_FAILURE);
} else if (start_mig_server) {
load_migration_data(guest_mem);
close_migration_channel();
} else {
if (load_kernel(guest_mem, path) != 0)
exit(EXIT_FAILURE);
}
#endif
pthread_barrier_init(&barrier, NULL, ncores);
pthread_barrier_init(&migration_barrier, NULL, ncores+1);
cpuid = 0;
// create first CPU, it will be the boot processor by default
int ret = vcpu_init();
const char* netif_str = getenv("HERMIT_NETIF");
if (netif_str)
{
// TODO: strncmp for different network interfaces
// for example tun/tap device or uhyvetap device
netfd = uhyve_net_init(netif_str);
if (netfd < 0)
err(1, "unable to initialized network");
}
return ret;
}
int uhyve_loop(int argc, char **argv)
{
const char* hermit_check = getenv("HERMIT_CHECKPOINT");
const char* hermit_mig_support = getenv("HERMIT_MIGRATION_SUPPORT");
const char* hermit_mig_type = getenv("HERMIT_MIGRATION_TYPE");
const char* hermit_debug = getenv("HERMIT_DEBUG");
int ts = 0, i = 0;
if (hermit_debug && (atoi(hermit_debug) != 0))
uhyve_gdb_enabled = true;
/* argv[0] is 'proxy', do not count it */
uhyve_argc = argc-1;
uhyve_argv = &argv[1];
uhyve_envp = environ;
while(uhyve_envp[i] != NULL)
i++;
uhyve_envc = i;
if (uhyve_argc > MAX_ARGC_ENVC) {
fprintf(stderr, "uhyve downsiize envc from %d to %d\n", uhyve_argc, MAX_ARGC_ENVC);
uhyve_argc = MAX_ARGC_ENVC;
}
if (uhyve_envc > MAX_ARGC_ENVC-1) {
fprintf(stderr, "uhyve downsiize envc from %d to %d\n", uhyve_envc, MAX_ARGC_ENVC-1);
uhyve_envc = MAX_ARGC_ENVC-1;
}
if (uhyve_argc > MAX_ARGC_ENVC || uhyve_envc > MAX_ARGC_ENVC) {
fprintf(stderr, "uhyve cannot forward more than %d command line "
"arguments or environment variables, please consider increasing "
"the MAX_ARGC_ENVP cmake argument\n", MAX_ARGC_ENVC);
return -1;
}
if (hermit_check)
ts = atoi(hermit_check);
if (hermit_mig_support) {
set_migration_target(hermit_mig_support, MIGRATION_PORT);
set_migration_type(hermit_mig_type);
/* block SIGUSR1 in main thread */
sigemptyset (&signal_mask);
sigaddset (&signal_mask, SIGUSR1);
pthread_sigmask (SIG_BLOCK, &signal_mask, NULL);
/* start migration thread; handles SIGUSR1 */
pthread_t sig_thr_id;
pthread_create (&sig_thr_id, NULL, migration_handler, (void *)&signal_mask);
/* install signal handler for migration */
struct sigaction sa;
memset(&sa, 0x00, sizeof(sa));
sa.sa_handler = &vcpu_thread_mig_handler;
sigaction(SIGTHRMIG, &sa, NULL);
}
// First CPU is special because it will boot the system. Other CPUs will
// be booted linearily after the first one.
vcpu_threads[0] = pthread_self();
// start threads to create VCPUs
for(size_t i = 1; i < ncores; i++)
pthread_create(&vcpu_threads[i], NULL, uhyve_thread, (void*) i);
pthread_barrier_wait(&barrier);
#ifdef __aarch64__
init_kvm_arch();
if (restart) {
if (load_checkpoint(guest_mem, guest_path) != 0)
exit(EXIT_FAILURE);
} else {
if (load_kernel(guest_mem, guest_path) != 0)
exit(EXIT_FAILURE);
}
#endif
*((uint32_t*) (mboot+0x24)) = ncores;
if (ts > 0)
{
struct sigaction sa;
struct itimerval timer;
/* Install timer_handler as the signal handler for SIGVTALRM. */
memset(&sa, 0x00, sizeof(sa));
sa.sa_handler = &timer_handler;
sigaction(SIGALRM, &sa, NULL);
/* Configure the timer to expire after "ts" sec... */
timer.it_value.tv_sec = ts;
timer.it_value.tv_usec = 0;
/* ... and every "ts" sec after that. */
timer.it_interval.tv_sec = ts;
timer.it_interval.tv_usec = 0;
/* Start a virtual timer. It counts down whenever this process is executing. */
setitimer(ITIMER_REAL, &timer, NULL);
}
// Run first CPU
return vcpu_loop();
}