1
0
Fork 0
mirror of https://github.com/hermitcore/libhermit.git synced 2025-03-30 00:00:15 +01:00
libhermit/tools/uhyve.c
2017-03-01 18:57:32 +01:00

709 lines
17 KiB
C

/* Copyright (c) 2015, IBM
* Author(s): Dan Williams <djwillia@us.ibm.com>
* Ricardo Koller <kollerr@us.ibm.com>
* Copyright (c) 2017, RWTH Aachen University
* Author(s): Stefan Lankes <slankes@eonerc.rwth-aachen.de>
*
* Permission to use, copy, modify, and/or distribute this software
* for any purpose with or without fee is hereby granted, provided
* that the above copyright notice and this permission notice appear
* in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
* WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
* AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
* CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
* NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
/* We used several existing projects as guides
* kvmtest.c: http://lwn.net/Articles/658512/
* lkvm: http://github.com/clearlinux/kvmtool
*/
/*
* 15.1.2017: extend original version (https://github.com/Solo5/solo5)
* for HermitCore
* 25.2.2017: add SMP support to enable more than one core
*/
#define _GNU_SOURCE
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <errno.h>
#include <fcntl.h>
#include <sched.h>
#include <signal.h>
#include <limits.h>
#include <assert.h>
#include <pthread.h>
#include <elf.h>
#include <err.h>
#include <sys/wait.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <linux/const.h>
#include <linux/kvm.h>
#include <asm/msr-index.h>
#include "uhyve-cpu.h"
#include "uhyve-syscalls.h"
#include "proxy.h"
#define GUEST_OFFSET 0x0
#define CPUID_FUNC_PERFMON 0x0A
#define GUEST_PAGE_SIZE 0x200000 /* 2 MB pages in guest */
#define BOOT_GDT 0x1000
#define BOOT_INFO 0x2000
#define BOOT_PML4 0x10000
#define BOOT_PDPTE 0x11000
#define BOOT_PDE 0x12000
#define BOOT_GDT_NULL 0
#define BOOT_GDT_CODE 1
#define BOOT_GDT_DATA 2
#define BOOT_GDT_MAX 3
#define KVM_32BIT_MAX_MEM_SIZE (1ULL << 32)
#define KVM_32BIT_GAP_SIZE (768 << 20)
#define KVM_32BIT_GAP_START (KVM_32BIT_MAX_MEM_SIZE - KVM_32BIT_GAP_SIZE)
#define kvm_ioctl(fd, cmd, arg) ({ \
const int ret = ioctl(fd, cmd, arg); \
if(ret == -1) \
err(1, "KVM: ioctl " #cmd " failed"); \
ret; \
})
static uint32_t ncores = 1;
static uint8_t* guest_mem = NULL;
static uint8_t* klog = NULL;
static uint8_t* mboot = NULL;
static size_t guest_size = 0x20000000ULL;
static uint64_t elf_entry;
static pthread_t* vcpu_threads = NULL;
static int kvm = -1, vmfd = -1;
static __thread struct kvm_run *run = NULL;
static __thread int vcpufd = 1;
static uint64_t memparse(const char *ptr)
{
// local pointer to end of parsed string
char *endptr;
// parse number
uint64_t size = strtoull(ptr, &endptr, 0);
// parse size extension, intentional fall-through
switch (*endptr) {
case 'E':
case 'e':
size <<= 10;
case 'P':
case 'p':
size <<= 10;
case 'T':
case 't':
size <<= 10;
case 'G':
case 'g':
size <<= 10;
case 'M':
case 'm':
size <<= 10;
case 'K':
case 'k':
size <<= 10;
endptr++;
default:
break;
}
return size;
}
/// Just close file descriptor if not already done
static inline void close_fd(int* fd)
{
if(*fd != -1) {
close(*fd);
*fd = -1;
}
}
static void sig_func(int sig)
{
(void) sig;
close_fd(&vcpufd);
pthread_exit(NULL);
}
static void uhyve_exit(void)
{
// only the main thread will execute this
if (vcpu_threads) {
for(uint32_t i = 1; i < ncores; i++) {
pthread_kill(vcpu_threads[i], SIGTERM);
pthread_join(vcpu_threads[i], NULL);
}
free(vcpu_threads);
}
char* verbose = getenv("HERMIT_VERBOSE");
if (klog && verbose && (strcmp(verbose, "0") != 0))
{
puts("\nDump kernel log:");
puts("================\n");
printf("%s\n", klog);
}
// clean up and close KVM
close_fd(&vcpufd);
close_fd(&vmfd);
close_fd(&kvm);
}
static uint32_t get_cpufreq(void)
{
char line[128];
uint32_t freq = 0;
char* match;
FILE* fp = fopen("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq", "r");
if (fp != NULL) {
if (fgets(line, sizeof(line), fp) != NULL) {
// cpuinfo_max_freq is in kHz
freq = (uint32_t) atoi(line) / 1000;
}
fclose(fp);
} else if( (fp = fopen("/proc/cpuinfo", "r")) ) {
// Resorting to /proc/cpuinfo, however on most systems this will only
// return the current frequency that might change over time.
// Currently only needed when running inside a VM
// read until we find the line indicating cpu frequency
while(fgets(line, sizeof(line), fp) != NULL) {
match = strstr(line, "cpu MHz");
if(match != NULL) {
// advance pointer to beginning of number
while( ((*match < '0') || (*match > '9')) && (*match != '\0') )
match++;
freq = (uint32_t) atoi(match);
break;
}
}
fclose(fp);
}
return freq;
}
static ssize_t pread_in_full(int fd, void *buf, size_t count, off_t offset)
{
ssize_t total = 0;
char *p = buf;
if (count > SSIZE_MAX) {
errno = E2BIG;
return -1;
}
while (count > 0) {
ssize_t nr;
nr = pread(fd, p, count, offset);
if (nr == 0)
return total;
else if (nr == -1 && errno == EINTR)
continue;
else if (nr == -1)
return -1;
count -= nr;
total += nr;
p += nr;
offset += nr;
}
return total;
}
static int load_kernel(uint8_t* mem, char* path)
{
Elf64_Ehdr hdr;
Elf64_Phdr *phdr = NULL;
size_t buflen;
int fd, ret;
int first_load = 1;
fd = open(path, O_RDONLY);
if (fd == -1)
{
perror("Unable to open file");
return -1;
}
ret = pread_in_full(fd, &hdr, sizeof(hdr), 0);
if (ret < 0)
goto out;
// check if the program is a HermitCore file
if (hdr.e_ident[EI_MAG0] != ELFMAG0
|| hdr.e_ident[EI_MAG1] != ELFMAG1
|| hdr.e_ident[EI_MAG2] != ELFMAG2
|| hdr.e_ident[EI_MAG3] != ELFMAG3
|| hdr.e_ident[EI_CLASS] != ELFCLASS64
|| hdr.e_ident[EI_OSABI] != HERMIT_ELFOSABI
|| hdr.e_type != ET_EXEC || hdr.e_machine != EM_X86_64) {
fprintf(stderr, "Inavlide HermitCore file!\n");
goto out;
}
elf_entry = hdr.e_entry;
buflen = hdr.e_phentsize * hdr.e_phnum;
phdr = malloc(buflen);
if (!phdr) {
fprintf(stderr, "Not enough memory\n");
goto out;
}
ret = pread_in_full(fd, phdr, buflen, hdr.e_phoff);
if (ret < 0)
goto out;
/*
* Load all segments with type "LOAD" from the file at offset
* p_offset, and copy that into in memory.
*/
for (Elf64_Half ph_i = 0; ph_i < hdr.e_phnum; ph_i++)
{
uint64_t paddr = phdr[ph_i].p_paddr;
size_t offset = phdr[ph_i].p_offset;
size_t filesz = phdr[ph_i].p_filesz;
size_t memsz = phdr[ph_i].p_memsz;
if (phdr[ph_i].p_type != PT_LOAD)
continue;
//printf("Kernel location 0x%zx, file size 0x%zx\n", paddr, filesz);
ret = pread_in_full(fd, mem+paddr-GUEST_OFFSET, filesz, offset);
if (ret < 0)
goto out;
memset(mem+paddr+filesz-GUEST_OFFSET, 0x00, memsz - filesz);
if (!klog)
klog = mem+paddr+0x5000-GUEST_OFFSET;
if (!mboot)
mboot = mem+paddr-GUEST_OFFSET;
if (first_load) {
first_load = 0;
// initialize kernel
*((uint64_t*) (mem+paddr-GUEST_OFFSET + 0x08)) = paddr; // physical start address
*((uint64_t*) (mem+paddr-GUEST_OFFSET + 0x10)) = guest_size; // physical limit
*((uint32_t*) (mem+paddr-GUEST_OFFSET + 0x18)) = get_cpufreq();
*((uint32_t*) (mem+paddr-GUEST_OFFSET + 0x24)) = 1; // number of used cpus
*((uint32_t*) (mem+paddr-GUEST_OFFSET + 0x30)) = 0; // apicid
*((uint64_t*) (mem+paddr-GUEST_OFFSET + 0x38)) = filesz;
*((uint32_t*) (mem+paddr-GUEST_OFFSET + 0x60)) = 1; // numa nodes
*((uint32_t*) (mem+paddr-GUEST_OFFSET + 0x94)) = 1; // announce uhyve
}
}
out:
if (phdr)
free(phdr);
close(fd);
return 0;
}
/// Filter CPUID functions that are not supported by the hypervisor and enable
/// features according to our needs.
static void filter_cpuid(struct kvm_cpuid2 *kvm_cpuid)
{
for (uint32_t i = 0; i < kvm_cpuid->nent; i++) {
struct kvm_cpuid_entry2 *entry = &kvm_cpuid->entries[i];
switch (entry->function) {
case 1:
// CPUID to define basic cpu features
entry->ecx |= (1U << 31); // propagate that we are running on a hypervisor
entry->edx |= (1U << 5); // enable msr support
break;
case CPUID_FUNC_PERFMON:
// disable it
entry->eax = 0x00;
break;
default:
// Keep the CPUID function as-is
break;
};
}
}
static void setup_system_64bit(struct kvm_sregs *sregs)
{
sregs->cr0 |= X86_CR0_PE;
sregs->efer |= EFER_LME;
}
static void setup_system_page_tables(struct kvm_sregs *sregs, uint8_t *mem)
{
uint64_t *pml4 = (uint64_t *) (mem + BOOT_PML4);
uint64_t *pdpte = (uint64_t *) (mem + BOOT_PDPTE);
uint64_t *pde = (uint64_t *) (mem + BOOT_PDE);
uint64_t paddr;
/*
* For simplicity we currently use 2MB pages and only a single
* PML4/PDPTE/PDE. Sanity check that the guest size is a multiple of the
* page size and will fit in a single PDE (512 entries).
*/
assert((guest_size & (GUEST_PAGE_SIZE - 1)) == 0);
assert(guest_size <= (GUEST_PAGE_SIZE * 512));
memset(pml4, 0x00, 4096);
memset(pdpte, 0x00, 4096);
memset(pde, 0x00, 4096);
*pml4 = BOOT_PDPTE | (X86_PDPT_P | X86_PDPT_RW);
*pdpte = BOOT_PDE | (X86_PDPT_P | X86_PDPT_RW);
for (paddr = 0; paddr < guest_size; paddr += GUEST_PAGE_SIZE, pde++)
*pde = paddr | (X86_PDPT_P | X86_PDPT_RW | X86_PDPT_PS);
sregs->cr3 = BOOT_PML4;
sregs->cr4 |= X86_CR4_PAE;
sregs->cr0 |= X86_CR0_PG;
}
static void setup_system_gdt(struct kvm_sregs *sregs,
uint8_t *mem,
uint64_t off)
{
uint64_t *gdt = (uint64_t *) (mem + off);
struct kvm_segment data_seg, code_seg;
/* flags, base, limit */
gdt[BOOT_GDT_NULL] = GDT_ENTRY(0, 0, 0);
gdt[BOOT_GDT_CODE] = GDT_ENTRY(0xA09B, 0, 0xFFFFF);
gdt[BOOT_GDT_DATA] = GDT_ENTRY(0xC093, 0, 0xFFFFF);
sregs->gdt.base = off;
sregs->gdt.limit = (sizeof(uint64_t) * BOOT_GDT_MAX) - 1;
GDT_TO_KVM_SEGMENT(code_seg, gdt, BOOT_GDT_CODE);
GDT_TO_KVM_SEGMENT(data_seg, gdt, BOOT_GDT_DATA);
sregs->cs = code_seg;
sregs->ds = data_seg;
sregs->es = data_seg;
sregs->fs = data_seg;
sregs->gs = data_seg;
sregs->ss = data_seg;
}
static void setup_system(int vcpufd, uint8_t *mem, uint32_t id)
{
static struct kvm_sregs sregs;
// all cores use the same startup code
// => all cores use the same sregs
// => only the boot processor has to initialize sregs
if (id == 0) {
kvm_ioctl(vcpufd, KVM_GET_SREGS, &sregs);
/* Set all cpu/mem system structures */
setup_system_gdt(&sregs, mem, BOOT_GDT);
setup_system_page_tables(&sregs, mem);
setup_system_64bit(&sregs);
}
kvm_ioctl(vcpufd, KVM_SET_SREGS, &sregs);
}
static void setup_cpuid(int kvm, int vcpufd)
{
struct kvm_cpuid2 *kvm_cpuid;
unsigned int max_entries = 100;
// allocate space for cpuid we get from KVM
kvm_cpuid = calloc(1, sizeof(*kvm_cpuid) +
(max_entries * sizeof(kvm_cpuid->entries[0])) );
kvm_cpuid->nent = max_entries;
kvm_ioctl(kvm, KVM_GET_SUPPORTED_CPUID, kvm_cpuid);
// set features
filter_cpuid(kvm_cpuid);
kvm_ioctl(vcpufd, KVM_SET_CPUID2, kvm_cpuid);
free(kvm_cpuid);
}
static int vcpu_loop(void)
{
int ret;
struct kvm_mp_state state = { KVM_MP_STATE_RUNNABLE };
// be sure that the multiprocessor is runable
kvm_ioctl(vcpufd, KVM_SET_MP_STATE, &state);
while (1) {
ret = kvm_ioctl(vcpufd, KVM_RUN, NULL);
if(ret == -1) {
switch(errno) {
case EINTR:
continue;
case EFAULT: {
struct kvm_regs regs;
kvm_ioctl(vcpufd, KVM_GET_REGS, &regs);
err(1, "KVM: host/guest translation fault: rip=0x%llx", regs.rip);
}
default:
err(1, "KVM: ioctl KVM_RUN in vcpu_loop failed");
break;
}
}
/* handle requests */
switch (run->exit_reason) {
case KVM_EXIT_HLT:
fprintf(stderr, "Guest has halted the CPU, this is considered as a normal exit.\n");
return 0;
case KVM_EXIT_MMIO:
err(1, "KVM: unhandled KVM_EXIT_MMIO at 0x%llx", run->mmio.phys_addr);
break;
case KVM_EXIT_IO:
//printf("port 0x%x\n", run->io.port);
switch (run->io.port) {
case UHYVE_PORT_WRITE: {
unsigned data = *((unsigned*)((size_t)run+run->io.data_offset));
uhyve_write_t* uhyve_write = (uhyve_write_t*) (guest_mem+data);
uhyve_write->len = write(uhyve_write->fd, guest_mem+(size_t)uhyve_write->buf, uhyve_write->len);
break;
}
case UHYVE_PORT_READ: {
unsigned data = *((unsigned*)((size_t)run+run->io.data_offset));
uhyve_read_t* uhyve_read = (uhyve_read_t*) (guest_mem+data);
uhyve_read->ret = read(uhyve_read->fd, guest_mem+(size_t)uhyve_read->buf, uhyve_read->len);
break;
}
case UHYVE_PORT_EXIT: {
unsigned data = *((unsigned*)((size_t)run+run->io.data_offset));
exit(*(int*)(guest_mem+data));
break;
}
case UHYVE_PORT_OPEN: {
unsigned data = *((unsigned*)((size_t)run+run->io.data_offset));
uhyve_open_t* uhyve_open = (uhyve_open_t*) (guest_mem+data);
uhyve_open->ret = open((const char*)guest_mem+(size_t)uhyve_open->name, uhyve_open->flags, uhyve_open->mode);
break;
}
case UHYVE_PORT_CLOSE: {
unsigned data = *((unsigned*)((size_t)run+run->io.data_offset));
uhyve_close_t* uhyve_close = (uhyve_close_t*) (guest_mem+data);
if (uhyve_close->ret > 2)
uhyve_close->ret = close(uhyve_close->fd);
break;
}
case UHYVE_PORT_LSEEK: {
unsigned data = *((unsigned*)((size_t)run+run->io.data_offset));
uhyve_lseek_t* uhyve_lseek = (uhyve_lseek_t*) (guest_mem+data);
uhyve_lseek->offset = lseek(uhyve_lseek->fd, uhyve_lseek->offset, uhyve_lseek->whence);
break;
}
default:
err(1, "KVM: unhandled KVM_EXIT_IO at port 0x%x, direction %d", run->io.port, run->io.direction);
break;
}
break;
case KVM_EXIT_FAIL_ENTRY:
err(1, "KVM: entry failure: hw_entry_failure_reason=0x%llx",
run->fail_entry.hardware_entry_failure_reason);
break;
case KVM_EXIT_INTERNAL_ERROR:
err(1, "KVM: internal error exit: suberror = 0x%x", run->internal.suberror);
break;
case KVM_EXIT_SHUTDOWN:
err(1, "KVM: receive shutdown command");
break;
default:
fprintf(stderr, "KVM: unhandled exit: exit_reason = 0x%x\n", run->exit_reason);
exit(EXIT_FAILURE);
}
}
close(vcpufd);
vcpufd = -1;
return 0;
}
static int vcpu_init(uint32_t id)
{
size_t mmap_size;
vcpufd = kvm_ioctl(vmfd, KVM_CREATE_VCPU, id);
/* Setup registers and memory. */
setup_system(vcpufd, guest_mem, id);
struct kvm_regs regs = {
.rip = elf_entry, // entry point to HermitCore
.rflags = 0x2, // POR value required by x86 architecture
};
kvm_ioctl(vcpufd, KVM_SET_REGS, &regs);
/* Map the shared kvm_run structure and following data. */
mmap_size = (size_t) kvm_ioctl(kvm, KVM_GET_VCPU_MMAP_SIZE, NULL);
if (mmap_size < sizeof(*run))
err(1, "KVM: invalid VCPU_MMAP_SIZE: %zd", mmap_size);
run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, vcpufd, 0);
if (run == MAP_FAILED)
err(1, "KVM: VCPU mmap failed");
setup_cpuid(kvm, vcpufd);
// only one core is able to enter startup code
// => the wait for the predecessor core
while (*((volatile uint32_t*) (mboot + 0x20)) < id)
pthread_yield();
*((volatile uint32_t*) (mboot + 0x30)) = id;
return 0;
}
static void* uhyve_thread(void* arg)
{
const size_t id = (size_t) arg;
// create new cpu
vcpu_init(id);
// run cpu loop until thread gets killed
const size_t ret = vcpu_loop();
return (void*) ret;
}
int uhyve_init(char *path)
{
// register signal handler before going multithread
signal(SIGTERM, sig_func);
// register routine to close the VM
atexit(uhyve_exit);
const char* hermit_memory = getenv("HERMIT_MEM");
if (hermit_memory)
guest_size = memparse(hermit_memory);
kvm = open("/dev/kvm", O_RDWR | O_CLOEXEC);
if (kvm < 0)
err(1, "Could not open: /dev/kvm");
/* Make sure we have the stable version of the API */
int kvm_api_version = kvm_ioctl(kvm, KVM_GET_API_VERSION, NULL);
if (kvm_api_version != 12)
err(1, "KVM: API version is %d, uhyve requires version 12", kvm_api_version);
/* Create the virtual machine */
vmfd = kvm_ioctl(kvm, KVM_CREATE_VM, 0);
// TODO: we have to create a gap for PCI
assert(guest_size < KVM_32BIT_GAP_SIZE);
/* Allocate page-aligned guest memory. */
guest_mem = mmap(NULL, guest_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
if (guest_mem == MAP_FAILED)
err(1, "mmap failed");
if (load_kernel(guest_mem, path) != 0)
exit(EXIT_FAILURE);
/* Map it to the second page frame (to avoid the real-mode IDT at 0). */
struct kvm_userspace_memory_region kvm_region = {
.slot = 0,
.guest_phys_addr = GUEST_OFFSET,
.memory_size = guest_size,
.userspace_addr = (uint64_t) guest_mem,
};
kvm_ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, &kvm_region);
kvm_ioctl(vmfd, KVM_CREATE_IRQCHIP, NULL);
// create first CPU, it will be the boot processor by default
return vcpu_init(0);
}
int uhyve_loop(void)
{
const char* hermit_cpus = getenv("HERMIT_CPUS");
if (hermit_cpus)
ncores = (uint32_t) atoi(hermit_cpus);
*((uint32_t*) (mboot+0x24)) = ncores;
vcpu_threads = (pthread_t*) calloc(ncores, sizeof(pthread_t));
if (!vcpu_threads)
err(1, "Not enough memory");
// First CPU is special because it will boot the system. Other CPUs will
// be booted linearily after the first one.
vcpu_threads[0] = pthread_self();
// start threads to create VCPUs
for(size_t i = 1; i < ncores; i++)
pthread_create(&vcpu_threads[i], NULL, uhyve_thread, (void*) i);
// Run first CPU
return vcpu_loop();
}