diff --git a/arch/x86/include/asm/stddef.h b/arch/x86/include/asm/stddef.h index 7678456f0..42d68ed0a 100644 --- a/arch/x86/include/asm/stddef.h +++ b/arch/x86/include/asm/stddef.h @@ -205,6 +205,7 @@ typedef struct { size_t ss_size; /* Stack size. */ } stack_t; +const int32_t is_uhyve(void); const int32_t is_single_kernel(void); #ifdef __cplusplus diff --git a/arch/x86/kernel/entry.asm b/arch/x86/kernel/entry.asm index a8b38fd62..099d0d531 100644 --- a/arch/x86/kernel/entry.asm +++ b/arch/x86/kernel/entry.asm @@ -69,6 +69,7 @@ align 4 global mb_info global hbmem_base global hbmem_size + global uhyve base dq 0 limit dq 0 cpu_freq dd 0 @@ -92,6 +93,7 @@ align 4 mb_info dq 0 hbmem_base dq 0 hbmem_size dq 0 + uhyve dd 0 ; Bootstrap page tables are used during the initialization. align 4096 @@ -666,6 +668,12 @@ Lgo3: add rsp, 16 iretq +global is_uhyve +align 64 +is_uhyve + mov eax, DWORD [uhyve] + ret + global is_single_kernel align 64 is_single_kernel: diff --git a/arch/x86/kernel/pci.c b/arch/x86/kernel/pci.c index 79991b69f..114ecf80e 100644 --- a/arch/x86/kernel/pci.c +++ b/arch/x86/kernel/pci.c @@ -154,7 +154,7 @@ int pci_get_device_info(uint32_t vendor_id, uint32_t device_id, pci_info_t* info if (!info) return -EINVAL; - if (!mechanism) + if (!mechanism && !is_uhyve()) pci_init(); for (bus = 0; bus < MAX_BUS; bus++) { diff --git a/arch/x86/kernel/uart.c b/arch/x86/kernel/uart.c index 3be2dbbf3..8a534455e 100644 --- a/arch/x86/kernel/uart.c +++ b/arch/x86/kernel/uart.c @@ -230,6 +230,8 @@ extern const void kernel_start; int uart_early_init(char* cmdline) { + if (is_uhyve()) + return 0; #if 1 // default value of our QEMU configuration iobase = 0xc110; @@ -275,6 +277,9 @@ int uart_early_init(char* cmdline) int uart_init(void) { + if (is_uhyve()) + return 0; + #ifdef CONFIG_PCI pci_info_t pci_info; uint32_t bar = 0; diff --git a/include/hermit/stddef.h b/include/hermit/stddef.h index 03899cde7..85b879c60 100644 --- a/include/hermit/stddef.h +++ b/include/hermit/stddef.h @@ -56,6 +56,13 @@ extern "C" { #define DYNAMIC_TICKS +#define UHYVE_PORT_WRITE 0x499 +#define UHYVE_PORT_OPEN 0x500 +#define UHYVE_PORT_CLOSE 0x501 +#define UHYVE_PORT_READ 0x502 +#define UHYVE_PORT_EXIT 0x503 +#define UHYVE_PORT_LSEEK 0x504 + #define BUILTIN_EXPECT(exp, b) __builtin_expect((exp), (b)) //#define BUILTIN_EXPECT(exp, b) (exp) #define NORETURN __attribute__((noreturn)) diff --git a/kernel/main.c b/kernel/main.c index 2fcd9bce3..e0efe8d7a 100644 --- a/kernel/main.c +++ b/kernel/main.c @@ -177,6 +177,9 @@ static int init_netifs(void) LOG_INFO("TCP/IP initialized.\n"); sys_sem_free(&sem); + if (is_uhyve()) + return -ENODEV; + if (!is_single_kernel()) { /* Set network address variables */ @@ -403,6 +406,17 @@ static int initd(void* arg) // initialize network init_netifs(); + if (is_uhyve()) + { + char* dummy[] = {"app_name", NULL}; + + LOG_INFO("Boot time: %d ms\n", (get_clock_tick() * 1000) / TIMER_FREQ); + // call user code + libc_start(1, dummy, NULL); //argc, argv, environ); + + return 0; + } + #if 0 if (is_single_kernel()) { char* dummy[] = {"app_name", NULL}; diff --git a/kernel/syscall.c b/kernel/syscall.c index daf7c5ff9..072e8d034 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -89,24 +90,28 @@ typedef struct { /** @brief To be called by the systemcall to exit tasks */ void NORETURN sys_exit(int arg) { - sys_exit_t sysargs = {__NR_exit, arg}; - - spinlock_irqsave_lock(&lwip_lock); - if (libc_sd >= 0) - { - int s = libc_sd; - - lwip_write(s, &sysargs, sizeof(sysargs)); - libc_sd = -1; - - spinlock_irqsave_unlock(&lwip_lock); - - // switch to LwIP thread - reschedule(); - - lwip_close(s); + if (is_uhyve()) { + outportl(UHYVE_PORT_EXIT, (unsigned) (size_t) &arg); } else { - spinlock_irqsave_unlock(&lwip_lock); + sys_exit_t sysargs = {__NR_exit, arg}; + + spinlock_irqsave_lock(&lwip_lock); + if (libc_sd >= 0) + { + int s = libc_sd; + + lwip_write(s, &sysargs, sizeof(sysargs)); + libc_sd = -1; + + spinlock_irqsave_unlock(&lwip_lock); + + // switch to LwIP thread + reschedule(); + + lwip_close(s); + } else { + spinlock_irqsave_unlock(&lwip_lock); + } } do_exit(arg); @@ -118,8 +123,23 @@ typedef struct { size_t len; } __attribute__((packed)) sys_read_t; +typedef struct { + int fd; + char* buf; + size_t len; + ssize_t ret; +} __attribute__((packed)) uhyve_read_t; + ssize_t sys_read(int fd, char* buf, size_t len) { + if (is_uhyve()) { + uhyve_read_t uhyve_args = {fd, (char*) virt_to_phys((size_t) buf), len, -1}; + + outportl(UHYVE_PORT_READ, (unsigned)virt_to_phys((size_t)&uhyve_args)); + + return uhyve_args.ret; + } + sys_read_t sysargs = {__NR_read, fd, len}; ssize_t j, ret; int s; @@ -175,15 +195,29 @@ typedef struct { size_t len; } __attribute__((packed)) sys_write_t; +typedef struct { + int fd; + const char* buf; + size_t len; +} __attribute__((packed)) uhyve_write_t; + ssize_t sys_write(int fd, const char* buf, size_t len) { - ssize_t i, ret; - sys_write_t sysargs = {__NR_write, fd, len}; - int s; - if (BUILTIN_EXPECT(!buf, 0)) return -1; + if (is_uhyve()) { + uhyve_write_t uhyve_args = {fd, (const char*) virt_to_phys((size_t) buf), len}; + + outportl(UHYVE_PORT_WRITE, (unsigned)virt_to_phys((size_t)&uhyve_args)); + + return uhyve_args.len; + } + + ssize_t i, ret; + int s; + sys_write_t sysargs = {__NR_write, fd, len}; + // do we have an LwIP file descriptor? if (fd & LWIP_FD_BIT) { ret = lwip_write(fd & ~LWIP_FD_BIT, buf, len); @@ -273,8 +307,24 @@ ssize_t sys_sbrk(ssize_t incr) return ret; } +typedef struct { + const char* name; + int flags; + int mode; + int ret; +} __attribute__((packed)) uhyve_open_t; + int sys_open(const char* name, int flags, int mode) { + if (is_uhyve()) { + uhyve_open_t uhyve_open = {(const char*)virt_to_phys((size_t)name), flags, mode, -1}; + + kprintf("name %s, %p, 0x%zx\n", name, name, uhyve_open.name); + outportl(UHYVE_PORT_OPEN, (unsigned)virt_to_phys((size_t) &uhyve_open)); + + return uhyve_open.ret; + } + int s, i, ret, sysnr = __NR_open; size_t len; @@ -331,8 +381,21 @@ typedef struct { int fd; } __attribute__((packed)) sys_close_t; +typedef struct { + int fd; + int ret; +} __attribute__((packed)) uhyve_close_t; + int sys_close(int fd) { + if (is_uhyve()) { + uhyve_close_t uhyve_close = {fd, -1}; + + outportl(UHYVE_PORT_CLOSE, (unsigned)virt_to_phys((size_t) &uhyve_close)); + + return uhyve_close.ret; + } + int ret, s; sys_close_t sysargs = {__NR_close, fd}; @@ -449,8 +512,22 @@ typedef struct { int whence; } __attribute__((packed)) sys_lseek_t; +typedef struct { + int fd; + off_t offset; + int whence; +} __attribute__((packed)) uhyve_lseek_t; + off_t sys_lseek(int fd, off_t offset, int whence) { + if (is_uhyve()) { + uhyve_lseek_t uhyve_lseek = { fd, offset, whence }; + + outportl(UHYVE_PORT_LSEEK, (unsigned)virt_to_phys((size_t) &uhyve_lseek)); + + return uhyve_lseek.offset; + } + off_t off; sys_lseek_t sysargs = {__NR_lseek, fd, offset, whence}; int s; diff --git a/tools/Makefile b/tools/Makefile index a9fa8b31d..f12c85ecb 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -1,5 +1,6 @@ MAKE = make CC = gcc +CP = cp CFLAGS = -O2 -Wall -std=gnu99 $(ARCH_OPT) PROXYFILES = proxy init.sh $(shell find ../usr/tests ../usr/benchmarks ../usr/openmpbench -type f -executable) @@ -13,13 +14,13 @@ endif # other implicit rules %.o : %.c @echo [CC] $@ - $Q$(CC) -c $(CFLAGS) -o $@ $< + $Q$(CC) -c $(CFLAGS) -pthread -o $@ $< all: proxy -proxy: proxy.o +proxy: proxy.o uhyve.o @echo [LD] $@ - $Q$(CC) $(CFLAGS) -o $@ $< + $Q$(CC) $(CFLAGS) -pthread -o $@ $< uhyve.o clean: @echo Cleaning tools diff --git a/tools/proxy.c b/tools/proxy.c index c1229c691..aeaa618d9 100644 --- a/tools/proxy.c +++ b/tools/proxy.c @@ -47,6 +47,8 @@ #include #include +#include "proxy.h" + #define MAX_PATH 255 #define MAX_ARGS 1024 #define INADDR(a, b, c, d) (struct in_addr) { .s_addr = ((((((d) << 8) | (c)) << 8) | (b)) << 8) | (a) } @@ -54,14 +56,6 @@ #define HERMIT_PORT 0x494E #define HERMIT_IP(isle) INADDR(192, 168, 28, isle + 2) #define HERMIT_MAGIC 0x7E317 -#define HERMIT_ELFOSABI 0x42 - -#define __HERMIT_exit 0 -#define __HERMIT_write 1 -#define __HERMIT_open 2 -#define __HERMIT_close 3 -#define __HERMIT_read 4 -#define __HERMIT_lseek 5 #define EVENT_SIZE (sizeof (struct inotify_event)) #define BUF_LEN (1024 * (EVENT_SIZE + 16)) @@ -80,24 +74,27 @@ static void stop_hermit(void); static void dump_log(void); static int init_multi(char *path); static int init_qemu(char *path); +int init_uhyve(char *path); -static void fini_env(void) +static void fini_qemu(void) { - if (qemu) { - int status = 0; + int status = 0; - if (id) { - kill(id, SIGINT); - wait(&status); - } + if (id) { + kill(id, SIGINT); + wait(&status); + } - dump_log(); - puts(""); - unlink(tmpname); - } else { - dump_log(); - stop_hermit(); -} } + dump_log(); + puts(""); + unlink(tmpname); +} + +static void fini_multi(void) +{ + dump_log(); + stop_hermit(); +} static void exit_handler(int sig) { @@ -126,7 +123,7 @@ static char* cpufreq(void) ; *point = '\0'; - snprintf(cmdline, MAX_PATH, "-freq%s", match); + snprintf(cmdline, MAX_PATH, "-freq%s", match); fclose(fp); return cmdline; @@ -139,6 +136,7 @@ static int init_env(char *path) { char* str; struct sigaction sINT, sTERM; + unsigned int uhyve = 0; // define action for SIGINT sINT.sa_handler = exit_handler; @@ -163,6 +161,11 @@ static int init_env(char *path) { if (strncmp(str, "qemu", 4) == 0) { qemu = 1; + uhyve = 0; + isle_nr = 0; + } else if (strncmp(str, "uhyve", 5) == 0) { + uhyve = 1; + qemu = 0; isle_nr = 0; } else { isle_nr = atoi(str); @@ -179,10 +182,15 @@ static int init_env(char *path) port = HERMIT_PORT; } - if (qemu) + if (qemu) { + atexit(fini_qemu); return init_qemu(path); - else + } else if (uhyve) { + return init_uhyve(path); + } else { + atexit(fini_multi); return init_multi(path); + } } static int is_hermit_available(void) @@ -820,7 +828,9 @@ int main(int argc, char **argv) struct sockaddr_in serv_name; init_env(argv[1]); - atexit(fini_env); + + // in case of uhyve, we will never reach this point + // => we could now establish an IP connection to HermitCore #if 0 // check if mmnif interface is available @@ -831,7 +841,7 @@ int main(int argc, char **argv) strncpy(ethreq.ifr_name, "mmnif", IFNAMSIZ); while(1) { - /* this socket doesn't really matter, we just need a descriptor + /* this socket doesn't really matter, we just need a descriptor * to perform the ioctl on */ s = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); ioctl(s, SIOCGIFFLAGS, ðreq); diff --git a/tools/proxy.h b/tools/proxy.h new file mode 100644 index 000000000..d429d8a67 --- /dev/null +++ b/tools/proxy.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2017, Stefan Lankes, RWTH Aachen University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __PROXY_H__ +#define __PROXY_H__ + +#include + +#define HERMIT_ELFOSABI 0x42 + +#define __HERMIT_exit 0 +#define __HERMIT_write 1 +#define __HERMIT_open 2 +#define __HERMIT_close 3 +#define __HERMIT_read 4 +#define __HERMIT_lseek 5 + +#endif diff --git a/tools/uhyve-cpu.h b/tools/uhyve-cpu.h new file mode 100644 index 000000000..26744ec99 --- /dev/null +++ b/tools/uhyve-cpu.h @@ -0,0 +1,108 @@ +#ifndef __UHYVE_CPU_H__ +#define __UHYVE_CPU_H__ + +#ifndef _BITUL + +#ifdef __ASSEMBLY__ +#define _AC(X,Y) X +#define _AT(T,X) X +#else +#define __AC(X,Y) (X##Y) +#define _AC(X,Y) __AC(X,Y) +#define _AT(T,X) ((T)(X)) +#endif + +#define _BITUL(x) (_AC(1,UL) << (x)) +#define _BITULL(x) (_AC(1,ULL) << (x)) + +#endif + +/* + * EFLAGS bits + */ +#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ + +/* + * Basic CPU control in CR0 + */ +#define X86_CR0_PE_BIT 0 /* Protection Enable */ +#define X86_CR0_PE _BITUL(X86_CR0_PE_BIT) +#define X86_CR0_PG_BIT 31 /* Paging */ +#define X86_CR0_PG _BITUL(X86_CR0_PG_BIT) + +/* + * Intel CPU features in CR4 + */ +#define X86_CR4_PAE_BIT 5 /* enable physical address extensions */ +#define X86_CR4_PAE _BITUL(X86_CR4_PAE_BIT) + +/* + * Intel long mode page directory/table entries + */ +#define X86_PDPT_P_BIT 0 /* Present */ +#define X86_PDPT_P _BITUL(X86_PDPT_P_BIT) +#define X86_PDPT_RW_BIT 1 /* Writable */ +#define X86_PDPT_RW _BITUL(X86_PDPT_RW_BIT) +#define X86_PDPT_PS_BIT 7 /* Page size */ +#define X86_PDPT_PS _BITUL(X86_PDPT_PS_BIT) + +/* + * GDT and KVM segment manipulation + */ + +#define GDT_DESC_OFFSET(n) ((n) * 0x8) + +#define GDT_GET_BASE(x) ( \ + (((x) & 0xFF00000000000000) >> 32) | \ + (((x) & 0x000000FF00000000) >> 16) | \ + (((x) & 0x00000000FFFF0000) >> 16)) + +#define GDT_GET_LIMIT(x) (__u32)( \ + (((x) & 0x000F000000000000) >> 32) | \ + (((x) & 0x000000000000FFFF))) + +/* Constructor for a conventional segment GDT (or LDT) entry */ +/* This is a macro so it can be used in initializers */ +#define GDT_ENTRY(flags, base, limit) \ + ((((base) & _AC(0xff000000, ULL)) << (56-24)) | \ + (((flags) & _AC(0x0000f0ff, ULL)) << 40) | \ + (((limit) & _AC(0x000f0000, ULL)) << (48-16)) | \ + (((base) & _AC(0x00ffffff, ULL)) << 16) | \ + (((limit) & _AC(0x0000ffff, ULL)))) + +struct _kvm_segment { + __u64 base; + __u32 limit; + __u16 selector; + __u8 type; + __u8 present, dpl, db, s, l, g, avl; + __u8 unusable; + __u8 padding; +}; + +#define GDT_GET_G(x) (__u8)(((x) & 0x0080000000000000) >> 55) +#define GDT_GET_DB(x) (__u8)(((x) & 0x0040000000000000) >> 54) +#define GDT_GET_L(x) (__u8)(((x) & 0x0020000000000000) >> 53) +#define GDT_GET_AVL(x) (__u8)(((x) & 0x0010000000000000) >> 52) +#define GDT_GET_P(x) (__u8)(((x) & 0x0000800000000000) >> 47) +#define GDT_GET_DPL(x) (__u8)(((x) & 0x0000600000000000) >> 45) +#define GDT_GET_S(x) (__u8)(((x) & 0x0000100000000000) >> 44) +#define GDT_GET_TYPE(x)(__u8)(((x) & 0x00000F0000000000) >> 40) + +#define GDT_TO_KVM_SEGMENT(seg, gdt_table, sel) \ + do { \ + __u64 gdt_ent = gdt_table[sel]; \ + seg.base = GDT_GET_BASE(gdt_ent); \ + seg.limit = GDT_GET_LIMIT(gdt_ent); \ + seg.selector = sel * 8; \ + seg.type = GDT_GET_TYPE(gdt_ent); \ + seg.present = GDT_GET_P(gdt_ent); \ + seg.dpl = GDT_GET_DPL(gdt_ent); \ + seg.db = GDT_GET_DB(gdt_ent); \ + seg.s = GDT_GET_S(gdt_ent); \ + seg.l = GDT_GET_L(gdt_ent); \ + seg.g = GDT_GET_G(gdt_ent); \ + seg.avl = GDT_GET_AVL(gdt_ent); \ + } while (0) + +#endif diff --git a/tools/uhyve.c b/tools/uhyve.c new file mode 100644 index 000000000..7038139e0 --- /dev/null +++ b/tools/uhyve.c @@ -0,0 +1,653 @@ +/* Copyright (c) 2015, IBM + * Author(s): Dan Williams + * Ricardo Koller + * Copyright (c) 2017, RWTH Aachen University + * Author(s): Stefan Lankes + * + * Permission to use, copy, modify, and/or distribute this software + * for any purpose with or without fee is hereby granted, provided + * that the above copyright notice and this permission notice appear + * in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL + * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE + * AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, + * NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* We used several existing projects as guides + * kvmtest.c: http://lwn.net/Articles/658512/ + * lkvm: http://github.com/clearlinux/kvmtool + */ + +/* + * 15.1.2017: extend original version (https://github.com/Solo5/solo5) + * for HermitCore + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "uhyve-cpu.h" +#include "proxy.h" + +#define GUEST_OFFSET 0x0 +#define CPUID_FUNC_PERFMON 0x0A +#define GUEST_PAGE_SIZE 0x200000 /* 2 MB pages in guest */ + +#define BOOT_GDT 0x1000 +#define BOOT_INFO 0x2000 +#define BOOT_PML4 0x10000 +#define BOOT_PDPTE 0x11000 +#define BOOT_PDE 0x12000 + +#define BOOT_GDT_NULL 0 +#define BOOT_GDT_CODE 1 +#define BOOT_GDT_DATA 2 +#define BOOT_GDT_MAX 3 + +#define KVM_32BIT_MAX_MEM_SIZE (1ULL << 32) +#define KVM_32BIT_GAP_SIZE (768 << 20) +#define KVM_32BIT_GAP_START (KVM_32BIT_MAX_MEM_SIZE - KVM_32BIT_GAP_SIZE) + +#define UHYVE_PORT_WRITE 0x499 +#define UHYVE_PORT_OPEN 0x500 +#define UHYVE_PORT_CLOSE 0x501 +#define UHYVE_PORT_READ 0x502 +#define UHYVE_PORT_EXIT 0x503 +#define UHYVE_PORT_LSEEK 0x504 + +static int kvm = -1, vmfd = -1, vcpufd = 1; +static uint8_t* guest_mem = NULL; +static uint8_t* klog = NULL; +static size_t guest_size = 0x20000000ULL; +static uint64_t elf_entry; +//static pthread_t vcpu_thread; +static volatile uint8_t done = 0; + +typedef struct { + int fd; + const char* buf; + size_t len; +} __attribute__((packed)) uhyve_write_t; + +typedef struct { + const char* name; + int flags; + int mode; + int ret; +} __attribute__((packed)) uhyve_open_t; + +typedef struct { + int fd; + int ret; +} __attribute__((packed)) uhyve_close_t; + +typedef struct { + int fd; + char* buf; + size_t len; + ssize_t ret; +} __attribute__((packed)) uhyve_read_t; + +typedef struct { + int fd; + off_t offset; + int whence; +} __attribute__((packed)) uhyve_lseek_t; + +static void uhyve_exit(void) +{ + char* str = getenv("HERMIT_VERBOSE"); + + if (done == 0) { + done = 1; + //pthread_kill(vcpu_thread, SIGINT); + } + + if (klog && str && (strcmp(str, "0") != 0)) + { + puts("\nDump kernel log:"); + puts("================\n"); + printf("%s\n", klog); + } + + if (vcpufd != -1) + close(vcpufd); + if (vmfd != -1) + close(vmfd); + if (kvm != -1) + close(kvm); +} + +static uint32_t get_cpufreq(void) +{ +#if 1 + char line[2048]; + uint32_t freq = 0; + + FILE* fp = fopen("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq", "r"); + if (!fp) { + perror("Unable to open /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq\n"); + return freq; + } + + if (fgets(line, 2048, fp)) + freq = atoi(line) / 1000; + + return freq; +#else + uint32_t freq = 0; + char line[2048]; + char* match; + char* point; + + FILE* fp = fopen("/proc/cpuinfo", "r"); + if (!fp) + return freq; + + while(fgets(line, 2048, fp)) { + if ((match = strstr(line, "cpu MHz")) == NULL) + continue; + + // scan strinf for the next number + for(; (*match < 0x30) || (*match > 0x39); match++) + ; + + for(point = match; ((*point != '.') && (*point != '\0')); point++) + ; + *point = '\0'; + + freq = atoi(match); + fclose(fp); + + return freq; + } + + return freq; +#endif +} + +static ssize_t pread_in_full(int fd, void *buf, size_t count, off_t offset) +{ + ssize_t total = 0; + char *p = buf; + + if (count > SSIZE_MAX) { + errno = E2BIG; + return -1; + } + + while (count > 0) { + ssize_t nr; + + nr = pread(fd, p, count, offset); + if (nr == 0) + return total; + else if (nr == -1 && errno == EINTR) + continue; + else if (nr == -1) + return -1; + + count -= nr; + total += nr; + p += nr; + offset += nr; + } + + return total; +} + +static int load_kernel(uint8_t* mem, char* path) +{ + Elf64_Ehdr hdr; + Elf64_Phdr *phdr = NULL; + size_t buflen; + int fd, ret; + + fd = open(path, O_RDONLY); + if (fd == -1) + { + perror("Unable to open file"); + return -1; + } + + ret = pread_in_full(fd, &hdr, sizeof(hdr), 0); + if (ret < 0) + goto out; + + // check if the program is a HermitCore file + if (hdr.e_ident[EI_MAG0] != ELFMAG0 + || hdr.e_ident[EI_MAG1] != ELFMAG1 + || hdr.e_ident[EI_MAG2] != ELFMAG2 + || hdr.e_ident[EI_MAG3] != ELFMAG3 + || hdr.e_ident[EI_CLASS] != ELFCLASS64 + || hdr.e_ident[EI_OSABI] != HERMIT_ELFOSABI + || hdr.e_type != ET_EXEC || hdr.e_machine != EM_X86_64) { + fprintf(stderr, "Inavlide HermitCore file!\n"); + goto out; + } + + elf_entry = hdr.e_entry; + + buflen = hdr.e_phentsize * hdr.e_phnum; + phdr = malloc(buflen); + if (!phdr) { + fprintf(stderr, "Not enough memory\n"); + goto out; + } + + ret = pread_in_full(fd, phdr, buflen, hdr.e_phoff); + if (ret < 0) + goto out; + + /* + * Load all segments with type "LOAD" from the file at offset + * p_offset, and copy that into in memory. + */ + for (Elf64_Half ph_i = 0; ph_i < hdr.e_phnum; ph_i++) + { + uint64_t paddr = phdr[ph_i].p_paddr; + size_t offset = phdr[ph_i].p_offset; + size_t filesz = phdr[ph_i].p_filesz; + size_t memsz = phdr[ph_i].p_memsz; + + if (phdr[ph_i].p_type != PT_LOAD) + continue; + + //printf("Kernel location 0x%zx, file size 0x%zx\n", paddr, filesz); + + ret = pread_in_full(fd, mem+paddr-GUEST_OFFSET, filesz, offset); + if (ret < 0) + goto out; + memset(mem+paddr+filesz-GUEST_OFFSET, 0x00, memsz - filesz); + if (!klog) + klog = mem+paddr+0x5000-GUEST_OFFSET; + + // initialize kernel + *((uint64_t*) (mem+paddr-GUEST_OFFSET + 0x08)) = paddr; // physical start address + *((uint64_t*) (mem+paddr-GUEST_OFFSET + 0x10)) = guest_size; // physical limit + *((uint32_t*) (mem+paddr-GUEST_OFFSET + 0x18)) = get_cpufreq(); + *((uint32_t*) (mem+paddr-GUEST_OFFSET + 0x24)) = 1; // number of used cpus + *((uint32_t*) (mem+paddr-GUEST_OFFSET + 0x30)) = 0; // apicid + *((uint64_t*) (mem+paddr-GUEST_OFFSET + 0x38)) = filesz; + *((uint32_t*) (mem+paddr-GUEST_OFFSET + 0x60)) = 1; // numa nodes + *((uint32_t*) (mem+paddr-GUEST_OFFSET + 0x94)) = 1; // announce uhyve + } +out: + if (phdr) + free(phdr); + + close(fd); + + return 0; +} + +static void filter_cpuid(struct kvm_cpuid2 *kvm_cpuid) +{ + unsigned int i; + + /* + * Filter CPUID functions that are not supported by the hypervisor. + */ + for (i = 0; i < kvm_cpuid->nent; i++) { + struct kvm_cpuid_entry2 *entry = &kvm_cpuid->entries[i]; + + switch (entry->function) { + case 1: // CPUID to define basic cpu features + entry->ecx = entry->ecx | (1 << 31); // propagate that we are running on a hypervisor + entry->ecx = entry->ecx & ~(1 << 21); // disable X2APIC support + entry->edx = entry->edx | (1 << 5); // enable msr support + break; + case CPUID_FUNC_PERFMON: + entry->eax = 0x00; /* disable it */ + break; + default: + /* Keep the CPUID function as -is */ + break; + }; + } +} + +static void setup_system_64bit(struct kvm_sregs *sregs) +{ + sregs->cr0 |= X86_CR0_PE; + sregs->efer |= EFER_LME; +} + + +static void setup_system_page_tables(struct kvm_sregs *sregs, uint8_t *mem) +{ + uint64_t *pml4 = (uint64_t *) (mem + BOOT_PML4); + uint64_t *pdpte = (uint64_t *) (mem + BOOT_PDPTE); + uint64_t *pde = (uint64_t *) (mem + BOOT_PDE); + uint64_t paddr; + + /* + * For simplicity we currently use 2MB pages and only a single + * PML4/PDPTE/PDE. Sanity check that the guest size is a multiple of the + * page size and will fit in a single PDE (512 entries). + */ + assert((guest_size & (GUEST_PAGE_SIZE - 1)) == 0); + assert(guest_size <= (GUEST_PAGE_SIZE * 512)); + + memset(pml4, 0x00, 4096); + memset(pdpte, 0x00, 4096); + memset(pde, 0x00, 4096); + + *pml4 = BOOT_PDPTE | (X86_PDPT_P | X86_PDPT_RW); + *pdpte = BOOT_PDE | (X86_PDPT_P | X86_PDPT_RW); + for (paddr = 0; paddr < guest_size; paddr += GUEST_PAGE_SIZE, pde++) + *pde = paddr | (X86_PDPT_P | X86_PDPT_RW | X86_PDPT_PS); + + sregs->cr3 = BOOT_PML4; + sregs->cr4 |= X86_CR4_PAE; + sregs->cr0 |= X86_CR0_PG; +} + +static void setup_system_gdt(struct kvm_sregs *sregs, + uint8_t *mem, + uint64_t off) +{ + uint64_t *gdt = (uint64_t *) (mem + off); + struct kvm_segment data_seg, code_seg; + + /* flags, base, limit */ + gdt[BOOT_GDT_NULL] = GDT_ENTRY(0, 0, 0); + gdt[BOOT_GDT_CODE] = GDT_ENTRY(0xA09B, 0, 0xFFFFF); + gdt[BOOT_GDT_DATA] = GDT_ENTRY(0xC093, 0, 0xFFFFF); + + sregs->gdt.base = off; + sregs->gdt.limit = (sizeof(uint64_t) * BOOT_GDT_MAX) - 1; + + GDT_TO_KVM_SEGMENT(code_seg, gdt, BOOT_GDT_CODE); + GDT_TO_KVM_SEGMENT(data_seg, gdt, BOOT_GDT_DATA); + + sregs->cs = code_seg; + sregs->ds = data_seg; + sregs->es = data_seg; + sregs->fs = data_seg; + sregs->gs = data_seg; + sregs->ss = data_seg; +} + +static void setup_system(int vcpufd, uint8_t *mem) +{ + struct kvm_sregs sregs; + int ret; + + /* Set all cpu/mem system structures */ + ret = ioctl(vcpufd, KVM_GET_SREGS, &sregs); + if (ret == -1) + err(1, "KVM: ioctl (GET_SREGS) failed"); + + setup_system_gdt(&sregs, mem, BOOT_GDT); + setup_system_page_tables(&sregs, mem); + setup_system_64bit(&sregs); + + ret = ioctl(vcpufd, KVM_SET_SREGS, &sregs); + if (ret == -1) + err(1, "KVM: ioctl (SET_SREGS) failed"); +} + + +static void setup_cpuid(int kvm, int vcpufd) +{ + struct kvm_cpuid2 *kvm_cpuid; + int max_entries = 100; + + kvm_cpuid = calloc(1, sizeof(*kvm_cpuid) + max_entries * sizeof(*kvm_cpuid->entries)); + kvm_cpuid->nent = max_entries; + + if (ioctl(kvm, KVM_GET_SUPPORTED_CPUID, kvm_cpuid) < 0) + err(1, "KVM: ioctl (GET_SUPPORTED_CPUID) failed"); + + filter_cpuid(kvm_cpuid); + + if (ioctl(vcpufd, KVM_SET_CPUID2, kvm_cpuid) < 0) + err(1, "KVM: ioctl (SET_CPUID2) failed"); +} + +static void* vcpu_loop(struct kvm_run *run) +{ + int ret; + + while (!done) { + ret = ioctl(vcpufd, KVM_RUN, NULL); + if (ret == -1 && errno == EINTR) + continue; + if (ret == -1) { + if (errno == EFAULT) { + struct kvm_regs regs; + + ret = ioctl(vcpufd, KVM_GET_REGS, ®s); + if (ret == -1) + err(1, "KVM: ioctl (GET_REGS) failed after guest fault"); + err(1, "KVM: host/guest translation fault: rip=0x%llx", regs.rip); + } else err(1, "KVM: ioctl in vcpu_loop failed"); + } + + /* TODO: handle requests */ + + switch (run->exit_reason) { + case KVM_EXIT_HLT: + fprintf(stderr, "KVM: unhandled KVM_EXIT_HLT\n"); + /* Guest has halted the CPU, this is considered as a normal exit. */ + return NULL; + + case KVM_EXIT_MMIO: + err(1, "KVM: unhandled KVM_EXIT_MMIO at 0x%llx", run->mmio.phys_addr); + break; + + case KVM_EXIT_IO: + //printf("port 0x%x\n", run->io.port); + switch (run->io.port) { + case UHYVE_PORT_WRITE: { + unsigned data = *((unsigned*)((size_t)run+run->io.data_offset)); + uhyve_write_t* uhyve_write = (uhyve_write_t*) (guest_mem+data); + + uhyve_write->len = write(uhyve_write->fd, guest_mem+(size_t)uhyve_write->buf, uhyve_write->len); + break; + } + + case UHYVE_PORT_READ: { + unsigned data = *((unsigned*)((size_t)run+run->io.data_offset)); + uhyve_read_t* uhyve_read = (uhyve_read_t*) (guest_mem+data); + + uhyve_read->ret = read(uhyve_read->fd, guest_mem+(size_t)uhyve_read->buf, uhyve_read->len); + break; + } + + case UHYVE_PORT_EXIT: { + unsigned data = *((unsigned*)((size_t)run+run->io.data_offset)); + + done = 1; + exit(*(int*)(guest_mem+data)); + break; + } + + case UHYVE_PORT_OPEN: { + unsigned data = *((unsigned*)((size_t)run+run->io.data_offset)); + uhyve_open_t* uhyve_open = (uhyve_open_t*) (guest_mem+data); + + uhyve_open->ret = open((const char*)guest_mem+(size_t)uhyve_open->name, uhyve_open->flags, uhyve_open->mode); + break; + } + + case UHYVE_PORT_CLOSE: { + unsigned data = *((unsigned*)((size_t)run+run->io.data_offset)); + uhyve_close_t* uhyve_close = (uhyve_close_t*) (guest_mem+data); + + uhyve_close->ret = close(uhyve_close->fd); + break; + } + + case UHYVE_PORT_LSEEK: { + unsigned data = *((unsigned*)((size_t)run+run->io.data_offset)); + uhyve_lseek_t* uhyve_lseek = (uhyve_lseek_t*) (guest_mem+data); + + uhyve_lseek->offset = lseek(uhyve_lseek->fd, uhyve_lseek->offset, uhyve_lseek->whence); + break; + } + default: + err(1, "KVM: unhandled KVM_EXIT_IO at port 0x%x, direction %d", run->io.port, run->io.direction); + break; + } + break; + + case KVM_EXIT_FAIL_ENTRY: + err(1, "KVM: entry failure: hw_entry_failure_reason=0x%llx", + run->fail_entry.hardware_entry_failure_reason); + break; + + case KVM_EXIT_INTERNAL_ERROR: + err(1, "KVM: internal error exit: suberror = 0x%x", run->internal.suberror); + break; + + case KVM_EXIT_SHUTDOWN: + err(1, "KVM: receive shutdown command"); + break; + + default: + fprintf(stderr, "KVM: unhandled exit: exit_reason = 0x%x\n", run->exit_reason); + exit(EXIT_FAILURE); + } + } + + return NULL; +} + +static void* uhyve_thread(void* arg) +{ + char* path = (char*) arg; + int ret; + size_t mmap_size; + struct kvm_run *run; + + // register routine to close the VM + atexit(uhyve_exit); + + char* str = getenv("HERMIT_MEM"); + if (str) + printf("We want to use %s memory\n", str); + + kvm = open("/dev/kvm", O_RDWR | O_CLOEXEC); + if (kvm < 0) + err(1, "Could not open: /dev/kvm"); + + /* Make sure we have the stable version of the API */ + ret = ioctl(kvm, KVM_GET_API_VERSION, NULL); + if (ret < 0) + err(1, "KVM: ioctl (GET_API_VERSION) failed"); + if (ret != 12) + err(1, "KVM: API version is %d, uhyve requires version 12", ret); + + vmfd = ioctl(kvm, KVM_CREATE_VM, 0); + if (vmfd == -1) + err(1, "KVM: unable to create VM"); + + // TODO: we have to create a gap for PCI + assert(guest_size < KVM_32BIT_GAP_SIZE); + + /* Allocate page-aligned guest memory. */ + guest_mem = mmap(NULL, guest_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (guest_mem == MAP_FAILED) + err(1, "mmap failed"); + + ret = load_kernel(guest_mem, path); + if (ret) + exit(EXIT_FAILURE); + + /* Map it to the second page frame (to avoid the real-mode IDT at 0). */ + struct kvm_userspace_memory_region kvm_region = { + .slot = 0, + .guest_phys_addr = GUEST_OFFSET, + .memory_size = guest_size, + .userspace_addr = (uint64_t) guest_mem, + }; + + ret = ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, &kvm_region); + if (ret == -1) + err(1, "KVM: set user memory failed"); + + ret = ioctl(vmfd, KVM_CREATE_IRQCHIP); + if (ret < 0) + err(1, "KVM_CREATE_IRQCHIP ioctl"); + + vcpufd = ioctl(vmfd, KVM_CREATE_VCPU, 0); + if (vcpufd == -1) + err(1, "KVM: create vcpu failed"); + + /* Setup registers and memory. */ + setup_system(vcpufd, guest_mem); + + /* + * Initialize registers: instruction pointer for our code, addends, + * and initial flags required by x86 architecture. + * Arguments to the kernel main are passed using the x86_64 calling + * convention: RDI, RSI, RDX, RCX, R8, and R9 + */ + struct kvm_regs regs = { + .rip = elf_entry, + .rax = 2, + .rbx = 2, + .rdx = 0, + .rflags = 0x2, + }; + ret = ioctl(vcpufd, KVM_SET_REGS, ®s); + if (ret == -1) + err(1, "KVM: ioctl (SET_REGS) failed"); + + /* Map the shared kvm_run structure and following data. */ + ret = ioctl(kvm, KVM_GET_VCPU_MMAP_SIZE, NULL); + if (ret == -1) + err(1, "KVM: ioctl get VCPU_MMAP_SIZE failed"); + mmap_size = ret; + if (mmap_size < sizeof(*run)) + err(1, "KVM: invalid VCPU_MMAP_SIZE: %zd", mmap_size); + + run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, vcpufd, 0); + if (run == MAP_FAILED) + err(1, "KVM: VCPU mmap failed"); + + setup_cpuid(kvm, vcpufd); + + return vcpu_loop(run); +} + +int init_uhyve(char *path) +{ + //pthread_create(&vcpu_thread, NULL, uhyve_thread, (void*)path); + + uhyve_thread(path); + exit(EXIT_SUCCESS); + return 0; +}