From 04180a5a81effea24e70e6fc1df210ab83cfa4f1 Mon Sep 17 00:00:00 2001 From: Michael Steil Date: Tue, 9 Jun 2015 21:25:36 -0700 Subject: [PATCH] Base bhyve checkin git https://github.com/freebsd/freebsd 132e393b177792999244c04f7b08753a98b0c4c8 --- bhyve/Makefile | 50 + bhyve/acpi.c | 1012 +++++++++++ bhyve/acpi.h | 54 + bhyve/ahci.h | 322 ++++ bhyve/atkbdc.c | 90 + bhyve/bhyve.8 | 325 ++++ bhyve/bhyverun.c | 892 ++++++++++ bhyve/bhyverun.h | 55 + bhyve/block_if.c | 822 +++++++++ bhyve/block_if.h | 70 + bhyve/consport.c | 153 ++ bhyve/dbgport.c | 142 ++ bhyve/dbgport.h | 34 + bhyve/inout.c | 297 ++++ bhyve/inout.h | 79 + bhyve/ioapic.c | 74 + bhyve/ioapic.h | 39 + bhyve/mem.c | 291 +++ bhyve/mem.h | 61 + bhyve/mevent.c | 456 +++++ bhyve/mevent.h | 51 + bhyve/mevent_test.c | 256 +++ bhyve/mptbl.c | 377 ++++ bhyve/mptbl.h | 35 + bhyve/pci_ahci.c | 2346 +++++++++++++++++++++++++ bhyve/pci_emul.c | 2108 ++++++++++++++++++++++ bhyve/pci_emul.h | 283 +++ bhyve/pci_hostbridge.c | 70 + bhyve/pci_irq.c | 346 ++++ bhyve/pci_irq.h | 45 + bhyve/pci_lpc.c | 429 +++++ bhyve/pci_lpc.h | 72 + bhyve/pci_passthru.c | 790 +++++++++ bhyve/pci_uart.c | 119 ++ bhyve/pci_virtio_block.c | 410 +++++ bhyve/pci_virtio_net.c | 730 ++++++++ bhyve/pci_virtio_rnd.c | 189 ++ bhyve/pm.c | 312 ++++ bhyve/post.c | 53 + bhyve/rtc.c | 129 ++ bhyve/rtc.h | 34 + bhyve/smbiostbl.c | 827 +++++++++ bhyve/smbiostbl.h | 36 + bhyve/spinup_ap.c | 104 ++ bhyve/spinup_ap.h | 34 + bhyve/task_switch.c | 939 ++++++++++ bhyve/uart_emul.c | 657 +++++++ bhyve/uart_emul.h | 45 + bhyve/virtio.c | 777 ++++++++ bhyve/virtio.h | 464 +++++ bhyve/xmsr.c | 230 +++ bhyve/xmsr.h | 36 + bhyvectl/Makefile | 16 + bhyvectl/bhyvectl.c | 2142 ++++++++++++++++++++++ bhyveload/Makefile | 13 + bhyveload/bhyveload.8 | 157 ++ bhyveload/bhyveload.c | 746 ++++++++ libvmmapi/Makefile | 13 + libvmmapi/vmmapi.c | 1201 +++++++++++++ libvmmapi/vmmapi.h | 173 ++ libvmmapi/vmmapi_freebsd.c | 345 ++++ vmm.h | 648 +++++++ vmm/amd/amdv.c | 133 ++ vmm/amd/npt.c | 87 + vmm/amd/npt.h | 36 + vmm/amd/svm.c | 2259 ++++++++++++++++++++++++ vmm/amd/svm.h | 54 + vmm/amd/svm_genassym.c | 48 + vmm/amd/svm_msr.c | 165 ++ vmm/amd/svm_msr.h | 44 + vmm/amd/svm_softc.h | 114 ++ vmm/amd/svm_support.S | 121 ++ vmm/amd/vmcb.c | 442 +++++ vmm/amd/vmcb.h | 334 ++++ vmm/intel/ept.c | 205 +++ vmm/intel/ept.h | 39 + vmm/intel/vmcs.c | 503 ++++++ vmm/intel/vmcs.h | 401 +++++ vmm/intel/vmx.c | 3416 ++++++++++++++++++++++++++++++++++++ vmm/intel/vmx.h | 140 ++ vmm/intel/vmx_controls.h | 96 + vmm/intel/vmx_cpufunc.h | 218 +++ vmm/intel/vmx_genassym.c | 88 + vmm/intel/vmx_msr.c | 483 +++++ vmm/intel/vmx_msr.h | 70 + vmm/intel/vmx_support.S | 262 +++ vmm/intel/vtd.c | 688 ++++++++ vmm/io/iommu.c | 285 +++ vmm/io/iommu.h | 75 + vmm/io/ppt.c | 651 +++++++ vmm/io/ppt.h | 54 + vmm/io/vatpic.c | 808 +++++++++ vmm/io/vatpic.h | 57 + vmm/io/vatpit.c | 457 +++++ vmm/io/vatpit.h | 45 + vmm/io/vhpet.c | 759 ++++++++ vmm/io/vhpet.h | 44 + vmm/io/vioapic.c | 499 ++++++ vmm/io/vioapic.h | 50 + vmm/io/vlapic.c | 1654 +++++++++++++++++ vmm/io/vlapic.h | 109 ++ vmm/io/vlapic_priv.h | 190 ++ vmm/io/vpmtmr.c | 103 ++ vmm/io/vpmtmr.h | 42 + vmm/io/vrtc.c | 1019 +++++++++++ vmm/io/vrtc.h | 50 + vmm/vmm.c | 2427 +++++++++++++++++++++++++ vmm/vmm_dev.c | 689 ++++++++ vmm/vmm_host.c | 161 ++ vmm/vmm_host.h | 83 + vmm/vmm_instruction_emul.c | 2407 +++++++++++++++++++++++++ vmm/vmm_ioport.c | 176 ++ vmm/vmm_ioport.h | 37 + vmm/vmm_ktr.h | 69 + vmm/vmm_lapic.c | 247 +++ vmm/vmm_lapic.h | 75 + vmm/vmm_mem.c | 154 ++ vmm/vmm_mem.h | 43 + vmm/vmm_stat.c | 169 ++ vmm/vmm_stat.h | 160 ++ vmm/vmm_util.c | 111 ++ vmm/vmm_util.h | 40 + vmm/x86.c | 521 ++++++ vmm/x86.h | 78 + vmm_dev.h | 365 ++++ vmm_instruction_emul.h | 116 ++ 126 files changed, 49630 insertions(+) create mode 100644 bhyve/Makefile create mode 100644 bhyve/acpi.c create mode 100644 bhyve/acpi.h create mode 100644 bhyve/ahci.h create mode 100644 bhyve/atkbdc.c create mode 100644 bhyve/bhyve.8 create mode 100644 bhyve/bhyverun.c create mode 100644 bhyve/bhyverun.h create mode 100644 bhyve/block_if.c create mode 100644 bhyve/block_if.h create mode 100644 bhyve/consport.c create mode 100644 bhyve/dbgport.c create mode 100644 bhyve/dbgport.h create mode 100644 bhyve/inout.c create mode 100644 bhyve/inout.h create mode 100644 bhyve/ioapic.c create mode 100644 bhyve/ioapic.h create mode 100644 bhyve/mem.c create mode 100644 bhyve/mem.h create mode 100644 bhyve/mevent.c create mode 100644 bhyve/mevent.h create mode 100644 bhyve/mevent_test.c create mode 100644 bhyve/mptbl.c create mode 100644 bhyve/mptbl.h create mode 100644 bhyve/pci_ahci.c create mode 100644 bhyve/pci_emul.c create mode 100644 bhyve/pci_emul.h create mode 100644 bhyve/pci_hostbridge.c create mode 100644 bhyve/pci_irq.c create mode 100644 bhyve/pci_irq.h create mode 100644 bhyve/pci_lpc.c create mode 100644 bhyve/pci_lpc.h create mode 100644 bhyve/pci_passthru.c create mode 100644 bhyve/pci_uart.c create mode 100644 bhyve/pci_virtio_block.c create mode 100644 bhyve/pci_virtio_net.c create mode 100644 bhyve/pci_virtio_rnd.c create mode 100644 bhyve/pm.c create mode 100644 bhyve/post.c create mode 100644 bhyve/rtc.c create mode 100644 bhyve/rtc.h create mode 100644 bhyve/smbiostbl.c create mode 100644 bhyve/smbiostbl.h create mode 100644 bhyve/spinup_ap.c create mode 100644 bhyve/spinup_ap.h create mode 100644 bhyve/task_switch.c create mode 100644 bhyve/uart_emul.c create mode 100644 bhyve/uart_emul.h create mode 100644 bhyve/virtio.c create mode 100644 bhyve/virtio.h create mode 100644 bhyve/xmsr.c create mode 100644 bhyve/xmsr.h create mode 100644 bhyvectl/Makefile create mode 100644 bhyvectl/bhyvectl.c create mode 100644 bhyveload/Makefile create mode 100644 bhyveload/bhyveload.8 create mode 100644 bhyveload/bhyveload.c create mode 100644 libvmmapi/Makefile create mode 100644 libvmmapi/vmmapi.c create mode 100644 libvmmapi/vmmapi.h create mode 100644 libvmmapi/vmmapi_freebsd.c create mode 100644 vmm.h create mode 100644 vmm/amd/amdv.c create mode 100644 vmm/amd/npt.c create mode 100644 vmm/amd/npt.h create mode 100644 vmm/amd/svm.c create mode 100644 vmm/amd/svm.h create mode 100644 vmm/amd/svm_genassym.c create mode 100644 vmm/amd/svm_msr.c create mode 100644 vmm/amd/svm_msr.h create mode 100644 vmm/amd/svm_softc.h create mode 100644 vmm/amd/svm_support.S create mode 100644 vmm/amd/vmcb.c create mode 100644 vmm/amd/vmcb.h create mode 100644 vmm/intel/ept.c create mode 100644 vmm/intel/ept.h create mode 100644 vmm/intel/vmcs.c create mode 100644 vmm/intel/vmcs.h create mode 100644 vmm/intel/vmx.c create mode 100644 vmm/intel/vmx.h create mode 100644 vmm/intel/vmx_controls.h create mode 100644 vmm/intel/vmx_cpufunc.h create mode 100644 vmm/intel/vmx_genassym.c create mode 100644 vmm/intel/vmx_msr.c create mode 100644 vmm/intel/vmx_msr.h create mode 100644 vmm/intel/vmx_support.S create mode 100644 vmm/intel/vtd.c create mode 100644 vmm/io/iommu.c create mode 100644 vmm/io/iommu.h create mode 100644 vmm/io/ppt.c create mode 100644 vmm/io/ppt.h create mode 100644 vmm/io/vatpic.c create mode 100644 vmm/io/vatpic.h create mode 100644 vmm/io/vatpit.c create mode 100644 vmm/io/vatpit.h create mode 100644 vmm/io/vhpet.c create mode 100644 vmm/io/vhpet.h create mode 100644 vmm/io/vioapic.c create mode 100644 vmm/io/vioapic.h create mode 100644 vmm/io/vlapic.c create mode 100644 vmm/io/vlapic.h create mode 100644 vmm/io/vlapic_priv.h create mode 100644 vmm/io/vpmtmr.c create mode 100644 vmm/io/vpmtmr.h create mode 100644 vmm/io/vrtc.c create mode 100644 vmm/io/vrtc.h create mode 100644 vmm/vmm.c create mode 100644 vmm/vmm_dev.c create mode 100644 vmm/vmm_host.c create mode 100644 vmm/vmm_host.h create mode 100644 vmm/vmm_instruction_emul.c create mode 100644 vmm/vmm_ioport.c create mode 100644 vmm/vmm_ioport.h create mode 100644 vmm/vmm_ktr.h create mode 100644 vmm/vmm_lapic.c create mode 100644 vmm/vmm_lapic.h create mode 100644 vmm/vmm_mem.c create mode 100644 vmm/vmm_mem.h create mode 100644 vmm/vmm_stat.c create mode 100644 vmm/vmm_stat.h create mode 100644 vmm/vmm_util.c create mode 100644 vmm/vmm_util.h create mode 100644 vmm/x86.c create mode 100644 vmm/x86.h create mode 100644 vmm_dev.h create mode 100644 vmm_instruction_emul.h diff --git a/bhyve/Makefile b/bhyve/Makefile new file mode 100644 index 0000000..bb81bcb --- /dev/null +++ b/bhyve/Makefile @@ -0,0 +1,50 @@ +# +# $FreeBSD$ +# + +PROG= bhyve + +DEBUG_FLAGS= -g -O0 + +MAN= bhyve.8 + +SRCS= \ + atkbdc.c \ + acpi.c \ + bhyverun.c \ + block_if.c \ + consport.c \ + dbgport.c \ + inout.c \ + ioapic.c \ + mem.c \ + mevent.c \ + mptbl.c \ + pci_ahci.c \ + pci_emul.c \ + pci_hostbridge.c \ + pci_irq.c \ + pci_lpc.c \ + pci_passthru.c \ + pci_virtio_block.c \ + pci_virtio_net.c \ + pci_virtio_rnd.c \ + pci_uart.c \ + pm.c \ + post.c \ + rtc.c \ + smbiostbl.c \ + task_switch.c \ + uart_emul.c \ + virtio.c \ + xmsr.c \ + spinup_ap.c + +.PATH: ${.CURDIR}/../../sys/amd64/vmm +SRCS+= vmm_instruction_emul.c + +LIBADD= vmmapi md pthread + +WARNS?= 2 + +.include diff --git a/bhyve/acpi.c b/bhyve/acpi.c new file mode 100644 index 0000000..a9dd1cc --- /dev/null +++ b/bhyve/acpi.c @@ -0,0 +1,1012 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * bhyve ACPI table generator. + * + * Create the minimal set of ACPI tables required to boot FreeBSD (and + * hopefully other o/s's) by writing out ASL template files for each of + * the tables and the compiling them to AML with the Intel iasl compiler. + * The AML files are then read into guest memory. + * + * The tables are placed in the guest's ROM area just below 1MB physical, + * above the MPTable. + * + * Layout + * ------ + * RSDP -> 0xf2400 (36 bytes fixed) + * RSDT -> 0xf2440 (36 bytes + 4*7 table addrs, 4 used) + * XSDT -> 0xf2480 (36 bytes + 8*7 table addrs, 4 used) + * MADT -> 0xf2500 (depends on #CPUs) + * FADT -> 0xf2600 (268 bytes) + * HPET -> 0xf2740 (56 bytes) + * MCFG -> 0xf2780 (60 bytes) + * FACS -> 0xf27C0 (64 bytes) + * DSDT -> 0xf2800 (variable - can go up to 0x100000) + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "bhyverun.h" +#include "acpi.h" +#include "pci_emul.h" + +/* + * Define the base address of the ACPI tables, and the offsets to + * the individual tables + */ +#define BHYVE_ACPI_BASE 0xf2400 +#define RSDT_OFFSET 0x040 +#define XSDT_OFFSET 0x080 +#define MADT_OFFSET 0x100 +#define FADT_OFFSET 0x200 +#define HPET_OFFSET 0x340 +#define MCFG_OFFSET 0x380 +#define FACS_OFFSET 0x3C0 +#define DSDT_OFFSET 0x400 + +#define BHYVE_ASL_TEMPLATE "bhyve.XXXXXXX" +#define BHYVE_ASL_SUFFIX ".aml" +#define BHYVE_ASL_COMPILER "/usr/sbin/iasl" + +static int basl_keep_temps; +static int basl_verbose_iasl; +static int basl_ncpu; +static uint32_t basl_acpi_base = BHYVE_ACPI_BASE; +static uint32_t hpet_capabilities; + +/* + * Contains the full pathname of the template to be passed + * to mkstemp/mktemps(3) + */ +static char basl_template[MAXPATHLEN]; +static char basl_stemplate[MAXPATHLEN]; + +/* + * State for dsdt_line(), dsdt_indent(), and dsdt_unindent(). + */ +static FILE *dsdt_fp; +static int dsdt_indent_level; +static int dsdt_error; + +struct basl_fio { + int fd; + FILE *fp; + char f_name[MAXPATHLEN]; +}; + +#define EFPRINTF(...) \ + err = fprintf(__VA_ARGS__); if (err < 0) goto err_exit; + +#define EFFLUSH(x) \ + err = fflush(x); if (err != 0) goto err_exit; + +static int +basl_fwrite_rsdp(FILE *fp) +{ + int err; + + err = 0; + + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve RSDP template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0008]\t\tSignature : \"RSD PTR \"\n"); + EFPRINTF(fp, "[0001]\t\tChecksum : 43\n"); + EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); + EFPRINTF(fp, "[0001]\t\tRevision : 02\n"); + EFPRINTF(fp, "[0004]\t\tRSDT Address : %08X\n", + basl_acpi_base + RSDT_OFFSET); + EFPRINTF(fp, "[0004]\t\tLength : 00000024\n"); + EFPRINTF(fp, "[0008]\t\tXSDT Address : 00000000%08X\n", + basl_acpi_base + XSDT_OFFSET); + EFPRINTF(fp, "[0001]\t\tExtended Checksum : 00\n"); + EFPRINTF(fp, "[0003]\t\tReserved : 000000\n"); + + EFFLUSH(fp); + + return (0); + +err_exit: + return (errno); +} + +static int +basl_fwrite_rsdt(FILE *fp) +{ + int err; + + err = 0; + + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve RSDT template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0004]\t\tSignature : \"RSDT\"\n"); + EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n"); + EFPRINTF(fp, "[0001]\t\tRevision : 01\n"); + EFPRINTF(fp, "[0001]\t\tChecksum : 00\n"); + EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); + EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVRSDT \"\n"); + EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n"); + /* iasl will fill in the compiler ID/revision fields */ + EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n"); + EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n"); + EFPRINTF(fp, "\n"); + + /* Add in pointers to the MADT, FADT and HPET */ + EFPRINTF(fp, "[0004]\t\tACPI Table Address 0 : %08X\n", + basl_acpi_base + MADT_OFFSET); + EFPRINTF(fp, "[0004]\t\tACPI Table Address 1 : %08X\n", + basl_acpi_base + FADT_OFFSET); + EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : %08X\n", + basl_acpi_base + HPET_OFFSET); + EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : %08X\n", + basl_acpi_base + MCFG_OFFSET); + + EFFLUSH(fp); + + return (0); + +err_exit: + return (errno); +} + +static int +basl_fwrite_xsdt(FILE *fp) +{ + int err; + + err = 0; + + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve XSDT template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0004]\t\tSignature : \"XSDT\"\n"); + EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n"); + EFPRINTF(fp, "[0001]\t\tRevision : 01\n"); + EFPRINTF(fp, "[0001]\t\tChecksum : 00\n"); + EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); + EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVXSDT \"\n"); + EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n"); + /* iasl will fill in the compiler ID/revision fields */ + EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n"); + EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n"); + EFPRINTF(fp, "\n"); + + /* Add in pointers to the MADT, FADT and HPET */ + EFPRINTF(fp, "[0004]\t\tACPI Table Address 0 : 00000000%08X\n", + basl_acpi_base + MADT_OFFSET); + EFPRINTF(fp, "[0004]\t\tACPI Table Address 1 : 00000000%08X\n", + basl_acpi_base + FADT_OFFSET); + EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : 00000000%08X\n", + basl_acpi_base + HPET_OFFSET); + EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : 00000000%08X\n", + basl_acpi_base + MCFG_OFFSET); + + EFFLUSH(fp); + + return (0); + +err_exit: + return (errno); +} + +static int +basl_fwrite_madt(FILE *fp) +{ + int err; + int i; + + err = 0; + + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve MADT template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0004]\t\tSignature : \"APIC\"\n"); + EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n"); + EFPRINTF(fp, "[0001]\t\tRevision : 01\n"); + EFPRINTF(fp, "[0001]\t\tChecksum : 00\n"); + EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); + EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVMADT \"\n"); + EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n"); + + /* iasl will fill in the compiler ID/revision fields */ + EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n"); + EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0004]\t\tLocal Apic Address : FEE00000\n"); + EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n"); + EFPRINTF(fp, "\t\t\tPC-AT Compatibility : 1\n"); + EFPRINTF(fp, "\n"); + + /* Add a Processor Local APIC entry for each CPU */ + for (i = 0; i < basl_ncpu; i++) { + EFPRINTF(fp, "[0001]\t\tSubtable Type : 00\n"); + EFPRINTF(fp, "[0001]\t\tLength : 08\n"); + /* iasl expects hex values for the proc and apic id's */ + EFPRINTF(fp, "[0001]\t\tProcessor ID : %02x\n", i); + EFPRINTF(fp, "[0001]\t\tLocal Apic ID : %02x\n", i); + EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n"); + EFPRINTF(fp, "\t\t\tProcessor Enabled : 1\n"); + EFPRINTF(fp, "\n"); + } + + /* Always a single IOAPIC entry, with ID 0 */ + EFPRINTF(fp, "[0001]\t\tSubtable Type : 01\n"); + EFPRINTF(fp, "[0001]\t\tLength : 0C\n"); + /* iasl expects a hex value for the i/o apic id */ + EFPRINTF(fp, "[0001]\t\tI/O Apic ID : %02x\n", 0); + EFPRINTF(fp, "[0001]\t\tReserved : 00\n"); + EFPRINTF(fp, "[0004]\t\tAddress : fec00000\n"); + EFPRINTF(fp, "[0004]\t\tInterrupt : 00000000\n"); + EFPRINTF(fp, "\n"); + + /* Legacy IRQ0 is connected to pin 2 of the IOAPIC */ + EFPRINTF(fp, "[0001]\t\tSubtable Type : 02\n"); + EFPRINTF(fp, "[0001]\t\tLength : 0A\n"); + EFPRINTF(fp, "[0001]\t\tBus : 00\n"); + EFPRINTF(fp, "[0001]\t\tSource : 00\n"); + EFPRINTF(fp, "[0004]\t\tInterrupt : 00000002\n"); + EFPRINTF(fp, "[0002]\t\tFlags (decoded below) : 0005\n"); + EFPRINTF(fp, "\t\t\tPolarity : 1\n"); + EFPRINTF(fp, "\t\t\tTrigger Mode : 1\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0001]\t\tSubtable Type : 02\n"); + EFPRINTF(fp, "[0001]\t\tLength : 0A\n"); + EFPRINTF(fp, "[0001]\t\tBus : 00\n"); + EFPRINTF(fp, "[0001]\t\tSource : %02X\n", SCI_INT); + EFPRINTF(fp, "[0004]\t\tInterrupt : %08X\n", SCI_INT); + EFPRINTF(fp, "[0002]\t\tFlags (decoded below) : 0000\n"); + EFPRINTF(fp, "\t\t\tPolarity : 3\n"); + EFPRINTF(fp, "\t\t\tTrigger Mode : 3\n"); + EFPRINTF(fp, "\n"); + + /* Local APIC NMI is connected to LINT 1 on all CPUs */ + EFPRINTF(fp, "[0001]\t\tSubtable Type : 04\n"); + EFPRINTF(fp, "[0001]\t\tLength : 06\n"); + EFPRINTF(fp, "[0001]\t\tProcessorId : FF\n"); + EFPRINTF(fp, "[0002]\t\tFlags (decoded below) : 0005\n"); + EFPRINTF(fp, "\t\t\tPolarity : 1\n"); + EFPRINTF(fp, "\t\t\tTrigger Mode : 1\n"); + EFPRINTF(fp, "[0001]\t\tInterrupt : 01\n"); + EFPRINTF(fp, "\n"); + + EFFLUSH(fp); + + return (0); + +err_exit: + return (errno); +} + +static int +basl_fwrite_fadt(FILE *fp) +{ + int err; + + err = 0; + + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve FADT template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0004]\t\tSignature : \"FACP\"\n"); + EFPRINTF(fp, "[0004]\t\tTable Length : 0000010C\n"); + EFPRINTF(fp, "[0001]\t\tRevision : 05\n"); + EFPRINTF(fp, "[0001]\t\tChecksum : 00\n"); + EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); + EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVFACP \"\n"); + EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n"); + /* iasl will fill in the compiler ID/revision fields */ + EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n"); + EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0004]\t\tFACS Address : %08X\n", + basl_acpi_base + FACS_OFFSET); + EFPRINTF(fp, "[0004]\t\tDSDT Address : %08X\n", + basl_acpi_base + DSDT_OFFSET); + EFPRINTF(fp, "[0001]\t\tModel : 01\n"); + EFPRINTF(fp, "[0001]\t\tPM Profile : 00 [Unspecified]\n"); + EFPRINTF(fp, "[0002]\t\tSCI Interrupt : %04X\n", + SCI_INT); + EFPRINTF(fp, "[0004]\t\tSMI Command Port : %08X\n", + SMI_CMD); + EFPRINTF(fp, "[0001]\t\tACPI Enable Value : %02X\n", + BHYVE_ACPI_ENABLE); + EFPRINTF(fp, "[0001]\t\tACPI Disable Value : %02X\n", + BHYVE_ACPI_DISABLE); + EFPRINTF(fp, "[0001]\t\tS4BIOS Command : 00\n"); + EFPRINTF(fp, "[0001]\t\tP-State Control : 00\n"); + EFPRINTF(fp, "[0004]\t\tPM1A Event Block Address : %08X\n", + PM1A_EVT_ADDR); + EFPRINTF(fp, "[0004]\t\tPM1B Event Block Address : 00000000\n"); + EFPRINTF(fp, "[0004]\t\tPM1A Control Block Address : %08X\n", + PM1A_CNT_ADDR); + EFPRINTF(fp, "[0004]\t\tPM1B Control Block Address : 00000000\n"); + EFPRINTF(fp, "[0004]\t\tPM2 Control Block Address : 00000000\n"); + EFPRINTF(fp, "[0004]\t\tPM Timer Block Address : %08X\n", + IO_PMTMR); + EFPRINTF(fp, "[0004]\t\tGPE0 Block Address : 00000000\n"); + EFPRINTF(fp, "[0004]\t\tGPE1 Block Address : 00000000\n"); + EFPRINTF(fp, "[0001]\t\tPM1 Event Block Length : 04\n"); + EFPRINTF(fp, "[0001]\t\tPM1 Control Block Length : 02\n"); + EFPRINTF(fp, "[0001]\t\tPM2 Control Block Length : 00\n"); + EFPRINTF(fp, "[0001]\t\tPM Timer Block Length : 04\n"); + EFPRINTF(fp, "[0001]\t\tGPE0 Block Length : 00\n"); + EFPRINTF(fp, "[0001]\t\tGPE1 Block Length : 00\n"); + EFPRINTF(fp, "[0001]\t\tGPE1 Base Offset : 00\n"); + EFPRINTF(fp, "[0001]\t\t_CST Support : 00\n"); + EFPRINTF(fp, "[0002]\t\tC2 Latency : 0000\n"); + EFPRINTF(fp, "[0002]\t\tC3 Latency : 0000\n"); + EFPRINTF(fp, "[0002]\t\tCPU Cache Size : 0000\n"); + EFPRINTF(fp, "[0002]\t\tCache Flush Stride : 0000\n"); + EFPRINTF(fp, "[0001]\t\tDuty Cycle Offset : 00\n"); + EFPRINTF(fp, "[0001]\t\tDuty Cycle Width : 00\n"); + EFPRINTF(fp, "[0001]\t\tRTC Day Alarm Index : 00\n"); + EFPRINTF(fp, "[0001]\t\tRTC Month Alarm Index : 00\n"); + EFPRINTF(fp, "[0001]\t\tRTC Century Index : 32\n"); + EFPRINTF(fp, "[0002]\t\tBoot Flags (decoded below) : 0000\n"); + EFPRINTF(fp, "\t\t\tLegacy Devices Supported (V2) : 0\n"); + EFPRINTF(fp, "\t\t\t8042 Present on ports 60/64 (V2) : 0\n"); + EFPRINTF(fp, "\t\t\tVGA Not Present (V4) : 1\n"); + EFPRINTF(fp, "\t\t\tMSI Not Supported (V4) : 0\n"); + EFPRINTF(fp, "\t\t\tPCIe ASPM Not Supported (V4) : 1\n"); + EFPRINTF(fp, "\t\t\tCMOS RTC Not Present (V5) : 0\n"); + EFPRINTF(fp, "[0001]\t\tReserved : 00\n"); + EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000000\n"); + EFPRINTF(fp, "\t\t\tWBINVD instruction is operational (V1) : 1\n"); + EFPRINTF(fp, "\t\t\tWBINVD flushes all caches (V1) : 0\n"); + EFPRINTF(fp, "\t\t\tAll CPUs support C1 (V1) : 1\n"); + EFPRINTF(fp, "\t\t\tC2 works on MP system (V1) : 0\n"); + EFPRINTF(fp, "\t\t\tControl Method Power Button (V1) : 0\n"); + EFPRINTF(fp, "\t\t\tControl Method Sleep Button (V1) : 1\n"); + EFPRINTF(fp, "\t\t\tRTC wake not in fixed reg space (V1) : 0\n"); + EFPRINTF(fp, "\t\t\tRTC can wake system from S4 (V1) : 0\n"); + EFPRINTF(fp, "\t\t\t32-bit PM Timer (V1) : 1\n"); + EFPRINTF(fp, "\t\t\tDocking Supported (V1) : 0\n"); + EFPRINTF(fp, "\t\t\tReset Register Supported (V2) : 1\n"); + EFPRINTF(fp, "\t\t\tSealed Case (V3) : 0\n"); + EFPRINTF(fp, "\t\t\tHeadless - No Video (V3) : 1\n"); + EFPRINTF(fp, "\t\t\tUse native instr after SLP_TYPx (V3) : 0\n"); + EFPRINTF(fp, "\t\t\tPCIEXP_WAK Bits Supported (V4) : 0\n"); + EFPRINTF(fp, "\t\t\tUse Platform Timer (V4) : 0\n"); + EFPRINTF(fp, "\t\t\tRTC_STS valid on S4 wake (V4) : 0\n"); + EFPRINTF(fp, "\t\t\tRemote Power-on capable (V4) : 0\n"); + EFPRINTF(fp, "\t\t\tUse APIC Cluster Model (V4) : 0\n"); + EFPRINTF(fp, "\t\t\tUse APIC Physical Destination Mode (V4) : 1\n"); + EFPRINTF(fp, "\t\t\tHardware Reduced (V5) : 0\n"); + EFPRINTF(fp, "\t\t\tLow Power S0 Idle (V5) : 0\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, + "[0012]\t\tReset Register : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 08\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000CF9\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0001]\t\tValue to cause reset : 06\n"); + EFPRINTF(fp, "[0002]\t\tARM Flags (decoded below): 0000\n"); + EFPRINTF(fp, "\t\t\tPSCI Compliant : 0\n"); + EFPRINTF(fp, "\t\t\tMust use HVC for PSCI : 0\n"); + EFPRINTF(fp, "[0001]\t\tFADT Minor Revision : 01\n"); + EFPRINTF(fp, "[0008]\t\tFACS Address : 00000000%08X\n", + basl_acpi_base + FACS_OFFSET); + EFPRINTF(fp, "[0008]\t\tDSDT Address : 00000000%08X\n", + basl_acpi_base + DSDT_OFFSET); + EFPRINTF(fp, + "[0012]\t\tPM1A Event Block : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 20\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 02 [Word Access:16]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n", + PM1A_EVT_ADDR); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, + "[0012]\t\tPM1B Event Block : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 00\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, + "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, + "[0012]\t\tPM1A Control Block : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 10\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 02 [Word Access:16]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n", + PM1A_CNT_ADDR); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, + "[0012]\t\tPM1B Control Block : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 00\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, + "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, + "[0012]\t\tPM2 Control Block : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 08\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, + "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); + EFPRINTF(fp, "\n"); + + /* Valid for bhyve */ + EFPRINTF(fp, + "[0012]\t\tPM Timer Block : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 20\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, + "[0001]\t\tEncoded Access Width : 03 [DWord Access:32]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n", + IO_PMTMR); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0012]\t\tGPE0 Block : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 00\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0012]\t\tGPE1 Block : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 00\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, + "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, + "[0012]\t\tSleep Control Register : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 08\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, + "[0012]\t\tSleep Status Register : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 08\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); + + EFFLUSH(fp); + + return (0); + +err_exit: + return (errno); +} + +static int +basl_fwrite_hpet(FILE *fp) +{ + int err; + + err = 0; + + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve HPET template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0004]\t\tSignature : \"HPET\"\n"); + EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n"); + EFPRINTF(fp, "[0001]\t\tRevision : 01\n"); + EFPRINTF(fp, "[0001]\t\tChecksum : 00\n"); + EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); + EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVHPET \"\n"); + EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n"); + + /* iasl will fill in the compiler ID/revision fields */ + EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n"); + EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0004]\t\tTimer Block ID : %08X\n", hpet_capabilities); + EFPRINTF(fp, + "[0012]\t\tTimer Block Register : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 00 [SystemMemory]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 00\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, + "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 00000000FED00000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0001]\t\tHPET Number : 00\n"); + EFPRINTF(fp, "[0002]\t\tMinimum Clock Ticks : 0000\n"); + EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n"); + EFPRINTF(fp, "\t\t\t4K Page Protect : 1\n"); + EFPRINTF(fp, "\t\t\t64K Page Protect : 0\n"); + EFPRINTF(fp, "\n"); + + EFFLUSH(fp); + + return (0); + +err_exit: + return (errno); +} + +static int +basl_fwrite_mcfg(FILE *fp) +{ + int err = 0; + + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve MCFG template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0004]\t\tSignature : \"MCFG\"\n"); + EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n"); + EFPRINTF(fp, "[0001]\t\tRevision : 01\n"); + EFPRINTF(fp, "[0001]\t\tChecksum : 00\n"); + EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); + EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVMCFG \"\n"); + EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n"); + + /* iasl will fill in the compiler ID/revision fields */ + EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n"); + EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n"); + EFPRINTF(fp, "[0008]\t\tReserved : 0\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0008]\t\tBase Address : %016lX\n", pci_ecfg_base()); + EFPRINTF(fp, "[0002]\t\tSegment Group: 0000\n"); + EFPRINTF(fp, "[0001]\t\tStart Bus: 00\n"); + EFPRINTF(fp, "[0001]\t\tEnd Bus: FF\n"); + EFPRINTF(fp, "[0004]\t\tReserved : 0\n"); + EFFLUSH(fp); + return (0); +err_exit: + return (errno); +} + +static int +basl_fwrite_facs(FILE *fp) +{ + int err; + + err = 0; + + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve FACS template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0004]\t\tSignature : \"FACS\"\n"); + EFPRINTF(fp, "[0004]\t\tLength : 00000040\n"); + EFPRINTF(fp, "[0004]\t\tHardware Signature : 00000000\n"); + EFPRINTF(fp, "[0004]\t\t32 Firmware Waking Vector : 00000000\n"); + EFPRINTF(fp, "[0004]\t\tGlobal Lock : 00000000\n"); + EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000000\n"); + EFPRINTF(fp, "\t\t\tS4BIOS Support Present : 0\n"); + EFPRINTF(fp, "\t\t\t64-bit Wake Supported (V2) : 0\n"); + EFPRINTF(fp, + "[0008]\t\t64 Firmware Waking Vector : 0000000000000000\n"); + EFPRINTF(fp, "[0001]\t\tVersion : 02\n"); + EFPRINTF(fp, "[0003]\t\tReserved : 000000\n"); + EFPRINTF(fp, "[0004]\t\tOspmFlags (decoded below) : 00000000\n"); + EFPRINTF(fp, "\t\t\t64-bit Wake Env Required (V2) : 0\n"); + + EFFLUSH(fp); + + return (0); + +err_exit: + return (errno); +} + +/* + * Helper routines for writing to the DSDT from other modules. + */ +void +dsdt_line(const char *fmt, ...) +{ + va_list ap; + int err; + + if (dsdt_error != 0) + return; + + if (strcmp(fmt, "") != 0) { + if (dsdt_indent_level != 0) + EFPRINTF(dsdt_fp, "%*c", dsdt_indent_level * 2, ' '); + va_start(ap, fmt); + if (vfprintf(dsdt_fp, fmt, ap) < 0) + goto err_exit; + va_end(ap); + } + EFPRINTF(dsdt_fp, "\n"); + return; + +err_exit: + dsdt_error = errno; +} + +void +dsdt_indent(int levels) +{ + + dsdt_indent_level += levels; + assert(dsdt_indent_level >= 0); +} + +void +dsdt_unindent(int levels) +{ + + assert(dsdt_indent_level >= levels); + dsdt_indent_level -= levels; +} + +void +dsdt_fixed_ioport(uint16_t iobase, uint16_t length) +{ + + dsdt_line("IO (Decode16,"); + dsdt_line(" 0x%04X, // Range Minimum", iobase); + dsdt_line(" 0x%04X, // Range Maximum", iobase); + dsdt_line(" 0x01, // Alignment"); + dsdt_line(" 0x%02X, // Length", length); + dsdt_line(" )"); +} + +void +dsdt_fixed_irq(uint8_t irq) +{ + + dsdt_line("IRQNoFlags ()"); + dsdt_line(" {%d}", irq); +} + +void +dsdt_fixed_mem32(uint32_t base, uint32_t length) +{ + + dsdt_line("Memory32Fixed (ReadWrite,"); + dsdt_line(" 0x%08X, // Address Base", base); + dsdt_line(" 0x%08X, // Address Length", length); + dsdt_line(" )"); +} + +static int +basl_fwrite_dsdt(FILE *fp) +{ + int err; + + err = 0; + dsdt_fp = fp; + dsdt_error = 0; + dsdt_indent_level = 0; + + dsdt_line("/*"); + dsdt_line(" * bhyve DSDT template"); + dsdt_line(" */"); + dsdt_line("DefinitionBlock (\"bhyve_dsdt.aml\", \"DSDT\", 2," + "\"BHYVE \", \"BVDSDT \", 0x00000001)"); + dsdt_line("{"); + dsdt_line(" Name (_S5, Package ()"); + dsdt_line(" {"); + dsdt_line(" 0x05,"); + dsdt_line(" Zero,"); + dsdt_line(" })"); + + pci_write_dsdt(); + + dsdt_line(""); + dsdt_line(" Scope (_SB.PC00)"); + dsdt_line(" {"); + dsdt_line(" Device (HPET)"); + dsdt_line(" {"); + dsdt_line(" Name (_HID, EISAID(\"PNP0103\"))"); + dsdt_line(" Name (_UID, 0)"); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_indent(4); + dsdt_fixed_mem32(0xFED00000, 0x400); + dsdt_unindent(4); + dsdt_line(" })"); + dsdt_line(" }"); + dsdt_line(" }"); + dsdt_line("}"); + + if (dsdt_error != 0) + return (dsdt_error); + + EFFLUSH(fp); + + return (0); + +err_exit: + return (errno); +} + +static int +basl_open(struct basl_fio *bf, int suffix) +{ + int err; + + err = 0; + + if (suffix) { + strncpy(bf->f_name, basl_stemplate, MAXPATHLEN); + bf->fd = mkstemps(bf->f_name, strlen(BHYVE_ASL_SUFFIX)); + } else { + strncpy(bf->f_name, basl_template, MAXPATHLEN); + bf->fd = mkstemp(bf->f_name); + } + + if (bf->fd > 0) { + bf->fp = fdopen(bf->fd, "w+"); + if (bf->fp == NULL) { + unlink(bf->f_name); + close(bf->fd); + } + } else { + err = 1; + } + + return (err); +} + +static void +basl_close(struct basl_fio *bf) +{ + + if (!basl_keep_temps) + unlink(bf->f_name); + fclose(bf->fp); +} + +static int +basl_start(struct basl_fio *in, struct basl_fio *out) +{ + int err; + + err = basl_open(in, 0); + if (!err) { + err = basl_open(out, 1); + if (err) { + basl_close(in); + } + } + + return (err); +} + +static void +basl_end(struct basl_fio *in, struct basl_fio *out) +{ + + basl_close(in); + basl_close(out); +} + +static int +basl_load(struct vmctx *ctx, int fd, uint64_t off) +{ + struct stat sb; + void *gaddr; + + if (fstat(fd, &sb) < 0) + return (errno); + + gaddr = paddr_guest2host(ctx, basl_acpi_base + off, sb.st_size); + if (gaddr == NULL) + return (EFAULT); + + if (read(fd, gaddr, sb.st_size) < 0) + return (errno); + + return (0); +} + +static int +basl_compile(struct vmctx *ctx, int (*fwrite_section)(FILE *), uint64_t offset) +{ + struct basl_fio io[2]; + static char iaslbuf[3*MAXPATHLEN + 10]; + char *fmt; + int err; + + err = basl_start(&io[0], &io[1]); + if (!err) { + err = (*fwrite_section)(io[0].fp); + + if (!err) { + /* + * iasl sends the results of the compilation to + * stdout. Shut this down by using the shell to + * redirect stdout to /dev/null, unless the user + * has requested verbose output for debugging + * purposes + */ + fmt = basl_verbose_iasl ? + "%s -p %s %s" : + "/bin/sh -c \"%s -p %s %s\" 1> /dev/null"; + + snprintf(iaslbuf, sizeof(iaslbuf), + fmt, + BHYVE_ASL_COMPILER, + io[1].f_name, io[0].f_name); + err = system(iaslbuf); + + if (!err) { + /* + * Copy the aml output file into guest + * memory at the specified location + */ + err = basl_load(ctx, io[1].fd, offset); + } + } + basl_end(&io[0], &io[1]); + } + + return (err); +} + +static int +basl_make_templates(void) +{ + const char *tmpdir; + int err; + int len; + + err = 0; + + /* + * + */ + if ((tmpdir = getenv("BHYVE_TMPDIR")) == NULL || *tmpdir == '\0' || + (tmpdir = getenv("TMPDIR")) == NULL || *tmpdir == '\0') { + tmpdir = _PATH_TMP; + } + + len = strlen(tmpdir); + + if ((len + sizeof(BHYVE_ASL_TEMPLATE) + 1) < MAXPATHLEN) { + strcpy(basl_template, tmpdir); + while (len > 0 && basl_template[len - 1] == '/') + len--; + basl_template[len] = '/'; + strcpy(&basl_template[len + 1], BHYVE_ASL_TEMPLATE); + } else + err = E2BIG; + + if (!err) { + /* + * len has been intialized (and maybe adjusted) above + */ + if ((len + sizeof(BHYVE_ASL_TEMPLATE) + 1 + + sizeof(BHYVE_ASL_SUFFIX)) < MAXPATHLEN) { + strcpy(basl_stemplate, tmpdir); + basl_stemplate[len] = '/'; + strcpy(&basl_stemplate[len + 1], BHYVE_ASL_TEMPLATE); + len = strlen(basl_stemplate); + strcpy(&basl_stemplate[len], BHYVE_ASL_SUFFIX); + } else + err = E2BIG; + } + + return (err); +} + +static struct { + int (*wsect)(FILE *fp); + uint64_t offset; +} basl_ftables[] = +{ + { basl_fwrite_rsdp, 0}, + { basl_fwrite_rsdt, RSDT_OFFSET }, + { basl_fwrite_xsdt, XSDT_OFFSET }, + { basl_fwrite_madt, MADT_OFFSET }, + { basl_fwrite_fadt, FADT_OFFSET }, + { basl_fwrite_hpet, HPET_OFFSET }, + { basl_fwrite_mcfg, MCFG_OFFSET }, + { basl_fwrite_facs, FACS_OFFSET }, + { basl_fwrite_dsdt, DSDT_OFFSET }, + { NULL } +}; + +int +acpi_build(struct vmctx *ctx, int ncpu) +{ + int err; + int i; + + basl_ncpu = ncpu; + + err = vm_get_hpet_capabilities(ctx, &hpet_capabilities); + if (err != 0) + return (err); + + /* + * For debug, allow the user to have iasl compiler output sent + * to stdout rather than /dev/null + */ + if (getenv("BHYVE_ACPI_VERBOSE_IASL")) + basl_verbose_iasl = 1; + + /* + * Allow the user to keep the generated ASL files for debugging + * instead of deleting them following use + */ + if (getenv("BHYVE_ACPI_KEEPTMPS")) + basl_keep_temps = 1; + + i = 0; + err = basl_make_templates(); + + /* + * Run through all the ASL files, compiling them and + * copying them into guest memory + */ + while (!err && basl_ftables[i].wsect != NULL) { + err = basl_compile(ctx, basl_ftables[i].wsect, + basl_ftables[i].offset); + i++; + } + + return (err); +} diff --git a/bhyve/acpi.h b/bhyve/acpi.h new file mode 100644 index 0000000..652164a --- /dev/null +++ b/bhyve/acpi.h @@ -0,0 +1,54 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _ACPI_H_ +#define _ACPI_H_ + +#define SCI_INT 9 + +#define SMI_CMD 0xb2 +#define BHYVE_ACPI_ENABLE 0xa0 +#define BHYVE_ACPI_DISABLE 0xa1 + +#define PM1A_EVT_ADDR 0x400 +#define PM1A_CNT_ADDR 0x404 + +#define IO_PMTMR 0x408 /* 4-byte i/o port for the timer */ + +struct vmctx; + +int acpi_build(struct vmctx *ctx, int ncpu); +void dsdt_line(const char *fmt, ...); +void dsdt_fixed_ioport(uint16_t iobase, uint16_t length); +void dsdt_fixed_irq(uint8_t irq); +void dsdt_fixed_mem32(uint32_t base, uint32_t length); +void dsdt_indent(int levels); +void dsdt_unindent(int levels); +void sci_init(struct vmctx *ctx); + +#endif /* _ACPI_H_ */ diff --git a/bhyve/ahci.h b/bhyve/ahci.h new file mode 100644 index 0000000..1fd9f20 --- /dev/null +++ b/bhyve/ahci.h @@ -0,0 +1,322 @@ +/*- + * Copyright (c) 1998 - 2008 Søren Schmidt + * Copyright (c) 2009-2012 Alexander Motin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer, + * without modification, immediately at the beginning of the file. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _AHCI_H_ +#define _AHCI_H_ + +/* ATA register defines */ +#define ATA_DATA 0 /* (RW) data */ + +#define ATA_FEATURE 1 /* (W) feature */ +#define ATA_F_DMA 0x01 /* enable DMA */ +#define ATA_F_OVL 0x02 /* enable overlap */ + +#define ATA_COUNT 2 /* (W) sector count */ + +#define ATA_SECTOR 3 /* (RW) sector # */ +#define ATA_CYL_LSB 4 /* (RW) cylinder# LSB */ +#define ATA_CYL_MSB 5 /* (RW) cylinder# MSB */ +#define ATA_DRIVE 6 /* (W) Sector/Drive/Head */ +#define ATA_D_LBA 0x40 /* use LBA addressing */ +#define ATA_D_IBM 0xa0 /* 512 byte sectors, ECC */ + +#define ATA_COMMAND 7 /* (W) command */ + +#define ATA_ERROR 8 /* (R) error */ +#define ATA_E_ILI 0x01 /* illegal length */ +#define ATA_E_NM 0x02 /* no media */ +#define ATA_E_ABORT 0x04 /* command aborted */ +#define ATA_E_MCR 0x08 /* media change request */ +#define ATA_E_IDNF 0x10 /* ID not found */ +#define ATA_E_MC 0x20 /* media changed */ +#define ATA_E_UNC 0x40 /* uncorrectable data */ +#define ATA_E_ICRC 0x80 /* UDMA crc error */ +#define ATA_E_ATAPI_SENSE_MASK 0xf0 /* ATAPI sense key mask */ + +#define ATA_IREASON 9 /* (R) interrupt reason */ +#define ATA_I_CMD 0x01 /* cmd (1) | data (0) */ +#define ATA_I_IN 0x02 /* read (1) | write (0) */ +#define ATA_I_RELEASE 0x04 /* released bus (1) */ +#define ATA_I_TAGMASK 0xf8 /* tag mask */ + +#define ATA_STATUS 10 /* (R) status */ +#define ATA_ALTSTAT 11 /* (R) alternate status */ +#define ATA_S_ERROR 0x01 /* error */ +#define ATA_S_INDEX 0x02 /* index */ +#define ATA_S_CORR 0x04 /* data corrected */ +#define ATA_S_DRQ 0x08 /* data request */ +#define ATA_S_DSC 0x10 /* drive seek completed */ +#define ATA_S_SERVICE 0x10 /* drive needs service */ +#define ATA_S_DWF 0x20 /* drive write fault */ +#define ATA_S_DMA 0x20 /* DMA ready */ +#define ATA_S_READY 0x40 /* drive ready */ +#define ATA_S_BUSY 0x80 /* busy */ + +#define ATA_CONTROL 12 /* (W) control */ +#define ATA_A_IDS 0x02 /* disable interrupts */ +#define ATA_A_RESET 0x04 /* RESET controller */ +#define ATA_A_4BIT 0x08 /* 4 head bits */ +#define ATA_A_HOB 0x80 /* High Order Byte enable */ + +/* SATA register defines */ +#define ATA_SSTATUS 13 +#define ATA_SS_DET_MASK 0x0000000f +#define ATA_SS_DET_NO_DEVICE 0x00000000 +#define ATA_SS_DET_DEV_PRESENT 0x00000001 +#define ATA_SS_DET_PHY_ONLINE 0x00000003 +#define ATA_SS_DET_PHY_OFFLINE 0x00000004 + +#define ATA_SS_SPD_MASK 0x000000f0 +#define ATA_SS_SPD_NO_SPEED 0x00000000 +#define ATA_SS_SPD_GEN1 0x00000010 +#define ATA_SS_SPD_GEN2 0x00000020 +#define ATA_SS_SPD_GEN3 0x00000030 + +#define ATA_SS_IPM_MASK 0x00000f00 +#define ATA_SS_IPM_NO_DEVICE 0x00000000 +#define ATA_SS_IPM_ACTIVE 0x00000100 +#define ATA_SS_IPM_PARTIAL 0x00000200 +#define ATA_SS_IPM_SLUMBER 0x00000600 +#define ATA_SS_IPM_DEVSLEEP 0x00000800 + +#define ATA_SERROR 14 +#define ATA_SE_DATA_CORRECTED 0x00000001 +#define ATA_SE_COMM_CORRECTED 0x00000002 +#define ATA_SE_DATA_ERR 0x00000100 +#define ATA_SE_COMM_ERR 0x00000200 +#define ATA_SE_PROT_ERR 0x00000400 +#define ATA_SE_HOST_ERR 0x00000800 +#define ATA_SE_PHY_CHANGED 0x00010000 +#define ATA_SE_PHY_IERROR 0x00020000 +#define ATA_SE_COMM_WAKE 0x00040000 +#define ATA_SE_DECODE_ERR 0x00080000 +#define ATA_SE_PARITY_ERR 0x00100000 +#define ATA_SE_CRC_ERR 0x00200000 +#define ATA_SE_HANDSHAKE_ERR 0x00400000 +#define ATA_SE_LINKSEQ_ERR 0x00800000 +#define ATA_SE_TRANSPORT_ERR 0x01000000 +#define ATA_SE_UNKNOWN_FIS 0x02000000 +#define ATA_SE_EXCHANGED 0x04000000 + +#define ATA_SCONTROL 15 +#define ATA_SC_DET_MASK 0x0000000f +#define ATA_SC_DET_IDLE 0x00000000 +#define ATA_SC_DET_RESET 0x00000001 +#define ATA_SC_DET_DISABLE 0x00000004 + +#define ATA_SC_SPD_MASK 0x000000f0 +#define ATA_SC_SPD_NO_SPEED 0x00000000 +#define ATA_SC_SPD_SPEED_GEN1 0x00000010 +#define ATA_SC_SPD_SPEED_GEN2 0x00000020 +#define ATA_SC_SPD_SPEED_GEN3 0x00000030 + +#define ATA_SC_IPM_MASK 0x00000f00 +#define ATA_SC_IPM_NONE 0x00000000 +#define ATA_SC_IPM_DIS_PARTIAL 0x00000100 +#define ATA_SC_IPM_DIS_SLUMBER 0x00000200 +#define ATA_SC_IPM_DIS_DEVSLEEP 0x00000400 + +#define ATA_SACTIVE 16 + +#define AHCI_MAX_PORTS 32 +#define AHCI_MAX_SLOTS 32 +#define AHCI_MAX_IRQS 16 + +/* SATA AHCI v1.0 register defines */ +#define AHCI_CAP 0x00 +#define AHCI_CAP_NPMASK 0x0000001f +#define AHCI_CAP_SXS 0x00000020 +#define AHCI_CAP_EMS 0x00000040 +#define AHCI_CAP_CCCS 0x00000080 +#define AHCI_CAP_NCS 0x00001F00 +#define AHCI_CAP_NCS_SHIFT 8 +#define AHCI_CAP_PSC 0x00002000 +#define AHCI_CAP_SSC 0x00004000 +#define AHCI_CAP_PMD 0x00008000 +#define AHCI_CAP_FBSS 0x00010000 +#define AHCI_CAP_SPM 0x00020000 +#define AHCI_CAP_SAM 0x00080000 +#define AHCI_CAP_ISS 0x00F00000 +#define AHCI_CAP_ISS_SHIFT 20 +#define AHCI_CAP_SCLO 0x01000000 +#define AHCI_CAP_SAL 0x02000000 +#define AHCI_CAP_SALP 0x04000000 +#define AHCI_CAP_SSS 0x08000000 +#define AHCI_CAP_SMPS 0x10000000 +#define AHCI_CAP_SSNTF 0x20000000 +#define AHCI_CAP_SNCQ 0x40000000 +#define AHCI_CAP_64BIT 0x80000000 + +#define AHCI_GHC 0x04 +#define AHCI_GHC_AE 0x80000000 +#define AHCI_GHC_MRSM 0x00000004 +#define AHCI_GHC_IE 0x00000002 +#define AHCI_GHC_HR 0x00000001 + +#define AHCI_IS 0x08 +#define AHCI_PI 0x0c +#define AHCI_VS 0x10 + +#define AHCI_CCCC 0x14 +#define AHCI_CCCC_TV_MASK 0xffff0000 +#define AHCI_CCCC_TV_SHIFT 16 +#define AHCI_CCCC_CC_MASK 0x0000ff00 +#define AHCI_CCCC_CC_SHIFT 8 +#define AHCI_CCCC_INT_MASK 0x000000f8 +#define AHCI_CCCC_INT_SHIFT 3 +#define AHCI_CCCC_EN 0x00000001 +#define AHCI_CCCP 0x18 + +#define AHCI_EM_LOC 0x1C +#define AHCI_EM_CTL 0x20 +#define AHCI_EM_MR 0x00000001 +#define AHCI_EM_TM 0x00000100 +#define AHCI_EM_RST 0x00000200 +#define AHCI_EM_LED 0x00010000 +#define AHCI_EM_SAFTE 0x00020000 +#define AHCI_EM_SES2 0x00040000 +#define AHCI_EM_SGPIO 0x00080000 +#define AHCI_EM_SMB 0x01000000 +#define AHCI_EM_XMT 0x02000000 +#define AHCI_EM_ALHD 0x04000000 +#define AHCI_EM_PM 0x08000000 + +#define AHCI_CAP2 0x24 +#define AHCI_CAP2_BOH 0x00000001 +#define AHCI_CAP2_NVMP 0x00000002 +#define AHCI_CAP2_APST 0x00000004 +#define AHCI_CAP2_SDS 0x00000008 +#define AHCI_CAP2_SADM 0x00000010 +#define AHCI_CAP2_DESO 0x00000020 + +#define AHCI_OFFSET 0x100 +#define AHCI_STEP 0x80 + +#define AHCI_P_CLB 0x00 +#define AHCI_P_CLBU 0x04 +#define AHCI_P_FB 0x08 +#define AHCI_P_FBU 0x0c +#define AHCI_P_IS 0x10 +#define AHCI_P_IE 0x14 +#define AHCI_P_IX_DHR 0x00000001 +#define AHCI_P_IX_PS 0x00000002 +#define AHCI_P_IX_DS 0x00000004 +#define AHCI_P_IX_SDB 0x00000008 +#define AHCI_P_IX_UF 0x00000010 +#define AHCI_P_IX_DP 0x00000020 +#define AHCI_P_IX_PC 0x00000040 +#define AHCI_P_IX_MP 0x00000080 + +#define AHCI_P_IX_PRC 0x00400000 +#define AHCI_P_IX_IPM 0x00800000 +#define AHCI_P_IX_OF 0x01000000 +#define AHCI_P_IX_INF 0x04000000 +#define AHCI_P_IX_IF 0x08000000 +#define AHCI_P_IX_HBD 0x10000000 +#define AHCI_P_IX_HBF 0x20000000 +#define AHCI_P_IX_TFE 0x40000000 +#define AHCI_P_IX_CPD 0x80000000 + +#define AHCI_P_CMD 0x18 +#define AHCI_P_CMD_ST 0x00000001 +#define AHCI_P_CMD_SUD 0x00000002 +#define AHCI_P_CMD_POD 0x00000004 +#define AHCI_P_CMD_CLO 0x00000008 +#define AHCI_P_CMD_FRE 0x00000010 +#define AHCI_P_CMD_CCS_MASK 0x00001f00 +#define AHCI_P_CMD_CCS_SHIFT 8 +#define AHCI_P_CMD_ISS 0x00002000 +#define AHCI_P_CMD_FR 0x00004000 +#define AHCI_P_CMD_CR 0x00008000 +#define AHCI_P_CMD_CPS 0x00010000 +#define AHCI_P_CMD_PMA 0x00020000 +#define AHCI_P_CMD_HPCP 0x00040000 +#define AHCI_P_CMD_MPSP 0x00080000 +#define AHCI_P_CMD_CPD 0x00100000 +#define AHCI_P_CMD_ESP 0x00200000 +#define AHCI_P_CMD_FBSCP 0x00400000 +#define AHCI_P_CMD_APSTE 0x00800000 +#define AHCI_P_CMD_ATAPI 0x01000000 +#define AHCI_P_CMD_DLAE 0x02000000 +#define AHCI_P_CMD_ALPE 0x04000000 +#define AHCI_P_CMD_ASP 0x08000000 +#define AHCI_P_CMD_ICC_MASK 0xf0000000 +#define AHCI_P_CMD_NOOP 0x00000000 +#define AHCI_P_CMD_ACTIVE 0x10000000 +#define AHCI_P_CMD_PARTIAL 0x20000000 +#define AHCI_P_CMD_SLUMBER 0x60000000 +#define AHCI_P_CMD_DEVSLEEP 0x80000000 + +#define AHCI_P_TFD 0x20 +#define AHCI_P_SIG 0x24 +#define AHCI_P_SSTS 0x28 +#define AHCI_P_SCTL 0x2c +#define AHCI_P_SERR 0x30 +#define AHCI_P_SACT 0x34 +#define AHCI_P_CI 0x38 +#define AHCI_P_SNTF 0x3C +#define AHCI_P_FBS 0x40 +#define AHCI_P_FBS_EN 0x00000001 +#define AHCI_P_FBS_DEC 0x00000002 +#define AHCI_P_FBS_SDE 0x00000004 +#define AHCI_P_FBS_DEV 0x00000f00 +#define AHCI_P_FBS_DEV_SHIFT 8 +#define AHCI_P_FBS_ADO 0x0000f000 +#define AHCI_P_FBS_ADO_SHIFT 12 +#define AHCI_P_FBS_DWE 0x000f0000 +#define AHCI_P_FBS_DWE_SHIFT 16 +#define AHCI_P_DEVSLP 0x44 +#define AHCI_P_DEVSLP_ADSE 0x00000001 +#define AHCI_P_DEVSLP_DSP 0x00000002 +#define AHCI_P_DEVSLP_DETO 0x000003fc +#define AHCI_P_DEVSLP_DETO_SHIFT 2 +#define AHCI_P_DEVSLP_MDAT 0x00007c00 +#define AHCI_P_DEVSLP_MDAT_SHIFT 10 +#define AHCI_P_DEVSLP_DITO 0x01ff8000 +#define AHCI_P_DEVSLP_DITO_SHIFT 15 +#define AHCI_P_DEVSLP_DM 0x0e000000 +#define AHCI_P_DEVSLP_DM_SHIFT 25 + +/* Just to be sure, if building as module. */ +#if MAXPHYS < 512 * 1024 +#undef MAXPHYS +#define MAXPHYS 512 * 1024 +#endif +/* Pessimistic prognosis on number of required S/G entries */ +#define AHCI_SG_ENTRIES (roundup(btoc(MAXPHYS) + 1, 8)) +/* Command list. 32 commands. First, 1Kbyte aligned. */ +#define AHCI_CL_OFFSET 0 +#define AHCI_CL_SIZE 32 +/* Command tables. Up to 32 commands, Each, 128byte aligned. */ +#define AHCI_CT_OFFSET (AHCI_CL_OFFSET + AHCI_CL_SIZE * AHCI_MAX_SLOTS) +#define AHCI_CT_SIZE (128 + AHCI_SG_ENTRIES * 16) +/* Total main work area. */ +#define AHCI_WORK_SIZE (AHCI_CT_OFFSET + AHCI_CT_SIZE * ch->numslots) + +#endif /* _AHCI_H_ */ diff --git a/bhyve/atkbdc.c b/bhyve/atkbdc.c new file mode 100644 index 0000000..930b7af --- /dev/null +++ b/bhyve/atkbdc.c @@ -0,0 +1,90 @@ +/*- + * Copyright (c) 2014 Tycho Nightingale + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include + +#include + +#include +#include +#include + +#include "inout.h" +#include "pci_lpc.h" + +#define KBD_DATA_PORT 0x60 + +#define KBD_STS_CTL_PORT 0x64 +#define KBD_SYS_FLAG 0x4 + +#define KBDC_RESET 0xfe + +static int +atkbdc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + if (bytes != 1) + return (-1); + + *eax = 0; + + return (0); +} + +static int +atkbdc_sts_ctl_handler(struct vmctx *ctx, int vcpu, int in, int port, + int bytes, uint32_t *eax, void *arg) +{ + int error, retval; + + if (bytes != 1) + return (-1); + + retval = 0; + if (in) { + *eax = KBD_SYS_FLAG; /* system passed POST */ + } else { + switch (*eax) { + case KBDC_RESET: /* Pulse "reset" line. */ + error = vm_suspend(ctx, VM_SUSPEND_RESET); + assert(error == 0 || errno == EALREADY); + break; + } + } + + return (retval); +} + +INOUT_PORT(atkdbc, KBD_DATA_PORT, IOPORT_F_INOUT, atkbdc_data_handler); +SYSRES_IO(KBD_DATA_PORT, 1); +INOUT_PORT(atkbdc, KBD_STS_CTL_PORT, IOPORT_F_INOUT, + atkbdc_sts_ctl_handler); +SYSRES_IO(KBD_STS_CTL_PORT, 1); diff --git a/bhyve/bhyve.8 b/bhyve/bhyve.8 new file mode 100644 index 0000000..ee0f2ca --- /dev/null +++ b/bhyve/bhyve.8 @@ -0,0 +1,325 @@ +.\" Copyright (c) 2013 Peter Grehan +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd September 17, 2014 +.Dt BHYVE 8 +.Os +.Sh NAME +.Nm bhyve +.Nd "run a guest operating system inside a virtual machine" +.Sh SYNOPSIS +.Nm +.Op Fl abehuwxACHPWY +.Op Fl c Ar numcpus +.Op Fl g Ar gdbport +.Op Fl l Ar lpcdev Ns Op , Ns Ar conf +.Op Fl m Ar size Ns Op Ar K|k|M|m|G|g|T|t +.Op Fl p Ar vcpu:hostcpu +.Op Fl s Ar slot,emulation Ns Op , Ns Ar conf +.Op Fl U Ar uuid +.Ar vmname +.Sh DESCRIPTION +.Nm +is a hypervisor that runs guest operating systems inside a +virtual machine. +.Pp +Parameters such as the number of virtual CPUs, amount of guest memory, and +I/O connectivity can be specified with command-line parameters. +.Pp +The guest operating system must be loaded with +.Xr bhyveload 4 +or a similar boot loader before running +.Nm . +.Pp +.Nm +runs until the guest operating system reboots or an unhandled hypervisor +exit is detected. +.Sh OPTIONS +.Bl -tag -width 10n +.It Fl a +The guest's local APIC is configured in xAPIC mode. +The xAPIC mode is the default setting so this option is redundant. It will be +deprecated in a future version. +.It Fl A +Generate ACPI tables. +Required for +.Fx Ns /amd64 +guests. +.It Fl b +Enable a low-level console device supported by +.Fx +kernels compiled with +.Cd "device bvmconsole" . +This option will be deprecated in a future version. +.It Fl c Ar numcpus +Number of guest virtual CPUs. +The default is 1 and the maximum is 16. +.It Fl C +Include guest memory in core file. +.It Fl e +Force +.Nm +to exit when a guest issues an access to an I/O port that is not emulated. +This is intended for debug purposes. +.It Fl g Ar gdbport +For +.Fx +kernels compiled with +.Cd "device bvmdebug" , +allow a remote kernel kgdb to be relayed to the guest kernel gdb stub +via a local IPv4 address and this port. +This option will be deprecated in a future version. +.It Fl h +Print help message and exit. +.It Fl H +Yield the virtual CPU thread when a HLT instruction is detected. +If this option is not specified, virtual CPUs will use 100% of a host CPU. +.It Fl l Ar lpcdev Ns Op , Ns Ar conf +Allow devices behind the LPC PCI-ISA bridge to be configured. +The only supported devices are the TTY-class devices, +.Li com1 +and +.Li com2 . +.It Fl m Ar size Ns Op Ar K|k|M|m|G|g|T|t +Guest physical memory size in bytes. +This must be the same size that was given to +.Xr bhyveload 8 . +.Pp +The size argument may be suffixed with one of K, M, G or T (either upper +or lower case) to indicate a multiple of kilobytes, megabytes, gigabytes, +or terabytes. +If no suffix is given, the value is assumed to be in megabytes. +.It Fl p Ar vcpu:hostcpu +Pin guest's virtual CPU +.Em vcpu +to +.Em hostcpu . +.It Fl P +Force the guest virtual CPU to exit when a PAUSE instruction is detected. +.It Fl s Ar slot,emulation Ns Op , Ns Ar conf +Configure a virtual PCI slot and function. +.Pp +.Nm bhyve +provides PCI bus emulation and virtual devices that can be attached to +slots on the bus. +There are 32 available slots, with the option of providing up to 8 functions +per slot. +.Bl -tag -width 10n +.It Ar slot +.Ar pcislot[:function] +.Ar bus:pcislot:function +.Pp +The +.Ar pcislot +value is 0 to 31. The optional function value is 0 to 7. The optional +.Ar bus +value is 0 to 255. +If not specified, the function value defaults to 0. +If not specified, the bus value defaults to 0. +.It Ar emulation +.Bl -tag -width 10n +.It Li hostbridge | Li amd_hostbridge +.Pp +Provide a simple host bridge. +This is usually configured at slot 0, and is required by most guest +operating systems. +The +.Li amd_hostbridge +emulation is identical but uses a PCI vendor ID of +.Li AMD . +.It Li passthru +PCI pass-through device. +.It Li virtio-net +Virtio network interface. +.It Li virtio-blk +Virtio block storage interface. +.It Li virtio-rnd +Virtio RNG interface. +.It Li ahci-cd +AHCI controller attached to an ATAPI CD/DVD. +.It Li ahci-hd +AHCI controller attached to a SATA hard-drive. +.It Li uart +PCI 16550 serial device. +.It Li lpc +LPC PCI-ISA bridge with COM1 and COM2 16550 serial ports. The LPC bridge +emulation can only be configured on bus 0. +.El +.It Op Ar conf +This optional parameter describes the backend for device emulations. +If +.Ar conf +is not specified, the device emulation has no backend and can be +considered unconnected. +.Pp +Network devices: +.Bl -tag -width 10n +.It Ar tapN Ns Op , Ns Ar mac=xx:xx:xx:xx:xx:xx +.It Ar vmnetN Ns Op , Ns Ar mac=xx:xx:xx:xx:xx:xx +.Pp +If +.Ar mac +is not specified, the MAC address is derived from a fixed OUI and the +remaining bytes from an MD5 hash of the slot and function numbers and +the device name. +.Pp +The MAC address is an ASCII string in +.Xr ethers 5 +format. +.El +.Pp +Block storage devices: +.Bl -tag -width 10n +.It Pa /filename Ns Oo , Ns Ar block-device-options Oc +.It Pa /dev/xxx Ns Oo , Ns Ar block-device-options Oc +.El +.Pp +The +.Ar block-device-options +are: +.Bl -tag -width 8n +.It Li nocache +Open the file with +.Dv O_DIRECT . +.It Li direct +Open the file using +.Dv O_SYNC . +.It Li ro +Force the file to be opened read-only. +.It Li sectorsize= Ns Ar logical Ns Oo / Ns Ar physical Oc +Specify the logical and physical sector sizes of the emulated disk. +The physical sector size is optional and is equal to the logical sector size +if not explicitly specified. +.El +.Pp +TTY devices: +.Bl -tag -width 10n +.It Li stdio +Connect the serial port to the standard input and output of +the bhyve process. +.It Pa /dev/xxx +Use the host TTY device for serial port I/O. +.El +.Pp +Pass-through devices: +.Bl -tag -width 10n +.It Ns Ar slot Ns / Ns Ar bus Ns / Ns Ar function +Connect to a PCI device on the host at the selector described by +.Ar slot , +.Ar bus , +and +.Ar function +numbers. +.El +.Pp +The host device must have been reserved at boot-time using the +.Va pptdev +loader variable as described in +.Xr vmm 4 . +.El +.It Fl u +RTC keeps UTC time. +.It Fl U Ar uuid +Set the universally unique identifier +.Pq UUID +in the guest's System Management BIOS System Information structure. +By default a UUID is generated from the host's hostname and +.Ar vmname . +.It Fl w +Ignore accesses to unimplemented Model Specific Registers (MSRs). This is intended for debug purposes. +.It Fl W +Force virtio PCI device emulations to use MSI interrupts instead of MSI-X +interrupts. +.It Fl x +The guest's local APIC is configured in x2APIC mode. +.It Fl Y +Disable MPtable generation. +.It Ar vmname +Alphanumeric name of the guest. +This should be the same as that created by +.Xr bhyveload 8 . +.El +.Sh EXAMPLES +The guest operating system must have been loaded with +.Xr bhyveload 4 +or a similar boot loader before +.Xr bhyve 4 +can be run. +.Pp +To run a virtual machine with 1GB of memory, two virtual CPUs, a virtio +block device backed by the +.Pa /my/image +filesystem image, and a serial port for the console: +.Bd -literal -offset indent +bhyve -c 2 -s 0,hostbridge -s 1,lpc -s 2,virtio-blk,/my/image \\ + -l com1,stdio -A -H -P -m 1G vm1 +.Ed +.Pp +Run a 24GB single-CPU virtual machine with three network ports, one of which +has a MAC address specified: +.Bd -literal -offset indent +bhyve -s 0,hostbridge -s 1,lpc -s 2:0,virtio-net,tap0 \\ + -s 2:1,virtio-net,tap1 \\ + -s 2:2,virtio-net,tap2,mac=00:be:fa:76:45:00 \\ + -s 3,virtio-blk,/my/image -l com1,stdio \\ + -A -H -P -m 24G bigvm +.Ed +.Pp +Run an 8GB quad-CPU virtual machine with 8 AHCI SATA disks, an AHCI ATAPI +CD-ROM, a single virtio network port, an AMD hostbridge, and the console +port connected to an +.Xr nmdm 4 +null-model device. +.Bd -literal -offset indent +bhyve -c 4 \e\ + -s 0,amd_hostbridge -s 1,lpc \\ + -s 1:0,ahci-hd,/images/disk.1 \\ + -s 1:1,ahci-hd,/images/disk.2 \\ + -s 1:2,ahci-hd,/images/disk.3 \\ + -s 1:3,ahci-hd,/images/disk.4 \\ + -s 1:4,ahci-hd,/images/disk.5 \\ + -s 1:5,ahci-hd,/images/disk.6 \\ + -s 1:6,ahci-hd,/images/disk.7 \\ + -s 1:7,ahci-hd,/images/disk.8 \\ + -s 2,ahci-cd,/images.install.iso \\ + -s 3,virtio-net,tap0 \\ + -l com1,/dev/nmdm0A \\ + -A -H -P -m 8G +.Ed +.Sh SEE ALSO +.Xr bhyve 4 , +.Xr nmdm 4 , +.Xr vmm 4 , +.Xr ethers 5 , +.Xr bhyvectl 8 , +.Xr bhyveload 8 +.Sh HISTORY +.Nm +first appeared in +.Fx 10.0 . +.Sh AUTHORS +.An Neel Natu Aq Mt neel@freebsd.org +.An Peter Grehan Aq Mt grehan@freebsd.org diff --git a/bhyve/bhyverun.c b/bhyve/bhyverun.c new file mode 100644 index 0000000..47a7699 --- /dev/null +++ b/bhyve/bhyverun.c @@ -0,0 +1,892 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "bhyverun.h" +#include "acpi.h" +#include "inout.h" +#include "dbgport.h" +#include "ioapic.h" +#include "mem.h" +#include "mevent.h" +#include "mptbl.h" +#include "pci_emul.h" +#include "pci_irq.h" +#include "pci_lpc.h" +#include "smbiostbl.h" +#include "xmsr.h" +#include "spinup_ap.h" +#include "rtc.h" + +#define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */ + +#define MB (1024UL * 1024) +#define GB (1024UL * MB) + +typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu); +extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu); + +char *vmname; + +int guest_ncpus; +char *guest_uuid_str; + +static int guest_vmexit_on_hlt, guest_vmexit_on_pause; +static int virtio_msix = 1; +static int x2apic_mode = 0; /* default is xAPIC */ + +static int strictio; +static int strictmsr = 1; + +static int acpi; + +static char *progname; +static const int BSP = 0; + +static cpuset_t cpumask; + +static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip); + +static struct vm_exit vmexit[VM_MAXCPU]; + +struct bhyvestats { + uint64_t vmexit_bogus; + uint64_t vmexit_bogus_switch; + uint64_t vmexit_hlt; + uint64_t vmexit_pause; + uint64_t vmexit_mtrap; + uint64_t vmexit_inst_emul; + uint64_t cpu_switch_rotate; + uint64_t cpu_switch_direct; +} stats; + +struct mt_vmm_info { + pthread_t mt_thr; + struct vmctx *mt_ctx; + int mt_vcpu; +} mt_vmm_info[VM_MAXCPU]; + +static cpuset_t *vcpumap[VM_MAXCPU] = { NULL }; + +static void +usage(int code) +{ + + fprintf(stderr, + "Usage: %s [-abehuwxACHPWY] [-c vcpus] [-g ] [-l ]\n" + " %*s [-m mem] [-p vcpu:hostcpu] [-s ] [-U uuid] \n" + " -a: local apic is in xAPIC mode (deprecated)\n" + " -A: create ACPI tables\n" + " -c: # cpus (default 1)\n" + " -C: include guest memory in core file\n" + " -e: exit on unhandled I/O access\n" + " -g: gdb port\n" + " -h: help\n" + " -H: vmexit from the guest on hlt\n" + " -l: LPC device configuration\n" + " -m: memory size in MB\n" + " -p: pin 'vcpu' to 'hostcpu'\n" + " -P: vmexit from the guest on pause\n" + " -s: PCI slot config\n" + " -u: RTC keeps UTC time\n" + " -U: uuid\n" + " -w: ignore unimplemented MSRs\n" + " -W: force virtio to use single-vector MSI\n" + " -x: local apic is in x2APIC mode\n" + " -Y: disable MPtable generation\n", + progname, (int)strlen(progname), ""); + + exit(code); +} + +static int +pincpu_parse(const char *opt) +{ + int vcpu, pcpu; + + if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) { + fprintf(stderr, "invalid format: %s\n", opt); + return (-1); + } + + if (vcpu < 0 || vcpu >= VM_MAXCPU) { + fprintf(stderr, "vcpu '%d' outside valid range from 0 to %d\n", + vcpu, VM_MAXCPU - 1); + return (-1); + } + + if (pcpu < 0 || pcpu >= CPU_SETSIZE) { + fprintf(stderr, "hostcpu '%d' outside valid range from " + "0 to %d\n", pcpu, CPU_SETSIZE - 1); + return (-1); + } + + if (vcpumap[vcpu] == NULL) { + if ((vcpumap[vcpu] = malloc(sizeof(cpuset_t))) == NULL) { + perror("malloc"); + return (-1); + } + CPU_ZERO(vcpumap[vcpu]); + } + CPU_SET(pcpu, vcpumap[vcpu]); + return (0); +} + +void +vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid, + int errcode) +{ + struct vmctx *ctx; + int error, restart_instruction; + + ctx = arg; + restart_instruction = 1; + + error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode, + restart_instruction); + assert(error == 0); +} + +void * +paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len) +{ + + return (vm_map_gpa(ctx, gaddr, len)); +} + +int +fbsdrun_vmexit_on_pause(void) +{ + + return (guest_vmexit_on_pause); +} + +int +fbsdrun_vmexit_on_hlt(void) +{ + + return (guest_vmexit_on_hlt); +} + +int +fbsdrun_virtio_msix(void) +{ + + return (virtio_msix); +} + +static void * +fbsdrun_start_thread(void *param) +{ + char tname[MAXCOMLEN + 1]; + struct mt_vmm_info *mtp; + int vcpu; + + mtp = param; + vcpu = mtp->mt_vcpu; + + snprintf(tname, sizeof(tname), "vcpu %d", vcpu); + pthread_set_name_np(mtp->mt_thr, tname); + + vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip); + + /* not reached */ + exit(1); + return (NULL); +} + +void +fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip) +{ + int error; + + assert(fromcpu == BSP); + + /* + * The 'newcpu' must be activated in the context of 'fromcpu'. If + * vm_activate_cpu() is delayed until newcpu's pthread starts running + * then vmm.ko is out-of-sync with bhyve and this can create a race + * with vm_suspend(). + */ + error = vm_activate_cpu(ctx, newcpu); + assert(error == 0); + + CPU_SET_ATOMIC(newcpu, &cpumask); + + /* + * Set up the vmexit struct to allow execution to start + * at the given RIP + */ + vmexit[newcpu].rip = rip; + vmexit[newcpu].inst_length = 0; + + mt_vmm_info[newcpu].mt_ctx = ctx; + mt_vmm_info[newcpu].mt_vcpu = newcpu; + + error = pthread_create(&mt_vmm_info[newcpu].mt_thr, NULL, + fbsdrun_start_thread, &mt_vmm_info[newcpu]); + assert(error == 0); +} + +static int +fbsdrun_deletecpu(struct vmctx *ctx, int vcpu) +{ + + if (!CPU_ISSET(vcpu, &cpumask)) { + fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu); + exit(1); + } + + CPU_CLR_ATOMIC(vcpu, &cpumask); + return (CPU_EMPTY(&cpumask)); +} + +static int +vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu, + uint32_t eax) +{ +#if BHYVE_DEBUG + /* + * put guest-driven debug here + */ +#endif + return (VMEXIT_CONTINUE); +} + +static int +vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) +{ + int error; + int bytes, port, in, out, string; + int vcpu; + + vcpu = *pvcpu; + + port = vme->u.inout.port; + bytes = vme->u.inout.bytes; + string = vme->u.inout.string; + in = vme->u.inout.in; + out = !in; + + /* Extra-special case of host notifications */ + if (out && port == GUEST_NIO_PORT) { + error = vmexit_handle_notify(ctx, vme, pvcpu, vme->u.inout.eax); + return (error); + } + + error = emulate_inout(ctx, vcpu, vme, strictio); + if (error) { + fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n", + in ? "in" : "out", + bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), + port, vmexit->rip); + return (VMEXIT_ABORT); + } else { + return (VMEXIT_CONTINUE); + } +} + +static int +vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) +{ + uint64_t val; + uint32_t eax, edx; + int error; + + val = 0; + error = emulate_rdmsr(ctx, *pvcpu, vme->u.msr.code, &val); + if (error != 0) { + fprintf(stderr, "rdmsr to register %#x on vcpu %d\n", + vme->u.msr.code, *pvcpu); + if (strictmsr) { + vm_inject_gp(ctx, *pvcpu); + return (VMEXIT_CONTINUE); + } + } + + eax = val; + error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RAX, eax); + assert(error == 0); + + edx = val >> 32; + error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RDX, edx); + assert(error == 0); + + return (VMEXIT_CONTINUE); +} + +static int +vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) +{ + int error; + + error = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code, vme->u.msr.wval); + if (error != 0) { + fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n", + vme->u.msr.code, vme->u.msr.wval, *pvcpu); + if (strictmsr) { + vm_inject_gp(ctx, *pvcpu); + return (VMEXIT_CONTINUE); + } + } + return (VMEXIT_CONTINUE); +} + +static int +vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) +{ + int newcpu; + int retval = VMEXIT_CONTINUE; + + newcpu = spinup_ap(ctx, *pvcpu, + vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip); + + return (retval); +} + +#define DEBUG_EPT_MISCONFIG +#ifdef DEBUG_EPT_MISCONFIG +#define EXIT_REASON_EPT_MISCONFIG 49 +#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 +#define VMCS_IDENT(x) ((x) | 0x80000000) + +static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4]; +static int ept_misconfig_ptenum; +#endif + +static int +vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + fprintf(stderr, "vm exit[%d]\n", *pvcpu); + fprintf(stderr, "\treason\t\tVMX\n"); + fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip); + fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length); + fprintf(stderr, "\tstatus\t\t%d\n", vmexit->u.vmx.status); + fprintf(stderr, "\texit_reason\t%u\n", vmexit->u.vmx.exit_reason); + fprintf(stderr, "\tqualification\t0x%016lx\n", + vmexit->u.vmx.exit_qualification); + fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type); + fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error); +#ifdef DEBUG_EPT_MISCONFIG + if (vmexit->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) { + vm_get_register(ctx, *pvcpu, + VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS), + &ept_misconfig_gpa); + vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte, + &ept_misconfig_ptenum); + fprintf(stderr, "\tEPT misconfiguration:\n"); + fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa); + fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n", + ept_misconfig_ptenum, ept_misconfig_pte[0], + ept_misconfig_pte[1], ept_misconfig_pte[2], + ept_misconfig_pte[3]); + } +#endif /* DEBUG_EPT_MISCONFIG */ + return (VMEXIT_ABORT); +} + +static int +vmexit_svm(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + fprintf(stderr, "vm exit[%d]\n", *pvcpu); + fprintf(stderr, "\treason\t\tSVM\n"); + fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip); + fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length); + fprintf(stderr, "\texitcode\t%#lx\n", vmexit->u.svm.exitcode); + fprintf(stderr, "\texitinfo1\t%#lx\n", vmexit->u.svm.exitinfo1); + fprintf(stderr, "\texitinfo2\t%#lx\n", vmexit->u.svm.exitinfo2); + return (VMEXIT_ABORT); +} + +static int +vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + assert(vmexit->inst_length == 0); + + stats.vmexit_bogus++; + + return (VMEXIT_CONTINUE); +} + +static int +vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + stats.vmexit_hlt++; + + /* + * Just continue execution with the next instruction. We use + * the HLT VM exit as a way to be friendly with the host + * scheduler. + */ + return (VMEXIT_CONTINUE); +} + +static int +vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + stats.vmexit_pause++; + + return (VMEXIT_CONTINUE); +} + +static int +vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + assert(vmexit->inst_length == 0); + + stats.vmexit_mtrap++; + + return (VMEXIT_CONTINUE); +} + +static int +vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + int err, i; + struct vie *vie; + + stats.vmexit_inst_emul++; + + vie = &vmexit->u.inst_emul.vie; + err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa, + vie, &vmexit->u.inst_emul.paging); + + if (err) { + if (err == ESRCH) { + fprintf(stderr, "Unhandled memory access to 0x%lx\n", + vmexit->u.inst_emul.gpa); + } + + fprintf(stderr, "Failed to emulate instruction ["); + for (i = 0; i < vie->num_valid; i++) { + fprintf(stderr, "0x%02x%s", vie->inst[i], + i != (vie->num_valid - 1) ? " " : ""); + } + fprintf(stderr, "] at 0x%lx\n", vmexit->rip); + return (VMEXIT_ABORT); + } + + return (VMEXIT_CONTINUE); +} + +static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER; + +static int +vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + enum vm_suspend_how how; + + how = vmexit->u.suspended.how; + + fbsdrun_deletecpu(ctx, *pvcpu); + + if (*pvcpu != BSP) { + pthread_mutex_lock(&resetcpu_mtx); + pthread_cond_signal(&resetcpu_cond); + pthread_mutex_unlock(&resetcpu_mtx); + pthread_exit(NULL); + } + + pthread_mutex_lock(&resetcpu_mtx); + while (!CPU_EMPTY(&cpumask)) { + pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx); + } + pthread_mutex_unlock(&resetcpu_mtx); + + switch (how) { + case VM_SUSPEND_RESET: + exit(0); + case VM_SUSPEND_POWEROFF: + exit(1); + case VM_SUSPEND_HALT: + exit(2); + case VM_SUSPEND_TRIPLEFAULT: + exit(3); + default: + fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how); + exit(100); + } + return (0); /* NOTREACHED */ +} + +static vmexit_handler_t handler[VM_EXITCODE_MAX] = { + [VM_EXITCODE_INOUT] = vmexit_inout, + [VM_EXITCODE_INOUT_STR] = vmexit_inout, + [VM_EXITCODE_VMX] = vmexit_vmx, + [VM_EXITCODE_SVM] = vmexit_svm, + [VM_EXITCODE_BOGUS] = vmexit_bogus, + [VM_EXITCODE_RDMSR] = vmexit_rdmsr, + [VM_EXITCODE_WRMSR] = vmexit_wrmsr, + [VM_EXITCODE_MTRAP] = vmexit_mtrap, + [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, + [VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap, + [VM_EXITCODE_SUSPENDED] = vmexit_suspend, + [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch, +}; + +static void +vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip) +{ + int error, rc, prevcpu; + enum vm_exitcode exitcode; + cpuset_t active_cpus; + + if (vcpumap[vcpu] != NULL) { + error = pthread_setaffinity_np(pthread_self(), + sizeof(cpuset_t), vcpumap[vcpu]); + assert(error == 0); + } + + error = vm_active_cpus(ctx, &active_cpus); + assert(CPU_ISSET(vcpu, &active_cpus)); + + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip); + assert(error == 0); + + while (1) { + error = vm_run(ctx, vcpu, &vmexit[vcpu]); + if (error != 0) + break; + + prevcpu = vcpu; + + exitcode = vmexit[vcpu].exitcode; + if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) { + fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n", + exitcode); + exit(1); + } + + rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu); + + switch (rc) { + case VMEXIT_CONTINUE: + break; + case VMEXIT_ABORT: + abort(); + default: + exit(1); + } + } + fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); +} + +static int +num_vcpus_allowed(struct vmctx *ctx) +{ + int tmp, error; + + error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp); + + /* + * The guest is allowed to spinup more than one processor only if the + * UNRESTRICTED_GUEST capability is available. + */ + if (error == 0) + return (VM_MAXCPU); + else + return (1); +} + +void +fbsdrun_set_capabilities(struct vmctx *ctx, int cpu) +{ + int err, tmp; + + if (fbsdrun_vmexit_on_hlt()) { + err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp); + if (err < 0) { + fprintf(stderr, "VM exit on HLT not supported\n"); + exit(1); + } + vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1); + if (cpu == BSP) + handler[VM_EXITCODE_HLT] = vmexit_hlt; + } + + if (fbsdrun_vmexit_on_pause()) { + /* + * pause exit support required for this mode + */ + err = vm_get_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, &tmp); + if (err < 0) { + fprintf(stderr, + "SMP mux requested, no pause support\n"); + exit(1); + } + vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1); + if (cpu == BSP) + handler[VM_EXITCODE_PAUSE] = vmexit_pause; + } + + if (x2apic_mode) + err = vm_set_x2apic_state(ctx, cpu, X2APIC_ENABLED); + else + err = vm_set_x2apic_state(ctx, cpu, X2APIC_DISABLED); + + if (err) { + fprintf(stderr, "Unable to set x2apic state (%d)\n", err); + exit(1); + } + + vm_set_capability(ctx, cpu, VM_CAP_ENABLE_INVPCID, 1); +} + +int +main(int argc, char *argv[]) +{ + int c, error, gdb_port, err, bvmcons; + int dump_guest_memory, max_vcpus, mptgen; + int rtc_localtime; + struct vmctx *ctx; + uint64_t rip; + size_t memsize; + + bvmcons = 0; + dump_guest_memory = 0; + progname = basename(argv[0]); + gdb_port = 0; + guest_ncpus = 1; + memsize = 256 * MB; + mptgen = 1; + rtc_localtime = 1; + + while ((c = getopt(argc, argv, "abehuwxACHIPWYp:g:c:s:m:l:U:")) != -1) { + switch (c) { + case 'a': + x2apic_mode = 0; + break; + case 'A': + acpi = 1; + break; + case 'b': + bvmcons = 1; + break; + case 'p': + if (pincpu_parse(optarg) != 0) { + errx(EX_USAGE, "invalid vcpu pinning " + "configuration '%s'", optarg); + } + break; + case 'c': + guest_ncpus = atoi(optarg); + break; + case 'C': + dump_guest_memory = 1; + break; + case 'g': + gdb_port = atoi(optarg); + break; + case 'l': + if (lpc_device_parse(optarg) != 0) { + errx(EX_USAGE, "invalid lpc device " + "configuration '%s'", optarg); + } + break; + case 's': + if (pci_parse_slot(optarg) != 0) + exit(1); + else + break; + case 'm': + error = vm_parse_memsize(optarg, &memsize); + if (error) + errx(EX_USAGE, "invalid memsize '%s'", optarg); + break; + case 'H': + guest_vmexit_on_hlt = 1; + break; + case 'I': + /* + * The "-I" option was used to add an ioapic to the + * virtual machine. + * + * An ioapic is now provided unconditionally for each + * virtual machine and this option is now deprecated. + */ + break; + case 'P': + guest_vmexit_on_pause = 1; + break; + case 'e': + strictio = 1; + break; + case 'u': + rtc_localtime = 0; + break; + case 'U': + guest_uuid_str = optarg; + break; + case 'w': + strictmsr = 0; + break; + case 'W': + virtio_msix = 0; + break; + case 'x': + x2apic_mode = 1; + break; + case 'Y': + mptgen = 0; + break; + case 'h': + usage(0); + default: + usage(1); + } + } + argc -= optind; + argv += optind; + + if (argc != 1) + usage(1); + + vmname = argv[0]; + + ctx = vm_open(vmname); + if (ctx == NULL) { + perror("vm_open"); + exit(1); + } + + if (guest_ncpus < 1) { + fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus); + exit(1); + } + + max_vcpus = num_vcpus_allowed(ctx); + if (guest_ncpus > max_vcpus) { + fprintf(stderr, "%d vCPUs requested but only %d available\n", + guest_ncpus, max_vcpus); + exit(1); + } + + fbsdrun_set_capabilities(ctx, BSP); + + if (dump_guest_memory) + vm_set_memflags(ctx, VM_MEM_F_INCORE); + err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); + if (err) { + fprintf(stderr, "Unable to setup memory (%d)\n", err); + exit(1); + } + + error = init_msr(); + if (error) { + fprintf(stderr, "init_msr error %d", error); + exit(1); + } + + init_mem(); + init_inout(); + pci_irq_init(ctx); + ioapic_init(ctx); + + rtc_init(ctx, rtc_localtime); + sci_init(ctx); + + /* + * Exit if a device emulation finds an error in it's initilization + */ + if (init_pci(ctx) != 0) + exit(1); + + if (gdb_port != 0) + init_dbgport(gdb_port); + + if (bvmcons) + init_bvmcons(); + + error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip); + assert(error == 0); + + /* + * build the guest tables, MP etc. + */ + if (mptgen) { + error = mptable_build(ctx, guest_ncpus); + if (error) + exit(1); + } + + error = smbios_build(ctx); + assert(error == 0); + + if (acpi) { + error = acpi_build(ctx, guest_ncpus); + assert(error == 0); + } + + /* + * Change the proc title to include the VM name. + */ + setproctitle("%s", vmname); + + /* + * Add CPU 0 + */ + fbsdrun_addcpu(ctx, BSP, BSP, rip); + + /* + * Head off to the main event dispatch loop + */ + mevent_dispatch(); + + exit(1); +} diff --git a/bhyve/bhyverun.h b/bhyve/bhyverun.h new file mode 100644 index 0000000..c51bf48 --- /dev/null +++ b/bhyve/bhyverun.h @@ -0,0 +1,55 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _FBSDRUN_H_ +#define _FBSDRUN_H_ + +#ifndef CTASSERT /* Allow lint to override */ +#define CTASSERT(x) _CTASSERT(x, __LINE__) +#define _CTASSERT(x, y) __CTASSERT(x, y) +#define __CTASSERT(x, y) typedef char __assert ## y[(x) ? 1 : -1] +#endif + +#define VMEXIT_CONTINUE (0) +#define VMEXIT_ABORT (-1) + +struct vmctx; +extern int guest_ncpus; +extern char *guest_uuid_str; +extern char *vmname; + +void *paddr_guest2host(struct vmctx *ctx, uintptr_t addr, size_t len); + +void fbsdrun_set_capabilities(struct vmctx *ctx, int cpu); +void fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip); +int fbsdrun_muxed(void); +int fbsdrun_vmexit_on_hlt(void); +int fbsdrun_vmexit_on_pause(void); +int fbsdrun_disable_x2apic(void); +int fbsdrun_virtio_msix(void); +#endif diff --git a/bhyve/block_if.c b/bhyve/block_if.c new file mode 100644 index 0000000..ef8e11e --- /dev/null +++ b/bhyve/block_if.c @@ -0,0 +1,822 @@ +/*- + * Copyright (c) 2013 Peter Grehan + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "bhyverun.h" +#include "mevent.h" +#include "block_if.h" + +#define BLOCKIF_SIG 0xb109b109 + +#define BLOCKIF_NUMTHR 8 +#define BLOCKIF_MAXREQ (64 + BLOCKIF_NUMTHR) + +enum blockop { + BOP_READ, + BOP_WRITE, + BOP_FLUSH, + BOP_DELETE +}; + +enum blockstat { + BST_FREE, + BST_BLOCK, + BST_PEND, + BST_BUSY, + BST_DONE +}; + +struct blockif_elem { + TAILQ_ENTRY(blockif_elem) be_link; + struct blockif_req *be_req; + enum blockop be_op; + enum blockstat be_status; + pthread_t be_tid; + off_t be_block; +}; + +struct blockif_ctxt { + int bc_magic; + int bc_fd; + int bc_ischr; + int bc_isgeom; + int bc_candelete; + int bc_rdonly; + off_t bc_size; + int bc_sectsz; + int bc_psectsz; + int bc_psectoff; + int bc_closing; + pthread_t bc_btid[BLOCKIF_NUMTHR]; + pthread_mutex_t bc_mtx; + pthread_cond_t bc_cond; + + /* Request elements and free/pending/busy queues */ + TAILQ_HEAD(, blockif_elem) bc_freeq; + TAILQ_HEAD(, blockif_elem) bc_pendq; + TAILQ_HEAD(, blockif_elem) bc_busyq; + struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; +}; + +static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; + +struct blockif_sig_elem { + pthread_mutex_t bse_mtx; + pthread_cond_t bse_cond; + int bse_pending; + struct blockif_sig_elem *bse_next; +}; + +static struct blockif_sig_elem *blockif_bse_head; + +static int +blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, + enum blockop op) +{ + struct blockif_elem *be, *tbe; + off_t off; + int i; + + be = TAILQ_FIRST(&bc->bc_freeq); + assert(be != NULL); + assert(be->be_status == BST_FREE); + TAILQ_REMOVE(&bc->bc_freeq, be, be_link); + be->be_req = breq; + be->be_op = op; + switch (op) { + case BOP_READ: + case BOP_WRITE: + case BOP_DELETE: + off = breq->br_offset; + for (i = 0; i < breq->br_iovcnt; i++) + off += breq->br_iov[i].iov_len; + break; + default: + off = OFF_MAX; + } + be->be_block = off; + TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { + if (tbe->be_block == breq->br_offset) + break; + } + if (tbe == NULL) { + TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) { + if (tbe->be_block == breq->br_offset) + break; + } + } + if (tbe == NULL) + be->be_status = BST_PEND; + else + be->be_status = BST_BLOCK; + TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); + return (be->be_status == BST_PEND); +} + +static int +blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep) +{ + struct blockif_elem *be; + + TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { + if (be->be_status == BST_PEND) + break; + assert(be->be_status == BST_BLOCK); + } + if (be == NULL) + return (0); + TAILQ_REMOVE(&bc->bc_pendq, be, be_link); + be->be_status = BST_BUSY; + be->be_tid = t; + TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); + *bep = be; + return (1); +} + +static void +blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) +{ + struct blockif_elem *tbe; + + if (be->be_status == BST_DONE || be->be_status == BST_BUSY) + TAILQ_REMOVE(&bc->bc_busyq, be, be_link); + else + TAILQ_REMOVE(&bc->bc_pendq, be, be_link); + TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { + if (tbe->be_req->br_offset == be->be_block) + tbe->be_status = BST_PEND; + } + be->be_tid = 0; + be->be_status = BST_FREE; + be->be_req = NULL; + TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); +} + +static void +blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) +{ + struct blockif_req *br; + off_t arg[2]; + ssize_t clen, len, off, boff, voff; + int i, err; + + br = be->be_req; + if (br->br_iovcnt <= 1) + buf = NULL; + err = 0; + switch (be->be_op) { + case BOP_READ: + if (buf == NULL) { + if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, + br->br_offset)) < 0) + err = errno; + else + br->br_resid -= len; + break; + } + i = 0; + off = voff = 0; + while (br->br_resid > 0) { + len = MIN(br->br_resid, MAXPHYS); + if (pread(bc->bc_fd, buf, len, br->br_offset + + off) < 0) { + err = errno; + break; + } + boff = 0; + do { + clen = MIN(len - boff, br->br_iov[i].iov_len - + voff); + memcpy(br->br_iov[i].iov_base + voff, + buf + boff, clen); + if (clen < br->br_iov[i].iov_len - voff) + voff += clen; + else { + i++; + voff = 0; + } + boff += clen; + } while (boff < len); + off += len; + br->br_resid -= len; + } + break; + case BOP_WRITE: + if (bc->bc_rdonly) { + err = EROFS; + break; + } + if (buf == NULL) { + if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, + br->br_offset)) < 0) + err = errno; + else + br->br_resid -= len; + break; + } + i = 0; + off = voff = 0; + while (br->br_resid > 0) { + len = MIN(br->br_resid, MAXPHYS); + boff = 0; + do { + clen = MIN(len - boff, br->br_iov[i].iov_len - + voff); + memcpy(buf + boff, + br->br_iov[i].iov_base + voff, clen); + if (clen < br->br_iov[i].iov_len - voff) + voff += clen; + else { + i++; + voff = 0; + } + boff += clen; + } while (boff < len); + if (pwrite(bc->bc_fd, buf, len, br->br_offset + + off) < 0) { + err = errno; + break; + } + off += len; + br->br_resid -= len; + } + break; + case BOP_FLUSH: + if (bc->bc_ischr) { + if (ioctl(bc->bc_fd, DIOCGFLUSH)) + err = errno; + } else if (fsync(bc->bc_fd)) + err = errno; + break; + case BOP_DELETE: + if (!bc->bc_candelete) + err = EOPNOTSUPP; + else if (bc->bc_rdonly) + err = EROFS; + else if (bc->bc_ischr) { + arg[0] = br->br_offset; + arg[1] = br->br_resid; + if (ioctl(bc->bc_fd, DIOCGDELETE, arg)) + err = errno; + else + br->br_resid = 0; + } else + err = EOPNOTSUPP; + break; + default: + err = EINVAL; + break; + } + + be->be_status = BST_DONE; + + (*br->br_callback)(br, err); +} + +static void * +blockif_thr(void *arg) +{ + struct blockif_ctxt *bc; + struct blockif_elem *be; + pthread_t t; + uint8_t *buf; + + bc = arg; + if (bc->bc_isgeom) + buf = malloc(MAXPHYS); + else + buf = NULL; + t = pthread_self(); + + pthread_mutex_lock(&bc->bc_mtx); + for (;;) { + while (blockif_dequeue(bc, t, &be)) { + pthread_mutex_unlock(&bc->bc_mtx); + blockif_proc(bc, be, buf); + pthread_mutex_lock(&bc->bc_mtx); + blockif_complete(bc, be); + } + /* Check ctxt status here to see if exit requested */ + if (bc->bc_closing) + break; + pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); + } + pthread_mutex_unlock(&bc->bc_mtx); + + if (buf) + free(buf); + pthread_exit(NULL); + return (NULL); +} + +static void +blockif_sigcont_handler(int signal, enum ev_type type, void *arg) +{ + struct blockif_sig_elem *bse; + + for (;;) { + /* + * Process the entire list even if not intended for + * this thread. + */ + do { + bse = blockif_bse_head; + if (bse == NULL) + return; + } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, + (uintptr_t)bse, + (uintptr_t)bse->bse_next)); + + pthread_mutex_lock(&bse->bse_mtx); + bse->bse_pending = 0; + pthread_cond_signal(&bse->bse_cond); + pthread_mutex_unlock(&bse->bse_mtx); + } +} + +static void +blockif_init(void) +{ + mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL); + (void) signal(SIGCONT, SIG_IGN); +} + +struct blockif_ctxt * +blockif_open(const char *optstr, const char *ident) +{ + char tname[MAXCOMLEN + 1]; + char name[MAXPATHLEN]; + char *nopt, *xopts, *cp; + struct blockif_ctxt *bc; + struct stat sbuf; + struct diocgattr_arg arg; + off_t size, psectsz, psectoff; + int extra, fd, i, sectsz; + int nocache, sync, ro, candelete, geom, ssopt, pssopt; + + pthread_once(&blockif_once, blockif_init); + + fd = -1; + ssopt = 0; + nocache = 0; + sync = 0; + ro = 0; + + /* + * The first element in the optstring is always a pathname. + * Optional elements follow + */ + nopt = xopts = strdup(optstr); + while (xopts != NULL) { + cp = strsep(&xopts, ","); + if (cp == nopt) /* file or device pathname */ + continue; + else if (!strcmp(cp, "nocache")) + nocache = 1; + else if (!strcmp(cp, "sync") || !strcmp(cp, "direct")) + sync = 1; + else if (!strcmp(cp, "ro")) + ro = 1; + else if (sscanf(cp, "sectorsize=%d/%d", &ssopt, &pssopt) == 2) + ; + else if (sscanf(cp, "sectorsize=%d", &ssopt) == 1) + pssopt = ssopt; + else { + fprintf(stderr, "Invalid device option \"%s\"\n", cp); + goto err; + } + } + + extra = 0; + if (nocache) + extra |= O_DIRECT; + if (sync) + extra |= O_SYNC; + + fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra); + if (fd < 0 && !ro) { + /* Attempt a r/w fail with a r/o open */ + fd = open(nopt, O_RDONLY | extra); + ro = 1; + } + + if (fd < 0) { + perror("Could not open backing file"); + goto err; + } + + if (fstat(fd, &sbuf) < 0) { + perror("Could not stat backing file"); + goto err; + } + + /* + * Deal with raw devices + */ + size = sbuf.st_size; + sectsz = DEV_BSIZE; + psectsz = psectoff = 0; + candelete = geom = 0; + if (S_ISCHR(sbuf.st_mode)) { + if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || + ioctl(fd, DIOCGSECTORSIZE, §sz)) { + perror("Could not fetch dev blk/sector size"); + goto err; + } + assert(size != 0); + assert(sectsz != 0); + if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) + ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); + strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); + arg.len = sizeof(arg.value.i); + if (ioctl(fd, DIOCGATTR, &arg) == 0) + candelete = arg.value.i; + if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0) + geom = 1; + } else + psectsz = sbuf.st_blksize; + + if (ssopt != 0) { + if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 || + ssopt > pssopt) { + fprintf(stderr, "Invalid sector size %d/%d\n", + ssopt, pssopt); + goto err; + } + + /* + * Some backend drivers (e.g. cd0, ada0) require that the I/O + * size be a multiple of the device's sector size. + * + * Validate that the emulated sector size complies with this + * requirement. + */ + if (S_ISCHR(sbuf.st_mode)) { + if (ssopt < sectsz || (ssopt % sectsz) != 0) { + fprintf(stderr, "Sector size %d incompatible " + "with underlying device sector size %d\n", + ssopt, sectsz); + goto err; + } + } + + sectsz = ssopt; + psectsz = pssopt; + psectoff = 0; + } + + bc = calloc(1, sizeof(struct blockif_ctxt)); + if (bc == NULL) { + perror("calloc"); + goto err; + } + + bc->bc_magic = BLOCKIF_SIG; + bc->bc_fd = fd; + bc->bc_ischr = S_ISCHR(sbuf.st_mode); + bc->bc_isgeom = geom; + bc->bc_candelete = candelete; + bc->bc_rdonly = ro; + bc->bc_size = size; + bc->bc_sectsz = sectsz; + bc->bc_psectsz = psectsz; + bc->bc_psectoff = psectoff; + pthread_mutex_init(&bc->bc_mtx, NULL); + pthread_cond_init(&bc->bc_cond, NULL); + TAILQ_INIT(&bc->bc_freeq); + TAILQ_INIT(&bc->bc_pendq); + TAILQ_INIT(&bc->bc_busyq); + for (i = 0; i < BLOCKIF_MAXREQ; i++) { + bc->bc_reqs[i].be_status = BST_FREE; + TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); + } + + for (i = 0; i < BLOCKIF_NUMTHR; i++) { + pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc); + snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i); + pthread_set_name_np(bc->bc_btid[i], tname); + } + + return (bc); +err: + if (fd >= 0) + close(fd); + return (NULL); +} + +static int +blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, + enum blockop op) +{ + int err; + + err = 0; + + pthread_mutex_lock(&bc->bc_mtx); + if (!TAILQ_EMPTY(&bc->bc_freeq)) { + /* + * Enqueue and inform the block i/o thread + * that there is work available + */ + if (blockif_enqueue(bc, breq, op)) + pthread_cond_signal(&bc->bc_cond); + } else { + /* + * Callers are not allowed to enqueue more than + * the specified blockif queue limit. Return an + * error to indicate that the queue length has been + * exceeded. + */ + err = E2BIG; + } + pthread_mutex_unlock(&bc->bc_mtx); + + return (err); +} + +int +blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (blockif_request(bc, breq, BOP_READ)); +} + +int +blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (blockif_request(bc, breq, BOP_WRITE)); +} + +int +blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (blockif_request(bc, breq, BOP_FLUSH)); +} + +int +blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (blockif_request(bc, breq, BOP_DELETE)); +} + +int +blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) +{ + struct blockif_elem *be; + + assert(bc->bc_magic == BLOCKIF_SIG); + + pthread_mutex_lock(&bc->bc_mtx); + /* + * Check pending requests. + */ + TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { + if (be->be_req == breq) + break; + } + if (be != NULL) { + /* + * Found it. + */ + blockif_complete(bc, be); + pthread_mutex_unlock(&bc->bc_mtx); + + return (0); + } + + /* + * Check in-flight requests. + */ + TAILQ_FOREACH(be, &bc->bc_busyq, be_link) { + if (be->be_req == breq) + break; + } + if (be == NULL) { + /* + * Didn't find it. + */ + pthread_mutex_unlock(&bc->bc_mtx); + return (EINVAL); + } + + /* + * Interrupt the processing thread to force it return + * prematurely via it's normal callback path. + */ + while (be->be_status == BST_BUSY) { + struct blockif_sig_elem bse, *old_head; + + pthread_mutex_init(&bse.bse_mtx, NULL); + pthread_cond_init(&bse.bse_cond, NULL); + + bse.bse_pending = 1; + + do { + old_head = blockif_bse_head; + bse.bse_next = old_head; + } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, + (uintptr_t)old_head, + (uintptr_t)&bse)); + + pthread_kill(be->be_tid, SIGCONT); + + pthread_mutex_lock(&bse.bse_mtx); + while (bse.bse_pending) + pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx); + pthread_mutex_unlock(&bse.bse_mtx); + } + + pthread_mutex_unlock(&bc->bc_mtx); + + /* + * The processing thread has been interrupted. Since it's not + * clear if the callback has been invoked yet, return EBUSY. + */ + return (EBUSY); +} + +int +blockif_close(struct blockif_ctxt *bc) +{ + void *jval; + int err, i; + + err = 0; + + assert(bc->bc_magic == BLOCKIF_SIG); + + /* + * Stop the block i/o thread + */ + pthread_mutex_lock(&bc->bc_mtx); + bc->bc_closing = 1; + pthread_mutex_unlock(&bc->bc_mtx); + pthread_cond_broadcast(&bc->bc_cond); + for (i = 0; i < BLOCKIF_NUMTHR; i++) + pthread_join(bc->bc_btid[i], &jval); + + /* XXX Cancel queued i/o's ??? */ + + /* + * Release resources + */ + bc->bc_magic = 0; + close(bc->bc_fd); + free(bc); + + return (0); +} + +/* + * Return virtual C/H/S values for a given block. Use the algorithm + * outlined in the VHD specification to calculate values. + */ +void +blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) +{ + off_t sectors; /* total sectors of the block dev */ + off_t hcyl; /* cylinders times heads */ + uint16_t secpt; /* sectors per track */ + uint8_t heads; + + assert(bc->bc_magic == BLOCKIF_SIG); + + sectors = bc->bc_size / bc->bc_sectsz; + + /* Clamp the size to the largest possible with CHS */ + if (sectors > 65535UL*16*255) + sectors = 65535UL*16*255; + + if (sectors >= 65536UL*16*63) { + secpt = 255; + heads = 16; + hcyl = sectors / secpt; + } else { + secpt = 17; + hcyl = sectors / secpt; + heads = (hcyl + 1023) / 1024; + + if (heads < 4) + heads = 4; + + if (hcyl >= (heads * 1024) || heads > 16) { + secpt = 31; + heads = 16; + hcyl = sectors / secpt; + } + if (hcyl >= (heads * 1024)) { + secpt = 63; + heads = 16; + hcyl = sectors / secpt; + } + } + + *c = hcyl / heads; + *h = heads; + *s = secpt; +} + +/* + * Accessors + */ +off_t +blockif_size(struct blockif_ctxt *bc) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (bc->bc_size); +} + +int +blockif_sectsz(struct blockif_ctxt *bc) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (bc->bc_sectsz); +} + +void +blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + *size = bc->bc_psectsz; + *off = bc->bc_psectoff; +} + +int +blockif_queuesz(struct blockif_ctxt *bc) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (BLOCKIF_MAXREQ - 1); +} + +int +blockif_is_ro(struct blockif_ctxt *bc) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (bc->bc_rdonly); +} + +int +blockif_candelete(struct blockif_ctxt *bc) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (bc->bc_candelete); +} diff --git a/bhyve/block_if.h b/bhyve/block_if.h new file mode 100644 index 0000000..8e63407 --- /dev/null +++ b/bhyve/block_if.h @@ -0,0 +1,70 @@ +/*- + * Copyright (c) 2013 Peter Grehan + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * The block API to be used by bhyve block-device emulations. The routines + * are thread safe, with no assumptions about the context of the completion + * callback - it may occur in the caller's context, or asynchronously in + * another thread. + */ + +#ifndef _BLOCK_IF_H_ +#define _BLOCK_IF_H_ + +#include +#include + +#define BLOCKIF_IOV_MAX 33 /* not practical to be IOV_MAX */ + +struct blockif_req { + struct iovec br_iov[BLOCKIF_IOV_MAX]; + int br_iovcnt; + off_t br_offset; + ssize_t br_resid; + void (*br_callback)(struct blockif_req *req, int err); + void *br_param; +}; + +struct blockif_ctxt; +struct blockif_ctxt *blockif_open(const char *optstr, const char *ident); +off_t blockif_size(struct blockif_ctxt *bc); +void blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, + uint8_t *s); +int blockif_sectsz(struct blockif_ctxt *bc); +void blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off); +int blockif_queuesz(struct blockif_ctxt *bc); +int blockif_is_ro(struct blockif_ctxt *bc); +int blockif_candelete(struct blockif_ctxt *bc); +int blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq); +int blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq); +int blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq); +int blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq); +int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq); +int blockif_close(struct blockif_ctxt *bc); + +#endif /* _BLOCK_IF_H_ */ diff --git a/bhyve/consport.c b/bhyve/consport.c new file mode 100644 index 0000000..4074e95 --- /dev/null +++ b/bhyve/consport.c @@ -0,0 +1,153 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include +#include +#include + +#include "inout.h" +#include "pci_lpc.h" + +#define BVM_CONSOLE_PORT 0x220 +#define BVM_CONS_SIG ('b' << 8 | 'v') + +static struct termios tio_orig, tio_new; + +static void +ttyclose(void) +{ + tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig); +} + +static void +ttyopen(void) +{ + tcgetattr(STDIN_FILENO, &tio_orig); + + cfmakeraw(&tio_new); + tcsetattr(STDIN_FILENO, TCSANOW, &tio_new); + + atexit(ttyclose); +} + +static bool +tty_char_available(void) +{ + fd_set rfds; + struct timeval tv; + + FD_ZERO(&rfds); + FD_SET(STDIN_FILENO, &rfds); + tv.tv_sec = 0; + tv.tv_usec = 0; + if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0) { + return (true); + } else { + return (false); + } +} + +static int +ttyread(void) +{ + char rb; + + if (tty_char_available()) { + read(STDIN_FILENO, &rb, 1); + return (rb & 0xff); + } else { + return (-1); + } +} + +static void +ttywrite(unsigned char wb) +{ + (void) write(STDOUT_FILENO, &wb, 1); +} + +static int +console_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + static int opened; + + if (bytes == 2 && in) { + *eax = BVM_CONS_SIG; + return (0); + } + + /* + * Guests might probe this port to look for old ISA devices + * using single-byte reads. Return 0xff for those. + */ + if (bytes == 1 && in) { + *eax = 0xff; + return (0); + } + + if (bytes != 4) + return (-1); + + if (!opened) { + ttyopen(); + opened = 1; + } + + if (in) + *eax = ttyread(); + else + ttywrite(*eax); + + return (0); +} + +SYSRES_IO(BVM_CONSOLE_PORT, 4); + +static struct inout_port consport = { + "bvmcons", + BVM_CONSOLE_PORT, + 1, + IOPORT_F_INOUT, + console_handler +}; + +void +init_bvmcons(void) +{ + + register_inout(&consport); +} diff --git a/bhyve/dbgport.c b/bhyve/dbgport.c new file mode 100644 index 0000000..534ae65 --- /dev/null +++ b/bhyve/dbgport.c @@ -0,0 +1,142 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "inout.h" +#include "dbgport.h" +#include "pci_lpc.h" + +#define BVM_DBG_PORT 0x224 +#define BVM_DBG_SIG ('B' << 8 | 'V') + +static int listen_fd, conn_fd; + +static struct sockaddr_in sin; + +static int +dbg_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + char ch; + int nwritten, nread, printonce; + + if (bytes == 2 && in) { + *eax = BVM_DBG_SIG; + return (0); + } + + if (bytes != 4) + return (-1); + +again: + printonce = 0; + while (conn_fd < 0) { + if (!printonce) { + printf("Waiting for connection from gdb\r\n"); + printonce = 1; + } + conn_fd = accept(listen_fd, NULL, NULL); + if (conn_fd >= 0) + fcntl(conn_fd, F_SETFL, O_NONBLOCK); + else if (errno != EINTR) + perror("accept"); + } + + if (in) { + nread = read(conn_fd, &ch, 1); + if (nread == -1 && errno == EAGAIN) + *eax = -1; + else if (nread == 1) + *eax = ch; + else { + close(conn_fd); + conn_fd = -1; + goto again; + } + } else { + ch = *eax; + nwritten = write(conn_fd, &ch, 1); + if (nwritten != 1) { + close(conn_fd); + conn_fd = -1; + goto again; + } + } + return (0); +} + +static struct inout_port dbgport = { + "bvmdbg", + BVM_DBG_PORT, + 1, + IOPORT_F_INOUT, + dbg_handler +}; + +SYSRES_IO(BVM_DBG_PORT, 4); + +void +init_dbgport(int sport) +{ + conn_fd = -1; + + if ((listen_fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { + perror("socket"); + exit(1); + } + + sin.sin_len = sizeof(sin); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(sport); + + if (bind(listen_fd, (struct sockaddr *)&sin, sizeof(sin)) < 0) { + perror("bind"); + exit(1); + } + + if (listen(listen_fd, 1) < 0) { + perror("listen"); + exit(1); + } + + register_inout(&dbgport); +} diff --git a/bhyve/dbgport.h b/bhyve/dbgport.h new file mode 100644 index 0000000..2ddcbf8 --- /dev/null +++ b/bhyve/dbgport.h @@ -0,0 +1,34 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _DBGPORT_H_ +#define _DBGPORT_H_ + +void init_dbgport(int port); + +#endif diff --git a/bhyve/inout.c b/bhyve/inout.c new file mode 100644 index 0000000..929bb3c --- /dev/null +++ b/bhyve/inout.c @@ -0,0 +1,297 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include +#include +#include + +#include "bhyverun.h" +#include "inout.h" + +SET_DECLARE(inout_port_set, struct inout_port); + +#define MAX_IOPORTS (1 << 16) + +#define VERIFY_IOPORT(port, size) \ + assert((port) >= 0 && (size) > 0 && ((port) + (size)) <= MAX_IOPORTS) + +static struct { + const char *name; + int flags; + inout_func_t handler; + void *arg; +} inout_handlers[MAX_IOPORTS]; + +static int +default_inout(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + if (in) { + switch (bytes) { + case 4: + *eax = 0xffffffff; + break; + case 2: + *eax = 0xffff; + break; + case 1: + *eax = 0xff; + break; + } + } + + return (0); +} + +static void +register_default_iohandler(int start, int size) +{ + struct inout_port iop; + + VERIFY_IOPORT(start, size); + + bzero(&iop, sizeof(iop)); + iop.name = "default"; + iop.port = start; + iop.size = size; + iop.flags = IOPORT_F_INOUT | IOPORT_F_DEFAULT; + iop.handler = default_inout; + + register_inout(&iop); +} + +int +emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict) +{ + int addrsize, bytes, flags, in, port, prot, rep; + uint32_t eax, val; + inout_func_t handler; + void *arg; + int error, fault, retval; + enum vm_reg_name idxreg; + uint64_t gla, index, iterations, count; + struct vm_inout_str *vis; + struct iovec iov[2]; + + bytes = vmexit->u.inout.bytes; + in = vmexit->u.inout.in; + port = vmexit->u.inout.port; + + assert(port < MAX_IOPORTS); + assert(bytes == 1 || bytes == 2 || bytes == 4); + + handler = inout_handlers[port].handler; + + if (strict && handler == default_inout) + return (-1); + + flags = inout_handlers[port].flags; + arg = inout_handlers[port].arg; + + if (in) { + if (!(flags & IOPORT_F_IN)) + return (-1); + } else { + if (!(flags & IOPORT_F_OUT)) + return (-1); + } + + retval = 0; + if (vmexit->u.inout.string) { + vis = &vmexit->u.inout_str; + rep = vis->inout.rep; + addrsize = vis->addrsize; + prot = in ? PROT_WRITE : PROT_READ; + assert(addrsize == 2 || addrsize == 4 || addrsize == 8); + + /* Index register */ + idxreg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI; + index = vis->index & vie_size2mask(addrsize); + + /* Count register */ + count = vis->count & vie_size2mask(addrsize); + + /* Limit number of back-to-back in/out emulations to 16 */ + iterations = MIN(count, 16); + while (iterations > 0) { + assert(retval == 0); + if (vie_calculate_gla(vis->paging.cpu_mode, + vis->seg_name, &vis->seg_desc, index, bytes, + addrsize, prot, &gla)) { + vm_inject_gp(ctx, vcpu); + break; + } + + error = vm_copy_setup(ctx, vcpu, &vis->paging, gla, + bytes, prot, iov, nitems(iov), &fault); + if (error) { + retval = -1; /* Unrecoverable error */ + break; + } else if (fault) { + retval = 0; /* Resume guest to handle fault */ + break; + } + + if (vie_alignment_check(vis->paging.cpl, bytes, + vis->cr0, vis->rflags, gla)) { + vm_inject_ac(ctx, vcpu, 0); + break; + } + + val = 0; + if (!in) + vm_copyin(ctx, vcpu, iov, &val, bytes); + + retval = handler(ctx, vcpu, in, port, bytes, &val, arg); + if (retval != 0) + break; + + if (in) + vm_copyout(ctx, vcpu, &val, iov, bytes); + + /* Update index */ + if (vis->rflags & PSL_D) + index -= bytes; + else + index += bytes; + + count--; + iterations--; + } + + /* Update index register */ + error = vie_update_register(ctx, vcpu, idxreg, index, addrsize); + assert(error == 0); + + /* + * Update count register only if the instruction had a repeat + * prefix. + */ + if (rep) { + error = vie_update_register(ctx, vcpu, VM_REG_GUEST_RCX, + count, addrsize); + assert(error == 0); + } + + /* Restart the instruction if more iterations remain */ + if (retval == 0 && count != 0) { + error = vm_restart_instruction(ctx, vcpu); + assert(error == 0); + } + } else { + eax = vmexit->u.inout.eax; + val = eax & vie_size2mask(bytes); + retval = handler(ctx, vcpu, in, port, bytes, &val, arg); + if (retval == 0 && in) { + eax &= ~vie_size2mask(bytes); + eax |= val & vie_size2mask(bytes); + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, + eax); + assert(error == 0); + } + } + return (retval); +} + +void +init_inout(void) +{ + struct inout_port **iopp, *iop; + + /* + * Set up the default handler for all ports + */ + register_default_iohandler(0, MAX_IOPORTS); + + /* + * Overwrite with specified handlers + */ + SET_FOREACH(iopp, inout_port_set) { + iop = *iopp; + assert(iop->port < MAX_IOPORTS); + inout_handlers[iop->port].name = iop->name; + inout_handlers[iop->port].flags = iop->flags; + inout_handlers[iop->port].handler = iop->handler; + inout_handlers[iop->port].arg = NULL; + } +} + +int +register_inout(struct inout_port *iop) +{ + int i; + + VERIFY_IOPORT(iop->port, iop->size); + + /* + * Verify that the new registration is not overwriting an already + * allocated i/o range. + */ + if ((iop->flags & IOPORT_F_DEFAULT) == 0) { + for (i = iop->port; i < iop->port + iop->size; i++) { + if ((inout_handlers[i].flags & IOPORT_F_DEFAULT) == 0) + return (-1); + } + } + + for (i = iop->port; i < iop->port + iop->size; i++) { + inout_handlers[i].name = iop->name; + inout_handlers[i].flags = iop->flags; + inout_handlers[i].handler = iop->handler; + inout_handlers[i].arg = iop->arg; + } + + return (0); +} + +int +unregister_inout(struct inout_port *iop) +{ + + VERIFY_IOPORT(iop->port, iop->size); + assert(inout_handlers[iop->port].name == iop->name); + + register_default_iohandler(iop->port, iop->size); + + return (0); +} diff --git a/bhyve/inout.h b/bhyve/inout.h new file mode 100644 index 0000000..7f39095 --- /dev/null +++ b/bhyve/inout.h @@ -0,0 +1,79 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _INOUT_H_ +#define _INOUT_H_ + +#include + +struct vmctx; +struct vm_exit; + +/* + * inout emulation handlers return 0 on success and -1 on failure. + */ +typedef int (*inout_func_t)(struct vmctx *ctx, int vcpu, int in, int port, + int bytes, uint32_t *eax, void *arg); + +struct inout_port { + const char *name; + int port; + int size; + int flags; + inout_func_t handler; + void *arg; +}; +#define IOPORT_F_IN 0x1 +#define IOPORT_F_OUT 0x2 +#define IOPORT_F_INOUT (IOPORT_F_IN | IOPORT_F_OUT) + +/* + * The following flags are used internally and must not be used by + * device models. + */ +#define IOPORT_F_DEFAULT 0x80000000 /* claimed by default handler */ + +#define INOUT_PORT(name, port, flags, handler) \ + static struct inout_port __CONCAT(__inout_port, __LINE__) = { \ + #name, \ + (port), \ + 1, \ + (flags), \ + (handler), \ + 0 \ + }; \ + DATA_SET(inout_port_set, __CONCAT(__inout_port, __LINE__)) + +void init_inout(void); +int emulate_inout(struct vmctx *, int vcpu, struct vm_exit *vmexit, + int strict); +int register_inout(struct inout_port *iop); +int unregister_inout(struct inout_port *iop); +void init_bvmcons(void); + +#endif /* _INOUT_H_ */ diff --git a/bhyve/ioapic.c b/bhyve/ioapic.c new file mode 100644 index 0000000..0ad69d9 --- /dev/null +++ b/bhyve/ioapic.c @@ -0,0 +1,74 @@ +/*- + * Copyright (c) 2014 Hudson River Trading LLC + * Written by: John H. Baldwin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include +#include + +#include "ioapic.h" + +/* + * Assign PCI INTx interrupts to I/O APIC pins in a round-robin + * fashion. Note that we have no idea what the HPET is using, but the + * HPET is also programmable whereas this is intended for hardwired + * PCI interrupts. + * + * This assumes a single I/O APIC where pins >= 16 are permitted for + * PCI devices. + */ +static int pci_pins; + +void +ioapic_init(struct vmctx *ctx) +{ + + if (vm_ioapic_pincount(ctx, &pci_pins) < 0) { + pci_pins = 0; + return; + } + + /* Ignore the first 16 pins. */ + if (pci_pins <= 16) { + pci_pins = 0; + return; + } + pci_pins -= 16; +} + +int +ioapic_pci_alloc_irq(void) +{ + static int last_pin; + + if (pci_pins == 0) + return (-1); + return (16 + (last_pin++ % pci_pins)); +} diff --git a/bhyve/ioapic.h b/bhyve/ioapic.h new file mode 100644 index 0000000..efdd3c6 --- /dev/null +++ b/bhyve/ioapic.h @@ -0,0 +1,39 @@ +/*- + * Copyright (c) 2014 Hudson River Trading LLC + * Written by: John H. Baldwin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IOAPIC_H_ +#define _IOAPIC_H_ + +/* + * Allocate a PCI IRQ from the I/O APIC. + */ +void ioapic_init(struct vmctx *ctx); +int ioapic_pci_alloc_irq(void); + +#endif diff --git a/bhyve/mem.c b/bhyve/mem.c new file mode 100644 index 0000000..2a9f430 --- /dev/null +++ b/bhyve/mem.c @@ -0,0 +1,291 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Memory ranges are represented with an RB tree. On insertion, the range + * is checked for overlaps. On lookup, the key has the same base and limit + * so it can be searched within the range. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "mem.h" + +struct mmio_rb_range { + RB_ENTRY(mmio_rb_range) mr_link; /* RB tree links */ + struct mem_range mr_param; + uint64_t mr_base; + uint64_t mr_end; +}; + +struct mmio_rb_tree; +RB_PROTOTYPE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare); + +RB_HEAD(mmio_rb_tree, mmio_rb_range) mmio_rb_root, mmio_rb_fallback; + +/* + * Per-vCPU cache. Since most accesses from a vCPU will be to + * consecutive addresses in a range, it makes sense to cache the + * result of a lookup. + */ +static struct mmio_rb_range *mmio_hint[VM_MAXCPU]; + +static pthread_rwlock_t mmio_rwlock; + +static int +mmio_rb_range_compare(struct mmio_rb_range *a, struct mmio_rb_range *b) +{ + if (a->mr_end < b->mr_base) + return (-1); + else if (a->mr_base > b->mr_end) + return (1); + return (0); +} + +static int +mmio_rb_lookup(struct mmio_rb_tree *rbt, uint64_t addr, + struct mmio_rb_range **entry) +{ + struct mmio_rb_range find, *res; + + find.mr_base = find.mr_end = addr; + + res = RB_FIND(mmio_rb_tree, rbt, &find); + + if (res != NULL) { + *entry = res; + return (0); + } + + return (ENOENT); +} + +static int +mmio_rb_add(struct mmio_rb_tree *rbt, struct mmio_rb_range *new) +{ + struct mmio_rb_range *overlap; + + overlap = RB_INSERT(mmio_rb_tree, rbt, new); + + if (overlap != NULL) { +#ifdef RB_DEBUG + printf("overlap detected: new %lx:%lx, tree %lx:%lx\n", + new->mr_base, new->mr_end, + overlap->mr_base, overlap->mr_end); +#endif + + return (EEXIST); + } + + return (0); +} + +#if 0 +static void +mmio_rb_dump(struct mmio_rb_tree *rbt) +{ + struct mmio_rb_range *np; + + pthread_rwlock_rdlock(&mmio_rwlock); + RB_FOREACH(np, mmio_rb_tree, rbt) { + printf(" %lx:%lx, %s\n", np->mr_base, np->mr_end, + np->mr_param.name); + } + pthread_rwlock_unlock(&mmio_rwlock); +} +#endif + +RB_GENERATE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare); + +static int +mem_read(void *ctx, int vcpu, uint64_t gpa, uint64_t *rval, int size, void *arg) +{ + int error; + struct mem_range *mr = arg; + + error = (*mr->handler)(ctx, vcpu, MEM_F_READ, gpa, size, + rval, mr->arg1, mr->arg2); + return (error); +} + +static int +mem_write(void *ctx, int vcpu, uint64_t gpa, uint64_t wval, int size, void *arg) +{ + int error; + struct mem_range *mr = arg; + + error = (*mr->handler)(ctx, vcpu, MEM_F_WRITE, gpa, size, + &wval, mr->arg1, mr->arg2); + return (error); +} + +int +emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie, + struct vm_guest_paging *paging) + +{ + struct mmio_rb_range *entry; + int err, immutable; + + pthread_rwlock_rdlock(&mmio_rwlock); + /* + * First check the per-vCPU cache + */ + if (mmio_hint[vcpu] && + paddr >= mmio_hint[vcpu]->mr_base && + paddr <= mmio_hint[vcpu]->mr_end) { + entry = mmio_hint[vcpu]; + } else + entry = NULL; + + if (entry == NULL) { + if (mmio_rb_lookup(&mmio_rb_root, paddr, &entry) == 0) { + /* Update the per-vCPU cache */ + mmio_hint[vcpu] = entry; + } else if (mmio_rb_lookup(&mmio_rb_fallback, paddr, &entry)) { + pthread_rwlock_unlock(&mmio_rwlock); + return (ESRCH); + } + } + + assert(entry != NULL); + + /* + * An 'immutable' memory range is guaranteed to be never removed + * so there is no need to hold 'mmio_rwlock' while calling the + * handler. + * + * XXX writes to the PCIR_COMMAND register can cause register_mem() + * to be called. If the guest is using PCI extended config space + * to modify the PCIR_COMMAND register then register_mem() can + * deadlock on 'mmio_rwlock'. However by registering the extended + * config space window as 'immutable' the deadlock can be avoided. + */ + immutable = (entry->mr_param.flags & MEM_F_IMMUTABLE); + if (immutable) + pthread_rwlock_unlock(&mmio_rwlock); + + err = vmm_emulate_instruction(ctx, vcpu, paddr, vie, paging, + mem_read, mem_write, &entry->mr_param); + + if (!immutable) + pthread_rwlock_unlock(&mmio_rwlock); + + return (err); +} + +static int +register_mem_int(struct mmio_rb_tree *rbt, struct mem_range *memp) +{ + struct mmio_rb_range *entry, *mrp; + int err; + + err = 0; + + mrp = malloc(sizeof(struct mmio_rb_range)); + + if (mrp != NULL) { + mrp->mr_param = *memp; + mrp->mr_base = memp->base; + mrp->mr_end = memp->base + memp->size - 1; + pthread_rwlock_wrlock(&mmio_rwlock); + if (mmio_rb_lookup(rbt, memp->base, &entry) != 0) + err = mmio_rb_add(rbt, mrp); + pthread_rwlock_unlock(&mmio_rwlock); + if (err) + free(mrp); + } else + err = ENOMEM; + + return (err); +} + +int +register_mem(struct mem_range *memp) +{ + + return (register_mem_int(&mmio_rb_root, memp)); +} + +int +register_mem_fallback(struct mem_range *memp) +{ + + return (register_mem_int(&mmio_rb_fallback, memp)); +} + +int +unregister_mem(struct mem_range *memp) +{ + struct mem_range *mr; + struct mmio_rb_range *entry = NULL; + int err, i; + + pthread_rwlock_wrlock(&mmio_rwlock); + err = mmio_rb_lookup(&mmio_rb_root, memp->base, &entry); + if (err == 0) { + mr = &entry->mr_param; + assert(mr->name == memp->name); + assert(mr->base == memp->base && mr->size == memp->size); + assert((mr->flags & MEM_F_IMMUTABLE) == 0); + RB_REMOVE(mmio_rb_tree, &mmio_rb_root, entry); + + /* flush Per-vCPU cache */ + for (i=0; i < VM_MAXCPU; i++) { + if (mmio_hint[i] == entry) + mmio_hint[i] = NULL; + } + } + pthread_rwlock_unlock(&mmio_rwlock); + + if (entry) + free(entry); + + return (err); +} + +void +init_mem(void) +{ + + RB_INIT(&mmio_rb_root); + RB_INIT(&mmio_rb_fallback); + pthread_rwlock_init(&mmio_rwlock, NULL); +} diff --git a/bhyve/mem.h b/bhyve/mem.h new file mode 100644 index 0000000..f671eae --- /dev/null +++ b/bhyve/mem.h @@ -0,0 +1,61 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _MEM_H_ +#define _MEM_H_ + +#include + +struct vmctx; + +typedef int (*mem_func_t)(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, + int size, uint64_t *val, void *arg1, long arg2); + +struct mem_range { + const char *name; + int flags; + mem_func_t handler; + void *arg1; + long arg2; + uint64_t base; + uint64_t size; +}; +#define MEM_F_READ 0x1 +#define MEM_F_WRITE 0x2 +#define MEM_F_RW 0x3 +#define MEM_F_IMMUTABLE 0x4 /* mem_range cannot be unregistered */ + +void init_mem(void); +int emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie, + struct vm_guest_paging *paging); + +int register_mem(struct mem_range *memp); +int register_mem_fallback(struct mem_range *memp); +int unregister_mem(struct mem_range *memp); + +#endif /* _MEM_H_ */ diff --git a/bhyve/mevent.c b/bhyve/mevent.c new file mode 100644 index 0000000..07d3baf --- /dev/null +++ b/bhyve/mevent.c @@ -0,0 +1,456 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Micro event library for FreeBSD, designed for a single i/o thread + * using kqueue, and having events be persistent by default. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include "mevent.h" + +#define MEVENT_MAX 64 + +#define MEV_ADD 1 +#define MEV_ENABLE 2 +#define MEV_DISABLE 3 +#define MEV_DEL_PENDING 4 + +extern char *vmname; + +static pthread_t mevent_tid; +static int mevent_timid = 43; +static int mevent_pipefd[2]; +static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER; + +struct mevent { + void (*me_func)(int, enum ev_type, void *); +#define me_msecs me_fd + int me_fd; + int me_timid; + enum ev_type me_type; + void *me_param; + int me_cq; + int me_state; + int me_closefd; + LIST_ENTRY(mevent) me_list; +}; + +static LIST_HEAD(listhead, mevent) global_head, change_head; + +static void +mevent_qlock(void) +{ + pthread_mutex_lock(&mevent_lmutex); +} + +static void +mevent_qunlock(void) +{ + pthread_mutex_unlock(&mevent_lmutex); +} + +static void +mevent_pipe_read(int fd, enum ev_type type, void *param) +{ + char buf[MEVENT_MAX]; + int status; + + /* + * Drain the pipe read side. The fd is non-blocking so this is + * safe to do. + */ + do { + status = read(fd, buf, sizeof(buf)); + } while (status == MEVENT_MAX); +} + +static void +mevent_notify(void) +{ + char c; + + /* + * If calling from outside the i/o thread, write a byte on the + * pipe to force the i/o thread to exit the blocking kevent call. + */ + if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) { + write(mevent_pipefd[1], &c, 1); + } +} + +static int +mevent_kq_filter(struct mevent *mevp) +{ + int retval; + + retval = 0; + + if (mevp->me_type == EVF_READ) + retval = EVFILT_READ; + + if (mevp->me_type == EVF_WRITE) + retval = EVFILT_WRITE; + + if (mevp->me_type == EVF_TIMER) + retval = EVFILT_TIMER; + + if (mevp->me_type == EVF_SIGNAL) + retval = EVFILT_SIGNAL; + + return (retval); +} + +static int +mevent_kq_flags(struct mevent *mevp) +{ + int ret; + + switch (mevp->me_state) { + case MEV_ADD: + ret = EV_ADD; /* implicitly enabled */ + break; + case MEV_ENABLE: + ret = EV_ENABLE; + break; + case MEV_DISABLE: + ret = EV_DISABLE; + break; + case MEV_DEL_PENDING: + ret = EV_DELETE; + break; + default: + assert(0); + break; + } + + return (ret); +} + +static int +mevent_kq_fflags(struct mevent *mevp) +{ + /* XXX nothing yet, perhaps EV_EOF for reads ? */ + return (0); +} + +static int +mevent_build(int mfd, struct kevent *kev) +{ + struct mevent *mevp, *tmpp; + int i; + + i = 0; + + mevent_qlock(); + + LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) { + if (mevp->me_closefd) { + /* + * A close of the file descriptor will remove the + * event + */ + close(mevp->me_fd); + } else { + if (mevp->me_type == EVF_TIMER) { + kev[i].ident = mevp->me_timid; + kev[i].data = mevp->me_msecs; + } else { + kev[i].ident = mevp->me_fd; + kev[i].data = 0; + } + kev[i].filter = mevent_kq_filter(mevp); + kev[i].flags = mevent_kq_flags(mevp); + kev[i].fflags = mevent_kq_fflags(mevp); + kev[i].udata = mevp; + i++; + } + + mevp->me_cq = 0; + LIST_REMOVE(mevp, me_list); + + if (mevp->me_state == MEV_DEL_PENDING) { + free(mevp); + } else { + LIST_INSERT_HEAD(&global_head, mevp, me_list); + } + + assert(i < MEVENT_MAX); + } + + mevent_qunlock(); + + return (i); +} + +static void +mevent_handle(struct kevent *kev, int numev) +{ + struct mevent *mevp; + int i; + + for (i = 0; i < numev; i++) { + mevp = kev[i].udata; + + /* XXX check for EV_ERROR ? */ + + (*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param); + } +} + +struct mevent * +mevent_add(int tfd, enum ev_type type, + void (*func)(int, enum ev_type, void *), void *param) +{ + struct mevent *lp, *mevp; + + if (tfd < 0 || func == NULL) { + return (NULL); + } + + mevp = NULL; + + mevent_qlock(); + + /* + * Verify that the fd/type tuple is not present in any list + */ + LIST_FOREACH(lp, &global_head, me_list) { + if (type != EVF_TIMER && lp->me_fd == tfd && + lp->me_type == type) { + goto exit; + } + } + + LIST_FOREACH(lp, &change_head, me_list) { + if (type != EVF_TIMER && lp->me_fd == tfd && + lp->me_type == type) { + goto exit; + } + } + + /* + * Allocate an entry, populate it, and add it to the change list. + */ + mevp = calloc(1, sizeof(struct mevent)); + if (mevp == NULL) { + goto exit; + } + + if (type == EVF_TIMER) { + mevp->me_msecs = tfd; + mevp->me_timid = mevent_timid++; + } else + mevp->me_fd = tfd; + mevp->me_type = type; + mevp->me_func = func; + mevp->me_param = param; + + LIST_INSERT_HEAD(&change_head, mevp, me_list); + mevp->me_cq = 1; + mevp->me_state = MEV_ADD; + mevent_notify(); + +exit: + mevent_qunlock(); + + return (mevp); +} + +static int +mevent_update(struct mevent *evp, int newstate) +{ + /* + * It's not possible to enable/disable a deleted event + */ + if (evp->me_state == MEV_DEL_PENDING) + return (EINVAL); + + /* + * No update needed if state isn't changing + */ + if (evp->me_state == newstate) + return (0); + + mevent_qlock(); + + evp->me_state = newstate; + + /* + * Place the entry onto the changed list if not already there. + */ + if (evp->me_cq == 0) { + evp->me_cq = 1; + LIST_REMOVE(evp, me_list); + LIST_INSERT_HEAD(&change_head, evp, me_list); + mevent_notify(); + } + + mevent_qunlock(); + + return (0); +} + +int +mevent_enable(struct mevent *evp) +{ + + return (mevent_update(evp, MEV_ENABLE)); +} + +int +mevent_disable(struct mevent *evp) +{ + + return (mevent_update(evp, MEV_DISABLE)); +} + +static int +mevent_delete_event(struct mevent *evp, int closefd) +{ + mevent_qlock(); + + /* + * Place the entry onto the changed list if not already there, and + * mark as to be deleted. + */ + if (evp->me_cq == 0) { + evp->me_cq = 1; + LIST_REMOVE(evp, me_list); + LIST_INSERT_HEAD(&change_head, evp, me_list); + mevent_notify(); + } + evp->me_state = MEV_DEL_PENDING; + + if (closefd) + evp->me_closefd = 1; + + mevent_qunlock(); + + return (0); +} + +int +mevent_delete(struct mevent *evp) +{ + + return (mevent_delete_event(evp, 0)); +} + +int +mevent_delete_close(struct mevent *evp) +{ + + return (mevent_delete_event(evp, 1)); +} + +static void +mevent_set_name(void) +{ + + pthread_set_name_np(mevent_tid, "mevent"); +} + +void +mevent_dispatch(void) +{ + struct kevent changelist[MEVENT_MAX]; + struct kevent eventlist[MEVENT_MAX]; + struct mevent *pipev; + int mfd; + int numev; + int ret; + + mevent_tid = pthread_self(); + mevent_set_name(); + + mfd = kqueue(); + assert(mfd > 0); + + /* + * Open the pipe that will be used for other threads to force + * the blocking kqueue call to exit by writing to it. Set the + * descriptor to non-blocking. + */ + ret = pipe(mevent_pipefd); + if (ret < 0) { + perror("pipe"); + exit(0); + } + + /* + * Add internal event handler for the pipe write fd + */ + pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL); + assert(pipev != NULL); + + for (;;) { + /* + * Build changelist if required. + * XXX the changelist can be put into the blocking call + * to eliminate the extra syscall. Currently better for + * debug. + */ + numev = mevent_build(mfd, changelist); + if (numev) { + ret = kevent(mfd, changelist, numev, NULL, 0, NULL); + if (ret == -1) { + perror("Error return from kevent change"); + } + } + + /* + * Block awaiting events + */ + ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL); + if (ret == -1 && errno != EINTR) { + perror("Error return from kevent monitor"); + } + + /* + * Handle reported events + */ + mevent_handle(eventlist, ret); + } +} diff --git a/bhyve/mevent.h b/bhyve/mevent.h new file mode 100644 index 0000000..d6a59c6 --- /dev/null +++ b/bhyve/mevent.h @@ -0,0 +1,51 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _MEVENT_H_ +#define _MEVENT_H_ + +enum ev_type { + EVF_READ, + EVF_WRITE, + EVF_TIMER, + EVF_SIGNAL +}; + +struct mevent; + +struct mevent *mevent_add(int fd, enum ev_type type, + void (*func)(int, enum ev_type, void *), + void *param); +int mevent_enable(struct mevent *evp); +int mevent_disable(struct mevent *evp); +int mevent_delete(struct mevent *evp); +int mevent_delete_close(struct mevent *evp); + +void mevent_dispatch(void); + +#endif /* _MEVENT_H_ */ diff --git a/bhyve/mevent_test.c b/bhyve/mevent_test.c new file mode 100644 index 0000000..9c68ff7 --- /dev/null +++ b/bhyve/mevent_test.c @@ -0,0 +1,256 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Test program for the micro event library. Set up a simple TCP echo + * service. + * + * cc mevent_test.c mevent.c -lpthread + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "mevent.h" + +#define TEST_PORT 4321 + +static pthread_mutex_t accept_mutex = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t accept_condvar = PTHREAD_COND_INITIALIZER; + +static struct mevent *tevp; + +char *vmname = "test vm"; + + +#define MEVENT_ECHO + +/* Number of timer events to capture */ +#define TEVSZ 4096 +uint64_t tevbuf[TEVSZ]; + +static void +timer_print(void) +{ + uint64_t min, max, diff, sum, tsc_freq; + size_t len; + int j; + + min = UINT64_MAX; + max = 0; + sum = 0; + + len = sizeof(tsc_freq); + sysctlbyname("machdep.tsc_freq", &tsc_freq, &len, NULL, 0); + + for (j = 1; j < TEVSZ; j++) { + /* Convert a tsc diff into microseconds */ + diff = (tevbuf[j] - tevbuf[j-1]) * 1000000 / tsc_freq; + sum += diff; + if (min > diff) + min = diff; + if (max < diff) + max = diff; + } + + printf("timers done: usecs, min %ld, max %ld, mean %ld\n", min, max, + sum/(TEVSZ - 1)); +} + +static void +timer_callback(int fd, enum ev_type type, void *param) +{ + static int i; + + if (i >= TEVSZ) + abort(); + + tevbuf[i++] = rdtsc(); + + if (i == TEVSZ) { + mevent_delete(tevp); + timer_print(); + } +} + + +#ifdef MEVENT_ECHO +struct esync { + pthread_mutex_t e_mt; + pthread_cond_t e_cond; +}; + +static void +echoer_callback(int fd, enum ev_type type, void *param) +{ + struct esync *sync = param; + + pthread_mutex_lock(&sync->e_mt); + pthread_cond_signal(&sync->e_cond); + pthread_mutex_unlock(&sync->e_mt); +} + +static void * +echoer(void *param) +{ + struct esync sync; + struct mevent *mev; + char buf[128]; + int fd = (int)(uintptr_t) param; + int len; + + pthread_mutex_init(&sync.e_mt, NULL); + pthread_cond_init(&sync.e_cond, NULL); + + pthread_mutex_lock(&sync.e_mt); + + mev = mevent_add(fd, EVF_READ, echoer_callback, &sync); + if (mev == NULL) { + printf("Could not allocate echoer event\n"); + exit(1); + } + + while (!pthread_cond_wait(&sync.e_cond, &sync.e_mt)) { + len = read(fd, buf, sizeof(buf)); + if (len > 0) { + write(fd, buf, len); + write(0, buf, len); + } else { + break; + } + } + + mevent_delete_close(mev); + + pthread_mutex_unlock(&sync.e_mt); + pthread_mutex_destroy(&sync.e_mt); + pthread_cond_destroy(&sync.e_cond); + + return (NULL); +} + +#else + +static void * +echoer(void *param) +{ + char buf[128]; + int fd = (int)(uintptr_t) param; + int len; + + while ((len = read(fd, buf, sizeof(buf))) > 0) { + write(1, buf, len); + } + + return (NULL); +} +#endif /* MEVENT_ECHO */ + +static void +acceptor_callback(int fd, enum ev_type type, void *param) +{ + pthread_mutex_lock(&accept_mutex); + pthread_cond_signal(&accept_condvar); + pthread_mutex_unlock(&accept_mutex); +} + +static void * +acceptor(void *param) +{ + struct sockaddr_in sin; + pthread_t tid; + int news; + int s; + static int first; + + if ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0) { + perror("socket"); + exit(1); + } + + sin.sin_len = sizeof(sin); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(TEST_PORT); + + if (bind(s, (struct sockaddr *)&sin, sizeof(sin)) < 0) { + perror("bind"); + exit(1); + } + + if (listen(s, 1) < 0) { + perror("listen"); + exit(1); + } + + (void) mevent_add(s, EVF_READ, acceptor_callback, NULL); + + pthread_mutex_lock(&accept_mutex); + + while (!pthread_cond_wait(&accept_condvar, &accept_mutex)) { + news = accept(s, NULL, NULL); + if (news < 0) { + perror("accept error"); + } else { + static int first = 1; + + if (first) { + /* + * Start a timer + */ + first = 0; + tevp = mevent_add(1, EVF_TIMER, timer_callback, + NULL); + } + + printf("incoming connection, spawning thread\n"); + pthread_create(&tid, NULL, echoer, + (void *)(uintptr_t)news); + } + } + + return (NULL); +} + +main() +{ + pthread_t tid; + + pthread_create(&tid, NULL, acceptor, NULL); + + mevent_dispatch(); +} diff --git a/bhyve/mptbl.c b/bhyve/mptbl.c new file mode 100644 index 0000000..904d103 --- /dev/null +++ b/bhyve/mptbl.c @@ -0,0 +1,377 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include + +#include "acpi.h" +#include "bhyverun.h" +#include "mptbl.h" +#include "pci_emul.h" + +#define MPTABLE_BASE 0xF0000 + +/* floating pointer length + maximum length of configuration table */ +#define MPTABLE_MAX_LENGTH (65536 + 16) + +#define LAPIC_PADDR 0xFEE00000 +#define LAPIC_VERSION 16 + +#define IOAPIC_PADDR 0xFEC00000 +#define IOAPIC_VERSION 0x11 + +#define MP_SPECREV 4 +#define MPFP_SIG "_MP_" + +/* Configuration header defines */ +#define MPCH_SIG "PCMP" +#define MPCH_OEMID "BHyVe " +#define MPCH_OEMID_LEN 8 +#define MPCH_PRODID "Hypervisor " +#define MPCH_PRODID_LEN 12 + +/* Processor entry defines */ +#define MPEP_SIG_FAMILY 6 /* XXX bhyve should supply this */ +#define MPEP_SIG_MODEL 26 +#define MPEP_SIG_STEPPING 5 +#define MPEP_SIG \ + ((MPEP_SIG_FAMILY << 8) | \ + (MPEP_SIG_MODEL << 4) | \ + (MPEP_SIG_STEPPING)) + +#define MPEP_FEATURES (0xBFEBFBFF) /* XXX Intel i7 */ + +/* Number of local intr entries */ +#define MPEII_NUM_LOCAL_IRQ 2 + +/* Bus entry defines */ +#define MPE_NUM_BUSES 2 +#define MPE_BUSNAME_LEN 6 +#define MPE_BUSNAME_ISA "ISA " +#define MPE_BUSNAME_PCI "PCI " + +static void *oem_tbl_start; +static int oem_tbl_size; + +static uint8_t +mpt_compute_checksum(void *base, size_t len) +{ + uint8_t *bytes; + uint8_t sum; + + for(bytes = base, sum = 0; len > 0; len--) { + sum += *bytes++; + } + + return (256 - sum); +} + +static void +mpt_build_mpfp(mpfps_t mpfp, vm_paddr_t gpa) +{ + + memset(mpfp, 0, sizeof(*mpfp)); + memcpy(mpfp->signature, MPFP_SIG, 4); + mpfp->pap = gpa + sizeof(*mpfp); + mpfp->length = 1; + mpfp->spec_rev = MP_SPECREV; + mpfp->checksum = mpt_compute_checksum(mpfp, sizeof(*mpfp)); +} + +static void +mpt_build_mpch(mpcth_t mpch) +{ + + memset(mpch, 0, sizeof(*mpch)); + memcpy(mpch->signature, MPCH_SIG, 4); + mpch->spec_rev = MP_SPECREV; + memcpy(mpch->oem_id, MPCH_OEMID, MPCH_OEMID_LEN); + memcpy(mpch->product_id, MPCH_PRODID, MPCH_PRODID_LEN); + mpch->apic_address = LAPIC_PADDR; +} + +static void +mpt_build_proc_entries(proc_entry_ptr mpep, int ncpu) +{ + int i; + + for (i = 0; i < ncpu; i++) { + memset(mpep, 0, sizeof(*mpep)); + mpep->type = MPCT_ENTRY_PROCESSOR; + mpep->apic_id = i; // XXX + mpep->apic_version = LAPIC_VERSION; + mpep->cpu_flags = PROCENTRY_FLAG_EN; + if (i == 0) + mpep->cpu_flags |= PROCENTRY_FLAG_BP; + mpep->cpu_signature = MPEP_SIG; + mpep->feature_flags = MPEP_FEATURES; + mpep++; + } +} + +static void +mpt_build_localint_entries(int_entry_ptr mpie) +{ + + /* Hardcode LINT0 as ExtINT on all CPUs. */ + memset(mpie, 0, sizeof(*mpie)); + mpie->type = MPCT_ENTRY_LOCAL_INT; + mpie->int_type = INTENTRY_TYPE_EXTINT; + mpie->int_flags = INTENTRY_FLAGS_POLARITY_CONFORM | + INTENTRY_FLAGS_TRIGGER_CONFORM; + mpie->dst_apic_id = 0xff; + mpie->dst_apic_int = 0; + mpie++; + + /* Hardcode LINT1 as NMI on all CPUs. */ + memset(mpie, 0, sizeof(*mpie)); + mpie->type = MPCT_ENTRY_LOCAL_INT; + mpie->int_type = INTENTRY_TYPE_NMI; + mpie->int_flags = INTENTRY_FLAGS_POLARITY_CONFORM | + INTENTRY_FLAGS_TRIGGER_CONFORM; + mpie->dst_apic_id = 0xff; + mpie->dst_apic_int = 1; +} + +static void +mpt_build_bus_entries(bus_entry_ptr mpeb) +{ + + memset(mpeb, 0, sizeof(*mpeb)); + mpeb->type = MPCT_ENTRY_BUS; + mpeb->bus_id = 0; + memcpy(mpeb->bus_type, MPE_BUSNAME_PCI, MPE_BUSNAME_LEN); + mpeb++; + + memset(mpeb, 0, sizeof(*mpeb)); + mpeb->type = MPCT_ENTRY_BUS; + mpeb->bus_id = 1; + memcpy(mpeb->bus_type, MPE_BUSNAME_ISA, MPE_BUSNAME_LEN); +} + +static void +mpt_build_ioapic_entries(io_apic_entry_ptr mpei, int id) +{ + + memset(mpei, 0, sizeof(*mpei)); + mpei->type = MPCT_ENTRY_IOAPIC; + mpei->apic_id = id; + mpei->apic_version = IOAPIC_VERSION; + mpei->apic_flags = IOAPICENTRY_FLAG_EN; + mpei->apic_address = IOAPIC_PADDR; +} + +static int +mpt_count_ioint_entries(void) +{ + int bus, count; + + count = 0; + for (bus = 0; bus <= PCI_BUSMAX; bus++) + count += pci_count_lintr(bus); + + /* + * Always include entries for the first 16 pins along with a entry + * for each active PCI INTx pin. + */ + return (16 + count); +} + +static void +mpt_generate_pci_int(int bus, int slot, int pin, int pirq_pin, int ioapic_irq, + void *arg) +{ + int_entry_ptr *mpiep, mpie; + + mpiep = arg; + mpie = *mpiep; + memset(mpie, 0, sizeof(*mpie)); + + /* + * This is always after another I/O interrupt entry, so cheat + * and fetch the I/O APIC ID from the prior entry. + */ + mpie->type = MPCT_ENTRY_INT; + mpie->int_type = INTENTRY_TYPE_INT; + mpie->src_bus_id = bus; + mpie->src_bus_irq = slot << 2 | (pin - 1); + mpie->dst_apic_id = mpie[-1].dst_apic_id; + mpie->dst_apic_int = ioapic_irq; + + *mpiep = mpie + 1; +} + +static void +mpt_build_ioint_entries(int_entry_ptr mpie, int id) +{ + int pin, bus; + + /* + * The following config is taken from kernel mptable.c + * mptable_parse_default_config_ints(...), for now + * just use the default config, tweek later if needed. + */ + + /* First, generate the first 16 pins. */ + for (pin = 0; pin < 16; pin++) { + memset(mpie, 0, sizeof(*mpie)); + mpie->type = MPCT_ENTRY_INT; + mpie->src_bus_id = 1; + mpie->dst_apic_id = id; + + /* + * All default configs route IRQs from bus 0 to the first 16 + * pins of the first I/O APIC with an APIC ID of 2. + */ + mpie->dst_apic_int = pin; + switch (pin) { + case 0: + /* Pin 0 is an ExtINT pin. */ + mpie->int_type = INTENTRY_TYPE_EXTINT; + break; + case 2: + /* IRQ 0 is routed to pin 2. */ + mpie->int_type = INTENTRY_TYPE_INT; + mpie->src_bus_irq = 0; + break; + case SCI_INT: + /* ACPI SCI is level triggered and active-lo. */ + mpie->int_flags = INTENTRY_FLAGS_POLARITY_ACTIVELO | + INTENTRY_FLAGS_TRIGGER_LEVEL; + mpie->int_type = INTENTRY_TYPE_INT; + mpie->src_bus_irq = SCI_INT; + break; + default: + /* All other pins are identity mapped. */ + mpie->int_type = INTENTRY_TYPE_INT; + mpie->src_bus_irq = pin; + break; + } + mpie++; + } + + /* Next, generate entries for any PCI INTx interrupts. */ + for (bus = 0; bus <= PCI_BUSMAX; bus++) + pci_walk_lintr(bus, mpt_generate_pci_int, &mpie); +} + +void +mptable_add_oemtbl(void *tbl, int tblsz) +{ + + oem_tbl_start = tbl; + oem_tbl_size = tblsz; +} + +int +mptable_build(struct vmctx *ctx, int ncpu) +{ + mpcth_t mpch; + bus_entry_ptr mpeb; + io_apic_entry_ptr mpei; + proc_entry_ptr mpep; + mpfps_t mpfp; + int_entry_ptr mpie; + int ioints, bus; + char *curraddr; + char *startaddr; + + startaddr = paddr_guest2host(ctx, MPTABLE_BASE, MPTABLE_MAX_LENGTH); + if (startaddr == NULL) { + fprintf(stderr, "mptable requires mapped mem\n"); + return (ENOMEM); + } + + /* + * There is no way to advertise multiple PCI hierarchies via MPtable + * so require that there is no PCI hierarchy with a non-zero bus + * number. + */ + for (bus = 1; bus <= PCI_BUSMAX; bus++) { + if (pci_bus_configured(bus)) { + fprintf(stderr, "MPtable is incompatible with " + "multiple PCI hierarchies.\r\n"); + fprintf(stderr, "MPtable generation can be disabled " + "by passing the -Y option to bhyve(8).\r\n"); + return (EINVAL); + } + } + + curraddr = startaddr; + mpfp = (mpfps_t)curraddr; + mpt_build_mpfp(mpfp, MPTABLE_BASE); + curraddr += sizeof(*mpfp); + + mpch = (mpcth_t)curraddr; + mpt_build_mpch(mpch); + curraddr += sizeof(*mpch); + + mpep = (proc_entry_ptr)curraddr; + mpt_build_proc_entries(mpep, ncpu); + curraddr += sizeof(*mpep) * ncpu; + mpch->entry_count += ncpu; + + mpeb = (bus_entry_ptr) curraddr; + mpt_build_bus_entries(mpeb); + curraddr += sizeof(*mpeb) * MPE_NUM_BUSES; + mpch->entry_count += MPE_NUM_BUSES; + + mpei = (io_apic_entry_ptr)curraddr; + mpt_build_ioapic_entries(mpei, 0); + curraddr += sizeof(*mpei); + mpch->entry_count++; + + mpie = (int_entry_ptr) curraddr; + ioints = mpt_count_ioint_entries(); + mpt_build_ioint_entries(mpie, 0); + curraddr += sizeof(*mpie) * ioints; + mpch->entry_count += ioints; + + mpie = (int_entry_ptr)curraddr; + mpt_build_localint_entries(mpie); + curraddr += sizeof(*mpie) * MPEII_NUM_LOCAL_IRQ; + mpch->entry_count += MPEII_NUM_LOCAL_IRQ; + + if (oem_tbl_start) { + mpch->oem_table_pointer = curraddr - startaddr + MPTABLE_BASE; + mpch->oem_table_size = oem_tbl_size; + memcpy(curraddr, oem_tbl_start, oem_tbl_size); + } + + mpch->base_table_length = curraddr - (char *)mpch; + mpch->checksum = mpt_compute_checksum(mpch, mpch->base_table_length); + + return (0); +} diff --git a/bhyve/mptbl.h b/bhyve/mptbl.h new file mode 100644 index 0000000..e9e1c42 --- /dev/null +++ b/bhyve/mptbl.h @@ -0,0 +1,35 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _MPTBL_H_ +#define _MPTBL_H_ + +int mptable_build(struct vmctx *ctx, int ncpu); +void mptable_add_oemtbl(void *tbl, int tblsz); + +#endif /* _MPTBL_H_ */ diff --git a/bhyve/pci_ahci.c b/bhyve/pci_ahci.c new file mode 100644 index 0000000..35a0859 --- /dev/null +++ b/bhyve/pci_ahci.c @@ -0,0 +1,2346 @@ +/*- + * Copyright (c) 2013 Zhixiang Yu + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bhyverun.h" +#include "pci_emul.h" +#include "ahci.h" +#include "block_if.h" + +#define MAX_PORTS 6 /* Intel ICH8 AHCI supports 6 ports */ + +#define PxSIG_ATA 0x00000101 /* ATA drive */ +#define PxSIG_ATAPI 0xeb140101 /* ATAPI drive */ + +enum sata_fis_type { + FIS_TYPE_REGH2D = 0x27, /* Register FIS - host to device */ + FIS_TYPE_REGD2H = 0x34, /* Register FIS - device to host */ + FIS_TYPE_DMAACT = 0x39, /* DMA activate FIS - device to host */ + FIS_TYPE_DMASETUP = 0x41, /* DMA setup FIS - bidirectional */ + FIS_TYPE_DATA = 0x46, /* Data FIS - bidirectional */ + FIS_TYPE_BIST = 0x58, /* BIST activate FIS - bidirectional */ + FIS_TYPE_PIOSETUP = 0x5F, /* PIO setup FIS - device to host */ + FIS_TYPE_SETDEVBITS = 0xA1, /* Set dev bits FIS - device to host */ +}; + +/* + * SCSI opcodes + */ +#define TEST_UNIT_READY 0x00 +#define REQUEST_SENSE 0x03 +#define INQUIRY 0x12 +#define START_STOP_UNIT 0x1B +#define PREVENT_ALLOW 0x1E +#define READ_CAPACITY 0x25 +#define READ_10 0x28 +#define POSITION_TO_ELEMENT 0x2B +#define READ_TOC 0x43 +#define GET_EVENT_STATUS_NOTIFICATION 0x4A +#define MODE_SENSE_10 0x5A +#define REPORT_LUNS 0xA0 +#define READ_12 0xA8 +#define READ_CD 0xBE + +/* + * SCSI mode page codes + */ +#define MODEPAGE_RW_ERROR_RECOVERY 0x01 +#define MODEPAGE_CD_CAPABILITIES 0x2A + +/* + * ATA commands + */ +#define ATA_SF_ENAB_SATA_SF 0x10 +#define ATA_SATA_SF_AN 0x05 +#define ATA_SF_DIS_SATA_SF 0x90 + +/* + * Debug printf + */ +#ifdef AHCI_DEBUG +static FILE *dbg; +#define DPRINTF(format, arg...) do{fprintf(dbg, format, ##arg);fflush(dbg);}while(0) +#else +#define DPRINTF(format, arg...) +#endif +#define WPRINTF(format, arg...) printf(format, ##arg) + +struct ahci_ioreq { + struct blockif_req io_req; + struct ahci_port *io_pr; + STAILQ_ENTRY(ahci_ioreq) io_flist; + TAILQ_ENTRY(ahci_ioreq) io_blist; + uint8_t *cfis; + uint32_t len; + uint32_t done; + int slot; + int more; +}; + +struct ahci_port { + struct blockif_ctxt *bctx; + struct pci_ahci_softc *pr_sc; + uint8_t *cmd_lst; + uint8_t *rfis; + char ident[20 + 1]; + int atapi; + int reset; + int waitforclear; + int mult_sectors; + uint8_t xfermode; + uint8_t err_cfis[20]; + uint8_t sense_key; + uint8_t asc; + u_int ccs; + uint32_t pending; + + uint32_t clb; + uint32_t clbu; + uint32_t fb; + uint32_t fbu; + uint32_t is; + uint32_t ie; + uint32_t cmd; + uint32_t unused0; + uint32_t tfd; + uint32_t sig; + uint32_t ssts; + uint32_t sctl; + uint32_t serr; + uint32_t sact; + uint32_t ci; + uint32_t sntf; + uint32_t fbs; + + /* + * i/o request info + */ + struct ahci_ioreq *ioreq; + int ioqsz; + STAILQ_HEAD(ahci_fhead, ahci_ioreq) iofhd; + TAILQ_HEAD(ahci_bhead, ahci_ioreq) iobhd; +}; + +struct ahci_cmd_hdr { + uint16_t flags; + uint16_t prdtl; + uint32_t prdbc; + uint64_t ctba; + uint32_t reserved[4]; +}; + +struct ahci_prdt_entry { + uint64_t dba; + uint32_t reserved; +#define DBCMASK 0x3fffff + uint32_t dbc; +}; + +struct pci_ahci_softc { + struct pci_devinst *asc_pi; + pthread_mutex_t mtx; + int ports; + uint32_t cap; + uint32_t ghc; + uint32_t is; + uint32_t pi; + uint32_t vs; + uint32_t ccc_ctl; + uint32_t ccc_pts; + uint32_t em_loc; + uint32_t em_ctl; + uint32_t cap2; + uint32_t bohc; + uint32_t lintr; + struct ahci_port port[MAX_PORTS]; +}; +#define ahci_ctx(sc) ((sc)->asc_pi->pi_vmctx) + +static void ahci_handle_port(struct ahci_port *p); + +static inline void lba_to_msf(uint8_t *buf, int lba) +{ + lba += 150; + buf[0] = (lba / 75) / 60; + buf[1] = (lba / 75) % 60; + buf[2] = lba % 75; +} + +/* + * generate HBA intr depending on whether or not ports within + * the controller have an interrupt pending. + */ +static void +ahci_generate_intr(struct pci_ahci_softc *sc) +{ + struct pci_devinst *pi; + int i; + + pi = sc->asc_pi; + + for (i = 0; i < sc->ports; i++) { + struct ahci_port *pr; + pr = &sc->port[i]; + if (pr->is & pr->ie) + sc->is |= (1 << i); + } + + DPRINTF("%s %x\n", __func__, sc->is); + + if (sc->is && (sc->ghc & AHCI_GHC_IE)) { + if (pci_msi_enabled(pi)) { + /* + * Generate an MSI interrupt on every edge + */ + pci_generate_msi(pi, 0); + } else if (!sc->lintr) { + /* + * Only generate a pin-based interrupt if one wasn't + * in progress + */ + sc->lintr = 1; + pci_lintr_assert(pi); + } + } else if (sc->lintr) { + /* + * No interrupts: deassert pin-based signal if it had + * been asserted + */ + pci_lintr_deassert(pi); + sc->lintr = 0; + } +} + +static void +ahci_write_fis(struct ahci_port *p, enum sata_fis_type ft, uint8_t *fis) +{ + int offset, len, irq; + + if (p->rfis == NULL || !(p->cmd & AHCI_P_CMD_FRE)) + return; + + switch (ft) { + case FIS_TYPE_REGD2H: + offset = 0x40; + len = 20; + irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_DHR : 0; + break; + case FIS_TYPE_SETDEVBITS: + offset = 0x58; + len = 8; + irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_SDB : 0; + break; + case FIS_TYPE_PIOSETUP: + offset = 0x20; + len = 20; + irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_PS : 0; + break; + default: + WPRINTF("unsupported fis type %d\n", ft); + return; + } + if (fis[2] & ATA_S_ERROR) { + p->waitforclear = 1; + irq |= AHCI_P_IX_TFE; + } + memcpy(p->rfis + offset, fis, len); + if (irq) { + p->is |= irq; + ahci_generate_intr(p->pr_sc); + } +} + +static void +ahci_write_fis_piosetup(struct ahci_port *p) +{ + uint8_t fis[20]; + + memset(fis, 0, sizeof(fis)); + fis[0] = FIS_TYPE_PIOSETUP; + ahci_write_fis(p, FIS_TYPE_PIOSETUP, fis); +} + +static void +ahci_write_fis_sdb(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd) +{ + uint8_t fis[8]; + uint8_t error; + + error = (tfd >> 8) & 0xff; + tfd &= 0x77; + memset(fis, 0, sizeof(fis)); + fis[0] = FIS_TYPE_SETDEVBITS; + fis[1] = (1 << 6); + fis[2] = tfd; + fis[3] = error; + if (fis[2] & ATA_S_ERROR) { + p->err_cfis[0] = slot; + p->err_cfis[2] = tfd; + p->err_cfis[3] = error; + memcpy(&p->err_cfis[4], cfis + 4, 16); + } else { + *(uint32_t *)(fis + 4) = (1 << slot); + p->sact &= ~(1 << slot); + } + p->tfd &= ~0x77; + p->tfd |= tfd; + ahci_write_fis(p, FIS_TYPE_SETDEVBITS, fis); +} + +static void +ahci_write_fis_d2h(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd) +{ + uint8_t fis[20]; + uint8_t error; + + error = (tfd >> 8) & 0xff; + memset(fis, 0, sizeof(fis)); + fis[0] = FIS_TYPE_REGD2H; + fis[1] = (1 << 6); + fis[2] = tfd & 0xff; + fis[3] = error; + fis[4] = cfis[4]; + fis[5] = cfis[5]; + fis[6] = cfis[6]; + fis[7] = cfis[7]; + fis[8] = cfis[8]; + fis[9] = cfis[9]; + fis[10] = cfis[10]; + fis[11] = cfis[11]; + fis[12] = cfis[12]; + fis[13] = cfis[13]; + if (fis[2] & ATA_S_ERROR) { + p->err_cfis[0] = 0x80; + p->err_cfis[2] = tfd & 0xff; + p->err_cfis[3] = error; + memcpy(&p->err_cfis[4], cfis + 4, 16); + } else + p->ci &= ~(1 << slot); + p->tfd = tfd; + ahci_write_fis(p, FIS_TYPE_REGD2H, fis); +} + +static void +ahci_write_fis_d2h_ncq(struct ahci_port *p, int slot) +{ + uint8_t fis[20]; + + p->tfd = ATA_S_READY | ATA_S_DSC; + memset(fis, 0, sizeof(fis)); + fis[0] = FIS_TYPE_REGD2H; + fis[1] = 0; /* No interrupt */ + fis[2] = p->tfd; /* Status */ + fis[3] = 0; /* No error */ + p->ci &= ~(1 << slot); + ahci_write_fis(p, FIS_TYPE_REGD2H, fis); +} + +static void +ahci_write_reset_fis_d2h(struct ahci_port *p) +{ + uint8_t fis[20]; + + memset(fis, 0, sizeof(fis)); + fis[0] = FIS_TYPE_REGD2H; + fis[3] = 1; + fis[4] = 1; + if (p->atapi) { + fis[5] = 0x14; + fis[6] = 0xeb; + } + fis[12] = 1; + ahci_write_fis(p, FIS_TYPE_REGD2H, fis); +} + +static void +ahci_check_stopped(struct ahci_port *p) +{ + /* + * If we are no longer processing the command list and nothing + * is in-flight, clear the running bit, the current command + * slot, the command issue and active bits. + */ + if (!(p->cmd & AHCI_P_CMD_ST)) { + if (p->pending == 0) { + p->ccs = 0; + p->cmd &= ~(AHCI_P_CMD_CR | AHCI_P_CMD_CCS_MASK); + p->ci = 0; + p->sact = 0; + p->waitforclear = 0; + } + } +} + +static void +ahci_port_stop(struct ahci_port *p) +{ + struct ahci_ioreq *aior; + uint8_t *cfis; + int slot; + int ncq; + int error; + + assert(pthread_mutex_isowned_np(&p->pr_sc->mtx)); + + TAILQ_FOREACH(aior, &p->iobhd, io_blist) { + /* + * Try to cancel the outstanding blockif request. + */ + error = blockif_cancel(p->bctx, &aior->io_req); + if (error != 0) + continue; + + slot = aior->slot; + cfis = aior->cfis; + if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || + cfis[2] == ATA_READ_FPDMA_QUEUED || + cfis[2] == ATA_SEND_FPDMA_QUEUED) + ncq = 1; + + if (ncq) + p->sact &= ~(1 << slot); + else + p->ci &= ~(1 << slot); + + /* + * This command is now done. + */ + p->pending &= ~(1 << slot); + + /* + * Delete the blockif request from the busy list + */ + TAILQ_REMOVE(&p->iobhd, aior, io_blist); + + /* + * Move the blockif request back to the free list + */ + STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); + } + + ahci_check_stopped(p); +} + +static void +ahci_port_reset(struct ahci_port *pr) +{ + pr->serr = 0; + pr->sact = 0; + pr->xfermode = ATA_UDMA6; + pr->mult_sectors = 128; + + if (!pr->bctx) { + pr->ssts = ATA_SS_DET_NO_DEVICE; + pr->sig = 0xFFFFFFFF; + pr->tfd = 0x7F; + return; + } + pr->ssts = ATA_SS_DET_PHY_ONLINE | ATA_SS_IPM_ACTIVE; + if (pr->sctl & ATA_SC_SPD_MASK) + pr->ssts |= (pr->sctl & ATA_SC_SPD_MASK); + else + pr->ssts |= ATA_SS_SPD_GEN3; + pr->tfd = (1 << 8) | ATA_S_DSC | ATA_S_DMA; + if (!pr->atapi) { + pr->sig = PxSIG_ATA; + pr->tfd |= ATA_S_READY; + } else + pr->sig = PxSIG_ATAPI; + ahci_write_reset_fis_d2h(pr); +} + +static void +ahci_reset(struct pci_ahci_softc *sc) +{ + int i; + + sc->ghc = AHCI_GHC_AE; + sc->is = 0; + + if (sc->lintr) { + pci_lintr_deassert(sc->asc_pi); + sc->lintr = 0; + } + + for (i = 0; i < sc->ports; i++) { + sc->port[i].ie = 0; + sc->port[i].is = 0; + sc->port[i].cmd = (AHCI_P_CMD_SUD | AHCI_P_CMD_POD); + if (sc->port[i].bctx) + sc->port[i].cmd |= AHCI_P_CMD_CPS; + sc->port[i].sctl = 0; + ahci_port_reset(&sc->port[i]); + } +} + +static void +ata_string(uint8_t *dest, const char *src, int len) +{ + int i; + + for (i = 0; i < len; i++) { + if (*src) + dest[i ^ 1] = *src++; + else + dest[i ^ 1] = ' '; + } +} + +static void +atapi_string(uint8_t *dest, const char *src, int len) +{ + int i; + + for (i = 0; i < len; i++) { + if (*src) + dest[i] = *src++; + else + dest[i] = ' '; + } +} + +/* + * Build up the iovec based on the PRDT, 'done' and 'len'. + */ +static void +ahci_build_iov(struct ahci_port *p, struct ahci_ioreq *aior, + struct ahci_prdt_entry *prdt, uint16_t prdtl) +{ + struct blockif_req *breq = &aior->io_req; + int i, j, skip, todo, left, extra; + uint32_t dbcsz; + + /* Copy part of PRDT between 'done' and 'len' bytes into the iov. */ + skip = aior->done; + left = aior->len - aior->done; + todo = 0; + for (i = 0, j = 0; i < prdtl && j < BLOCKIF_IOV_MAX && left > 0; + i++, prdt++) { + dbcsz = (prdt->dbc & DBCMASK) + 1; + /* Skip already done part of the PRDT */ + if (dbcsz <= skip) { + skip -= dbcsz; + continue; + } + dbcsz -= skip; + if (dbcsz > left) + dbcsz = left; + breq->br_iov[j].iov_base = paddr_guest2host(ahci_ctx(p->pr_sc), + prdt->dba + skip, dbcsz); + breq->br_iov[j].iov_len = dbcsz; + todo += dbcsz; + left -= dbcsz; + skip = 0; + j++; + } + + /* If we got limited by IOV length, round I/O down to sector size. */ + if (j == BLOCKIF_IOV_MAX) { + extra = todo % blockif_sectsz(p->bctx); + todo -= extra; + assert(todo > 0); + while (extra > 0) { + if (breq->br_iov[j - 1].iov_len > extra) { + breq->br_iov[j - 1].iov_len -= extra; + break; + } + extra -= breq->br_iov[j - 1].iov_len; + j--; + } + } + + breq->br_iovcnt = j; + breq->br_resid = todo; + aior->done += todo; + aior->more = (aior->done < aior->len && i < prdtl); +} + +static void +ahci_handle_rw(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) +{ + struct ahci_ioreq *aior; + struct blockif_req *breq; + struct ahci_prdt_entry *prdt; + struct ahci_cmd_hdr *hdr; + uint64_t lba; + uint32_t len; + int err, first, ncq, readop; + + prdt = (struct ahci_prdt_entry *)(cfis + 0x80); + hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); + ncq = 0; + readop = 1; + first = (done == 0); + + if (cfis[2] == ATA_WRITE || cfis[2] == ATA_WRITE48 || + cfis[2] == ATA_WRITE_MUL || cfis[2] == ATA_WRITE_MUL48 || + cfis[2] == ATA_WRITE_DMA || cfis[2] == ATA_WRITE_DMA48 || + cfis[2] == ATA_WRITE_FPDMA_QUEUED) + readop = 0; + + if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || + cfis[2] == ATA_READ_FPDMA_QUEUED) { + lba = ((uint64_t)cfis[10] << 40) | + ((uint64_t)cfis[9] << 32) | + ((uint64_t)cfis[8] << 24) | + ((uint64_t)cfis[6] << 16) | + ((uint64_t)cfis[5] << 8) | + cfis[4]; + len = cfis[11] << 8 | cfis[3]; + if (!len) + len = 65536; + ncq = 1; + } else if (cfis[2] == ATA_READ48 || cfis[2] == ATA_WRITE48 || + cfis[2] == ATA_READ_MUL48 || cfis[2] == ATA_WRITE_MUL48 || + cfis[2] == ATA_READ_DMA48 || cfis[2] == ATA_WRITE_DMA48) { + lba = ((uint64_t)cfis[10] << 40) | + ((uint64_t)cfis[9] << 32) | + ((uint64_t)cfis[8] << 24) | + ((uint64_t)cfis[6] << 16) | + ((uint64_t)cfis[5] << 8) | + cfis[4]; + len = cfis[13] << 8 | cfis[12]; + if (!len) + len = 65536; + } else { + lba = ((cfis[7] & 0xf) << 24) | (cfis[6] << 16) | + (cfis[5] << 8) | cfis[4]; + len = cfis[12]; + if (!len) + len = 256; + } + lba *= blockif_sectsz(p->bctx); + len *= blockif_sectsz(p->bctx); + + /* Pull request off free list */ + aior = STAILQ_FIRST(&p->iofhd); + assert(aior != NULL); + STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); + + aior->cfis = cfis; + aior->slot = slot; + aior->len = len; + aior->done = done; + breq = &aior->io_req; + breq->br_offset = lba + done; + ahci_build_iov(p, aior, prdt, hdr->prdtl); + + /* Mark this command in-flight. */ + p->pending |= 1 << slot; + + /* Stuff request onto busy list. */ + TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); + + if (ncq && first) + ahci_write_fis_d2h_ncq(p, slot); + + if (readop) + err = blockif_read(p->bctx, breq); + else + err = blockif_write(p->bctx, breq); + assert(err == 0); +} + +static void +ahci_handle_flush(struct ahci_port *p, int slot, uint8_t *cfis) +{ + struct ahci_ioreq *aior; + struct blockif_req *breq; + int err; + + /* + * Pull request off free list + */ + aior = STAILQ_FIRST(&p->iofhd); + assert(aior != NULL); + STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); + aior->cfis = cfis; + aior->slot = slot; + aior->len = 0; + aior->done = 0; + aior->more = 0; + breq = &aior->io_req; + + /* + * Mark this command in-flight. + */ + p->pending |= 1 << slot; + + /* + * Stuff request onto busy list + */ + TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); + + err = blockif_flush(p->bctx, breq); + assert(err == 0); +} + +static inline void +read_prdt(struct ahci_port *p, int slot, uint8_t *cfis, + void *buf, int size) +{ + struct ahci_cmd_hdr *hdr; + struct ahci_prdt_entry *prdt; + void *to; + int i, len; + + hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); + len = size; + to = buf; + prdt = (struct ahci_prdt_entry *)(cfis + 0x80); + for (i = 0; i < hdr->prdtl && len; i++) { + uint8_t *ptr; + uint32_t dbcsz; + int sublen; + + dbcsz = (prdt->dbc & DBCMASK) + 1; + ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz); + sublen = len < dbcsz ? len : dbcsz; + memcpy(to, ptr, sublen); + len -= sublen; + to += sublen; + prdt++; + } +} + +static void +ahci_handle_dsm_trim(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) +{ + struct ahci_ioreq *aior; + struct blockif_req *breq; + uint8_t *entry; + uint64_t elba; + uint32_t len, elen; + int err, first, ncq; + uint8_t buf[512]; + + first = (done == 0); + if (cfis[2] == ATA_DATA_SET_MANAGEMENT) { + len = (uint16_t)cfis[13] << 8 | cfis[12]; + len *= 512; + ncq = 0; + } else { /* ATA_SEND_FPDMA_QUEUED */ + len = (uint16_t)cfis[11] << 8 | cfis[3]; + len *= 512; + ncq = 1; + } + read_prdt(p, slot, cfis, buf, sizeof(buf)); + +next: + entry = &buf[done]; + elba = ((uint64_t)entry[5] << 40) | + ((uint64_t)entry[4] << 32) | + ((uint64_t)entry[3] << 24) | + ((uint64_t)entry[2] << 16) | + ((uint64_t)entry[1] << 8) | + entry[0]; + elen = (uint16_t)entry[7] << 8 | entry[6]; + done += 8; + if (elen == 0) { + if (done >= len) { + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); + p->pending &= ~(1 << slot); + ahci_check_stopped(p); + if (!first) + ahci_handle_port(p); + return; + } + goto next; + } + + /* + * Pull request off free list + */ + aior = STAILQ_FIRST(&p->iofhd); + assert(aior != NULL); + STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); + aior->cfis = cfis; + aior->slot = slot; + aior->len = len; + aior->done = done; + aior->more = (len != done); + + breq = &aior->io_req; + breq->br_offset = elba * blockif_sectsz(p->bctx); + breq->br_resid = elen * blockif_sectsz(p->bctx); + + /* + * Mark this command in-flight. + */ + p->pending |= 1 << slot; + + /* + * Stuff request onto busy list + */ + TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); + + if (ncq && first) + ahci_write_fis_d2h_ncq(p, slot); + + err = blockif_delete(p->bctx, breq); + assert(err == 0); +} + +static inline void +write_prdt(struct ahci_port *p, int slot, uint8_t *cfis, + void *buf, int size) +{ + struct ahci_cmd_hdr *hdr; + struct ahci_prdt_entry *prdt; + void *from; + int i, len; + + hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); + len = size; + from = buf; + prdt = (struct ahci_prdt_entry *)(cfis + 0x80); + for (i = 0; i < hdr->prdtl && len; i++) { + uint8_t *ptr; + uint32_t dbcsz; + int sublen; + + dbcsz = (prdt->dbc & DBCMASK) + 1; + ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz); + sublen = len < dbcsz ? len : dbcsz; + memcpy(ptr, from, sublen); + len -= sublen; + from += sublen; + prdt++; + } + hdr->prdbc = size - len; +} + +static void +ahci_checksum(uint8_t *buf, int size) +{ + int i; + uint8_t sum = 0; + + for (i = 0; i < size - 1; i++) + sum += buf[i]; + buf[size - 1] = 0x100 - sum; +} + +static void +ahci_handle_read_log(struct ahci_port *p, int slot, uint8_t *cfis) +{ + struct ahci_cmd_hdr *hdr; + uint8_t buf[512]; + + hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); + if (p->atapi || hdr->prdtl == 0 || cfis[4] != 0x10 || + cfis[5] != 0 || cfis[9] != 0 || cfis[12] != 1 || cfis[13] != 0) { + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + return; + } + + memset(buf, 0, sizeof(buf)); + memcpy(buf, p->err_cfis, sizeof(p->err_cfis)); + ahci_checksum(buf, sizeof(buf)); + + if (cfis[2] == ATA_READ_LOG_EXT) + ahci_write_fis_piosetup(p); + write_prdt(p, slot, cfis, (void *)buf, sizeof(buf)); + ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); +} + +static void +handle_identify(struct ahci_port *p, int slot, uint8_t *cfis) +{ + struct ahci_cmd_hdr *hdr; + + hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); + if (p->atapi || hdr->prdtl == 0) { + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + } else { + uint16_t buf[256]; + uint64_t sectors; + int sectsz, psectsz, psectoff, candelete, ro; + uint16_t cyl; + uint8_t sech, heads; + + ro = blockif_is_ro(p->bctx); + candelete = blockif_candelete(p->bctx); + sectsz = blockif_sectsz(p->bctx); + sectors = blockif_size(p->bctx) / sectsz; + blockif_chs(p->bctx, &cyl, &heads, &sech); + blockif_psectsz(p->bctx, &psectsz, &psectoff); + memset(buf, 0, sizeof(buf)); + buf[0] = 0x0040; + buf[1] = cyl; + buf[3] = heads; + buf[6] = sech; + ata_string((uint8_t *)(buf+10), p->ident, 20); + ata_string((uint8_t *)(buf+23), "001", 8); + ata_string((uint8_t *)(buf+27), "BHYVE SATA DISK", 40); + buf[47] = (0x8000 | 128); + buf[48] = 0x1; + buf[49] = (1 << 8 | 1 << 9 | 1 << 11); + buf[50] = (1 << 14); + buf[53] = (1 << 1 | 1 << 2); + if (p->mult_sectors) + buf[59] = (0x100 | p->mult_sectors); + if (sectors <= 0x0fffffff) { + buf[60] = sectors; + buf[61] = (sectors >> 16); + } else { + buf[60] = 0xffff; + buf[61] = 0x0fff; + } + buf[63] = 0x7; + if (p->xfermode & ATA_WDMA0) + buf[63] |= (1 << ((p->xfermode & 7) + 8)); + buf[64] = 0x3; + buf[65] = 120; + buf[66] = 120; + buf[67] = 120; + buf[68] = 120; + buf[69] = 0; + buf[75] = 31; + buf[76] = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3 | + ATA_SUPPORT_NCQ); + buf[77] = (ATA_SUPPORT_RCVSND_FPDMA_QUEUED | + (p->ssts & ATA_SS_SPD_MASK) >> 3); + buf[80] = 0x3f0; + buf[81] = 0x28; + buf[82] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE| + ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP); + buf[83] = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE | + ATA_SUPPORT_FLUSHCACHE48 | 1 << 14); + buf[84] = (1 << 14); + buf[85] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE| + ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP); + buf[86] = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE | + ATA_SUPPORT_FLUSHCACHE48 | 1 << 15); + buf[87] = (1 << 14); + buf[88] = 0x7f; + if (p->xfermode & ATA_UDMA0) + buf[88] |= (1 << ((p->xfermode & 7) + 8)); + buf[100] = sectors; + buf[101] = (sectors >> 16); + buf[102] = (sectors >> 32); + buf[103] = (sectors >> 48); + if (candelete && !ro) { + buf[69] |= ATA_SUPPORT_RZAT | ATA_SUPPORT_DRAT; + buf[105] = 1; + buf[169] = ATA_SUPPORT_DSM_TRIM; + } + buf[106] = 0x4000; + buf[209] = 0x4000; + if (psectsz > sectsz) { + buf[106] |= 0x2000; + buf[106] |= ffsl(psectsz / sectsz) - 1; + buf[209] |= (psectoff / sectsz); + } + if (sectsz > 512) { + buf[106] |= 0x1000; + buf[117] = sectsz / 2; + buf[118] = ((sectsz / 2) >> 16); + } + buf[119] = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14); + buf[120] = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14); + buf[222] = 0x1020; + buf[255] = 0x00a5; + ahci_checksum((uint8_t *)buf, sizeof(buf)); + ahci_write_fis_piosetup(p); + write_prdt(p, slot, cfis, (void *)buf, sizeof(buf)); + ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); + } +} + +static void +handle_atapi_identify(struct ahci_port *p, int slot, uint8_t *cfis) +{ + if (!p->atapi) { + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + } else { + uint16_t buf[256]; + + memset(buf, 0, sizeof(buf)); + buf[0] = (2 << 14 | 5 << 8 | 1 << 7 | 2 << 5); + ata_string((uint8_t *)(buf+10), p->ident, 20); + ata_string((uint8_t *)(buf+23), "001", 8); + ata_string((uint8_t *)(buf+27), "BHYVE SATA DVD ROM", 40); + buf[49] = (1 << 9 | 1 << 8); + buf[50] = (1 << 14 | 1); + buf[53] = (1 << 2 | 1 << 1); + buf[62] = 0x3f; + buf[63] = 7; + if (p->xfermode & ATA_WDMA0) + buf[63] |= (1 << ((p->xfermode & 7) + 8)); + buf[64] = 3; + buf[65] = 120; + buf[66] = 120; + buf[67] = 120; + buf[68] = 120; + buf[76] = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3); + buf[77] = ((p->ssts & ATA_SS_SPD_MASK) >> 3); + buf[78] = (1 << 5); + buf[80] = 0x3f0; + buf[82] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET | + ATA_SUPPORT_RESET | ATA_SUPPORT_NOP); + buf[83] = (1 << 14); + buf[84] = (1 << 14); + buf[85] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET | + ATA_SUPPORT_RESET | ATA_SUPPORT_NOP); + buf[87] = (1 << 14); + buf[88] = 0x7f; + if (p->xfermode & ATA_UDMA0) + buf[88] |= (1 << ((p->xfermode & 7) + 8)); + buf[222] = 0x1020; + buf[255] = 0x00a5; + ahci_checksum((uint8_t *)buf, sizeof(buf)); + ahci_write_fis_piosetup(p); + write_prdt(p, slot, cfis, (void *)buf, sizeof(buf)); + ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); + } +} + +static void +atapi_inquiry(struct ahci_port *p, int slot, uint8_t *cfis) +{ + uint8_t buf[36]; + uint8_t *acmd; + int len; + uint32_t tfd; + + acmd = cfis + 0x40; + + if (acmd[1] & 1) { /* VPD */ + if (acmd[2] == 0) { /* Supported VPD pages */ + buf[0] = 0x05; + buf[1] = 0; + buf[2] = 0; + buf[3] = 1; + buf[4] = 0; + len = 4 + buf[3]; + } else { + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x24; + tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, tfd); + return; + } + } else { + buf[0] = 0x05; + buf[1] = 0x80; + buf[2] = 0x00; + buf[3] = 0x21; + buf[4] = 31; + buf[5] = 0; + buf[6] = 0; + buf[7] = 0; + atapi_string(buf + 8, "BHYVE", 8); + atapi_string(buf + 16, "BHYVE DVD-ROM", 16); + atapi_string(buf + 32, "001", 4); + len = sizeof(buf); + } + + if (len > acmd[4]) + len = acmd[4]; + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + write_prdt(p, slot, cfis, buf, len); + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); +} + +static void +atapi_read_capacity(struct ahci_port *p, int slot, uint8_t *cfis) +{ + uint8_t buf[8]; + uint64_t sectors; + + sectors = blockif_size(p->bctx) / 2048; + be32enc(buf, sectors - 1); + be32enc(buf + 4, 2048); + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + write_prdt(p, slot, cfis, buf, sizeof(buf)); + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); +} + +static void +atapi_read_toc(struct ahci_port *p, int slot, uint8_t *cfis) +{ + uint8_t *acmd; + uint8_t format; + int len; + + acmd = cfis + 0x40; + + len = be16dec(acmd + 7); + format = acmd[9] >> 6; + switch (format) { + case 0: + { + int msf, size; + uint64_t sectors; + uint8_t start_track, buf[20], *bp; + + msf = (acmd[1] >> 1) & 1; + start_track = acmd[6]; + if (start_track > 1 && start_track != 0xaa) { + uint32_t tfd; + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x24; + tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, tfd); + return; + } + bp = buf + 2; + *bp++ = 1; + *bp++ = 1; + if (start_track <= 1) { + *bp++ = 0; + *bp++ = 0x14; + *bp++ = 1; + *bp++ = 0; + if (msf) { + *bp++ = 0; + lba_to_msf(bp, 0); + bp += 3; + } else { + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + } + } + *bp++ = 0; + *bp++ = 0x14; + *bp++ = 0xaa; + *bp++ = 0; + sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx); + sectors >>= 2; + if (msf) { + *bp++ = 0; + lba_to_msf(bp, sectors); + bp += 3; + } else { + be32enc(bp, sectors); + bp += 4; + } + size = bp - buf; + be16enc(buf, size - 2); + if (len > size) + len = size; + write_prdt(p, slot, cfis, buf, len); + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); + break; + } + case 1: + { + uint8_t buf[12]; + + memset(buf, 0, sizeof(buf)); + buf[1] = 0xa; + buf[2] = 0x1; + buf[3] = 0x1; + if (len > sizeof(buf)) + len = sizeof(buf); + write_prdt(p, slot, cfis, buf, len); + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); + break; + } + case 2: + { + int msf, size; + uint64_t sectors; + uint8_t start_track, *bp, buf[50]; + + msf = (acmd[1] >> 1) & 1; + start_track = acmd[6]; + bp = buf + 2; + *bp++ = 1; + *bp++ = 1; + + *bp++ = 1; + *bp++ = 0x14; + *bp++ = 0; + *bp++ = 0xa0; + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + *bp++ = 1; + *bp++ = 0; + *bp++ = 0; + + *bp++ = 1; + *bp++ = 0x14; + *bp++ = 0; + *bp++ = 0xa1; + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + *bp++ = 1; + *bp++ = 0; + *bp++ = 0; + + *bp++ = 1; + *bp++ = 0x14; + *bp++ = 0; + *bp++ = 0xa2; + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx); + sectors >>= 2; + if (msf) { + *bp++ = 0; + lba_to_msf(bp, sectors); + bp += 3; + } else { + be32enc(bp, sectors); + bp += 4; + } + + *bp++ = 1; + *bp++ = 0x14; + *bp++ = 0; + *bp++ = 1; + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + if (msf) { + *bp++ = 0; + lba_to_msf(bp, 0); + bp += 3; + } else { + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + *bp++ = 0; + } + + size = bp - buf; + be16enc(buf, size - 2); + if (len > size) + len = size; + write_prdt(p, slot, cfis, buf, len); + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); + break; + } + default: + { + uint32_t tfd; + + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x24; + tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, tfd); + break; + } + } +} + +static void +atapi_report_luns(struct ahci_port *p, int slot, uint8_t *cfis) +{ + uint8_t buf[16]; + + memset(buf, 0, sizeof(buf)); + buf[3] = 8; + + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + write_prdt(p, slot, cfis, buf, sizeof(buf)); + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); +} + +static void +atapi_read(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) +{ + struct ahci_ioreq *aior; + struct ahci_cmd_hdr *hdr; + struct ahci_prdt_entry *prdt; + struct blockif_req *breq; + struct pci_ahci_softc *sc; + uint8_t *acmd; + uint64_t lba; + uint32_t len; + int err; + + sc = p->pr_sc; + acmd = cfis + 0x40; + hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); + prdt = (struct ahci_prdt_entry *)(cfis + 0x80); + + lba = be32dec(acmd + 2); + if (acmd[0] == READ_10) + len = be16dec(acmd + 7); + else + len = be32dec(acmd + 6); + if (len == 0) { + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); + } + lba *= 2048; + len *= 2048; + + /* + * Pull request off free list + */ + aior = STAILQ_FIRST(&p->iofhd); + assert(aior != NULL); + STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); + aior->cfis = cfis; + aior->slot = slot; + aior->len = len; + aior->done = done; + breq = &aior->io_req; + breq->br_offset = lba + done; + ahci_build_iov(p, aior, prdt, hdr->prdtl); + + /* Mark this command in-flight. */ + p->pending |= 1 << slot; + + /* Stuff request onto busy list. */ + TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); + + err = blockif_read(p->bctx, breq); + assert(err == 0); +} + +static void +atapi_request_sense(struct ahci_port *p, int slot, uint8_t *cfis) +{ + uint8_t buf[64]; + uint8_t *acmd; + int len; + + acmd = cfis + 0x40; + len = acmd[4]; + if (len > sizeof(buf)) + len = sizeof(buf); + memset(buf, 0, len); + buf[0] = 0x70 | (1 << 7); + buf[2] = p->sense_key; + buf[7] = 10; + buf[12] = p->asc; + write_prdt(p, slot, cfis, buf, len); + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); +} + +static void +atapi_start_stop_unit(struct ahci_port *p, int slot, uint8_t *cfis) +{ + uint8_t *acmd = cfis + 0x40; + uint32_t tfd; + + switch (acmd[4] & 3) { + case 0: + case 1: + case 3: + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + tfd = ATA_S_READY | ATA_S_DSC; + break; + case 2: + /* TODO eject media */ + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x53; + tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; + break; + } + ahci_write_fis_d2h(p, slot, cfis, tfd); +} + +static void +atapi_mode_sense(struct ahci_port *p, int slot, uint8_t *cfis) +{ + uint8_t *acmd; + uint32_t tfd; + uint8_t pc, code; + int len; + + acmd = cfis + 0x40; + len = be16dec(acmd + 7); + pc = acmd[2] >> 6; + code = acmd[2] & 0x3f; + + switch (pc) { + case 0: + switch (code) { + case MODEPAGE_RW_ERROR_RECOVERY: + { + uint8_t buf[16]; + + if (len > sizeof(buf)) + len = sizeof(buf); + + memset(buf, 0, sizeof(buf)); + be16enc(buf, 16 - 2); + buf[2] = 0x70; + buf[8] = 0x01; + buf[9] = 16 - 10; + buf[11] = 0x05; + write_prdt(p, slot, cfis, buf, len); + tfd = ATA_S_READY | ATA_S_DSC; + break; + } + case MODEPAGE_CD_CAPABILITIES: + { + uint8_t buf[30]; + + if (len > sizeof(buf)) + len = sizeof(buf); + + memset(buf, 0, sizeof(buf)); + be16enc(buf, 30 - 2); + buf[2] = 0x70; + buf[8] = 0x2A; + buf[9] = 30 - 10; + buf[10] = 0x08; + buf[12] = 0x71; + be16enc(&buf[18], 2); + be16enc(&buf[20], 512); + write_prdt(p, slot, cfis, buf, len); + tfd = ATA_S_READY | ATA_S_DSC; + break; + } + default: + goto error; + break; + } + break; + case 3: + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x39; + tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; + break; +error: + case 1: + case 2: + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x24; + tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; + break; + } + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, tfd); +} + +static void +atapi_get_event_status_notification(struct ahci_port *p, int slot, + uint8_t *cfis) +{ + uint8_t *acmd; + uint32_t tfd; + + acmd = cfis + 0x40; + + /* we don't support asynchronous operation */ + if (!(acmd[1] & 1)) { + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x24; + tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; + } else { + uint8_t buf[8]; + int len; + + len = be16dec(acmd + 7); + if (len > sizeof(buf)) + len = sizeof(buf); + + memset(buf, 0, sizeof(buf)); + be16enc(buf, 8 - 2); + buf[2] = 0x04; + buf[3] = 0x10; + buf[5] = 0x02; + write_prdt(p, slot, cfis, buf, len); + tfd = ATA_S_READY | ATA_S_DSC; + } + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, tfd); +} + +static void +handle_packet_cmd(struct ahci_port *p, int slot, uint8_t *cfis) +{ + uint8_t *acmd; + + acmd = cfis + 0x40; + +#ifdef AHCI_DEBUG + { + int i; + DPRINTF("ACMD:"); + for (i = 0; i < 16; i++) + DPRINTF("%02x ", acmd[i]); + DPRINTF("\n"); + } +#endif + + switch (acmd[0]) { + case TEST_UNIT_READY: + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); + break; + case INQUIRY: + atapi_inquiry(p, slot, cfis); + break; + case READ_CAPACITY: + atapi_read_capacity(p, slot, cfis); + break; + case PREVENT_ALLOW: + /* TODO */ + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); + break; + case READ_TOC: + atapi_read_toc(p, slot, cfis); + break; + case REPORT_LUNS: + atapi_report_luns(p, slot, cfis); + break; + case READ_10: + case READ_12: + atapi_read(p, slot, cfis, 0); + break; + case REQUEST_SENSE: + atapi_request_sense(p, slot, cfis); + break; + case START_STOP_UNIT: + atapi_start_stop_unit(p, slot, cfis); + break; + case MODE_SENSE_10: + atapi_mode_sense(p, slot, cfis); + break; + case GET_EVENT_STATUS_NOTIFICATION: + atapi_get_event_status_notification(p, slot, cfis); + break; + default: + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x20; + ahci_write_fis_d2h(p, slot, cfis, (p->sense_key << 12) | + ATA_S_READY | ATA_S_ERROR); + break; + } +} + +static void +ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis) +{ + + p->tfd |= ATA_S_BUSY; + switch (cfis[2]) { + case ATA_ATA_IDENTIFY: + handle_identify(p, slot, cfis); + break; + case ATA_SETFEATURES: + { + switch (cfis[3]) { + case ATA_SF_ENAB_SATA_SF: + switch (cfis[12]) { + case ATA_SATA_SF_AN: + p->tfd = ATA_S_DSC | ATA_S_READY; + break; + default: + p->tfd = ATA_S_ERROR | ATA_S_READY; + p->tfd |= (ATA_ERROR_ABORT << 8); + break; + } + break; + case ATA_SF_ENAB_WCACHE: + case ATA_SF_DIS_WCACHE: + case ATA_SF_ENAB_RCACHE: + case ATA_SF_DIS_RCACHE: + p->tfd = ATA_S_DSC | ATA_S_READY; + break; + case ATA_SF_SETXFER: + { + switch (cfis[12] & 0xf8) { + case ATA_PIO: + case ATA_PIO0: + break; + case ATA_WDMA0: + case ATA_UDMA0: + p->xfermode = (cfis[12] & 0x7); + break; + } + p->tfd = ATA_S_DSC | ATA_S_READY; + break; + } + default: + p->tfd = ATA_S_ERROR | ATA_S_READY; + p->tfd |= (ATA_ERROR_ABORT << 8); + break; + } + ahci_write_fis_d2h(p, slot, cfis, p->tfd); + break; + } + case ATA_SET_MULTI: + if (cfis[12] != 0 && + (cfis[12] > 128 || (cfis[12] & (cfis[12] - 1)))) { + p->tfd = ATA_S_ERROR | ATA_S_READY; + p->tfd |= (ATA_ERROR_ABORT << 8); + } else { + p->mult_sectors = cfis[12]; + p->tfd = ATA_S_DSC | ATA_S_READY; + } + ahci_write_fis_d2h(p, slot, cfis, p->tfd); + break; + case ATA_READ: + case ATA_WRITE: + case ATA_READ48: + case ATA_WRITE48: + case ATA_READ_MUL: + case ATA_WRITE_MUL: + case ATA_READ_MUL48: + case ATA_WRITE_MUL48: + case ATA_READ_DMA: + case ATA_WRITE_DMA: + case ATA_READ_DMA48: + case ATA_WRITE_DMA48: + case ATA_READ_FPDMA_QUEUED: + case ATA_WRITE_FPDMA_QUEUED: + ahci_handle_rw(p, slot, cfis, 0); + break; + case ATA_FLUSHCACHE: + case ATA_FLUSHCACHE48: + ahci_handle_flush(p, slot, cfis); + break; + case ATA_DATA_SET_MANAGEMENT: + if (cfis[11] == 0 && cfis[3] == ATA_DSM_TRIM && + cfis[13] == 0 && cfis[12] == 1) { + ahci_handle_dsm_trim(p, slot, cfis, 0); + break; + } + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + break; + case ATA_SEND_FPDMA_QUEUED: + if ((cfis[13] & 0x1f) == ATA_SFPDMA_DSM && + cfis[17] == 0 && cfis[16] == ATA_DSM_TRIM && + cfis[11] == 0 && cfis[13] == 1) { + ahci_handle_dsm_trim(p, slot, cfis, 0); + break; + } + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + break; + case ATA_READ_LOG_EXT: + case ATA_READ_LOG_DMA_EXT: + ahci_handle_read_log(p, slot, cfis); + break; + case ATA_NOP: + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + break; + case ATA_STANDBY_CMD: + case ATA_STANDBY_IMMEDIATE: + case ATA_IDLE_CMD: + case ATA_IDLE_IMMEDIATE: + case ATA_SLEEP: + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); + break; + case ATA_ATAPI_IDENTIFY: + handle_atapi_identify(p, slot, cfis); + break; + case ATA_PACKET_CMD: + if (!p->atapi) { + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + } else + handle_packet_cmd(p, slot, cfis); + break; + default: + WPRINTF("Unsupported cmd:%02x\n", cfis[2]); + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + break; + } +} + +static void +ahci_handle_slot(struct ahci_port *p, int slot) +{ + struct ahci_cmd_hdr *hdr; + struct ahci_prdt_entry *prdt; + struct pci_ahci_softc *sc; + uint8_t *cfis; + int cfl; + + sc = p->pr_sc; + hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); + cfl = (hdr->flags & 0x1f) * 4; + cfis = paddr_guest2host(ahci_ctx(sc), hdr->ctba, + 0x80 + hdr->prdtl * sizeof(struct ahci_prdt_entry)); + prdt = (struct ahci_prdt_entry *)(cfis + 0x80); + +#ifdef AHCI_DEBUG + DPRINTF("\ncfis:"); + for (i = 0; i < cfl; i++) { + if (i % 10 == 0) + DPRINTF("\n"); + DPRINTF("%02x ", cfis[i]); + } + DPRINTF("\n"); + + for (i = 0; i < hdr->prdtl; i++) { + DPRINTF("%d@%08"PRIx64"\n", prdt->dbc & 0x3fffff, prdt->dba); + prdt++; + } +#endif + + if (cfis[0] != FIS_TYPE_REGH2D) { + WPRINTF("Not a H2D FIS:%02x\n", cfis[0]); + return; + } + + if (cfis[1] & 0x80) { + ahci_handle_cmd(p, slot, cfis); + } else { + if (cfis[15] & (1 << 2)) + p->reset = 1; + else if (p->reset) { + p->reset = 0; + ahci_port_reset(p); + } + p->ci &= ~(1 << slot); + } +} + +static void +ahci_handle_port(struct ahci_port *p) +{ + + if (!(p->cmd & AHCI_P_CMD_ST)) + return; + + /* + * Search for any new commands to issue ignoring those that + * are already in-flight. Stop if device is busy or in error. + */ + for (; (p->ci & ~p->pending) != 0; p->ccs = ((p->ccs + 1) & 31)) { + if ((p->tfd & (ATA_S_BUSY | ATA_S_DRQ)) != 0) + break; + if (p->waitforclear) + break; + if ((p->ci & ~p->pending & (1 << p->ccs)) != 0) { + p->cmd &= ~AHCI_P_CMD_CCS_MASK; + p->cmd |= p->ccs << AHCI_P_CMD_CCS_SHIFT; + ahci_handle_slot(p, p->ccs); + } + } +} + +/* + * blockif callback routine - this runs in the context of the blockif + * i/o thread, so the mutex needs to be acquired. + */ +static void +ata_ioreq_cb(struct blockif_req *br, int err) +{ + struct ahci_cmd_hdr *hdr; + struct ahci_ioreq *aior; + struct ahci_port *p; + struct pci_ahci_softc *sc; + uint32_t tfd; + uint8_t *cfis; + int slot, ncq, dsm; + + DPRINTF("%s %d\n", __func__, err); + + ncq = dsm = 0; + aior = br->br_param; + p = aior->io_pr; + cfis = aior->cfis; + slot = aior->slot; + sc = p->pr_sc; + hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); + + if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || + cfis[2] == ATA_READ_FPDMA_QUEUED || + cfis[2] == ATA_SEND_FPDMA_QUEUED) + ncq = 1; + if (cfis[2] == ATA_DATA_SET_MANAGEMENT || + (cfis[2] == ATA_SEND_FPDMA_QUEUED && + (cfis[13] & 0x1f) == ATA_SFPDMA_DSM)) + dsm = 1; + + pthread_mutex_lock(&sc->mtx); + + /* + * Delete the blockif request from the busy list + */ + TAILQ_REMOVE(&p->iobhd, aior, io_blist); + + /* + * Move the blockif request back to the free list + */ + STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); + + if (!err) + hdr->prdbc = aior->done; + + if (!err && aior->more) { + if (dsm) + ahci_handle_dsm_trim(p, slot, cfis, aior->done); + else + ahci_handle_rw(p, slot, cfis, aior->done); + goto out; + } + + if (!err) + tfd = ATA_S_READY | ATA_S_DSC; + else + tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR; + if (ncq) + ahci_write_fis_sdb(p, slot, cfis, tfd); + else + ahci_write_fis_d2h(p, slot, cfis, tfd); + + /* + * This command is now complete. + */ + p->pending &= ~(1 << slot); + + ahci_check_stopped(p); + ahci_handle_port(p); +out: + pthread_mutex_unlock(&sc->mtx); + DPRINTF("%s exit\n", __func__); +} + +static void +atapi_ioreq_cb(struct blockif_req *br, int err) +{ + struct ahci_cmd_hdr *hdr; + struct ahci_ioreq *aior; + struct ahci_port *p; + struct pci_ahci_softc *sc; + uint8_t *cfis; + uint32_t tfd; + int slot; + + DPRINTF("%s %d\n", __func__, err); + + aior = br->br_param; + p = aior->io_pr; + cfis = aior->cfis; + slot = aior->slot; + sc = p->pr_sc; + hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + aior->slot * AHCI_CL_SIZE); + + pthread_mutex_lock(&sc->mtx); + + /* + * Delete the blockif request from the busy list + */ + TAILQ_REMOVE(&p->iobhd, aior, io_blist); + + /* + * Move the blockif request back to the free list + */ + STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); + + if (!err) + hdr->prdbc = aior->done; + + if (!err && aior->more) { + atapi_read(p, slot, cfis, aior->done); + goto out; + } + + if (!err) { + tfd = ATA_S_READY | ATA_S_DSC; + } else { + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x21; + tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; + } + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, tfd); + + /* + * This command is now complete. + */ + p->pending &= ~(1 << slot); + + ahci_check_stopped(p); + ahci_handle_port(p); +out: + pthread_mutex_unlock(&sc->mtx); + DPRINTF("%s exit\n", __func__); +} + +static void +pci_ahci_ioreq_init(struct ahci_port *pr) +{ + struct ahci_ioreq *vr; + int i; + + pr->ioqsz = blockif_queuesz(pr->bctx); + pr->ioreq = calloc(pr->ioqsz, sizeof(struct ahci_ioreq)); + STAILQ_INIT(&pr->iofhd); + + /* + * Add all i/o request entries to the free queue + */ + for (i = 0; i < pr->ioqsz; i++) { + vr = &pr->ioreq[i]; + vr->io_pr = pr; + if (!pr->atapi) + vr->io_req.br_callback = ata_ioreq_cb; + else + vr->io_req.br_callback = atapi_ioreq_cb; + vr->io_req.br_param = vr; + STAILQ_INSERT_TAIL(&pr->iofhd, vr, io_flist); + } + + TAILQ_INIT(&pr->iobhd); +} + +static void +pci_ahci_port_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value) +{ + int port = (offset - AHCI_OFFSET) / AHCI_STEP; + offset = (offset - AHCI_OFFSET) % AHCI_STEP; + struct ahci_port *p = &sc->port[port]; + + DPRINTF("pci_ahci_port %d: write offset 0x%"PRIx64" value 0x%"PRIx64"\n", + port, offset, value); + + switch (offset) { + case AHCI_P_CLB: + p->clb = value; + break; + case AHCI_P_CLBU: + p->clbu = value; + break; + case AHCI_P_FB: + p->fb = value; + break; + case AHCI_P_FBU: + p->fbu = value; + break; + case AHCI_P_IS: + p->is &= ~value; + break; + case AHCI_P_IE: + p->ie = value & 0xFDC000FF; + ahci_generate_intr(sc); + break; + case AHCI_P_CMD: + { + p->cmd &= ~(AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD | + AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE | + AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE | + AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK); + p->cmd |= (AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD | + AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE | + AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE | + AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK) & value; + + if (!(value & AHCI_P_CMD_ST)) { + ahci_port_stop(p); + } else { + uint64_t clb; + + p->cmd |= AHCI_P_CMD_CR; + clb = (uint64_t)p->clbu << 32 | p->clb; + p->cmd_lst = paddr_guest2host(ahci_ctx(sc), clb, + AHCI_CL_SIZE * AHCI_MAX_SLOTS); + } + + if (value & AHCI_P_CMD_FRE) { + uint64_t fb; + + p->cmd |= AHCI_P_CMD_FR; + fb = (uint64_t)p->fbu << 32 | p->fb; + /* we don't support FBSCP, so rfis size is 256Bytes */ + p->rfis = paddr_guest2host(ahci_ctx(sc), fb, 256); + } else { + p->cmd &= ~AHCI_P_CMD_FR; + } + + if (value & AHCI_P_CMD_CLO) { + p->tfd &= ~(ATA_S_BUSY | ATA_S_DRQ); + p->cmd &= ~AHCI_P_CMD_CLO; + } + + if (value & AHCI_P_CMD_ICC_MASK) { + p->cmd &= ~AHCI_P_CMD_ICC_MASK; + } + + ahci_handle_port(p); + break; + } + case AHCI_P_TFD: + case AHCI_P_SIG: + case AHCI_P_SSTS: + WPRINTF("pci_ahci_port: read only registers 0x%"PRIx64"\n", offset); + break; + case AHCI_P_SCTL: + p->sctl = value; + if (!(p->cmd & AHCI_P_CMD_ST)) { + if (value & ATA_SC_DET_RESET) + ahci_port_reset(p); + } + break; + case AHCI_P_SERR: + p->serr &= ~value; + break; + case AHCI_P_SACT: + p->sact |= value; + break; + case AHCI_P_CI: + p->ci |= value; + ahci_handle_port(p); + break; + case AHCI_P_SNTF: + case AHCI_P_FBS: + default: + break; + } +} + +static void +pci_ahci_host_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value) +{ + DPRINTF("pci_ahci_host: write offset 0x%"PRIx64" value 0x%"PRIx64"\n", + offset, value); + + switch (offset) { + case AHCI_CAP: + case AHCI_PI: + case AHCI_VS: + case AHCI_CAP2: + DPRINTF("pci_ahci_host: read only registers 0x%"PRIx64"\n", offset); + break; + case AHCI_GHC: + if (value & AHCI_GHC_HR) + ahci_reset(sc); + else if (value & AHCI_GHC_IE) { + sc->ghc |= AHCI_GHC_IE; + ahci_generate_intr(sc); + } + break; + case AHCI_IS: + sc->is &= ~value; + ahci_generate_intr(sc); + break; + default: + break; + } +} + +static void +pci_ahci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value) +{ + struct pci_ahci_softc *sc = pi->pi_arg; + + assert(baridx == 5); + assert((offset % 4) == 0 && size == 4); + + pthread_mutex_lock(&sc->mtx); + + if (offset < AHCI_OFFSET) + pci_ahci_host_write(sc, offset, value); + else if (offset < AHCI_OFFSET + sc->ports * AHCI_STEP) + pci_ahci_port_write(sc, offset, value); + else + WPRINTF("pci_ahci: unknown i/o write offset 0x%"PRIx64"\n", offset); + + pthread_mutex_unlock(&sc->mtx); +} + +static uint64_t +pci_ahci_host_read(struct pci_ahci_softc *sc, uint64_t offset) +{ + uint32_t value; + + switch (offset) { + case AHCI_CAP: + case AHCI_GHC: + case AHCI_IS: + case AHCI_PI: + case AHCI_VS: + case AHCI_CCCC: + case AHCI_CCCP: + case AHCI_EM_LOC: + case AHCI_EM_CTL: + case AHCI_CAP2: + { + uint32_t *p = &sc->cap; + p += (offset - AHCI_CAP) / sizeof(uint32_t); + value = *p; + break; + } + default: + value = 0; + break; + } + DPRINTF("pci_ahci_host: read offset 0x%"PRIx64" value 0x%x\n", + offset, value); + + return (value); +} + +static uint64_t +pci_ahci_port_read(struct pci_ahci_softc *sc, uint64_t offset) +{ + uint32_t value; + int port = (offset - AHCI_OFFSET) / AHCI_STEP; + offset = (offset - AHCI_OFFSET) % AHCI_STEP; + + switch (offset) { + case AHCI_P_CLB: + case AHCI_P_CLBU: + case AHCI_P_FB: + case AHCI_P_FBU: + case AHCI_P_IS: + case AHCI_P_IE: + case AHCI_P_CMD: + case AHCI_P_TFD: + case AHCI_P_SIG: + case AHCI_P_SSTS: + case AHCI_P_SCTL: + case AHCI_P_SERR: + case AHCI_P_SACT: + case AHCI_P_CI: + case AHCI_P_SNTF: + case AHCI_P_FBS: + { + uint32_t *p= &sc->port[port].clb; + p += (offset - AHCI_P_CLB) / sizeof(uint32_t); + value = *p; + break; + } + default: + value = 0; + break; + } + + DPRINTF("pci_ahci_port %d: read offset 0x%"PRIx64" value 0x%x\n", + port, offset, value); + + return value; +} + +static uint64_t +pci_ahci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t regoff, int size) +{ + struct pci_ahci_softc *sc = pi->pi_arg; + uint64_t offset; + uint32_t value; + + assert(baridx == 5); + assert(size == 1 || size == 2 || size == 4); + assert((regoff & (size - 1)) == 0); + + pthread_mutex_lock(&sc->mtx); + + offset = regoff & ~0x3; /* round down to a multiple of 4 bytes */ + if (offset < AHCI_OFFSET) + value = pci_ahci_host_read(sc, offset); + else if (offset < AHCI_OFFSET + sc->ports * AHCI_STEP) + value = pci_ahci_port_read(sc, offset); + else { + value = 0; + WPRINTF("pci_ahci: unknown i/o read offset 0x%"PRIx64"\n", + regoff); + } + value >>= 8 * (regoff & 0x3); + + pthread_mutex_unlock(&sc->mtx); + + return (value); +} + +static int +pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi) +{ + char bident[sizeof("XX:X:X")]; + struct blockif_ctxt *bctxt; + struct pci_ahci_softc *sc; + int ret, slots; + MD5_CTX mdctx; + u_char digest[16]; + + ret = 0; + + if (opts == NULL) { + fprintf(stderr, "pci_ahci: backing device required\n"); + return (1); + } + +#ifdef AHCI_DEBUG + dbg = fopen("/tmp/log", "w+"); +#endif + + sc = calloc(1, sizeof(struct pci_ahci_softc)); + pi->pi_arg = sc; + sc->asc_pi = pi; + sc->ports = MAX_PORTS; + + /* + * Only use port 0 for a backing device. All other ports will be + * marked as unused + */ + sc->port[0].atapi = atapi; + + /* + * Attempt to open the backing image. Use the PCI + * slot/func for the identifier string. + */ + snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->pi_func); + bctxt = blockif_open(opts, bident); + if (bctxt == NULL) { + ret = 1; + goto open_fail; + } + sc->port[0].bctx = bctxt; + sc->port[0].pr_sc = sc; + + /* + * Create an identifier for the backing file. Use parts of the + * md5 sum of the filename + */ + MD5Init(&mdctx); + MD5Update(&mdctx, opts, strlen(opts)); + MD5Final(digest, &mdctx); + sprintf(sc->port[0].ident, "BHYVE-%02X%02X-%02X%02X-%02X%02X", + digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]); + + /* + * Allocate blockif request structures and add them + * to the free list + */ + pci_ahci_ioreq_init(&sc->port[0]); + + pthread_mutex_init(&sc->mtx, NULL); + + /* Intel ICH8 AHCI */ + slots = sc->port[0].ioqsz; + if (slots > 32) + slots = 32; + --slots; + sc->cap = AHCI_CAP_64BIT | AHCI_CAP_SNCQ | AHCI_CAP_SSNTF | + AHCI_CAP_SMPS | AHCI_CAP_SSS | AHCI_CAP_SALP | + AHCI_CAP_SAL | AHCI_CAP_SCLO | (0x3 << AHCI_CAP_ISS_SHIFT)| + AHCI_CAP_PMD | AHCI_CAP_SSC | AHCI_CAP_PSC | + (slots << AHCI_CAP_NCS_SHIFT) | AHCI_CAP_SXS | (sc->ports - 1); + + /* Only port 0 implemented */ + sc->pi = 1; + sc->vs = 0x10300; + sc->cap2 = AHCI_CAP2_APST; + ahci_reset(sc); + + pci_set_cfgdata16(pi, PCIR_DEVICE, 0x2821); + pci_set_cfgdata16(pi, PCIR_VENDOR, 0x8086); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); + pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_SATA); + pci_set_cfgdata8(pi, PCIR_PROGIF, PCIP_STORAGE_SATA_AHCI_1_0); + pci_emul_add_msicap(pi, 1); + pci_emul_alloc_bar(pi, 5, PCIBAR_MEM32, + AHCI_OFFSET + sc->ports * AHCI_STEP); + + pci_lintr_request(pi); + +open_fail: + if (ret) { + if (sc->port[0].bctx != NULL) + blockif_close(sc->port[0].bctx); + free(sc); + } + + return (ret); +} + +static int +pci_ahci_hd_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + + return (pci_ahci_init(ctx, pi, opts, 0)); +} + +static int +pci_ahci_atapi_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + + return (pci_ahci_init(ctx, pi, opts, 1)); +} + +/* + * Use separate emulation names to distinguish drive and atapi devices + */ +struct pci_devemu pci_de_ahci_hd = { + .pe_emu = "ahci-hd", + .pe_init = pci_ahci_hd_init, + .pe_barwrite = pci_ahci_write, + .pe_barread = pci_ahci_read +}; +PCI_EMUL_SET(pci_de_ahci_hd); + +struct pci_devemu pci_de_ahci_cd = { + .pe_emu = "ahci-cd", + .pe_init = pci_ahci_atapi_init, + .pe_barwrite = pci_ahci_write, + .pe_barread = pci_ahci_read +}; +PCI_EMUL_SET(pci_de_ahci_cd); diff --git a/bhyve/pci_emul.c b/bhyve/pci_emul.c new file mode 100644 index 0000000..03ff0c0 --- /dev/null +++ b/bhyve/pci_emul.c @@ -0,0 +1,2108 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "acpi.h" +#include "bhyverun.h" +#include "inout.h" +#include "ioapic.h" +#include "mem.h" +#include "pci_emul.h" +#include "pci_irq.h" +#include "pci_lpc.h" + +#define CONF1_ADDR_PORT 0x0cf8 +#define CONF1_DATA_PORT 0x0cfc + +#define CONF1_ENABLE 0x80000000ul + +#define MAXBUSES (PCI_BUSMAX + 1) +#define MAXSLOTS (PCI_SLOTMAX + 1) +#define MAXFUNCS (PCI_FUNCMAX + 1) + +struct funcinfo { + char *fi_name; + char *fi_param; + struct pci_devinst *fi_devi; +}; + +struct intxinfo { + int ii_count; + int ii_pirq_pin; + int ii_ioapic_irq; +}; + +struct slotinfo { + struct intxinfo si_intpins[4]; + struct funcinfo si_funcs[MAXFUNCS]; +}; + +struct businfo { + uint16_t iobase, iolimit; /* I/O window */ + uint32_t membase32, memlimit32; /* mmio window below 4GB */ + uint64_t membase64, memlimit64; /* mmio window above 4GB */ + struct slotinfo slotinfo[MAXSLOTS]; +}; + +static struct businfo *pci_businfo[MAXBUSES]; + +SET_DECLARE(pci_devemu_set, struct pci_devemu); + +static uint64_t pci_emul_iobase; +static uint64_t pci_emul_membase32; +static uint64_t pci_emul_membase64; + +#define PCI_EMUL_IOBASE 0x2000 +#define PCI_EMUL_IOLIMIT 0x10000 + +#define PCI_EMUL_ECFG_BASE 0xE0000000 /* 3.5GB */ +#define PCI_EMUL_ECFG_SIZE (MAXBUSES * 1024 * 1024) /* 1MB per bus */ +SYSRES_MEM(PCI_EMUL_ECFG_BASE, PCI_EMUL_ECFG_SIZE); + +#define PCI_EMUL_MEMLIMIT32 PCI_EMUL_ECFG_BASE + +#define PCI_EMUL_MEMBASE64 0xD000000000UL +#define PCI_EMUL_MEMLIMIT64 0xFD00000000UL + +static struct pci_devemu *pci_emul_finddev(char *name); +static void pci_lintr_route(struct pci_devinst *pi); +static void pci_lintr_update(struct pci_devinst *pi); +static void pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, + int func, int coff, int bytes, uint32_t *val); + +static __inline void +CFGWRITE(struct pci_devinst *pi, int coff, uint32_t val, int bytes) +{ + + if (bytes == 1) + pci_set_cfgdata8(pi, coff, val); + else if (bytes == 2) + pci_set_cfgdata16(pi, coff, val); + else + pci_set_cfgdata32(pi, coff, val); +} + +static __inline uint32_t +CFGREAD(struct pci_devinst *pi, int coff, int bytes) +{ + + if (bytes == 1) + return (pci_get_cfgdata8(pi, coff)); + else if (bytes == 2) + return (pci_get_cfgdata16(pi, coff)); + else + return (pci_get_cfgdata32(pi, coff)); +} + +/* + * I/O access + */ + +/* + * Slot options are in the form: + * + * ::,[,] + * [:],[,] + * + * slot is 0..31 + * func is 0..7 + * emul is a string describing the type of PCI device e.g. virtio-net + * config is an optional string, depending on the device, that can be + * used for configuration. + * Examples are: + * 1,virtio-net,tap0 + * 3:0,dummy + */ +static void +pci_parse_slot_usage(char *aopt) +{ + + fprintf(stderr, "Invalid PCI slot info field \"%s\"\n", aopt); +} + +int +pci_parse_slot(char *opt) +{ + struct businfo *bi; + struct slotinfo *si; + char *emul, *config, *str, *cp; + int error, bnum, snum, fnum; + + error = -1; + str = strdup(opt); + + emul = config = NULL; + if ((cp = strchr(str, ',')) != NULL) { + *cp = '\0'; + emul = cp + 1; + if ((cp = strchr(emul, ',')) != NULL) { + *cp = '\0'; + config = cp + 1; + } + } else { + pci_parse_slot_usage(opt); + goto done; + } + + /* :: */ + if (sscanf(str, "%d:%d:%d", &bnum, &snum, &fnum) != 3) { + bnum = 0; + /* : */ + if (sscanf(str, "%d:%d", &snum, &fnum) != 2) { + fnum = 0; + /* */ + if (sscanf(str, "%d", &snum) != 1) { + snum = -1; + } + } + } + + if (bnum < 0 || bnum >= MAXBUSES || snum < 0 || snum >= MAXSLOTS || + fnum < 0 || fnum >= MAXFUNCS) { + pci_parse_slot_usage(opt); + goto done; + } + + if (pci_businfo[bnum] == NULL) + pci_businfo[bnum] = calloc(1, sizeof(struct businfo)); + + bi = pci_businfo[bnum]; + si = &bi->slotinfo[snum]; + + if (si->si_funcs[fnum].fi_name != NULL) { + fprintf(stderr, "pci slot %d:%d already occupied!\n", + snum, fnum); + goto done; + } + + if (pci_emul_finddev(emul) == NULL) { + fprintf(stderr, "pci slot %d:%d: unknown device \"%s\"\n", + snum, fnum, emul); + goto done; + } + + error = 0; + si->si_funcs[fnum].fi_name = emul; + si->si_funcs[fnum].fi_param = config; + +done: + if (error) + free(str); + + return (error); +} + +static int +pci_valid_pba_offset(struct pci_devinst *pi, uint64_t offset) +{ + + if (offset < pi->pi_msix.pba_offset) + return (0); + + if (offset >= pi->pi_msix.pba_offset + pi->pi_msix.pba_size) { + return (0); + } + + return (1); +} + +int +pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size, + uint64_t value) +{ + int msix_entry_offset; + int tab_index; + char *dest; + + /* support only 4 or 8 byte writes */ + if (size != 4 && size != 8) + return (-1); + + /* + * Return if table index is beyond what device supports + */ + tab_index = offset / MSIX_TABLE_ENTRY_SIZE; + if (tab_index >= pi->pi_msix.table_count) + return (-1); + + msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; + + /* support only aligned writes */ + if ((msix_entry_offset % size) != 0) + return (-1); + + dest = (char *)(pi->pi_msix.table + tab_index); + dest += msix_entry_offset; + + if (size == 4) + *((uint32_t *)dest) = value; + else + *((uint64_t *)dest) = value; + + return (0); +} + +uint64_t +pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size) +{ + char *dest; + int msix_entry_offset; + int tab_index; + uint64_t retval = ~0; + + /* + * The PCI standard only allows 4 and 8 byte accesses to the MSI-X + * table but we also allow 1 byte access to accomodate reads from + * ddb. + */ + if (size != 1 && size != 4 && size != 8) + return (retval); + + msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; + + /* support only aligned reads */ + if ((msix_entry_offset % size) != 0) { + return (retval); + } + + tab_index = offset / MSIX_TABLE_ENTRY_SIZE; + + if (tab_index < pi->pi_msix.table_count) { + /* valid MSI-X Table access */ + dest = (char *)(pi->pi_msix.table + tab_index); + dest += msix_entry_offset; + + if (size == 1) + retval = *((uint8_t *)dest); + else if (size == 4) + retval = *((uint32_t *)dest); + else + retval = *((uint64_t *)dest); + } else if (pci_valid_pba_offset(pi, offset)) { + /* return 0 for PBA access */ + retval = 0; + } + + return (retval); +} + +int +pci_msix_table_bar(struct pci_devinst *pi) +{ + + if (pi->pi_msix.table != NULL) + return (pi->pi_msix.table_bar); + else + return (-1); +} + +int +pci_msix_pba_bar(struct pci_devinst *pi) +{ + + if (pi->pi_msix.table != NULL) + return (pi->pi_msix.pba_bar); + else + return (-1); +} + +static int +pci_emul_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + struct pci_devinst *pdi = arg; + struct pci_devemu *pe = pdi->pi_d; + uint64_t offset; + int i; + + for (i = 0; i <= PCI_BARMAX; i++) { + if (pdi->pi_bar[i].type == PCIBAR_IO && + port >= pdi->pi_bar[i].addr && + port + bytes <= pdi->pi_bar[i].addr + pdi->pi_bar[i].size) { + offset = port - pdi->pi_bar[i].addr; + if (in) + *eax = (*pe->pe_barread)(ctx, vcpu, pdi, i, + offset, bytes); + else + (*pe->pe_barwrite)(ctx, vcpu, pdi, i, offset, + bytes, *eax); + return (0); + } + } + return (-1); +} + +static int +pci_emul_mem_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, + int size, uint64_t *val, void *arg1, long arg2) +{ + struct pci_devinst *pdi = arg1; + struct pci_devemu *pe = pdi->pi_d; + uint64_t offset; + int bidx = (int) arg2; + + assert(bidx <= PCI_BARMAX); + assert(pdi->pi_bar[bidx].type == PCIBAR_MEM32 || + pdi->pi_bar[bidx].type == PCIBAR_MEM64); + assert(addr >= pdi->pi_bar[bidx].addr && + addr + size <= pdi->pi_bar[bidx].addr + pdi->pi_bar[bidx].size); + + offset = addr - pdi->pi_bar[bidx].addr; + + if (dir == MEM_F_WRITE) { + if (size == 8) { + (*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset, + 4, *val & 0xffffffff); + (*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset + 4, + 4, *val >> 32); + } else { + (*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset, + size, *val); + } + } else { + if (size == 8) { + *val = (*pe->pe_barread)(ctx, vcpu, pdi, bidx, + offset, 4); + *val |= (*pe->pe_barread)(ctx, vcpu, pdi, bidx, + offset + 4, 4) << 32; + } else { + *val = (*pe->pe_barread)(ctx, vcpu, pdi, bidx, + offset, size); + } + } + + return (0); +} + + +static int +pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size, + uint64_t *addr) +{ + uint64_t base; + + assert((size & (size - 1)) == 0); /* must be a power of 2 */ + + base = roundup2(*baseptr, size); + + if (base + size <= limit) { + *addr = base; + *baseptr = base + size; + return (0); + } else + return (-1); +} + +int +pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type, + uint64_t size) +{ + + return (pci_emul_alloc_pbar(pdi, idx, 0, type, size)); +} + +/* + * Register (or unregister) the MMIO or I/O region associated with the BAR + * register 'idx' of an emulated pci device. + */ +static void +modify_bar_registration(struct pci_devinst *pi, int idx, int registration) +{ + int error; + struct inout_port iop; + struct mem_range mr; + + switch (pi->pi_bar[idx].type) { + case PCIBAR_IO: + bzero(&iop, sizeof(struct inout_port)); + iop.name = pi->pi_name; + iop.port = pi->pi_bar[idx].addr; + iop.size = pi->pi_bar[idx].size; + if (registration) { + iop.flags = IOPORT_F_INOUT; + iop.handler = pci_emul_io_handler; + iop.arg = pi; + error = register_inout(&iop); + } else + error = unregister_inout(&iop); + break; + case PCIBAR_MEM32: + case PCIBAR_MEM64: + bzero(&mr, sizeof(struct mem_range)); + mr.name = pi->pi_name; + mr.base = pi->pi_bar[idx].addr; + mr.size = pi->pi_bar[idx].size; + if (registration) { + mr.flags = MEM_F_RW; + mr.handler = pci_emul_mem_handler; + mr.arg1 = pi; + mr.arg2 = idx; + error = register_mem(&mr); + } else + error = unregister_mem(&mr); + break; + default: + error = EINVAL; + break; + } + assert(error == 0); +} + +static void +unregister_bar(struct pci_devinst *pi, int idx) +{ + + modify_bar_registration(pi, idx, 0); +} + +static void +register_bar(struct pci_devinst *pi, int idx) +{ + + modify_bar_registration(pi, idx, 1); +} + +/* Are we decoding i/o port accesses for the emulated pci device? */ +static int +porten(struct pci_devinst *pi) +{ + uint16_t cmd; + + cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); + + return (cmd & PCIM_CMD_PORTEN); +} + +/* Are we decoding memory accesses for the emulated pci device? */ +static int +memen(struct pci_devinst *pi) +{ + uint16_t cmd; + + cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); + + return (cmd & PCIM_CMD_MEMEN); +} + +/* + * Update the MMIO or I/O address that is decoded by the BAR register. + * + * If the pci device has enabled the address space decoding then intercept + * the address range decoded by the BAR register. + */ +static void +update_bar_address(struct pci_devinst *pi, uint64_t addr, int idx, int type) +{ + int decode; + + if (pi->pi_bar[idx].type == PCIBAR_IO) + decode = porten(pi); + else + decode = memen(pi); + + if (decode) + unregister_bar(pi, idx); + + switch (type) { + case PCIBAR_IO: + case PCIBAR_MEM32: + pi->pi_bar[idx].addr = addr; + break; + case PCIBAR_MEM64: + pi->pi_bar[idx].addr &= ~0xffffffffUL; + pi->pi_bar[idx].addr |= addr; + break; + case PCIBAR_MEMHI64: + pi->pi_bar[idx].addr &= 0xffffffff; + pi->pi_bar[idx].addr |= addr; + break; + default: + assert(0); + } + + if (decode) + register_bar(pi, idx); +} + +int +pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase, + enum pcibar_type type, uint64_t size) +{ + int error; + uint64_t *baseptr, limit, addr, mask, lobits, bar; + + assert(idx >= 0 && idx <= PCI_BARMAX); + + if ((size & (size - 1)) != 0) + size = 1UL << flsl(size); /* round up to a power of 2 */ + + /* Enforce minimum BAR sizes required by the PCI standard */ + if (type == PCIBAR_IO) { + if (size < 4) + size = 4; + } else { + if (size < 16) + size = 16; + } + + switch (type) { + case PCIBAR_NONE: + baseptr = NULL; + addr = mask = lobits = 0; + break; + case PCIBAR_IO: + baseptr = &pci_emul_iobase; + limit = PCI_EMUL_IOLIMIT; + mask = PCIM_BAR_IO_BASE; + lobits = PCIM_BAR_IO_SPACE; + break; + case PCIBAR_MEM64: + /* + * XXX + * Some drivers do not work well if the 64-bit BAR is allocated + * above 4GB. Allow for this by allocating small requests under + * 4GB unless then allocation size is larger than some arbitrary + * number (32MB currently). + */ + if (size > 32 * 1024 * 1024) { + /* + * XXX special case for device requiring peer-peer DMA + */ + if (size == 0x100000000UL) + baseptr = &hostbase; + else + baseptr = &pci_emul_membase64; + limit = PCI_EMUL_MEMLIMIT64; + mask = PCIM_BAR_MEM_BASE; + lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 | + PCIM_BAR_MEM_PREFETCH; + break; + } else { + baseptr = &pci_emul_membase32; + limit = PCI_EMUL_MEMLIMIT32; + mask = PCIM_BAR_MEM_BASE; + lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64; + } + break; + case PCIBAR_MEM32: + baseptr = &pci_emul_membase32; + limit = PCI_EMUL_MEMLIMIT32; + mask = PCIM_BAR_MEM_BASE; + lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32; + break; + default: + printf("pci_emul_alloc_base: invalid bar type %d\n", type); + assert(0); + } + + if (baseptr != NULL) { + error = pci_emul_alloc_resource(baseptr, limit, size, &addr); + if (error != 0) + return (error); + } + + pdi->pi_bar[idx].type = type; + pdi->pi_bar[idx].addr = addr; + pdi->pi_bar[idx].size = size; + + /* Initialize the BAR register in config space */ + bar = (addr & mask) | lobits; + pci_set_cfgdata32(pdi, PCIR_BAR(idx), bar); + + if (type == PCIBAR_MEM64) { + assert(idx + 1 <= PCI_BARMAX); + pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64; + pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32); + } + + register_bar(pdi, idx); + + return (0); +} + +#define CAP_START_OFFSET 0x40 +static int +pci_emul_add_capability(struct pci_devinst *pi, u_char *capdata, int caplen) +{ + int i, capoff, reallen; + uint16_t sts; + + assert(caplen > 0); + + reallen = roundup2(caplen, 4); /* dword aligned */ + + sts = pci_get_cfgdata16(pi, PCIR_STATUS); + if ((sts & PCIM_STATUS_CAPPRESENT) == 0) + capoff = CAP_START_OFFSET; + else + capoff = pi->pi_capend + 1; + + /* Check if we have enough space */ + if (capoff + reallen > PCI_REGMAX + 1) + return (-1); + + /* Set the previous capability pointer */ + if ((sts & PCIM_STATUS_CAPPRESENT) == 0) { + pci_set_cfgdata8(pi, PCIR_CAP_PTR, capoff); + pci_set_cfgdata16(pi, PCIR_STATUS, sts|PCIM_STATUS_CAPPRESENT); + } else + pci_set_cfgdata8(pi, pi->pi_prevcap + 1, capoff); + + /* Copy the capability */ + for (i = 0; i < caplen; i++) + pci_set_cfgdata8(pi, capoff + i, capdata[i]); + + /* Set the next capability pointer */ + pci_set_cfgdata8(pi, capoff + 1, 0); + + pi->pi_prevcap = capoff; + pi->pi_capend = capoff + reallen - 1; + return (0); +} + +static struct pci_devemu * +pci_emul_finddev(char *name) +{ + struct pci_devemu **pdpp, *pdp; + + SET_FOREACH(pdpp, pci_devemu_set) { + pdp = *pdpp; + if (!strcmp(pdp->pe_emu, name)) { + return (pdp); + } + } + + return (NULL); +} + +static int +pci_emul_init(struct vmctx *ctx, struct pci_devemu *pde, int bus, int slot, + int func, struct funcinfo *fi) +{ + struct pci_devinst *pdi; + int err; + + pdi = calloc(1, sizeof(struct pci_devinst)); + + pdi->pi_vmctx = ctx; + pdi->pi_bus = bus; + pdi->pi_slot = slot; + pdi->pi_func = func; + pthread_mutex_init(&pdi->pi_lintr.lock, NULL); + pdi->pi_lintr.pin = 0; + pdi->pi_lintr.state = IDLE; + pdi->pi_lintr.pirq_pin = 0; + pdi->pi_lintr.ioapic_irq = 0; + pdi->pi_d = pde; + snprintf(pdi->pi_name, PI_NAMESZ, "%s-pci-%d", pde->pe_emu, slot); + + /* Disable legacy interrupts */ + pci_set_cfgdata8(pdi, PCIR_INTLINE, 255); + pci_set_cfgdata8(pdi, PCIR_INTPIN, 0); + + pci_set_cfgdata8(pdi, PCIR_COMMAND, + PCIM_CMD_PORTEN | PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN); + + err = (*pde->pe_init)(ctx, pdi, fi->fi_param); + if (err == 0) + fi->fi_devi = pdi; + else + free(pdi); + + return (err); +} + +void +pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr) +{ + int mmc; + + CTASSERT(sizeof(struct msicap) == 14); + + /* Number of msi messages must be a power of 2 between 1 and 32 */ + assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32); + mmc = ffs(msgnum) - 1; + + bzero(msicap, sizeof(struct msicap)); + msicap->capid = PCIY_MSI; + msicap->nextptr = nextptr; + msicap->msgctrl = PCIM_MSICTRL_64BIT | (mmc << 1); +} + +int +pci_emul_add_msicap(struct pci_devinst *pi, int msgnum) +{ + struct msicap msicap; + + pci_populate_msicap(&msicap, msgnum, 0); + + return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap))); +} + +static void +pci_populate_msixcap(struct msixcap *msixcap, int msgnum, int barnum, + uint32_t msix_tab_size) +{ + CTASSERT(sizeof(struct msixcap) == 12); + + assert(msix_tab_size % 4096 == 0); + + bzero(msixcap, sizeof(struct msixcap)); + msixcap->capid = PCIY_MSIX; + + /* + * Message Control Register, all fields set to + * zero except for the Table Size. + * Note: Table size N is encoded as N-1 + */ + msixcap->msgctrl = msgnum - 1; + + /* + * MSI-X BAR setup: + * - MSI-X table start at offset 0 + * - PBA table starts at a 4K aligned offset after the MSI-X table + */ + msixcap->table_info = barnum & PCIM_MSIX_BIR_MASK; + msixcap->pba_info = msix_tab_size | (barnum & PCIM_MSIX_BIR_MASK); +} + +static void +pci_msix_table_init(struct pci_devinst *pi, int table_entries) +{ + int i, table_size; + + assert(table_entries > 0); + assert(table_entries <= MAX_MSIX_TABLE_ENTRIES); + + table_size = table_entries * MSIX_TABLE_ENTRY_SIZE; + pi->pi_msix.table = calloc(1, table_size); + + /* set mask bit of vector control register */ + for (i = 0; i < table_entries; i++) + pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK; +} + +int +pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum) +{ + uint32_t tab_size; + struct msixcap msixcap; + + assert(msgnum >= 1 && msgnum <= MAX_MSIX_TABLE_ENTRIES); + assert(barnum >= 0 && barnum <= PCIR_MAX_BAR_0); + + tab_size = msgnum * MSIX_TABLE_ENTRY_SIZE; + + /* Align table size to nearest 4K */ + tab_size = roundup2(tab_size, 4096); + + pi->pi_msix.table_bar = barnum; + pi->pi_msix.pba_bar = barnum; + pi->pi_msix.table_offset = 0; + pi->pi_msix.table_count = msgnum; + pi->pi_msix.pba_offset = tab_size; + pi->pi_msix.pba_size = PBA_SIZE(msgnum); + + pci_msix_table_init(pi, msgnum); + + pci_populate_msixcap(&msixcap, msgnum, barnum, tab_size); + + /* allocate memory for MSI-X Table and PBA */ + pci_emul_alloc_bar(pi, barnum, PCIBAR_MEM32, + tab_size + pi->pi_msix.pba_size); + + return (pci_emul_add_capability(pi, (u_char *)&msixcap, + sizeof(msixcap))); +} + +void +msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, + int bytes, uint32_t val) +{ + uint16_t msgctrl, rwmask; + int off, table_bar; + + off = offset - capoff; + table_bar = pi->pi_msix.table_bar; + /* Message Control Register */ + if (off == 2 && bytes == 2) { + rwmask = PCIM_MSIXCTRL_MSIX_ENABLE | PCIM_MSIXCTRL_FUNCTION_MASK; + msgctrl = pci_get_cfgdata16(pi, offset); + msgctrl &= ~rwmask; + msgctrl |= val & rwmask; + val = msgctrl; + + pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE; + pi->pi_msix.function_mask = val & PCIM_MSIXCTRL_FUNCTION_MASK; + pci_lintr_update(pi); + } + + CFGWRITE(pi, offset, val, bytes); +} + +void +msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, + int bytes, uint32_t val) +{ + uint16_t msgctrl, rwmask, msgdata, mme; + uint32_t addrlo; + + /* + * If guest is writing to the message control register make sure + * we do not overwrite read-only fields. + */ + if ((offset - capoff) == 2 && bytes == 2) { + rwmask = PCIM_MSICTRL_MME_MASK | PCIM_MSICTRL_MSI_ENABLE; + msgctrl = pci_get_cfgdata16(pi, offset); + msgctrl &= ~rwmask; + msgctrl |= val & rwmask; + val = msgctrl; + + addrlo = pci_get_cfgdata32(pi, capoff + 4); + if (msgctrl & PCIM_MSICTRL_64BIT) + msgdata = pci_get_cfgdata16(pi, capoff + 12); + else + msgdata = pci_get_cfgdata16(pi, capoff + 8); + + mme = msgctrl & PCIM_MSICTRL_MME_MASK; + pi->pi_msi.enabled = msgctrl & PCIM_MSICTRL_MSI_ENABLE ? 1 : 0; + if (pi->pi_msi.enabled) { + pi->pi_msi.addr = addrlo; + pi->pi_msi.msg_data = msgdata; + pi->pi_msi.maxmsgnum = 1 << (mme >> 4); + } else { + pi->pi_msi.maxmsgnum = 0; + } + pci_lintr_update(pi); + } + + CFGWRITE(pi, offset, val, bytes); +} + +void +pciecap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, + int bytes, uint32_t val) +{ + + /* XXX don't write to the readonly parts */ + CFGWRITE(pi, offset, val, bytes); +} + +#define PCIECAP_VERSION 0x2 +int +pci_emul_add_pciecap(struct pci_devinst *pi, int type) +{ + int err; + struct pciecap pciecap; + + CTASSERT(sizeof(struct pciecap) == 60); + + if (type != PCIEM_TYPE_ROOT_PORT) + return (-1); + + bzero(&pciecap, sizeof(pciecap)); + + pciecap.capid = PCIY_EXPRESS; + pciecap.pcie_capabilities = PCIECAP_VERSION | PCIEM_TYPE_ROOT_PORT; + pciecap.link_capabilities = 0x411; /* gen1, x1 */ + pciecap.link_status = 0x11; /* gen1, x1 */ + + err = pci_emul_add_capability(pi, (u_char *)&pciecap, sizeof(pciecap)); + return (err); +} + +/* + * This function assumes that 'coff' is in the capabilities region of the + * config space. + */ +static void +pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val) +{ + int capid; + uint8_t capoff, nextoff; + + /* Do not allow un-aligned writes */ + if ((offset & (bytes - 1)) != 0) + return; + + /* Find the capability that we want to update */ + capoff = CAP_START_OFFSET; + while (1) { + nextoff = pci_get_cfgdata8(pi, capoff + 1); + if (nextoff == 0) + break; + if (offset >= capoff && offset < nextoff) + break; + + capoff = nextoff; + } + assert(offset >= capoff); + + /* + * Capability ID and Next Capability Pointer are readonly. + * However, some o/s's do 4-byte writes that include these. + * For this case, trim the write back to 2 bytes and adjust + * the data. + */ + if (offset == capoff || offset == capoff + 1) { + if (offset == capoff && bytes == 4) { + bytes = 2; + offset += 2; + val >>= 16; + } else + return; + } + + capid = pci_get_cfgdata8(pi, capoff); + switch (capid) { + case PCIY_MSI: + msicap_cfgwrite(pi, capoff, offset, bytes, val); + break; + case PCIY_MSIX: + msixcap_cfgwrite(pi, capoff, offset, bytes, val); + break; + case PCIY_EXPRESS: + pciecap_cfgwrite(pi, capoff, offset, bytes, val); + break; + default: + break; + } +} + +static int +pci_emul_iscap(struct pci_devinst *pi, int offset) +{ + uint16_t sts; + + sts = pci_get_cfgdata16(pi, PCIR_STATUS); + if ((sts & PCIM_STATUS_CAPPRESENT) != 0) { + if (offset >= CAP_START_OFFSET && offset <= pi->pi_capend) + return (1); + } + return (0); +} + +static int +pci_emul_fallback_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, + int size, uint64_t *val, void *arg1, long arg2) +{ + /* + * Ignore writes; return 0xff's for reads. The mem read code + * will take care of truncating to the correct size. + */ + if (dir == MEM_F_READ) { + *val = 0xffffffffffffffff; + } + + return (0); +} + +static int +pci_emul_ecfg_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, + int bytes, uint64_t *val, void *arg1, long arg2) +{ + int bus, slot, func, coff, in; + + coff = addr & 0xfff; + func = (addr >> 12) & 0x7; + slot = (addr >> 15) & 0x1f; + bus = (addr >> 20) & 0xff; + in = (dir == MEM_F_READ); + if (in) + *val = ~0UL; + pci_cfgrw(ctx, vcpu, in, bus, slot, func, coff, bytes, (uint32_t *)val); + return (0); +} + +uint64_t +pci_ecfg_base(void) +{ + + return (PCI_EMUL_ECFG_BASE); +} + +#define BUSIO_ROUNDUP 32 +#define BUSMEM_ROUNDUP (1024 * 1024) + +int +init_pci(struct vmctx *ctx) +{ + struct mem_range mr; + struct pci_devemu *pde; + struct businfo *bi; + struct slotinfo *si; + struct funcinfo *fi; + size_t lowmem; + int bus, slot, func; + int error; + + pci_emul_iobase = PCI_EMUL_IOBASE; + pci_emul_membase32 = vm_get_lowmem_limit(ctx); + pci_emul_membase64 = PCI_EMUL_MEMBASE64; + + for (bus = 0; bus < MAXBUSES; bus++) { + if ((bi = pci_businfo[bus]) == NULL) + continue; + /* + * Keep track of the i/o and memory resources allocated to + * this bus. + */ + bi->iobase = pci_emul_iobase; + bi->membase32 = pci_emul_membase32; + bi->membase64 = pci_emul_membase64; + + for (slot = 0; slot < MAXSLOTS; slot++) { + si = &bi->slotinfo[slot]; + for (func = 0; func < MAXFUNCS; func++) { + fi = &si->si_funcs[func]; + if (fi->fi_name == NULL) + continue; + pde = pci_emul_finddev(fi->fi_name); + assert(pde != NULL); + error = pci_emul_init(ctx, pde, bus, slot, + func, fi); + if (error) + return (error); + } + } + + /* + * Add some slop to the I/O and memory resources decoded by + * this bus to give a guest some flexibility if it wants to + * reprogram the BARs. + */ + pci_emul_iobase += BUSIO_ROUNDUP; + pci_emul_iobase = roundup2(pci_emul_iobase, BUSIO_ROUNDUP); + bi->iolimit = pci_emul_iobase; + + pci_emul_membase32 += BUSMEM_ROUNDUP; + pci_emul_membase32 = roundup2(pci_emul_membase32, + BUSMEM_ROUNDUP); + bi->memlimit32 = pci_emul_membase32; + + pci_emul_membase64 += BUSMEM_ROUNDUP; + pci_emul_membase64 = roundup2(pci_emul_membase64, + BUSMEM_ROUNDUP); + bi->memlimit64 = pci_emul_membase64; + } + + /* + * PCI backends are initialized before routing INTx interrupts + * so that LPC devices are able to reserve ISA IRQs before + * routing PIRQ pins. + */ + for (bus = 0; bus < MAXBUSES; bus++) { + if ((bi = pci_businfo[bus]) == NULL) + continue; + + for (slot = 0; slot < MAXSLOTS; slot++) { + si = &bi->slotinfo[slot]; + for (func = 0; func < MAXFUNCS; func++) { + fi = &si->si_funcs[func]; + if (fi->fi_devi == NULL) + continue; + pci_lintr_route(fi->fi_devi); + } + } + } + lpc_pirq_routed(); + + /* + * The guest physical memory map looks like the following: + * [0, lowmem) guest system memory + * [lowmem, lowmem_limit) memory hole (may be absent) + * [lowmem_limit, 0xE0000000) PCI hole (32-bit BAR allocation) + * [0xE0000000, 0xF0000000) PCI extended config window + * [0xF0000000, 4GB) LAPIC, IOAPIC, HPET, firmware + * [4GB, 4GB + highmem) + */ + + /* + * Accesses to memory addresses that are not allocated to system + * memory or PCI devices return 0xff's. + */ + lowmem = vm_get_lowmem_size(ctx); + bzero(&mr, sizeof(struct mem_range)); + mr.name = "PCI hole"; + mr.flags = MEM_F_RW | MEM_F_IMMUTABLE; + mr.base = lowmem; + mr.size = (4ULL * 1024 * 1024 * 1024) - lowmem; + mr.handler = pci_emul_fallback_handler; + error = register_mem_fallback(&mr); + assert(error == 0); + + /* PCI extended config space */ + bzero(&mr, sizeof(struct mem_range)); + mr.name = "PCI ECFG"; + mr.flags = MEM_F_RW | MEM_F_IMMUTABLE; + mr.base = PCI_EMUL_ECFG_BASE; + mr.size = PCI_EMUL_ECFG_SIZE; + mr.handler = pci_emul_ecfg_handler; + error = register_mem(&mr); + assert(error == 0); + + return (0); +} + +static void +pci_apic_prt_entry(int bus, int slot, int pin, int pirq_pin, int ioapic_irq, + void *arg) +{ + + dsdt_line(" Package ()"); + dsdt_line(" {"); + dsdt_line(" 0x%X,", slot << 16 | 0xffff); + dsdt_line(" 0x%02X,", pin - 1); + dsdt_line(" Zero,"); + dsdt_line(" 0x%X", ioapic_irq); + dsdt_line(" },"); +} + +static void +pci_pirq_prt_entry(int bus, int slot, int pin, int pirq_pin, int ioapic_irq, + void *arg) +{ + char *name; + + name = lpc_pirq_name(pirq_pin); + if (name == NULL) + return; + dsdt_line(" Package ()"); + dsdt_line(" {"); + dsdt_line(" 0x%X,", slot << 16 | 0xffff); + dsdt_line(" 0x%02X,", pin - 1); + dsdt_line(" %s,", name); + dsdt_line(" 0x00"); + dsdt_line(" },"); + free(name); +} + +/* + * A bhyve virtual machine has a flat PCI hierarchy with a root port + * corresponding to each PCI bus. + */ +static void +pci_bus_write_dsdt(int bus) +{ + struct businfo *bi; + struct slotinfo *si; + struct pci_devinst *pi; + int count, func, slot; + + /* + * If there are no devices on this 'bus' then just return. + */ + if ((bi = pci_businfo[bus]) == NULL) { + /* + * Bus 0 is special because it decodes the I/O ports used + * for PCI config space access even if there are no devices + * on it. + */ + if (bus != 0) + return; + } + + dsdt_line(" Device (PC%02X)", bus); + dsdt_line(" {"); + dsdt_line(" Name (_HID, EisaId (\"PNP0A03\"))"); + dsdt_line(" Name (_ADR, Zero)"); + + dsdt_line(" Method (_BBN, 0, NotSerialized)"); + dsdt_line(" {"); + dsdt_line(" Return (0x%08X)", bus); + dsdt_line(" }"); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_line(" WordBusNumber (ResourceProducer, MinFixed, " + "MaxFixed, PosDecode,"); + dsdt_line(" 0x0000, // Granularity"); + dsdt_line(" 0x%04X, // Range Minimum", bus); + dsdt_line(" 0x%04X, // Range Maximum", bus); + dsdt_line(" 0x0000, // Translation Offset"); + dsdt_line(" 0x0001, // Length"); + dsdt_line(" ,, )"); + + if (bus == 0) { + dsdt_indent(3); + dsdt_fixed_ioport(0xCF8, 8); + dsdt_unindent(3); + + dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " + "PosDecode, EntireRange,"); + dsdt_line(" 0x0000, // Granularity"); + dsdt_line(" 0x0000, // Range Minimum"); + dsdt_line(" 0x0CF7, // Range Maximum"); + dsdt_line(" 0x0000, // Translation Offset"); + dsdt_line(" 0x0CF8, // Length"); + dsdt_line(" ,, , TypeStatic)"); + + dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " + "PosDecode, EntireRange,"); + dsdt_line(" 0x0000, // Granularity"); + dsdt_line(" 0x0D00, // Range Minimum"); + dsdt_line(" 0x%04X, // Range Maximum", + PCI_EMUL_IOBASE - 1); + dsdt_line(" 0x0000, // Translation Offset"); + dsdt_line(" 0x%04X, // Length", + PCI_EMUL_IOBASE - 0x0D00); + dsdt_line(" ,, , TypeStatic)"); + + if (bi == NULL) { + dsdt_line(" })"); + goto done; + } + } + assert(bi != NULL); + + /* i/o window */ + dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " + "PosDecode, EntireRange,"); + dsdt_line(" 0x0000, // Granularity"); + dsdt_line(" 0x%04X, // Range Minimum", bi->iobase); + dsdt_line(" 0x%04X, // Range Maximum", + bi->iolimit - 1); + dsdt_line(" 0x0000, // Translation Offset"); + dsdt_line(" 0x%04X, // Length", + bi->iolimit - bi->iobase); + dsdt_line(" ,, , TypeStatic)"); + + /* mmio window (32-bit) */ + dsdt_line(" DWordMemory (ResourceProducer, PosDecode, " + "MinFixed, MaxFixed, NonCacheable, ReadWrite,"); + dsdt_line(" 0x00000000, // Granularity"); + dsdt_line(" 0x%08X, // Range Minimum\n", bi->membase32); + dsdt_line(" 0x%08X, // Range Maximum\n", + bi->memlimit32 - 1); + dsdt_line(" 0x00000000, // Translation Offset"); + dsdt_line(" 0x%08X, // Length\n", + bi->memlimit32 - bi->membase32); + dsdt_line(" ,, , AddressRangeMemory, TypeStatic)"); + + /* mmio window (64-bit) */ + dsdt_line(" QWordMemory (ResourceProducer, PosDecode, " + "MinFixed, MaxFixed, NonCacheable, ReadWrite,"); + dsdt_line(" 0x0000000000000000, // Granularity"); + dsdt_line(" 0x%016lX, // Range Minimum\n", bi->membase64); + dsdt_line(" 0x%016lX, // Range Maximum\n", + bi->memlimit64 - 1); + dsdt_line(" 0x0000000000000000, // Translation Offset"); + dsdt_line(" 0x%016lX, // Length\n", + bi->memlimit64 - bi->membase64); + dsdt_line(" ,, , AddressRangeMemory, TypeStatic)"); + dsdt_line(" })"); + + count = pci_count_lintr(bus); + if (count != 0) { + dsdt_indent(2); + dsdt_line("Name (PPRT, Package ()"); + dsdt_line("{"); + pci_walk_lintr(bus, pci_pirq_prt_entry, NULL); + dsdt_line("})"); + dsdt_line("Name (APRT, Package ()"); + dsdt_line("{"); + pci_walk_lintr(bus, pci_apic_prt_entry, NULL); + dsdt_line("})"); + dsdt_line("Method (_PRT, 0, NotSerialized)"); + dsdt_line("{"); + dsdt_line(" If (PICM)"); + dsdt_line(" {"); + dsdt_line(" Return (APRT)"); + dsdt_line(" }"); + dsdt_line(" Else"); + dsdt_line(" {"); + dsdt_line(" Return (PPRT)"); + dsdt_line(" }"); + dsdt_line("}"); + dsdt_unindent(2); + } + + dsdt_indent(2); + for (slot = 0; slot < MAXSLOTS; slot++) { + si = &bi->slotinfo[slot]; + for (func = 0; func < MAXFUNCS; func++) { + pi = si->si_funcs[func].fi_devi; + if (pi != NULL && pi->pi_d->pe_write_dsdt != NULL) + pi->pi_d->pe_write_dsdt(pi); + } + } + dsdt_unindent(2); +done: + dsdt_line(" }"); +} + +void +pci_write_dsdt(void) +{ + int bus; + + dsdt_indent(1); + dsdt_line("Name (PICM, 0x00)"); + dsdt_line("Method (_PIC, 1, NotSerialized)"); + dsdt_line("{"); + dsdt_line(" Store (Arg0, PICM)"); + dsdt_line("}"); + dsdt_line(""); + dsdt_line("Scope (_SB)"); + dsdt_line("{"); + for (bus = 0; bus < MAXBUSES; bus++) + pci_bus_write_dsdt(bus); + dsdt_line("}"); + dsdt_unindent(1); +} + +int +pci_bus_configured(int bus) +{ + assert(bus >= 0 && bus < MAXBUSES); + return (pci_businfo[bus] != NULL); +} + +int +pci_msi_enabled(struct pci_devinst *pi) +{ + return (pi->pi_msi.enabled); +} + +int +pci_msi_maxmsgnum(struct pci_devinst *pi) +{ + if (pi->pi_msi.enabled) + return (pi->pi_msi.maxmsgnum); + else + return (0); +} + +int +pci_msix_enabled(struct pci_devinst *pi) +{ + + return (pi->pi_msix.enabled && !pi->pi_msi.enabled); +} + +void +pci_generate_msix(struct pci_devinst *pi, int index) +{ + struct msix_table_entry *mte; + + if (!pci_msix_enabled(pi)) + return; + + if (pi->pi_msix.function_mask) + return; + + if (index >= pi->pi_msix.table_count) + return; + + mte = &pi->pi_msix.table[index]; + if ((mte->vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { + /* XXX Set PBA bit if interrupt is disabled */ + vm_lapic_msi(pi->pi_vmctx, mte->addr, mte->msg_data); + } +} + +void +pci_generate_msi(struct pci_devinst *pi, int index) +{ + + if (pci_msi_enabled(pi) && index < pci_msi_maxmsgnum(pi)) { + vm_lapic_msi(pi->pi_vmctx, pi->pi_msi.addr, + pi->pi_msi.msg_data + index); + } +} + +static bool +pci_lintr_permitted(struct pci_devinst *pi) +{ + uint16_t cmd; + + cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); + return (!(pi->pi_msi.enabled || pi->pi_msix.enabled || + (cmd & PCIM_CMD_INTxDIS))); +} + +void +pci_lintr_request(struct pci_devinst *pi) +{ + struct businfo *bi; + struct slotinfo *si; + int bestpin, bestcount, pin; + + bi = pci_businfo[pi->pi_bus]; + assert(bi != NULL); + + /* + * Just allocate a pin from our slot. The pin will be + * assigned IRQs later when interrupts are routed. + */ + si = &bi->slotinfo[pi->pi_slot]; + bestpin = 0; + bestcount = si->si_intpins[0].ii_count; + for (pin = 1; pin < 4; pin++) { + if (si->si_intpins[pin].ii_count < bestcount) { + bestpin = pin; + bestcount = si->si_intpins[pin].ii_count; + } + } + + si->si_intpins[bestpin].ii_count++; + pi->pi_lintr.pin = bestpin + 1; + pci_set_cfgdata8(pi, PCIR_INTPIN, bestpin + 1); +} + +static void +pci_lintr_route(struct pci_devinst *pi) +{ + struct businfo *bi; + struct intxinfo *ii; + + if (pi->pi_lintr.pin == 0) + return; + + bi = pci_businfo[pi->pi_bus]; + assert(bi != NULL); + ii = &bi->slotinfo[pi->pi_slot].si_intpins[pi->pi_lintr.pin - 1]; + + /* + * Attempt to allocate an I/O APIC pin for this intpin if one + * is not yet assigned. + */ + if (ii->ii_ioapic_irq == 0) + ii->ii_ioapic_irq = ioapic_pci_alloc_irq(); + assert(ii->ii_ioapic_irq > 0); + + /* + * Attempt to allocate a PIRQ pin for this intpin if one is + * not yet assigned. + */ + if (ii->ii_pirq_pin == 0) + ii->ii_pirq_pin = pirq_alloc_pin(pi->pi_vmctx); + assert(ii->ii_pirq_pin > 0); + + pi->pi_lintr.ioapic_irq = ii->ii_ioapic_irq; + pi->pi_lintr.pirq_pin = ii->ii_pirq_pin; + pci_set_cfgdata8(pi, PCIR_INTLINE, pirq_irq(ii->ii_pirq_pin)); +} + +void +pci_lintr_assert(struct pci_devinst *pi) +{ + + assert(pi->pi_lintr.pin > 0); + + pthread_mutex_lock(&pi->pi_lintr.lock); + if (pi->pi_lintr.state == IDLE) { + if (pci_lintr_permitted(pi)) { + pi->pi_lintr.state = ASSERTED; + pci_irq_assert(pi); + } else + pi->pi_lintr.state = PENDING; + } + pthread_mutex_unlock(&pi->pi_lintr.lock); +} + +void +pci_lintr_deassert(struct pci_devinst *pi) +{ + + assert(pi->pi_lintr.pin > 0); + + pthread_mutex_lock(&pi->pi_lintr.lock); + if (pi->pi_lintr.state == ASSERTED) { + pi->pi_lintr.state = IDLE; + pci_irq_deassert(pi); + } else if (pi->pi_lintr.state == PENDING) + pi->pi_lintr.state = IDLE; + pthread_mutex_unlock(&pi->pi_lintr.lock); +} + +static void +pci_lintr_update(struct pci_devinst *pi) +{ + + pthread_mutex_lock(&pi->pi_lintr.lock); + if (pi->pi_lintr.state == ASSERTED && !pci_lintr_permitted(pi)) { + pci_irq_deassert(pi); + pi->pi_lintr.state = PENDING; + } else if (pi->pi_lintr.state == PENDING && pci_lintr_permitted(pi)) { + pi->pi_lintr.state = ASSERTED; + pci_irq_assert(pi); + } + pthread_mutex_unlock(&pi->pi_lintr.lock); +} + +int +pci_count_lintr(int bus) +{ + int count, slot, pin; + struct slotinfo *slotinfo; + + count = 0; + if (pci_businfo[bus] != NULL) { + for (slot = 0; slot < MAXSLOTS; slot++) { + slotinfo = &pci_businfo[bus]->slotinfo[slot]; + for (pin = 0; pin < 4; pin++) { + if (slotinfo->si_intpins[pin].ii_count != 0) + count++; + } + } + } + return (count); +} + +void +pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg) +{ + struct businfo *bi; + struct slotinfo *si; + struct intxinfo *ii; + int slot, pin; + + if ((bi = pci_businfo[bus]) == NULL) + return; + + for (slot = 0; slot < MAXSLOTS; slot++) { + si = &bi->slotinfo[slot]; + for (pin = 0; pin < 4; pin++) { + ii = &si->si_intpins[pin]; + if (ii->ii_count != 0) + cb(bus, slot, pin + 1, ii->ii_pirq_pin, + ii->ii_ioapic_irq, arg); + } + } +} + +/* + * Return 1 if the emulated device in 'slot' is a multi-function device. + * Return 0 otherwise. + */ +static int +pci_emul_is_mfdev(int bus, int slot) +{ + struct businfo *bi; + struct slotinfo *si; + int f, numfuncs; + + numfuncs = 0; + if ((bi = pci_businfo[bus]) != NULL) { + si = &bi->slotinfo[slot]; + for (f = 0; f < MAXFUNCS; f++) { + if (si->si_funcs[f].fi_devi != NULL) { + numfuncs++; + } + } + } + return (numfuncs > 1); +} + +/* + * Ensure that the PCIM_MFDEV bit is properly set (or unset) depending on + * whether or not is a multi-function being emulated in the pci 'slot'. + */ +static void +pci_emul_hdrtype_fixup(int bus, int slot, int off, int bytes, uint32_t *rv) +{ + int mfdev; + + if (off <= PCIR_HDRTYPE && off + bytes > PCIR_HDRTYPE) { + mfdev = pci_emul_is_mfdev(bus, slot); + switch (bytes) { + case 1: + case 2: + *rv &= ~PCIM_MFDEV; + if (mfdev) { + *rv |= PCIM_MFDEV; + } + break; + case 4: + *rv &= ~(PCIM_MFDEV << 16); + if (mfdev) { + *rv |= (PCIM_MFDEV << 16); + } + break; + } + } +} + +static void +pci_emul_cmdsts_write(struct pci_devinst *pi, int coff, uint32_t new, int bytes) +{ + int i, rshift; + uint32_t cmd, cmd2, changed, old, readonly; + + cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); /* stash old value */ + + /* + * From PCI Local Bus Specification 3.0 sections 6.2.2 and 6.2.3. + * + * XXX Bits 8, 11, 12, 13, 14 and 15 in the status register are + * 'write 1 to clear'. However these bits are not set to '1' by + * any device emulation so it is simpler to treat them as readonly. + */ + rshift = (coff & 0x3) * 8; + readonly = 0xFFFFF880 >> rshift; + + old = CFGREAD(pi, coff, bytes); + new &= ~readonly; + new |= (old & readonly); + CFGWRITE(pi, coff, new, bytes); /* update config */ + + cmd2 = pci_get_cfgdata16(pi, PCIR_COMMAND); /* get updated value */ + changed = cmd ^ cmd2; + + /* + * If the MMIO or I/O address space decoding has changed then + * register/unregister all BARs that decode that address space. + */ + for (i = 0; i <= PCI_BARMAX; i++) { + switch (pi->pi_bar[i].type) { + case PCIBAR_NONE: + case PCIBAR_MEMHI64: + break; + case PCIBAR_IO: + /* I/O address space decoding changed? */ + if (changed & PCIM_CMD_PORTEN) { + if (porten(pi)) + register_bar(pi, i); + else + unregister_bar(pi, i); + } + break; + case PCIBAR_MEM32: + case PCIBAR_MEM64: + /* MMIO address space decoding changed? */ + if (changed & PCIM_CMD_MEMEN) { + if (memen(pi)) + register_bar(pi, i); + else + unregister_bar(pi, i); + } + break; + default: + assert(0); + } + } + + /* + * If INTx has been unmasked and is pending, assert the + * interrupt. + */ + pci_lintr_update(pi); +} + +static void +pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func, + int coff, int bytes, uint32_t *eax) +{ + struct businfo *bi; + struct slotinfo *si; + struct pci_devinst *pi; + struct pci_devemu *pe; + int idx, needcfg; + uint64_t addr, bar, mask; + + if ((bi = pci_businfo[bus]) != NULL) { + si = &bi->slotinfo[slot]; + pi = si->si_funcs[func].fi_devi; + } else + pi = NULL; + + /* + * Just return if there is no device at this slot:func or if the + * the guest is doing an un-aligned access. + */ + if (pi == NULL || (bytes != 1 && bytes != 2 && bytes != 4) || + (coff & (bytes - 1)) != 0) { + if (in) + *eax = 0xffffffff; + return; + } + + /* + * Ignore all writes beyond the standard config space and return all + * ones on reads. + */ + if (coff >= PCI_REGMAX + 1) { + if (in) { + *eax = 0xffffffff; + /* + * Extended capabilities begin at offset 256 in config + * space. Absence of extended capabilities is signaled + * with all 0s in the extended capability header at + * offset 256. + */ + if (coff <= PCI_REGMAX + 4) + *eax = 0x00000000; + } + return; + } + + pe = pi->pi_d; + + /* + * Config read + */ + if (in) { + /* Let the device emulation override the default handler */ + if (pe->pe_cfgread != NULL) { + needcfg = pe->pe_cfgread(ctx, vcpu, pi, coff, bytes, + eax); + } else { + needcfg = 1; + } + + if (needcfg) + *eax = CFGREAD(pi, coff, bytes); + + pci_emul_hdrtype_fixup(bus, slot, coff, bytes, eax); + } else { + /* Let the device emulation override the default handler */ + if (pe->pe_cfgwrite != NULL && + (*pe->pe_cfgwrite)(ctx, vcpu, pi, coff, bytes, *eax) == 0) + return; + + /* + * Special handling for write to BAR registers + */ + if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) { + /* + * Ignore writes to BAR registers that are not + * 4-byte aligned. + */ + if (bytes != 4 || (coff & 0x3) != 0) + return; + idx = (coff - PCIR_BAR(0)) / 4; + mask = ~(pi->pi_bar[idx].size - 1); + switch (pi->pi_bar[idx].type) { + case PCIBAR_NONE: + pi->pi_bar[idx].addr = bar = 0; + break; + case PCIBAR_IO: + addr = *eax & mask; + addr &= 0xffff; + bar = addr | PCIM_BAR_IO_SPACE; + /* + * Register the new BAR value for interception + */ + if (addr != pi->pi_bar[idx].addr) { + update_bar_address(pi, addr, idx, + PCIBAR_IO); + } + break; + case PCIBAR_MEM32: + addr = bar = *eax & mask; + bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32; + if (addr != pi->pi_bar[idx].addr) { + update_bar_address(pi, addr, idx, + PCIBAR_MEM32); + } + break; + case PCIBAR_MEM64: + addr = bar = *eax & mask; + bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 | + PCIM_BAR_MEM_PREFETCH; + if (addr != (uint32_t)pi->pi_bar[idx].addr) { + update_bar_address(pi, addr, idx, + PCIBAR_MEM64); + } + break; + case PCIBAR_MEMHI64: + mask = ~(pi->pi_bar[idx - 1].size - 1); + addr = ((uint64_t)*eax << 32) & mask; + bar = addr >> 32; + if (bar != pi->pi_bar[idx - 1].addr >> 32) { + update_bar_address(pi, addr, idx - 1, + PCIBAR_MEMHI64); + } + break; + default: + assert(0); + } + pci_set_cfgdata32(pi, coff, bar); + + } else if (pci_emul_iscap(pi, coff)) { + pci_emul_capwrite(pi, coff, bytes, *eax); + } else if (coff >= PCIR_COMMAND && coff < PCIR_REVID) { + pci_emul_cmdsts_write(pi, coff, *eax, bytes); + } else { + CFGWRITE(pi, coff, *eax, bytes); + } + } +} + +static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff; + +static int +pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + uint32_t x; + + if (bytes != 4) { + if (in) + *eax = (bytes == 2) ? 0xffff : 0xff; + return (0); + } + + if (in) { + x = (cfgbus << 16) | (cfgslot << 11) | (cfgfunc << 8) | cfgoff; + if (cfgenable) + x |= CONF1_ENABLE; + *eax = x; + } else { + x = *eax; + cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE; + cfgoff = x & PCI_REGMAX; + cfgfunc = (x >> 8) & PCI_FUNCMAX; + cfgslot = (x >> 11) & PCI_SLOTMAX; + cfgbus = (x >> 16) & PCI_BUSMAX; + } + + return (0); +} +INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr); + +static int +pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + int coff; + + assert(bytes == 1 || bytes == 2 || bytes == 4); + + coff = cfgoff + (port - CONF1_DATA_PORT); + if (cfgenable) { + pci_cfgrw(ctx, vcpu, in, cfgbus, cfgslot, cfgfunc, coff, bytes, + eax); + } else { + /* Ignore accesses to cfgdata if not enabled by cfgaddr */ + if (in) + *eax = 0xffffffff; + } + return (0); +} + +INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+0, IOPORT_F_INOUT, pci_emul_cfgdata); +INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+1, IOPORT_F_INOUT, pci_emul_cfgdata); +INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata); +INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata); + +#define PCI_EMUL_TEST +#ifdef PCI_EMUL_TEST +/* + * Define a dummy test device + */ +#define DIOSZ 8 +#define DMEMSZ 4096 +struct pci_emul_dsoftc { + uint8_t ioregs[DIOSZ]; + uint8_t memregs[2][DMEMSZ]; +}; + +#define PCI_EMUL_MSI_MSGS 4 +#define PCI_EMUL_MSIX_MSGS 16 + +static int +pci_emul_dinit(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + int error; + struct pci_emul_dsoftc *sc; + + sc = calloc(1, sizeof(struct pci_emul_dsoftc)); + + pi->pi_arg = sc; + + pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0001); + pci_set_cfgdata16(pi, PCIR_VENDOR, 0x10DD); + pci_set_cfgdata8(pi, PCIR_CLASS, 0x02); + + error = pci_emul_add_msicap(pi, PCI_EMUL_MSI_MSGS); + assert(error == 0); + + error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, DIOSZ); + assert(error == 0); + + error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, DMEMSZ); + assert(error == 0); + + error = pci_emul_alloc_bar(pi, 2, PCIBAR_MEM32, DMEMSZ); + assert(error == 0); + + return (0); +} + +static void +pci_emul_diow(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size, uint64_t value) +{ + int i; + struct pci_emul_dsoftc *sc = pi->pi_arg; + + if (baridx == 0) { + if (offset + size > DIOSZ) { + printf("diow: iow too large, offset %ld size %d\n", + offset, size); + return; + } + + if (size == 1) { + sc->ioregs[offset] = value & 0xff; + } else if (size == 2) { + *(uint16_t *)&sc->ioregs[offset] = value & 0xffff; + } else if (size == 4) { + *(uint32_t *)&sc->ioregs[offset] = value; + } else { + printf("diow: iow unknown size %d\n", size); + } + + /* + * Special magic value to generate an interrupt + */ + if (offset == 4 && size == 4 && pci_msi_enabled(pi)) + pci_generate_msi(pi, value % pci_msi_maxmsgnum(pi)); + + if (value == 0xabcdef) { + for (i = 0; i < pci_msi_maxmsgnum(pi); i++) + pci_generate_msi(pi, i); + } + } + + if (baridx == 1 || baridx == 2) { + if (offset + size > DMEMSZ) { + printf("diow: memw too large, offset %ld size %d\n", + offset, size); + return; + } + + i = baridx - 1; /* 'memregs' index */ + + if (size == 1) { + sc->memregs[i][offset] = value; + } else if (size == 2) { + *(uint16_t *)&sc->memregs[i][offset] = value; + } else if (size == 4) { + *(uint32_t *)&sc->memregs[i][offset] = value; + } else if (size == 8) { + *(uint64_t *)&sc->memregs[i][offset] = value; + } else { + printf("diow: memw unknown size %d\n", size); + } + + /* + * magic interrupt ?? + */ + } + + if (baridx > 2) { + printf("diow: unknown bar idx %d\n", baridx); + } +} + +static uint64_t +pci_emul_dior(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size) +{ + struct pci_emul_dsoftc *sc = pi->pi_arg; + uint32_t value; + int i; + + if (baridx == 0) { + if (offset + size > DIOSZ) { + printf("dior: ior too large, offset %ld size %d\n", + offset, size); + return (0); + } + + if (size == 1) { + value = sc->ioregs[offset]; + } else if (size == 2) { + value = *(uint16_t *) &sc->ioregs[offset]; + } else if (size == 4) { + value = *(uint32_t *) &sc->ioregs[offset]; + } else { + printf("dior: ior unknown size %d\n", size); + } + } + + if (baridx == 1 || baridx == 2) { + if (offset + size > DMEMSZ) { + printf("dior: memr too large, offset %ld size %d\n", + offset, size); + return (0); + } + + i = baridx - 1; /* 'memregs' index */ + + if (size == 1) { + value = sc->memregs[i][offset]; + } else if (size == 2) { + value = *(uint16_t *) &sc->memregs[i][offset]; + } else if (size == 4) { + value = *(uint32_t *) &sc->memregs[i][offset]; + } else if (size == 8) { + value = *(uint64_t *) &sc->memregs[i][offset]; + } else { + printf("dior: ior unknown size %d\n", size); + } + } + + + if (baridx > 2) { + printf("dior: unknown bar idx %d\n", baridx); + return (0); + } + + return (value); +} + +struct pci_devemu pci_dummy = { + .pe_emu = "dummy", + .pe_init = pci_emul_dinit, + .pe_barwrite = pci_emul_diow, + .pe_barread = pci_emul_dior +}; +PCI_EMUL_SET(pci_dummy); + +#endif /* PCI_EMUL_TEST */ diff --git a/bhyve/pci_emul.h b/bhyve/pci_emul.h new file mode 100644 index 0000000..6b8c4e0 --- /dev/null +++ b/bhyve/pci_emul.h @@ -0,0 +1,283 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _PCI_EMUL_H_ +#define _PCI_EMUL_H_ + +#include +#include +#include +#include + +#include + +#include + +#define PCI_BARMAX PCIR_MAX_BAR_0 /* BAR registers in a Type 0 header */ + +struct vmctx; +struct pci_devinst; +struct memory_region; + +struct pci_devemu { + char *pe_emu; /* Name of device emulation */ + + /* instance creation */ + int (*pe_init)(struct vmctx *, struct pci_devinst *, + char *opts); + + /* ACPI DSDT enumeration */ + void (*pe_write_dsdt)(struct pci_devinst *); + + /* config space read/write callbacks */ + int (*pe_cfgwrite)(struct vmctx *ctx, int vcpu, + struct pci_devinst *pi, int offset, + int bytes, uint32_t val); + int (*pe_cfgread)(struct vmctx *ctx, int vcpu, + struct pci_devinst *pi, int offset, + int bytes, uint32_t *retval); + + /* BAR read/write callbacks */ + void (*pe_barwrite)(struct vmctx *ctx, int vcpu, + struct pci_devinst *pi, int baridx, + uint64_t offset, int size, uint64_t value); + uint64_t (*pe_barread)(struct vmctx *ctx, int vcpu, + struct pci_devinst *pi, int baridx, + uint64_t offset, int size); +}; +#define PCI_EMUL_SET(x) DATA_SET(pci_devemu_set, x); + +enum pcibar_type { + PCIBAR_NONE, + PCIBAR_IO, + PCIBAR_MEM32, + PCIBAR_MEM64, + PCIBAR_MEMHI64 +}; + +struct pcibar { + enum pcibar_type type; /* io or memory */ + uint64_t size; + uint64_t addr; +}; + +#define PI_NAMESZ 40 + +struct msix_table_entry { + uint64_t addr; + uint32_t msg_data; + uint32_t vector_control; +} __packed; + +/* + * In case the structure is modified to hold extra information, use a define + * for the size that should be emulated. + */ +#define MSIX_TABLE_ENTRY_SIZE 16 +#define MAX_MSIX_TABLE_ENTRIES 2048 +#define PBA_SIZE(msgnum) (roundup2((msgnum), 64) / 8) + +enum lintr_stat { + IDLE, + ASSERTED, + PENDING +}; + +struct pci_devinst { + struct pci_devemu *pi_d; + struct vmctx *pi_vmctx; + uint8_t pi_bus, pi_slot, pi_func; + char pi_name[PI_NAMESZ]; + int pi_bar_getsize; + int pi_prevcap; + int pi_capend; + + struct { + int8_t pin; + enum lintr_stat state; + int pirq_pin; + int ioapic_irq; + pthread_mutex_t lock; + } pi_lintr; + + struct { + int enabled; + uint64_t addr; + uint64_t msg_data; + int maxmsgnum; + } pi_msi; + + struct { + int enabled; + int table_bar; + int pba_bar; + uint32_t table_offset; + int table_count; + uint32_t pba_offset; + int pba_size; + int function_mask; + struct msix_table_entry *table; /* allocated at runtime */ + } pi_msix; + + void *pi_arg; /* devemu-private data */ + + u_char pi_cfgdata[PCI_REGMAX + 1]; + struct pcibar pi_bar[PCI_BARMAX + 1]; +}; + +struct msicap { + uint8_t capid; + uint8_t nextptr; + uint16_t msgctrl; + uint32_t addrlo; + uint32_t addrhi; + uint16_t msgdata; +} __packed; + +struct msixcap { + uint8_t capid; + uint8_t nextptr; + uint16_t msgctrl; + uint32_t table_info; /* bar index and offset within it */ + uint32_t pba_info; /* bar index and offset within it */ +} __packed; + +struct pciecap { + uint8_t capid; + uint8_t nextptr; + uint16_t pcie_capabilities; + + uint32_t dev_capabilities; /* all devices */ + uint16_t dev_control; + uint16_t dev_status; + + uint32_t link_capabilities; /* devices with links */ + uint16_t link_control; + uint16_t link_status; + + uint32_t slot_capabilities; /* ports with slots */ + uint16_t slot_control; + uint16_t slot_status; + + uint16_t root_control; /* root ports */ + uint16_t root_capabilities; + uint32_t root_status; + + uint32_t dev_capabilities2; /* all devices */ + uint16_t dev_control2; + uint16_t dev_status2; + + uint32_t link_capabilities2; /* devices with links */ + uint16_t link_control2; + uint16_t link_status2; + + uint32_t slot_capabilities2; /* ports with slots */ + uint16_t slot_control2; + uint16_t slot_status2; +} __packed; + +typedef void (*pci_lintr_cb)(int b, int s, int pin, int pirq_pin, + int ioapic_irq, void *arg); + +int init_pci(struct vmctx *ctx); +void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, + int bytes, uint32_t val); +void msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, + int bytes, uint32_t val); +void pci_callback(void); +int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, + enum pcibar_type type, uint64_t size); +int pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, + uint64_t hostbase, enum pcibar_type type, uint64_t size); +int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum); +int pci_emul_add_pciecap(struct pci_devinst *pi, int pcie_device_type); +void pci_generate_msi(struct pci_devinst *pi, int msgnum); +void pci_generate_msix(struct pci_devinst *pi, int msgnum); +void pci_lintr_assert(struct pci_devinst *pi); +void pci_lintr_deassert(struct pci_devinst *pi); +void pci_lintr_request(struct pci_devinst *pi); +int pci_msi_enabled(struct pci_devinst *pi); +int pci_msix_enabled(struct pci_devinst *pi); +int pci_msix_table_bar(struct pci_devinst *pi); +int pci_msix_pba_bar(struct pci_devinst *pi); +int pci_msi_msgnum(struct pci_devinst *pi); +int pci_parse_slot(char *opt); +void pci_populate_msicap(struct msicap *cap, int msgs, int nextptr); +int pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum); +int pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size, + uint64_t value); +uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size); +int pci_count_lintr(int bus); +void pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg); +void pci_write_dsdt(void); +uint64_t pci_ecfg_base(void); +int pci_bus_configured(int bus); + +static __inline void +pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val) +{ + assert(offset <= PCI_REGMAX); + *(uint8_t *)(pi->pi_cfgdata + offset) = val; +} + +static __inline void +pci_set_cfgdata16(struct pci_devinst *pi, int offset, uint16_t val) +{ + assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0); + *(uint16_t *)(pi->pi_cfgdata + offset) = val; +} + +static __inline void +pci_set_cfgdata32(struct pci_devinst *pi, int offset, uint32_t val) +{ + assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0); + *(uint32_t *)(pi->pi_cfgdata + offset) = val; +} + +static __inline uint8_t +pci_get_cfgdata8(struct pci_devinst *pi, int offset) +{ + assert(offset <= PCI_REGMAX); + return (*(uint8_t *)(pi->pi_cfgdata + offset)); +} + +static __inline uint16_t +pci_get_cfgdata16(struct pci_devinst *pi, int offset) +{ + assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0); + return (*(uint16_t *)(pi->pi_cfgdata + offset)); +} + +static __inline uint32_t +pci_get_cfgdata32(struct pci_devinst *pi, int offset) +{ + assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0); + return (*(uint32_t *)(pi->pi_cfgdata + offset)); +} + +#endif /* _PCI_EMUL_H_ */ diff --git a/bhyve/pci_hostbridge.c b/bhyve/pci_hostbridge.c new file mode 100644 index 0000000..54a25ae --- /dev/null +++ b/bhyve/pci_hostbridge.c @@ -0,0 +1,70 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "pci_emul.h" + +static int +pci_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + + /* config space */ + pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1275); /* NetApp */ + pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1275); /* NetApp */ + pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_BRIDGE); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE); + pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_HOST); + + pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_PORT); + + return (0); +} + +static int +pci_amd_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + (void) pci_hostbridge_init(ctx, pi, opts); + pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1022); /* AMD */ + pci_set_cfgdata16(pi, PCIR_DEVICE, 0x7432); /* made up */ + + return (0); +} + +struct pci_devemu pci_de_amd_hostbridge = { + .pe_emu = "amd_hostbridge", + .pe_init = pci_amd_hostbridge_init, +}; +PCI_EMUL_SET(pci_de_amd_hostbridge); + +struct pci_devemu pci_de_hostbridge = { + .pe_emu = "hostbridge", + .pe_init = pci_hostbridge_init, +}; +PCI_EMUL_SET(pci_de_hostbridge); diff --git a/bhyve/pci_irq.c b/bhyve/pci_irq.c new file mode 100644 index 0000000..f22b15c --- /dev/null +++ b/bhyve/pci_irq.c @@ -0,0 +1,346 @@ +/*- + * Copyright (c) 2014 Hudson River Trading LLC + * Written by: John H. Baldwin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "acpi.h" +#include "inout.h" +#include "pci_emul.h" +#include "pci_irq.h" +#include "pci_lpc.h" + +/* + * Implement an 8 pin PCI interrupt router compatible with the router + * present on Intel's ICH10 chip. + */ + +/* Fields in each PIRQ register. */ +#define PIRQ_DIS 0x80 +#define PIRQ_IRQ 0x0f + +/* Only IRQs 3-7, 9-12, and 14-15 are permitted. */ +#define PERMITTED_IRQS 0xdef8 +#define IRQ_PERMITTED(irq) (((1U << (irq)) & PERMITTED_IRQS) != 0) + +/* IRQ count to disable an IRQ. */ +#define IRQ_DISABLED 0xff + +static struct pirq { + uint8_t reg; + int use_count; + int active_count; + pthread_mutex_t lock; +} pirqs[8]; + +static u_char irq_counts[16]; +static int pirq_cold = 1; + +/* + * Returns true if this pin is enabled with a valid IRQ. Setting the + * register to a reserved IRQ causes interrupts to not be asserted as + * if the pin was disabled. + */ +static bool +pirq_valid_irq(int reg) +{ + + if (reg & PIRQ_DIS) + return (false); + return (IRQ_PERMITTED(reg & PIRQ_IRQ)); +} + +uint8_t +pirq_read(int pin) +{ + + assert(pin > 0 && pin <= nitems(pirqs)); + return (pirqs[pin - 1].reg); +} + +void +pirq_write(struct vmctx *ctx, int pin, uint8_t val) +{ + struct pirq *pirq; + + assert(pin > 0 && pin <= nitems(pirqs)); + pirq = &pirqs[pin - 1]; + pthread_mutex_lock(&pirq->lock); + if (pirq->reg != (val & (PIRQ_DIS | PIRQ_IRQ))) { + if (pirq->active_count != 0 && pirq_valid_irq(pirq->reg)) + vm_isa_deassert_irq(ctx, pirq->reg & PIRQ_IRQ, -1); + pirq->reg = val & (PIRQ_DIS | PIRQ_IRQ); + if (pirq->active_count != 0 && pirq_valid_irq(pirq->reg)) + vm_isa_assert_irq(ctx, pirq->reg & PIRQ_IRQ, -1); + } + pthread_mutex_unlock(&pirq->lock); +} + +void +pci_irq_reserve(int irq) +{ + + assert(irq >= 0 && irq < nitems(irq_counts)); + assert(pirq_cold); + assert(irq_counts[irq] == 0 || irq_counts[irq] == IRQ_DISABLED); + irq_counts[irq] = IRQ_DISABLED; +} + +void +pci_irq_use(int irq) +{ + + assert(irq >= 0 && irq < nitems(irq_counts)); + assert(pirq_cold); + assert(irq_counts[irq] != IRQ_DISABLED); + irq_counts[irq]++; +} + +void +pci_irq_init(struct vmctx *ctx) +{ + int i; + + for (i = 0; i < nitems(pirqs); i++) { + pirqs[i].reg = PIRQ_DIS; + pirqs[i].use_count = 0; + pirqs[i].active_count = 0; + pthread_mutex_init(&pirqs[i].lock, NULL); + } + for (i = 0; i < nitems(irq_counts); i++) { + if (IRQ_PERMITTED(i)) + irq_counts[i] = 0; + else + irq_counts[i] = IRQ_DISABLED; + } +} + +void +pci_irq_assert(struct pci_devinst *pi) +{ + struct pirq *pirq; + + if (pi->pi_lintr.pirq_pin > 0) { + assert(pi->pi_lintr.pirq_pin <= nitems(pirqs)); + pirq = &pirqs[pi->pi_lintr.pirq_pin - 1]; + pthread_mutex_lock(&pirq->lock); + pirq->active_count++; + if (pirq->active_count == 1 && pirq_valid_irq(pirq->reg)) { + vm_isa_assert_irq(pi->pi_vmctx, pirq->reg & PIRQ_IRQ, + pi->pi_lintr.ioapic_irq); + pthread_mutex_unlock(&pirq->lock); + return; + } + pthread_mutex_unlock(&pirq->lock); + } + vm_ioapic_assert_irq(pi->pi_vmctx, pi->pi_lintr.ioapic_irq); +} + +void +pci_irq_deassert(struct pci_devinst *pi) +{ + struct pirq *pirq; + + if (pi->pi_lintr.pirq_pin > 0) { + assert(pi->pi_lintr.pirq_pin <= nitems(pirqs)); + pirq = &pirqs[pi->pi_lintr.pirq_pin - 1]; + pthread_mutex_lock(&pirq->lock); + pirq->active_count--; + if (pirq->active_count == 0 && pirq_valid_irq(pirq->reg)) { + vm_isa_deassert_irq(pi->pi_vmctx, pirq->reg & PIRQ_IRQ, + pi->pi_lintr.ioapic_irq); + pthread_mutex_unlock(&pirq->lock); + return; + } + pthread_mutex_unlock(&pirq->lock); + } + vm_ioapic_deassert_irq(pi->pi_vmctx, pi->pi_lintr.ioapic_irq); +} + +int +pirq_alloc_pin(struct vmctx *ctx) +{ + int best_count, best_irq, best_pin, irq, pin; + + pirq_cold = 0; + + /* First, find the least-used PIRQ pin. */ + best_pin = 0; + best_count = pirqs[0].use_count; + for (pin = 1; pin < nitems(pirqs); pin++) { + if (pirqs[pin].use_count < best_count) { + best_pin = pin; + best_count = pirqs[pin].use_count; + } + } + pirqs[best_pin].use_count++; + + /* Second, route this pin to an IRQ. */ + if (pirqs[best_pin].reg == PIRQ_DIS) { + best_irq = -1; + best_count = 0; + for (irq = 0; irq < nitems(irq_counts); irq++) { + if (irq_counts[irq] == IRQ_DISABLED) + continue; + if (best_irq == -1 || irq_counts[irq] < best_count) { + best_irq = irq; + best_count = irq_counts[irq]; + } + } + assert(best_irq >= 0); + irq_counts[best_irq]++; + pirqs[best_pin].reg = best_irq; + vm_isa_set_irq_trigger(ctx, best_irq, LEVEL_TRIGGER); + } + + return (best_pin + 1); +} + +int +pirq_irq(int pin) +{ + assert(pin > 0 && pin <= nitems(pirqs)); + return (pirqs[pin - 1].reg & PIRQ_IRQ); +} + +/* XXX: Generate $PIR table. */ + +static void +pirq_dsdt(void) +{ + char *irq_prs, *old; + int irq, pin; + + irq_prs = NULL; + for (irq = 0; irq < nitems(irq_counts); irq++) { + if (!IRQ_PERMITTED(irq)) + continue; + if (irq_prs == NULL) + asprintf(&irq_prs, "%d", irq); + else { + old = irq_prs; + asprintf(&irq_prs, "%s,%d", old, irq); + free(old); + } + } + + /* + * A helper method to validate a link register's value. This + * duplicates pirq_valid_irq(). + */ + dsdt_line(""); + dsdt_line("Method (PIRV, 1, NotSerialized)"); + dsdt_line("{"); + dsdt_line(" If (And (Arg0, 0x%02X))", PIRQ_DIS); + dsdt_line(" {"); + dsdt_line(" Return (0x00)"); + dsdt_line(" }"); + dsdt_line(" And (Arg0, 0x%02X, Local0)", PIRQ_IRQ); + dsdt_line(" If (LLess (Local0, 0x03))"); + dsdt_line(" {"); + dsdt_line(" Return (0x00)"); + dsdt_line(" }"); + dsdt_line(" If (LEqual (Local0, 0x08))"); + dsdt_line(" {"); + dsdt_line(" Return (0x00)"); + dsdt_line(" }"); + dsdt_line(" If (LEqual (Local0, 0x0D))"); + dsdt_line(" {"); + dsdt_line(" Return (0x00)"); + dsdt_line(" }"); + dsdt_line(" Return (0x01)"); + dsdt_line("}"); + + for (pin = 0; pin < nitems(pirqs); pin++) { + dsdt_line(""); + dsdt_line("Device (LNK%c)", 'A' + pin); + dsdt_line("{"); + dsdt_line(" Name (_HID, EisaId (\"PNP0C0F\"))"); + dsdt_line(" Name (_UID, 0x%02X)", pin + 1); + dsdt_line(" Method (_STA, 0, NotSerialized)"); + dsdt_line(" {"); + dsdt_line(" If (PIRV (PIR%c))", 'A' + pin); + dsdt_line(" {"); + dsdt_line(" Return (0x0B)"); + dsdt_line(" }"); + dsdt_line(" Else"); + dsdt_line(" {"); + dsdt_line(" Return (0x09)"); + dsdt_line(" }"); + dsdt_line(" }"); + dsdt_line(" Name (_PRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_line(" IRQ (Level, ActiveLow, Shared, )"); + dsdt_line(" {%s}", irq_prs); + dsdt_line(" })"); + dsdt_line(" Name (CB%02X, ResourceTemplate ()", pin + 1); + dsdt_line(" {"); + dsdt_line(" IRQ (Level, ActiveLow, Shared, )"); + dsdt_line(" {}"); + dsdt_line(" })"); + dsdt_line(" CreateWordField (CB%02X, 0x01, CIR%c)", + pin + 1, 'A' + pin); + dsdt_line(" Method (_CRS, 0, NotSerialized)"); + dsdt_line(" {"); + dsdt_line(" And (PIR%c, 0x%02X, Local0)", 'A' + pin, + PIRQ_DIS | PIRQ_IRQ); + dsdt_line(" If (PIRV (Local0))"); + dsdt_line(" {"); + dsdt_line(" ShiftLeft (0x01, Local0, CIR%c)", 'A' + pin); + dsdt_line(" }"); + dsdt_line(" Else"); + dsdt_line(" {"); + dsdt_line(" Store (0x00, CIR%c)", 'A' + pin); + dsdt_line(" }"); + dsdt_line(" Return (CB%02X)", pin + 1); + dsdt_line(" }"); + dsdt_line(" Method (_DIS, 0, NotSerialized)"); + dsdt_line(" {"); + dsdt_line(" Store (0x80, PIR%c)", 'A' + pin); + dsdt_line(" }"); + dsdt_line(" Method (_SRS, 1, NotSerialized)"); + dsdt_line(" {"); + dsdt_line(" CreateWordField (Arg0, 0x01, SIR%c)", 'A' + pin); + dsdt_line(" FindSetRightBit (SIR%c, Local0)", 'A' + pin); + dsdt_line(" Store (Decrement (Local0), PIR%c)", 'A' + pin); + dsdt_line(" }"); + dsdt_line("}"); + } + free(irq_prs); +} +LPC_DSDT(pirq_dsdt); diff --git a/bhyve/pci_irq.h b/bhyve/pci_irq.h new file mode 100644 index 0000000..24f9c99 --- /dev/null +++ b/bhyve/pci_irq.h @@ -0,0 +1,45 @@ +/*- + * Copyright (c) 2014 Hudson River Trading LLC + * Written by: John H. Baldwin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef __PCI_IRQ_H__ +#define __PCI_IRQ_H__ + +struct pci_devinst; + +void pci_irq_assert(struct pci_devinst *pi); +void pci_irq_deassert(struct pci_devinst *pi); +void pci_irq_init(struct vmctx *ctx); +void pci_irq_reserve(int irq); +void pci_irq_use(int irq); +int pirq_alloc_pin(struct vmctx *ctx); +int pirq_irq(int pin); +uint8_t pirq_read(int pin); +void pirq_write(struct vmctx *ctx, int pin, uint8_t val); + +#endif diff --git a/bhyve/pci_lpc.c b/bhyve/pci_lpc.c new file mode 100644 index 0000000..e98b141 --- /dev/null +++ b/bhyve/pci_lpc.c @@ -0,0 +1,429 @@ +/*- + * Copyright (c) 2013 Neel Natu + * Copyright (c) 2013 Tycho Nightingale + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include + +#include + +#include "acpi.h" +#include "inout.h" +#include "pci_emul.h" +#include "pci_irq.h" +#include "pci_lpc.h" +#include "uart_emul.h" + +#define IO_ICU1 0x20 +#define IO_ICU2 0xA0 + +SET_DECLARE(lpc_dsdt_set, struct lpc_dsdt); +SET_DECLARE(lpc_sysres_set, struct lpc_sysres); + +#define ELCR_PORT 0x4d0 +SYSRES_IO(ELCR_PORT, 2); + +#define IO_TIMER1_PORT 0x40 + +#define NMISC_PORT 0x61 +SYSRES_IO(NMISC_PORT, 1); + +static struct pci_devinst *lpc_bridge; + +#define LPC_UART_NUM 2 +static struct lpc_uart_softc { + struct uart_softc *uart_softc; + const char *opts; + int iobase; + int irq; + int enabled; +} lpc_uart_softc[LPC_UART_NUM]; + +static const char *lpc_uart_names[LPC_UART_NUM] = { "COM1", "COM2" }; + +/* + * LPC device configuration is in the following form: + * [,] + * For e.g. "com1,stdio" + */ +int +lpc_device_parse(const char *opts) +{ + int unit, error; + char *str, *cpy, *lpcdev; + + error = -1; + str = cpy = strdup(opts); + lpcdev = strsep(&str, ","); + if (lpcdev != NULL) { + for (unit = 0; unit < LPC_UART_NUM; unit++) { + if (strcasecmp(lpcdev, lpc_uart_names[unit]) == 0) { + lpc_uart_softc[unit].opts = str; + error = 0; + goto done; + } + } + } + +done: + if (error) + free(cpy); + + return (error); +} + +static void +lpc_uart_intr_assert(void *arg) +{ + struct lpc_uart_softc *sc = arg; + + assert(sc->irq >= 0); + + vm_isa_pulse_irq(lpc_bridge->pi_vmctx, sc->irq, sc->irq); +} + +static void +lpc_uart_intr_deassert(void *arg) +{ + /* + * The COM devices on the LPC bus generate edge triggered interrupts, + * so nothing more to do here. + */ +} + +static int +lpc_uart_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + int offset; + struct lpc_uart_softc *sc = arg; + + offset = port - sc->iobase; + + switch (bytes) { + case 1: + if (in) + *eax = uart_read(sc->uart_softc, offset); + else + uart_write(sc->uart_softc, offset, *eax); + break; + case 2: + if (in) { + *eax = uart_read(sc->uart_softc, offset); + *eax |= uart_read(sc->uart_softc, offset + 1) << 8; + } else { + uart_write(sc->uart_softc, offset, *eax); + uart_write(sc->uart_softc, offset + 1, *eax >> 8); + } + break; + default: + return (-1); + } + + return (0); +} + +static int +lpc_init(void) +{ + struct lpc_uart_softc *sc; + struct inout_port iop; + const char *name; + int unit, error; + + /* COM1 and COM2 */ + for (unit = 0; unit < LPC_UART_NUM; unit++) { + sc = &lpc_uart_softc[unit]; + name = lpc_uart_names[unit]; + + if (uart_legacy_alloc(unit, &sc->iobase, &sc->irq) != 0) { + fprintf(stderr, "Unable to allocate resources for " + "LPC device %s\n", name); + return (-1); + } + pci_irq_reserve(sc->irq); + + sc->uart_softc = uart_init(lpc_uart_intr_assert, + lpc_uart_intr_deassert, sc); + + if (uart_set_backend(sc->uart_softc, sc->opts) != 0) { + fprintf(stderr, "Unable to initialize backend '%s' " + "for LPC device %s\n", sc->opts, name); + return (-1); + } + + bzero(&iop, sizeof(struct inout_port)); + iop.name = name; + iop.port = sc->iobase; + iop.size = UART_IO_BAR_SIZE; + iop.flags = IOPORT_F_INOUT; + iop.handler = lpc_uart_io_handler; + iop.arg = sc; + + error = register_inout(&iop); + assert(error == 0); + sc->enabled = 1; + } + + return (0); +} + +static void +pci_lpc_write_dsdt(struct pci_devinst *pi) +{ + struct lpc_dsdt **ldpp, *ldp; + + dsdt_line(""); + dsdt_line("Device (ISA)"); + dsdt_line("{"); + dsdt_line(" Name (_ADR, 0x%04X%04X)", pi->pi_slot, pi->pi_func); + dsdt_line(" OperationRegion (LPCR, PCI_Config, 0x00, 0x100)"); + dsdt_line(" Field (LPCR, AnyAcc, NoLock, Preserve)"); + dsdt_line(" {"); + dsdt_line(" Offset (0x60),"); + dsdt_line(" PIRA, 8,"); + dsdt_line(" PIRB, 8,"); + dsdt_line(" PIRC, 8,"); + dsdt_line(" PIRD, 8,"); + dsdt_line(" Offset (0x68),"); + dsdt_line(" PIRE, 8,"); + dsdt_line(" PIRF, 8,"); + dsdt_line(" PIRG, 8,"); + dsdt_line(" PIRH, 8"); + dsdt_line(" }"); + dsdt_line(""); + + dsdt_indent(1); + SET_FOREACH(ldpp, lpc_dsdt_set) { + ldp = *ldpp; + ldp->handler(); + } + + dsdt_line(""); + dsdt_line("Device (PIC)"); + dsdt_line("{"); + dsdt_line(" Name (_HID, EisaId (\"PNP0000\"))"); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_indent(2); + dsdt_fixed_ioport(IO_ICU1, 2); + dsdt_fixed_ioport(IO_ICU2, 2); + dsdt_fixed_irq(2); + dsdt_unindent(2); + dsdt_line(" })"); + dsdt_line("}"); + + dsdt_line(""); + dsdt_line("Device (TIMR)"); + dsdt_line("{"); + dsdt_line(" Name (_HID, EisaId (\"PNP0100\"))"); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_indent(2); + dsdt_fixed_ioport(IO_TIMER1_PORT, 4); + dsdt_fixed_irq(0); + dsdt_unindent(2); + dsdt_line(" })"); + dsdt_line("}"); + dsdt_unindent(1); + + dsdt_line("}"); +} + +static void +pci_lpc_sysres_dsdt(void) +{ + struct lpc_sysres **lspp, *lsp; + + dsdt_line(""); + dsdt_line("Device (SIO)"); + dsdt_line("{"); + dsdt_line(" Name (_HID, EisaId (\"PNP0C02\"))"); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + + dsdt_indent(2); + SET_FOREACH(lspp, lpc_sysres_set) { + lsp = *lspp; + switch (lsp->type) { + case LPC_SYSRES_IO: + dsdt_fixed_ioport(lsp->base, lsp->length); + break; + case LPC_SYSRES_MEM: + dsdt_fixed_mem32(lsp->base, lsp->length); + break; + } + } + dsdt_unindent(2); + + dsdt_line(" })"); + dsdt_line("}"); +} +LPC_DSDT(pci_lpc_sysres_dsdt); + +static void +pci_lpc_uart_dsdt(void) +{ + struct lpc_uart_softc *sc; + int unit; + + for (unit = 0; unit < LPC_UART_NUM; unit++) { + sc = &lpc_uart_softc[unit]; + if (!sc->enabled) + continue; + dsdt_line(""); + dsdt_line("Device (%s)", lpc_uart_names[unit]); + dsdt_line("{"); + dsdt_line(" Name (_HID, EisaId (\"PNP0501\"))"); + dsdt_line(" Name (_UID, %d)", unit + 1); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_indent(2); + dsdt_fixed_ioport(sc->iobase, UART_IO_BAR_SIZE); + dsdt_fixed_irq(sc->irq); + dsdt_unindent(2); + dsdt_line(" })"); + dsdt_line("}"); + } +} +LPC_DSDT(pci_lpc_uart_dsdt); + +static int +pci_lpc_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int coff, int bytes, uint32_t val) +{ + int pirq_pin; + + if (bytes == 1) { + pirq_pin = 0; + if (coff >= 0x60 && coff <= 0x63) + pirq_pin = coff - 0x60 + 1; + if (coff >= 0x68 && coff <= 0x6b) + pirq_pin = coff - 0x68 + 5; + if (pirq_pin != 0) { + pirq_write(ctx, pirq_pin, val); + pci_set_cfgdata8(pi, coff, pirq_read(pirq_pin)); + return (0); + } + } + return (-1); +} + +static void +pci_lpc_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value) +{ +} + +static uint64_t +pci_lpc_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size) +{ + return (0); +} + +#define LPC_DEV 0x7000 +#define LPC_VENDOR 0x8086 + +static int +pci_lpc_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + + /* + * Do not allow more than one LPC bridge to be configured. + */ + if (lpc_bridge != NULL) { + fprintf(stderr, "Only one LPC bridge is allowed.\n"); + return (-1); + } + + /* + * Enforce that the LPC can only be configured on bus 0. This + * simplifies the ACPI DSDT because it can provide a decode for + * all legacy i/o ports behind bus 0. + */ + if (pi->pi_bus != 0) { + fprintf(stderr, "LPC bridge can be present only on bus 0.\n"); + return (-1); + } + + if (lpc_init() != 0) + return (-1); + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, LPC_DEV); + pci_set_cfgdata16(pi, PCIR_VENDOR, LPC_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE); + pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_ISA); + + lpc_bridge = pi; + + return (0); +} + +char * +lpc_pirq_name(int pin) +{ + char *name; + + if (lpc_bridge == NULL) + return (NULL); + asprintf(&name, "\\_SB.PC00.ISA.LNK%c,", 'A' + pin - 1); + return (name); +} + +void +lpc_pirq_routed(void) +{ + int pin; + + if (lpc_bridge == NULL) + return; + + for (pin = 0; pin < 4; pin++) + pci_set_cfgdata8(lpc_bridge, 0x60 + pin, pirq_read(pin + 1)); + for (pin = 0; pin < 4; pin++) + pci_set_cfgdata8(lpc_bridge, 0x68 + pin, pirq_read(pin + 5)); +} + +struct pci_devemu pci_de_lpc = { + .pe_emu = "lpc", + .pe_init = pci_lpc_init, + .pe_write_dsdt = pci_lpc_write_dsdt, + .pe_cfgwrite = pci_lpc_cfgwrite, + .pe_barwrite = pci_lpc_write, + .pe_barread = pci_lpc_read +}; +PCI_EMUL_SET(pci_de_lpc); diff --git a/bhyve/pci_lpc.h b/bhyve/pci_lpc.h new file mode 100644 index 0000000..55a5865 --- /dev/null +++ b/bhyve/pci_lpc.h @@ -0,0 +1,72 @@ +/*- + * Copyright (c) 2013 Neel Natu + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _LPC_H_ +#define _LPC_H_ + +#include + +typedef void (*lpc_write_dsdt_t)(void); + +struct lpc_dsdt { + lpc_write_dsdt_t handler; +}; + +#define LPC_DSDT(handler) \ + static struct lpc_dsdt __CONCAT(__lpc_dsdt, __LINE__) = { \ + (handler), \ + }; \ + DATA_SET(lpc_dsdt_set, __CONCAT(__lpc_dsdt, __LINE__)) + +enum lpc_sysres_type { + LPC_SYSRES_IO, + LPC_SYSRES_MEM +}; + +struct lpc_sysres { + enum lpc_sysres_type type; + uint32_t base; + uint32_t length; +}; + +#define LPC_SYSRES(type, base, length) \ + static struct lpc_sysres __CONCAT(__lpc_sysres, __LINE__) = { \ + (type), \ + (base), \ + (length) \ + }; \ + DATA_SET(lpc_sysres_set, __CONCAT(__lpc_sysres, __LINE__)) + +#define SYSRES_IO(base, length) LPC_SYSRES(LPC_SYSRES_IO, base, length) +#define SYSRES_MEM(base, length) LPC_SYSRES(LPC_SYSRES_MEM, base, length) + +int lpc_device_parse(const char *opt); +char *lpc_pirq_name(int pin); +void lpc_pirq_routed(void); + +#endif diff --git a/bhyve/pci_passthru.c b/bhyve/pci_passthru.c new file mode 100644 index 0000000..04d68c4 --- /dev/null +++ b/bhyve/pci_passthru.c @@ -0,0 +1,790 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include "pci_emul.h" +#include "mem.h" + +#ifndef _PATH_DEVPCI +#define _PATH_DEVPCI "/dev/pci" +#endif + +#ifndef _PATH_DEVIO +#define _PATH_DEVIO "/dev/io" +#endif + +#define LEGACY_SUPPORT 1 + +#define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1) +#define MSIX_CAPLEN 12 + +static int pcifd = -1; +static int iofd = -1; + +struct passthru_softc { + struct pci_devinst *psc_pi; + struct pcibar psc_bar[PCI_BARMAX + 1]; + struct { + int capoff; + int msgctrl; + int emulated; + } psc_msi; + struct { + int capoff; + } psc_msix; + struct pcisel psc_sel; +}; + +static int +msi_caplen(int msgctrl) +{ + int len; + + len = 10; /* minimum length of msi capability */ + + if (msgctrl & PCIM_MSICTRL_64BIT) + len += 4; + +#if 0 + /* + * Ignore the 'mask' and 'pending' bits in the MSI capability. + * We'll let the guest manipulate them directly. + */ + if (msgctrl & PCIM_MSICTRL_VECTOR) + len += 10; +#endif + + return (len); +} + +static uint32_t +read_config(const struct pcisel *sel, long reg, int width) +{ + struct pci_io pi; + + bzero(&pi, sizeof(pi)); + pi.pi_sel = *sel; + pi.pi_reg = reg; + pi.pi_width = width; + + if (ioctl(pcifd, PCIOCREAD, &pi) < 0) + return (0); /* XXX */ + else + return (pi.pi_data); +} + +static void +write_config(const struct pcisel *sel, long reg, int width, uint32_t data) +{ + struct pci_io pi; + + bzero(&pi, sizeof(pi)); + pi.pi_sel = *sel; + pi.pi_reg = reg; + pi.pi_width = width; + pi.pi_data = data; + + (void)ioctl(pcifd, PCIOCWRITE, &pi); /* XXX */ +} + +#ifdef LEGACY_SUPPORT +static int +passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr) +{ + int capoff, i; + struct msicap msicap; + u_char *capdata; + + pci_populate_msicap(&msicap, msgnum, nextptr); + + /* + * XXX + * Copy the msi capability structure in the last 16 bytes of the + * config space. This is wrong because it could shadow something + * useful to the device. + */ + capoff = 256 - roundup(sizeof(msicap), 4); + capdata = (u_char *)&msicap; + for (i = 0; i < sizeof(msicap); i++) + pci_set_cfgdata8(pi, capoff + i, capdata[i]); + + return (capoff); +} +#endif /* LEGACY_SUPPORT */ + +static int +cfginitmsi(struct passthru_softc *sc) +{ + int i, ptr, capptr, cap, sts, caplen, table_size; + uint32_t u32; + struct pcisel sel; + struct pci_devinst *pi; + struct msixcap msixcap; + uint32_t *msixcap_ptr; + + pi = sc->psc_pi; + sel = sc->psc_sel; + + /* + * Parse the capabilities and cache the location of the MSI + * and MSI-X capabilities. + */ + sts = read_config(&sel, PCIR_STATUS, 2); + if (sts & PCIM_STATUS_CAPPRESENT) { + ptr = read_config(&sel, PCIR_CAP_PTR, 1); + while (ptr != 0 && ptr != 0xff) { + cap = read_config(&sel, ptr + PCICAP_ID, 1); + if (cap == PCIY_MSI) { + /* + * Copy the MSI capability into the config + * space of the emulated pci device + */ + sc->psc_msi.capoff = ptr; + sc->psc_msi.msgctrl = read_config(&sel, + ptr + 2, 2); + sc->psc_msi.emulated = 0; + caplen = msi_caplen(sc->psc_msi.msgctrl); + capptr = ptr; + while (caplen > 0) { + u32 = read_config(&sel, capptr, 4); + pci_set_cfgdata32(pi, capptr, u32); + caplen -= 4; + capptr += 4; + } + } else if (cap == PCIY_MSIX) { + /* + * Copy the MSI-X capability + */ + sc->psc_msix.capoff = ptr; + caplen = 12; + msixcap_ptr = (uint32_t*) &msixcap; + capptr = ptr; + while (caplen > 0) { + u32 = read_config(&sel, capptr, 4); + *msixcap_ptr = u32; + pci_set_cfgdata32(pi, capptr, u32); + caplen -= 4; + capptr += 4; + msixcap_ptr++; + } + } + ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1); + } + } + + if (sc->psc_msix.capoff != 0) { + pi->pi_msix.pba_bar = + msixcap.pba_info & PCIM_MSIX_BIR_MASK; + pi->pi_msix.pba_offset = + msixcap.pba_info & ~PCIM_MSIX_BIR_MASK; + pi->pi_msix.table_bar = + msixcap.table_info & PCIM_MSIX_BIR_MASK; + pi->pi_msix.table_offset = + msixcap.table_info & ~PCIM_MSIX_BIR_MASK; + pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl); + pi->pi_msix.pba_size = PBA_SIZE(pi->pi_msix.table_count); + + /* Allocate the emulated MSI-X table array */ + table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE; + pi->pi_msix.table = calloc(1, table_size); + + /* Mask all table entries */ + for (i = 0; i < pi->pi_msix.table_count; i++) { + pi->pi_msix.table[i].vector_control |= + PCIM_MSIX_VCTRL_MASK; + } + } + +#ifdef LEGACY_SUPPORT + /* + * If the passthrough device does not support MSI then craft a + * MSI capability for it. We link the new MSI capability at the + * head of the list of capabilities. + */ + if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) { + int origptr, msiptr; + origptr = read_config(&sel, PCIR_CAP_PTR, 1); + msiptr = passthru_add_msicap(pi, 1, origptr); + sc->psc_msi.capoff = msiptr; + sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2); + sc->psc_msi.emulated = 1; + pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr); + } +#endif + + /* Make sure one of the capabilities is present */ + if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0) + return (-1); + else + return (0); +} + +static uint64_t +msix_table_read(struct passthru_softc *sc, uint64_t offset, int size) +{ + struct pci_devinst *pi; + struct msix_table_entry *entry; + uint8_t *src8; + uint16_t *src16; + uint32_t *src32; + uint64_t *src64; + uint64_t data; + size_t entry_offset; + int index; + + pi = sc->psc_pi; + if (offset < pi->pi_msix.table_offset) + return (-1); + + offset -= pi->pi_msix.table_offset; + index = offset / MSIX_TABLE_ENTRY_SIZE; + if (index >= pi->pi_msix.table_count) + return (-1); + + entry = &pi->pi_msix.table[index]; + entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; + + switch(size) { + case 1: + src8 = (uint8_t *)((void *)entry + entry_offset); + data = *src8; + break; + case 2: + src16 = (uint16_t *)((void *)entry + entry_offset); + data = *src16; + break; + case 4: + src32 = (uint32_t *)((void *)entry + entry_offset); + data = *src32; + break; + case 8: + src64 = (uint64_t *)((void *)entry + entry_offset); + data = *src64; + break; + default: + return (-1); + } + + return (data); +} + +static void +msix_table_write(struct vmctx *ctx, int vcpu, struct passthru_softc *sc, + uint64_t offset, int size, uint64_t data) +{ + struct pci_devinst *pi; + struct msix_table_entry *entry; + uint32_t *dest; + size_t entry_offset; + uint32_t vector_control; + int error, index; + + pi = sc->psc_pi; + if (offset < pi->pi_msix.table_offset) + return; + + offset -= pi->pi_msix.table_offset; + index = offset / MSIX_TABLE_ENTRY_SIZE; + if (index >= pi->pi_msix.table_count) + return; + + entry = &pi->pi_msix.table[index]; + entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; + + /* Only 4 byte naturally-aligned writes are supported */ + assert(size == 4); + assert(entry_offset % 4 == 0); + + vector_control = entry->vector_control; + dest = (uint32_t *)((void *)entry + entry_offset); + *dest = data; + /* If MSI-X hasn't been enabled, do nothing */ + if (pi->pi_msix.enabled) { + /* If the entry is masked, don't set it up */ + if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 || + (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { + error = vm_setup_pptdev_msix(ctx, vcpu, + sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, + sc->psc_sel.pc_func, index, entry->addr, + entry->msg_data, entry->vector_control); + } + } +} + +static int +init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base) +{ + int b, s, f; + int error, idx; + size_t len, remaining; + uint32_t table_size, table_offset; + uint32_t pba_size, pba_offset; + vm_paddr_t start; + struct pci_devinst *pi = sc->psc_pi; + + assert(pci_msix_table_bar(pi) >= 0 && pci_msix_pba_bar(pi) >= 0); + + b = sc->psc_sel.pc_bus; + s = sc->psc_sel.pc_dev; + f = sc->psc_sel.pc_func; + + /* + * If the MSI-X table BAR maps memory intended for + * other uses, it is at least assured that the table + * either resides in its own page within the region, + * or it resides in a page shared with only the PBA. + */ + table_offset = rounddown2(pi->pi_msix.table_offset, 4096); + + table_size = pi->pi_msix.table_offset - table_offset; + table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE; + table_size = roundup2(table_size, 4096); + + if (pi->pi_msix.pba_bar == pi->pi_msix.table_bar) { + pba_offset = pi->pi_msix.pba_offset; + pba_size = pi->pi_msix.pba_size; + if (pba_offset >= table_offset + table_size || + table_offset >= pba_offset + pba_size) { + /* + * The PBA can reside in the same BAR as the MSI-x + * tables as long as it does not overlap with any + * naturally aligned page occupied by the tables. + */ + } else { + /* Need to also emulate the PBA, not supported yet */ + printf("Unsupported MSI-X configuration: %d/%d/%d\n", + b, s, f); + return (-1); + } + } + + idx = pi->pi_msix.table_bar; + start = pi->pi_bar[idx].addr; + remaining = pi->pi_bar[idx].size; + + /* Map everything before the MSI-X table */ + if (table_offset > 0) { + len = table_offset; + error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base); + if (error) + return (error); + + base += len; + start += len; + remaining -= len; + } + + /* Skip the MSI-X table */ + base += table_size; + start += table_size; + remaining -= table_size; + + /* Map everything beyond the end of the MSI-X table */ + if (remaining > 0) { + len = remaining; + error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base); + if (error) + return (error); + } + + return (0); +} + +static int +cfginitbar(struct vmctx *ctx, struct passthru_softc *sc) +{ + int i, error; + struct pci_devinst *pi; + struct pci_bar_io bar; + enum pcibar_type bartype; + uint64_t base, size; + + pi = sc->psc_pi; + + /* + * Initialize BAR registers + */ + for (i = 0; i <= PCI_BARMAX; i++) { + bzero(&bar, sizeof(bar)); + bar.pbi_sel = sc->psc_sel; + bar.pbi_reg = PCIR_BAR(i); + + if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0) + continue; + + if (PCI_BAR_IO(bar.pbi_base)) { + bartype = PCIBAR_IO; + base = bar.pbi_base & PCIM_BAR_IO_BASE; + } else { + switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) { + case PCIM_BAR_MEM_64: + bartype = PCIBAR_MEM64; + break; + default: + bartype = PCIBAR_MEM32; + break; + } + base = bar.pbi_base & PCIM_BAR_MEM_BASE; + } + size = bar.pbi_length; + + if (bartype != PCIBAR_IO) { + if (((base | size) & PAGE_MASK) != 0) { + printf("passthru device %d/%d/%d BAR %d: " + "base %#lx or size %#lx not page aligned\n", + sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, + sc->psc_sel.pc_func, i, base, size); + return (-1); + } + } + + /* Cache information about the "real" BAR */ + sc->psc_bar[i].type = bartype; + sc->psc_bar[i].size = size; + sc->psc_bar[i].addr = base; + + /* Allocate the BAR in the guest I/O or MMIO space */ + error = pci_emul_alloc_pbar(pi, i, base, bartype, size); + if (error) + return (-1); + + /* The MSI-X table needs special handling */ + if (i == pci_msix_table_bar(pi)) { + error = init_msix_table(ctx, sc, base); + if (error) + return (-1); + } else if (bartype != PCIBAR_IO) { + /* Map the physical BAR in the guest MMIO space */ + error = vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus, + sc->psc_sel.pc_dev, sc->psc_sel.pc_func, + pi->pi_bar[i].addr, pi->pi_bar[i].size, base); + if (error) + return (-1); + } + + /* + * 64-bit BAR takes up two slots so skip the next one. + */ + if (bartype == PCIBAR_MEM64) { + i++; + assert(i <= PCI_BARMAX); + sc->psc_bar[i].type = PCIBAR_MEMHI64; + } + } + return (0); +} + +static int +cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func) +{ + int error; + struct passthru_softc *sc; + + error = 1; + sc = pi->pi_arg; + + bzero(&sc->psc_sel, sizeof(struct pcisel)); + sc->psc_sel.pc_bus = bus; + sc->psc_sel.pc_dev = slot; + sc->psc_sel.pc_func = func; + + if (cfginitmsi(sc) != 0) + goto done; + + if (cfginitbar(ctx, sc) != 0) + goto done; + + error = 0; /* success */ +done: + return (error); +} + +static int +passthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + int bus, slot, func, error; + struct passthru_softc *sc; + + sc = NULL; + error = 1; + + if (pcifd < 0) { + pcifd = open(_PATH_DEVPCI, O_RDWR, 0); + if (pcifd < 0) + goto done; + } + + if (iofd < 0) { + iofd = open(_PATH_DEVIO, O_RDWR, 0); + if (iofd < 0) + goto done; + } + + if (opts == NULL || + sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3) + goto done; + + if (vm_assign_pptdev(ctx, bus, slot, func) != 0) + goto done; + + sc = calloc(1, sizeof(struct passthru_softc)); + + pi->pi_arg = sc; + sc->psc_pi = pi; + + /* initialize config space */ + if ((error = cfginit(ctx, pi, bus, slot, func)) != 0) + goto done; + + error = 0; /* success */ +done: + if (error) { + free(sc); + vm_unassign_pptdev(ctx, bus, slot, func); + } + return (error); +} + +static int +bar_access(int coff) +{ + if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) + return (1); + else + return (0); +} + +static int +msicap_access(struct passthru_softc *sc, int coff) +{ + int caplen; + + if (sc->psc_msi.capoff == 0) + return (0); + + caplen = msi_caplen(sc->psc_msi.msgctrl); + + if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen) + return (1); + else + return (0); +} + +static int +msixcap_access(struct passthru_softc *sc, int coff) +{ + if (sc->psc_msix.capoff == 0) + return (0); + + return (coff >= sc->psc_msix.capoff && + coff < sc->psc_msix.capoff + MSIX_CAPLEN); +} + +static int +passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int coff, int bytes, uint32_t *rv) +{ + struct passthru_softc *sc; + + sc = pi->pi_arg; + + /* + * PCI BARs and MSI capability is emulated. + */ + if (bar_access(coff) || msicap_access(sc, coff)) + return (-1); + +#ifdef LEGACY_SUPPORT + /* + * Emulate PCIR_CAP_PTR if this device does not support MSI capability + * natively. + */ + if (sc->psc_msi.emulated) { + if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4) + return (-1); + } +#endif + + /* Everything else just read from the device's config space */ + *rv = read_config(&sc->psc_sel, coff, bytes); + + return (0); +} + +static int +passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int coff, int bytes, uint32_t val) +{ + int error, msix_table_entries, i; + struct passthru_softc *sc; + + sc = pi->pi_arg; + + /* + * PCI BARs are emulated + */ + if (bar_access(coff)) + return (-1); + + /* + * MSI capability is emulated + */ + if (msicap_access(sc, coff)) { + msicap_cfgwrite(pi, sc->psc_msi.capoff, coff, bytes, val); + + error = vm_setup_pptdev_msi(ctx, vcpu, sc->psc_sel.pc_bus, + sc->psc_sel.pc_dev, sc->psc_sel.pc_func, + pi->pi_msi.addr, pi->pi_msi.msg_data, + pi->pi_msi.maxmsgnum); + if (error != 0) { + printf("vm_setup_pptdev_msi error %d\r\n", errno); + exit(1); + } + return (0); + } + + if (msixcap_access(sc, coff)) { + msixcap_cfgwrite(pi, sc->psc_msix.capoff, coff, bytes, val); + if (pi->pi_msix.enabled) { + msix_table_entries = pi->pi_msix.table_count; + for (i = 0; i < msix_table_entries; i++) { + error = vm_setup_pptdev_msix(ctx, vcpu, + sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, + sc->psc_sel.pc_func, i, + pi->pi_msix.table[i].addr, + pi->pi_msix.table[i].msg_data, + pi->pi_msix.table[i].vector_control); + + if (error) { + printf("vm_setup_pptdev_msix error " + "%d\r\n", errno); + exit(1); + } + } + } + return (0); + } + +#ifdef LEGACY_SUPPORT + /* + * If this device does not support MSI natively then we cannot let + * the guest disable legacy interrupts from the device. It is the + * legacy interrupt that is triggering the virtual MSI to the guest. + */ + if (sc->psc_msi.emulated && pci_msi_enabled(pi)) { + if (coff == PCIR_COMMAND && bytes == 2) + val &= ~PCIM_CMD_INTxDIS; + } +#endif + + write_config(&sc->psc_sel, coff, bytes, val); + + return (0); +} + +static void +passthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size, uint64_t value) +{ + struct passthru_softc *sc; + struct iodev_pio_req pio; + + sc = pi->pi_arg; + + if (baridx == pci_msix_table_bar(pi)) { + msix_table_write(ctx, vcpu, sc, offset, size, value); + } else { + assert(pi->pi_bar[baridx].type == PCIBAR_IO); + bzero(&pio, sizeof(struct iodev_pio_req)); + pio.access = IODEV_PIO_WRITE; + pio.port = sc->psc_bar[baridx].addr + offset; + pio.width = size; + pio.val = value; + + (void)ioctl(iofd, IODEV_PIO, &pio); + } +} + +static uint64_t +passthru_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size) +{ + struct passthru_softc *sc; + struct iodev_pio_req pio; + uint64_t val; + + sc = pi->pi_arg; + + if (baridx == pci_msix_table_bar(pi)) { + val = msix_table_read(sc, offset, size); + } else { + assert(pi->pi_bar[baridx].type == PCIBAR_IO); + bzero(&pio, sizeof(struct iodev_pio_req)); + pio.access = IODEV_PIO_READ; + pio.port = sc->psc_bar[baridx].addr + offset; + pio.width = size; + pio.val = 0; + + (void)ioctl(iofd, IODEV_PIO, &pio); + + val = pio.val; + } + + return (val); +} + +struct pci_devemu passthru = { + .pe_emu = "passthru", + .pe_init = passthru_init, + .pe_cfgwrite = passthru_cfgwrite, + .pe_cfgread = passthru_cfgread, + .pe_barwrite = passthru_write, + .pe_barread = passthru_read, +}; +PCI_EMUL_SET(passthru); diff --git a/bhyve/pci_uart.c b/bhyve/pci_uart.c new file mode 100644 index 0000000..21b93bf --- /dev/null +++ b/bhyve/pci_uart.c @@ -0,0 +1,119 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include + +#include "bhyverun.h" +#include "pci_emul.h" +#include "uart_emul.h" + +/* + * Pick a PCI vid/did of a chip with a single uart at + * BAR0, that most versions of FreeBSD can understand: + * Siig CyberSerial 1-port. + */ +#define COM_VENDOR 0x131f +#define COM_DEV 0x2000 + +static void +pci_uart_intr_assert(void *arg) +{ + struct pci_devinst *pi = arg; + + pci_lintr_assert(pi); +} + +static void +pci_uart_intr_deassert(void *arg) +{ + struct pci_devinst *pi = arg; + + pci_lintr_deassert(pi); +} + +static void +pci_uart_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value) +{ + + assert(baridx == 0); + assert(size == 1); + + uart_write(pi->pi_arg, offset, value); +} + +uint64_t +pci_uart_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size) +{ + uint8_t val; + + assert(baridx == 0); + assert(size == 1); + + val = uart_read(pi->pi_arg, offset); + return (val); +} + +static int +pci_uart_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + struct uart_softc *sc; + + pci_emul_alloc_bar(pi, 0, PCIBAR_IO, UART_IO_BAR_SIZE); + pci_lintr_request(pi); + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, COM_DEV); + pci_set_cfgdata16(pi, PCIR_VENDOR, COM_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SIMPLECOMM); + + sc = uart_init(pci_uart_intr_assert, pci_uart_intr_deassert, pi); + pi->pi_arg = sc; + + if (uart_set_backend(sc, opts) != 0) { + fprintf(stderr, "Unable to initialize backend '%s' for " + "pci uart at %d:%d\n", opts, pi->pi_slot, pi->pi_func); + return (-1); + } + + return (0); +} + +struct pci_devemu pci_de_com = { + .pe_emu = "uart", + .pe_init = pci_uart_init, + .pe_barwrite = pci_uart_write, + .pe_barread = pci_uart_read +}; +PCI_EMUL_SET(pci_de_com); diff --git a/bhyve/pci_virtio_block.c b/bhyve/pci_virtio_block.c new file mode 100644 index 0000000..8500be6 --- /dev/null +++ b/bhyve/pci_virtio_block.c @@ -0,0 +1,410 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bhyverun.h" +#include "pci_emul.h" +#include "virtio.h" +#include "block_if.h" + +#define VTBLK_RINGSZ 64 + +#define VTBLK_S_OK 0 +#define VTBLK_S_IOERR 1 +#define VTBLK_S_UNSUPP 2 + +#define VTBLK_BLK_ID_BYTES 20 + +/* Capability bits */ +#define VTBLK_F_SEG_MAX (1 << 2) /* Maximum request segments */ +#define VTBLK_F_BLK_SIZE (1 << 6) /* cfg block size valid */ +#define VTBLK_F_FLUSH (1 << 9) /* Cache flush support */ +#define VTBLK_F_TOPOLOGY (1 << 10) /* Optimal I/O alignment */ + +/* + * Host capabilities + */ +#define VTBLK_S_HOSTCAPS \ + ( VTBLK_F_SEG_MAX | \ + VTBLK_F_BLK_SIZE | \ + VTBLK_F_FLUSH | \ + VTBLK_F_TOPOLOGY | \ + VIRTIO_RING_F_INDIRECT_DESC ) /* indirect descriptors */ + +/* + * Config space "registers" + */ +struct vtblk_config { + uint64_t vbc_capacity; + uint32_t vbc_size_max; + uint32_t vbc_seg_max; + struct { + uint16_t cylinders; + uint8_t heads; + uint8_t sectors; + } vbc_geometry; + uint32_t vbc_blk_size; + struct { + uint8_t physical_block_exp; + uint8_t alignment_offset; + uint16_t min_io_size; + uint32_t opt_io_size; + } vbc_topology; + uint8_t vbc_writeback; +} __packed; + +/* + * Fixed-size block header + */ +struct virtio_blk_hdr { +#define VBH_OP_READ 0 +#define VBH_OP_WRITE 1 +#define VBH_OP_FLUSH 4 +#define VBH_OP_FLUSH_OUT 5 +#define VBH_OP_IDENT 8 +#define VBH_FLAG_BARRIER 0x80000000 /* OR'ed into vbh_type */ + uint32_t vbh_type; + uint32_t vbh_ioprio; + uint64_t vbh_sector; +} __packed; + +/* + * Debug printf + */ +static int pci_vtblk_debug; +#define DPRINTF(params) if (pci_vtblk_debug) printf params +#define WPRINTF(params) printf params + +struct pci_vtblk_ioreq { + struct blockif_req io_req; + struct pci_vtblk_softc *io_sc; + uint8_t *io_status; + uint16_t io_idx; +}; + +/* + * Per-device softc + */ +struct pci_vtblk_softc { + struct virtio_softc vbsc_vs; + pthread_mutex_t vsc_mtx; + struct vqueue_info vbsc_vq; + struct vtblk_config vbsc_cfg; + struct blockif_ctxt *bc; + char vbsc_ident[VTBLK_BLK_ID_BYTES]; + struct pci_vtblk_ioreq vbsc_ios[VTBLK_RINGSZ]; +}; + +static void pci_vtblk_reset(void *); +static void pci_vtblk_notify(void *, struct vqueue_info *); +static int pci_vtblk_cfgread(void *, int, int, uint32_t *); +static int pci_vtblk_cfgwrite(void *, int, int, uint32_t); + +static struct virtio_consts vtblk_vi_consts = { + "vtblk", /* our name */ + 1, /* we support 1 virtqueue */ + sizeof(struct vtblk_config), /* config reg size */ + pci_vtblk_reset, /* reset */ + pci_vtblk_notify, /* device-wide qnotify */ + pci_vtblk_cfgread, /* read PCI config */ + pci_vtblk_cfgwrite, /* write PCI config */ + NULL, /* apply negotiated features */ + VTBLK_S_HOSTCAPS, /* our capabilities */ +}; + +static void +pci_vtblk_reset(void *vsc) +{ + struct pci_vtblk_softc *sc = vsc; + + DPRINTF(("vtblk: device reset requested !\n")); + vi_reset_dev(&sc->vbsc_vs); +} + +static void +pci_vtblk_done(struct blockif_req *br, int err) +{ + struct pci_vtblk_ioreq *io = br->br_param; + struct pci_vtblk_softc *sc = io->io_sc; + + /* convert errno into a virtio block error return */ + if (err == EOPNOTSUPP || err == ENOSYS) + *io->io_status = VTBLK_S_UNSUPP; + else if (err != 0) + *io->io_status = VTBLK_S_IOERR; + else + *io->io_status = VTBLK_S_OK; + + /* + * Return the descriptor back to the host. + * We wrote 1 byte (our status) to host. + */ + pthread_mutex_lock(&sc->vsc_mtx); + vq_relchain(&sc->vbsc_vq, io->io_idx, 1); + vq_endchains(&sc->vbsc_vq, 0); + pthread_mutex_unlock(&sc->vsc_mtx); +} + +static void +pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq) +{ + struct virtio_blk_hdr *vbh; + struct pci_vtblk_ioreq *io; + int i, n; + int err; + ssize_t iolen; + int writeop, type; + off_t offset; + struct iovec iov[BLOCKIF_IOV_MAX + 2]; + uint16_t idx, flags[BLOCKIF_IOV_MAX + 2]; + + n = vq_getchain(vq, &idx, iov, BLOCKIF_IOV_MAX + 2, flags); + + /* + * The first descriptor will be the read-only fixed header, + * and the last is for status (hence +2 above and below). + * The remaining iov's are the actual data I/O vectors. + * + * XXX - note - this fails on crash dump, which does a + * VIRTIO_BLK_T_FLUSH with a zero transfer length + */ + assert(n >= 2 && n <= BLOCKIF_IOV_MAX + 2); + + io = &sc->vbsc_ios[idx]; + assert((flags[0] & VRING_DESC_F_WRITE) == 0); + assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr)); + vbh = iov[0].iov_base; + memcpy(&io->io_req.br_iov, &iov[1], sizeof(struct iovec) * (n - 2)); + io->io_req.br_iovcnt = n - 2; + io->io_req.br_offset = vbh->vbh_sector * DEV_BSIZE; + io->io_status = iov[--n].iov_base; + assert(iov[n].iov_len == 1); + assert(flags[n] & VRING_DESC_F_WRITE); + + /* + * XXX + * The guest should not be setting the BARRIER flag because + * we don't advertise the capability. + */ + type = vbh->vbh_type & ~VBH_FLAG_BARRIER; + writeop = (type == VBH_OP_WRITE); + + iolen = 0; + for (i = 1; i < n; i++) { + /* + * - write op implies read-only descriptor, + * - read/ident op implies write-only descriptor, + * therefore test the inverse of the descriptor bit + * to the op. + */ + assert(((flags[i] & VRING_DESC_F_WRITE) == 0) == writeop); + iolen += iov[i].iov_len; + } + io->io_req.br_resid = iolen; + + DPRINTF(("virtio-block: %s op, %zd bytes, %d segs, offset %ld\n\r", + writeop ? "write" : "read/ident", iolen, i - 1, offset)); + + switch (type) { + case VBH_OP_READ: + err = blockif_read(sc->bc, &io->io_req); + break; + case VBH_OP_WRITE: + err = blockif_write(sc->bc, &io->io_req); + break; + case VBH_OP_FLUSH: + case VBH_OP_FLUSH_OUT: + err = blockif_flush(sc->bc, &io->io_req); + break; + case VBH_OP_IDENT: + /* Assume a single buffer */ + /* S/n equal to buffer is not zero-terminated. */ + memset(iov[1].iov_base, 0, iov[1].iov_len); + strncpy(iov[1].iov_base, sc->vbsc_ident, + MIN(iov[1].iov_len, sizeof(sc->vbsc_ident))); + pci_vtblk_done(&io->io_req, 0); + return; + default: + pci_vtblk_done(&io->io_req, EOPNOTSUPP); + return; + } + assert(err == 0); +} + +static void +pci_vtblk_notify(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtblk_softc *sc = vsc; + + while (vq_has_descs(vq)) + pci_vtblk_proc(sc, vq); +} + +static int +pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + char bident[sizeof("XX:X:X")]; + struct blockif_ctxt *bctxt; + MD5_CTX mdctx; + u_char digest[16]; + struct pci_vtblk_softc *sc; + off_t size; + int i, sectsz, sts, sto; + + if (opts == NULL) { + printf("virtio-block: backing device required\n"); + return (1); + } + + /* + * The supplied backing file has to exist + */ + snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->pi_func); + bctxt = blockif_open(opts, bident); + if (bctxt == NULL) { + perror("Could not open backing file"); + return (1); + } + + size = blockif_size(bctxt); + sectsz = blockif_sectsz(bctxt); + blockif_psectsz(bctxt, &sts, &sto); + + sc = calloc(1, sizeof(struct pci_vtblk_softc)); + sc->bc = bctxt; + for (i = 0; i < VTBLK_RINGSZ; i++) { + struct pci_vtblk_ioreq *io = &sc->vbsc_ios[i]; + io->io_req.br_callback = pci_vtblk_done; + io->io_req.br_param = io; + io->io_sc = sc; + io->io_idx = i; + } + + pthread_mutex_init(&sc->vsc_mtx, NULL); + + /* init virtio softc and virtqueues */ + vi_softc_linkup(&sc->vbsc_vs, &vtblk_vi_consts, sc, pi, &sc->vbsc_vq); + sc->vbsc_vs.vs_mtx = &sc->vsc_mtx; + + sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ; + /* sc->vbsc_vq.vq_notify = we have no per-queue notify */ + + /* + * Create an identifier for the backing file. Use parts of the + * md5 sum of the filename + */ + MD5Init(&mdctx); + MD5Update(&mdctx, opts, strlen(opts)); + MD5Final(digest, &mdctx); + sprintf(sc->vbsc_ident, "BHYVE-%02X%02X-%02X%02X-%02X%02X", + digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]); + + /* setup virtio block config space */ + sc->vbsc_cfg.vbc_capacity = size / DEV_BSIZE; /* 512-byte units */ + sc->vbsc_cfg.vbc_size_max = 0; /* not negotiated */ + sc->vbsc_cfg.vbc_seg_max = BLOCKIF_IOV_MAX; + sc->vbsc_cfg.vbc_geometry.cylinders = 0; /* no geometry */ + sc->vbsc_cfg.vbc_geometry.heads = 0; + sc->vbsc_cfg.vbc_geometry.sectors = 0; + sc->vbsc_cfg.vbc_blk_size = sectsz; + sc->vbsc_cfg.vbc_topology.physical_block_exp = + (sts > sectsz) ? (ffsll(sts / sectsz) - 1) : 0; + sc->vbsc_cfg.vbc_topology.alignment_offset = + (sto != 0) ? ((sts - sto) / sectsz) : 0; + sc->vbsc_cfg.vbc_topology.min_io_size = 0; + sc->vbsc_cfg.vbc_topology.opt_io_size = 0; + sc->vbsc_cfg.vbc_writeback = 0; + + /* + * Should we move some of this into virtio.c? Could + * have the device, class, and subdev_0 as fields in + * the virtio constants structure. + */ + pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_BLOCK); + pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); + pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_BLOCK); + pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); + + if (vi_intr_init(&sc->vbsc_vs, 1, fbsdrun_virtio_msix())) { + blockif_close(sc->bc); + free(sc); + return (1); + } + vi_set_io_bar(&sc->vbsc_vs, 0); + return (0); +} + +static int +pci_vtblk_cfgwrite(void *vsc, int offset, int size, uint32_t value) +{ + + DPRINTF(("vtblk: write to readonly reg %d\n\r", offset)); + return (1); +} + +static int +pci_vtblk_cfgread(void *vsc, int offset, int size, uint32_t *retval) +{ + struct pci_vtblk_softc *sc = vsc; + void *ptr; + + /* our caller has already verified offset and size */ + ptr = (uint8_t *)&sc->vbsc_cfg + offset; + memcpy(retval, ptr, size); + return (0); +} + +struct pci_devemu pci_de_vblk = { + .pe_emu = "virtio-blk", + .pe_init = pci_vtblk_init, + .pe_barwrite = vi_pci_write, + .pe_barread = vi_pci_read +}; +PCI_EMUL_SET(pci_de_vblk); diff --git a/bhyve/pci_virtio_net.c b/bhyve/pci_virtio_net.c new file mode 100644 index 0000000..3781ea9 --- /dev/null +++ b/bhyve/pci_virtio_net.c @@ -0,0 +1,730 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bhyverun.h" +#include "pci_emul.h" +#include "mevent.h" +#include "virtio.h" + +#define VTNET_RINGSZ 1024 + +#define VTNET_MAXSEGS 32 + +/* + * Host capabilities. Note that we only offer a few of these. + */ +#define VIRTIO_NET_F_CSUM (1 << 0) /* host handles partial cksum */ +#define VIRTIO_NET_F_GUEST_CSUM (1 << 1) /* guest handles partial cksum */ +#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */ +#define VIRTIO_NET_F_GSO_DEPREC (1 << 6) /* deprecated: host handles GSO */ +#define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can rcv TSOv4 */ +#define VIRTIO_NET_F_GUEST_TSO6 (1 << 8) /* guest can rcv TSOv6 */ +#define VIRTIO_NET_F_GUEST_ECN (1 << 9) /* guest can rcv TSO with ECN */ +#define VIRTIO_NET_F_GUEST_UFO (1 << 10) /* guest can rcv UFO */ +#define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can rcv TSOv4 */ +#define VIRTIO_NET_F_HOST_TSO6 (1 << 12) /* host can rcv TSOv6 */ +#define VIRTIO_NET_F_HOST_ECN (1 << 13) /* host can rcv TSO with ECN */ +#define VIRTIO_NET_F_HOST_UFO (1 << 14) /* host can rcv UFO */ +#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */ +#define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */ +#define VIRTIO_NET_F_CTRL_VQ (1 << 17) /* control channel available */ +#define VIRTIO_NET_F_CTRL_RX (1 << 18) /* control channel RX mode support */ +#define VIRTIO_NET_F_CTRL_VLAN (1 << 19) /* control channel VLAN filtering */ +#define VIRTIO_NET_F_GUEST_ANNOUNCE \ + (1 << 21) /* guest can send gratuitous pkts */ + +#define VTNET_S_HOSTCAPS \ + ( VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | VIRTIO_NET_F_STATUS | \ + VIRTIO_F_NOTIFY_ON_EMPTY) + +/* + * PCI config-space "registers" + */ +struct virtio_net_config { + uint8_t mac[6]; + uint16_t status; +} __packed; + +/* + * Queue definitions. + */ +#define VTNET_RXQ 0 +#define VTNET_TXQ 1 +#define VTNET_CTLQ 2 /* NB: not yet supported */ + +#define VTNET_MAXQ 3 + +/* + * Fixed network header size + */ +struct virtio_net_rxhdr { + uint8_t vrh_flags; + uint8_t vrh_gso_type; + uint16_t vrh_hdr_len; + uint16_t vrh_gso_size; + uint16_t vrh_csum_start; + uint16_t vrh_csum_offset; + uint16_t vrh_bufs; +} __packed; + +/* + * Debug printf + */ +static int pci_vtnet_debug; +#define DPRINTF(params) if (pci_vtnet_debug) printf params +#define WPRINTF(params) printf params + +/* + * Per-device softc + */ +struct pci_vtnet_softc { + struct virtio_softc vsc_vs; + struct vqueue_info vsc_queues[VTNET_MAXQ - 1]; + pthread_mutex_t vsc_mtx; + struct mevent *vsc_mevp; + + int vsc_tapfd; + int vsc_rx_ready; + volatile int resetting; /* set and checked outside lock */ + + uint64_t vsc_features; /* negotiated features */ + + struct virtio_net_config vsc_config; + + pthread_mutex_t rx_mtx; + int rx_in_progress; + int rx_vhdrlen; + int rx_merge; /* merged rx bufs in use */ + + pthread_t tx_tid; + pthread_mutex_t tx_mtx; + pthread_cond_t tx_cond; + int tx_in_progress; +}; + +static void pci_vtnet_reset(void *); +/* static void pci_vtnet_notify(void *, struct vqueue_info *); */ +static int pci_vtnet_cfgread(void *, int, int, uint32_t *); +static int pci_vtnet_cfgwrite(void *, int, int, uint32_t); +static void pci_vtnet_neg_features(void *, uint64_t); + +static struct virtio_consts vtnet_vi_consts = { + "vtnet", /* our name */ + VTNET_MAXQ - 1, /* we currently support 2 virtqueues */ + sizeof(struct virtio_net_config), /* config reg size */ + pci_vtnet_reset, /* reset */ + NULL, /* device-wide qnotify -- not used */ + pci_vtnet_cfgread, /* read PCI config */ + pci_vtnet_cfgwrite, /* write PCI config */ + pci_vtnet_neg_features, /* apply negotiated features */ + VTNET_S_HOSTCAPS, /* our capabilities */ +}; + +/* + * If the transmit thread is active then stall until it is done. + */ +static void +pci_vtnet_txwait(struct pci_vtnet_softc *sc) +{ + + pthread_mutex_lock(&sc->tx_mtx); + while (sc->tx_in_progress) { + pthread_mutex_unlock(&sc->tx_mtx); + usleep(10000); + pthread_mutex_lock(&sc->tx_mtx); + } + pthread_mutex_unlock(&sc->tx_mtx); +} + +/* + * If the receive thread is active then stall until it is done. + */ +static void +pci_vtnet_rxwait(struct pci_vtnet_softc *sc) +{ + + pthread_mutex_lock(&sc->rx_mtx); + while (sc->rx_in_progress) { + pthread_mutex_unlock(&sc->rx_mtx); + usleep(10000); + pthread_mutex_lock(&sc->rx_mtx); + } + pthread_mutex_unlock(&sc->rx_mtx); +} + +static void +pci_vtnet_reset(void *vsc) +{ + struct pci_vtnet_softc *sc = vsc; + + DPRINTF(("vtnet: device reset requested !\n")); + + sc->resetting = 1; + + /* + * Wait for the transmit and receive threads to finish their + * processing. + */ + pci_vtnet_txwait(sc); + pci_vtnet_rxwait(sc); + + sc->vsc_rx_ready = 0; + sc->rx_merge = 1; + sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr); + + /* now reset rings, MSI-X vectors, and negotiated capabilities */ + vi_reset_dev(&sc->vsc_vs); + + sc->resetting = 0; +} + +/* + * Called to send a buffer chain out to the tap device + */ +static void +pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, + int len) +{ + static char pad[60]; /* all zero bytes */ + + if (sc->vsc_tapfd == -1) + return; + + /* + * If the length is < 60, pad out to that and add the + * extra zero'd segment to the iov. It is guaranteed that + * there is always an extra iov available by the caller. + */ + if (len < 60) { + iov[iovcnt].iov_base = pad; + iov[iovcnt].iov_len = 60 - len; + iovcnt++; + } + (void) writev(sc->vsc_tapfd, iov, iovcnt); +} + +/* + * Called when there is read activity on the tap file descriptor. + * Each buffer posted by the guest is assumed to be able to contain + * an entire ethernet frame + rx header. + * MP note: the dummybuf is only used for discarding frames, so there + * is no need for it to be per-vtnet or locked. + */ +static uint8_t dummybuf[2048]; + +static __inline struct iovec * +rx_iov_trim(struct iovec *iov, int *niov, int tlen) +{ + struct iovec *riov; + + /* XXX short-cut: assume first segment is >= tlen */ + assert(iov[0].iov_len >= tlen); + + iov[0].iov_len -= tlen; + if (iov[0].iov_len == 0) { + assert(*niov > 1); + *niov -= 1; + riov = &iov[1]; + } else { + iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen); + riov = &iov[0]; + } + + return (riov); +} + +static void +pci_vtnet_tap_rx(struct pci_vtnet_softc *sc) +{ + struct iovec iov[VTNET_MAXSEGS], *riov; + struct vqueue_info *vq; + void *vrx; + int len, n; + uint16_t idx; + + /* + * Should never be called without a valid tap fd + */ + assert(sc->vsc_tapfd != -1); + + /* + * But, will be called when the rx ring hasn't yet + * been set up or the guest is resetting the device. + */ + if (!sc->vsc_rx_ready || sc->resetting) { + /* + * Drop the packet and try later. + */ + (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf)); + return; + } + + /* + * Check for available rx buffers + */ + vq = &sc->vsc_queues[VTNET_RXQ]; + if (!vq_has_descs(vq)) { + /* + * Drop the packet and try later. Interrupt on + * empty, if that's negotiated. + */ + (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf)); + vq_endchains(vq, 1); + return; + } + + do { + /* + * Get descriptor chain. + */ + n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); + assert(n >= 1 && n <= VTNET_MAXSEGS); + + /* + * Get a pointer to the rx header, and use the + * data immediately following it for the packet buffer. + */ + vrx = iov[0].iov_base; + riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen); + + len = readv(sc->vsc_tapfd, riov, n); + + if (len < 0 && errno == EWOULDBLOCK) { + /* + * No more packets, but still some avail ring + * entries. Interrupt if needed/appropriate. + */ + vq_retchain(vq); + vq_endchains(vq, 0); + return; + } + + /* + * The only valid field in the rx packet header is the + * number of buffers if merged rx bufs were negotiated. + */ + memset(vrx, 0, sc->rx_vhdrlen); + + if (sc->rx_merge) { + struct virtio_net_rxhdr *vrxh; + + vrxh = vrx; + vrxh->vrh_bufs = 1; + } + + /* + * Release this chain and handle more chains. + */ + vq_relchain(vq, idx, len + sc->rx_vhdrlen); + } while (vq_has_descs(vq)); + + /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */ + vq_endchains(vq, 1); +} + +static void +pci_vtnet_tap_callback(int fd, enum ev_type type, void *param) +{ + struct pci_vtnet_softc *sc = param; + + pthread_mutex_lock(&sc->rx_mtx); + sc->rx_in_progress = 1; + pci_vtnet_tap_rx(sc); + sc->rx_in_progress = 0; + pthread_mutex_unlock(&sc->rx_mtx); + +} + +static void +pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtnet_softc *sc = vsc; + + /* + * A qnotify means that the rx process can now begin + */ + if (sc->vsc_rx_ready == 0) { + sc->vsc_rx_ready = 1; + vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY; + } +} + +static void +pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq) +{ + struct iovec iov[VTNET_MAXSEGS + 1]; + int i, n; + int plen, tlen; + uint16_t idx; + + /* + * Obtain chain of descriptors. The first one is + * really the header descriptor, so we need to sum + * up two lengths: packet length and transfer length. + */ + n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); + assert(n >= 1 && n <= VTNET_MAXSEGS); + plen = 0; + tlen = iov[0].iov_len; + for (i = 1; i < n; i++) { + plen += iov[i].iov_len; + tlen += iov[i].iov_len; + } + + DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, n)); + pci_vtnet_tap_tx(sc, &iov[1], n - 1, plen); + + /* chain is processed, release it and set tlen */ + vq_relchain(vq, idx, tlen); +} + +static void +pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtnet_softc *sc = vsc; + + /* + * Any ring entries to process? + */ + if (!vq_has_descs(vq)) + return; + + /* Signal the tx thread for processing */ + pthread_mutex_lock(&sc->tx_mtx); + vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY; + if (sc->tx_in_progress == 0) + pthread_cond_signal(&sc->tx_cond); + pthread_mutex_unlock(&sc->tx_mtx); +} + +/* + * Thread which will handle processing of TX desc + */ +static void * +pci_vtnet_tx_thread(void *param) +{ + struct pci_vtnet_softc *sc = param; + struct vqueue_info *vq; + int error; + + vq = &sc->vsc_queues[VTNET_TXQ]; + + /* + * Let us wait till the tx queue pointers get initialised & + * first tx signaled + */ + pthread_mutex_lock(&sc->tx_mtx); + error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); + assert(error == 0); + + for (;;) { + /* note - tx mutex is locked here */ + while (sc->resetting || !vq_has_descs(vq)) { + vq->vq_used->vu_flags &= ~VRING_USED_F_NO_NOTIFY; + mb(); + if (!sc->resetting && vq_has_descs(vq)) + break; + + sc->tx_in_progress = 0; + error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); + assert(error == 0); + } + vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY; + sc->tx_in_progress = 1; + pthread_mutex_unlock(&sc->tx_mtx); + + do { + /* + * Run through entries, placing them into + * iovecs and sending when an end-of-packet + * is found + */ + pci_vtnet_proctx(sc, vq); + } while (vq_has_descs(vq)); + + /* + * Generate an interrupt if needed. + */ + vq_endchains(vq, 1); + + pthread_mutex_lock(&sc->tx_mtx); + } +} + +#ifdef notyet +static void +pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq) +{ + + DPRINTF(("vtnet: control qnotify!\n\r")); +} +#endif + +static int +pci_vtnet_parsemac(char *mac_str, uint8_t *mac_addr) +{ + struct ether_addr *ea; + char *tmpstr; + char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 }; + + tmpstr = strsep(&mac_str,"="); + + if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) { + ea = ether_aton(mac_str); + + if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) || + memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) { + fprintf(stderr, "Invalid MAC %s\n", mac_str); + return (EINVAL); + } else + memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN); + } + + return (0); +} + + +static int +pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + MD5_CTX mdctx; + unsigned char digest[16]; + char nstr[80]; + char tname[MAXCOMLEN + 1]; + struct pci_vtnet_softc *sc; + char *devname; + char *vtopts; + int mac_provided; + + sc = calloc(1, sizeof(struct pci_vtnet_softc)); + + pthread_mutex_init(&sc->vsc_mtx, NULL); + + vi_softc_linkup(&sc->vsc_vs, &vtnet_vi_consts, sc, pi, sc->vsc_queues); + sc->vsc_vs.vs_mtx = &sc->vsc_mtx; + + sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ; + sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq; + sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ; + sc->vsc_queues[VTNET_TXQ].vq_notify = pci_vtnet_ping_txq; +#ifdef notyet + sc->vsc_queues[VTNET_CTLQ].vq_qsize = VTNET_RINGSZ; + sc->vsc_queues[VTNET_CTLQ].vq_notify = pci_vtnet_ping_ctlq; +#endif + + /* + * Attempt to open the tap device and read the MAC address + * if specified + */ + mac_provided = 0; + sc->vsc_tapfd = -1; + if (opts != NULL) { + char tbuf[80]; + int err; + + devname = vtopts = strdup(opts); + (void) strsep(&vtopts, ","); + + if (vtopts != NULL) { + err = pci_vtnet_parsemac(vtopts, sc->vsc_config.mac); + if (err != 0) { + free(devname); + return (err); + } + mac_provided = 1; + } + + strcpy(tbuf, "/dev/"); + strlcat(tbuf, devname, sizeof(tbuf)); + + free(devname); + + sc->vsc_tapfd = open(tbuf, O_RDWR); + if (sc->vsc_tapfd == -1) { + WPRINTF(("open of tap device %s failed\n", tbuf)); + } else { + /* + * Set non-blocking and register for read + * notifications with the event loop + */ + int opt = 1; + if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) { + WPRINTF(("tap device O_NONBLOCK failed\n")); + close(sc->vsc_tapfd); + sc->vsc_tapfd = -1; + } + + sc->vsc_mevp = mevent_add(sc->vsc_tapfd, + EVF_READ, + pci_vtnet_tap_callback, + sc); + if (sc->vsc_mevp == NULL) { + WPRINTF(("Could not register event\n")); + close(sc->vsc_tapfd); + sc->vsc_tapfd = -1; + } + } + } + + /* + * The default MAC address is the standard NetApp OUI of 00-a0-98, + * followed by an MD5 of the PCI slot/func number and dev name + */ + if (!mac_provided) { + snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot, + pi->pi_func, vmname); + + MD5Init(&mdctx); + MD5Update(&mdctx, nstr, strlen(nstr)); + MD5Final(digest, &mdctx); + + sc->vsc_config.mac[0] = 0x00; + sc->vsc_config.mac[1] = 0xa0; + sc->vsc_config.mac[2] = 0x98; + sc->vsc_config.mac[3] = digest[0]; + sc->vsc_config.mac[4] = digest[1]; + sc->vsc_config.mac[5] = digest[2]; + } + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET); + pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); + pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET); + pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); + + /* Link is up if we managed to open tap device. */ + sc->vsc_config.status = (opts == NULL || sc->vsc_tapfd >= 0); + + /* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */ + if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix())) + return (1); + + /* use BAR 0 to map config regs in IO space */ + vi_set_io_bar(&sc->vsc_vs, 0); + + sc->resetting = 0; + + sc->rx_merge = 1; + sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr); + sc->rx_in_progress = 0; + pthread_mutex_init(&sc->rx_mtx, NULL); + + /* + * Initialize tx semaphore & spawn TX processing thread. + * As of now, only one thread for TX desc processing is + * spawned. + */ + sc->tx_in_progress = 0; + pthread_mutex_init(&sc->tx_mtx, NULL); + pthread_cond_init(&sc->tx_cond, NULL); + pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc); + snprintf(tname, sizeof(tname), "vtnet-%d:%d tx", pi->pi_slot, + pi->pi_func); + pthread_set_name_np(sc->tx_tid, tname); + + return (0); +} + +static int +pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value) +{ + struct pci_vtnet_softc *sc = vsc; + void *ptr; + + if (offset < 6) { + assert(offset + size <= 6); + /* + * The driver is allowed to change the MAC address + */ + ptr = &sc->vsc_config.mac[offset]; + memcpy(ptr, &value, size); + } else { + /* silently ignore other writes */ + DPRINTF(("vtnet: write to readonly reg %d\n\r", offset)); + } + + return (0); +} + +static int +pci_vtnet_cfgread(void *vsc, int offset, int size, uint32_t *retval) +{ + struct pci_vtnet_softc *sc = vsc; + void *ptr; + + ptr = (uint8_t *)&sc->vsc_config + offset; + memcpy(retval, ptr, size); + return (0); +} + +static void +pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features) +{ + struct pci_vtnet_softc *sc = vsc; + + sc->vsc_features = negotiated_features; + + if (!(sc->vsc_features & VIRTIO_NET_F_MRG_RXBUF)) { + sc->rx_merge = 0; + /* non-merge rx header is 2 bytes shorter */ + sc->rx_vhdrlen -= 2; + } +} + +struct pci_devemu pci_de_vnet = { + .pe_emu = "virtio-net", + .pe_init = pci_vtnet_init, + .pe_barwrite = vi_pci_write, + .pe_barread = vi_pci_read +}; +PCI_EMUL_SET(pci_de_vnet); diff --git a/bhyve/pci_virtio_rnd.c b/bhyve/pci_virtio_rnd.c new file mode 100644 index 0000000..78448f5 --- /dev/null +++ b/bhyve/pci_virtio_rnd.c @@ -0,0 +1,189 @@ +/*- + * Copyright (c) 2014 Nahanni Systems Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * virtio entropy device emulation. + * Randomness is sourced from /dev/random which does not block + * once it has been seeded at bootup. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bhyverun.h" +#include "pci_emul.h" +#include "virtio.h" + +#define VTRND_RINGSZ 64 + + +static int pci_vtrnd_debug; +#define DPRINTF(params) if (pci_vtrnd_debug) printf params +#define WPRINTF(params) printf params + +/* + * Per-device softc + */ +struct pci_vtrnd_softc { + struct virtio_softc vrsc_vs; + struct vqueue_info vrsc_vq; + pthread_mutex_t vrsc_mtx; + uint64_t vrsc_cfg; + int vrsc_fd; +}; + +static void pci_vtrnd_reset(void *); +static void pci_vtrnd_notify(void *, struct vqueue_info *); + +static struct virtio_consts vtrnd_vi_consts = { + "vtrnd", /* our name */ + 1, /* we support 1 virtqueue */ + 0, /* config reg size */ + pci_vtrnd_reset, /* reset */ + pci_vtrnd_notify, /* device-wide qnotify */ + NULL, /* read virtio config */ + NULL, /* write virtio config */ + NULL, /* apply negotiated features */ + 0, /* our capabilities */ +}; + + +static void +pci_vtrnd_reset(void *vsc) +{ + struct pci_vtrnd_softc *sc; + + sc = vsc; + + DPRINTF(("vtrnd: device reset requested !\n")); + vi_reset_dev(&sc->vrsc_vs); +} + + +static void +pci_vtrnd_notify(void *vsc, struct vqueue_info *vq) +{ + struct iovec iov; + struct pci_vtrnd_softc *sc; + int len; + uint16_t idx; + + sc = vsc; + + if (sc->vrsc_fd < 0) { + vq_endchains(vq, 0); + return; + } + + while (vq_has_descs(vq)) { + vq_getchain(vq, &idx, &iov, 1, NULL); + + len = read(sc->vrsc_fd, iov.iov_base, iov.iov_len); + + DPRINTF(("vtrnd: vtrnd_notify(): %d\r\n", len)); + + /* Catastrophe if unable to read from /dev/random */ + assert(len > 0); + + /* + * Release this chain and handle more + */ + vq_relchain(vq, idx, len); + } + vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ +} + + +static int +pci_vtrnd_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + struct pci_vtrnd_softc *sc; + int fd; + int len; + uint8_t v; + + /* + * Should always be able to open /dev/random. + */ + fd = open("/dev/random", O_RDONLY | O_NONBLOCK); + + assert(fd >= 0); + + /* + * Check that device is seeded and non-blocking. + */ + len = read(fd, &v, sizeof(v)); + if (len <= 0) { + WPRINTF(("vtrnd: /dev/random not ready, read(): %d", len)); + return (1); + } + + sc = calloc(1, sizeof(struct pci_vtrnd_softc)); + + vi_softc_linkup(&sc->vrsc_vs, &vtrnd_vi_consts, sc, pi, &sc->vrsc_vq); + sc->vrsc_vs.vs_mtx = &sc->vrsc_mtx; + + sc->vrsc_vq.vq_qsize = VTRND_RINGSZ; + + /* keep /dev/random opened while emulating */ + sc->vrsc_fd = fd; + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_RANDOM); + pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_CRYPTO); + pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_ENTROPY); + pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); + + if (vi_intr_init(&sc->vrsc_vs, 1, fbsdrun_virtio_msix())) + return (1); + vi_set_io_bar(&sc->vrsc_vs, 0); + + return (0); +} + + +struct pci_devemu pci_de_vrnd = { + .pe_emu = "virtio-rnd", + .pe_init = pci_vtrnd_init, + .pe_barwrite = vi_pci_write, + .pe_barread = vi_pci_read +}; +PCI_EMUL_SET(pci_de_vrnd); diff --git a/bhyve/pm.c b/bhyve/pm.c new file mode 100644 index 0000000..f7c1c23 --- /dev/null +++ b/bhyve/pm.c @@ -0,0 +1,312 @@ +/*- + * Copyright (c) 2013 Hudson River Trading LLC + * Written by: John H. Baldwin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include +#include +#include + +#include "acpi.h" +#include "inout.h" +#include "mevent.h" +#include "pci_irq.h" +#include "pci_lpc.h" + +static pthread_mutex_t pm_lock = PTHREAD_MUTEX_INITIALIZER; +static struct mevent *power_button; +static sig_t old_power_handler; + +/* + * Reset Control register at I/O port 0xcf9. Bit 2 forces a system + * reset when it transitions from 0 to 1. Bit 1 selects the type of + * reset to attempt: 0 selects a "soft" reset, and 1 selects a "hard" + * reset. + */ +static int +reset_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + int error; + + static uint8_t reset_control; + + if (bytes != 1) + return (-1); + if (in) + *eax = reset_control; + else { + reset_control = *eax; + + /* Treat hard and soft resets the same. */ + if (reset_control & 0x4) { + error = vm_suspend(ctx, VM_SUSPEND_RESET); + assert(error == 0 || errno == EALREADY); + } + } + return (0); +} +INOUT_PORT(reset_reg, 0xCF9, IOPORT_F_INOUT, reset_handler); + +/* + * ACPI's SCI is a level-triggered interrupt. + */ +static int sci_active; + +static void +sci_assert(struct vmctx *ctx) +{ + + if (sci_active) + return; + vm_isa_assert_irq(ctx, SCI_INT, SCI_INT); + sci_active = 1; +} + +static void +sci_deassert(struct vmctx *ctx) +{ + + if (!sci_active) + return; + vm_isa_deassert_irq(ctx, SCI_INT, SCI_INT); + sci_active = 0; +} + +/* + * Power Management 1 Event Registers + * + * The only power management event supported is a power button upon + * receiving SIGTERM. + */ +static uint16_t pm1_enable, pm1_status; + +#define PM1_TMR_STS 0x0001 +#define PM1_BM_STS 0x0010 +#define PM1_GBL_STS 0x0020 +#define PM1_PWRBTN_STS 0x0100 +#define PM1_SLPBTN_STS 0x0200 +#define PM1_RTC_STS 0x0400 +#define PM1_WAK_STS 0x8000 + +#define PM1_TMR_EN 0x0001 +#define PM1_GBL_EN 0x0020 +#define PM1_PWRBTN_EN 0x0100 +#define PM1_SLPBTN_EN 0x0200 +#define PM1_RTC_EN 0x0400 + +static void +sci_update(struct vmctx *ctx) +{ + int need_sci; + + /* See if the SCI should be active or not. */ + need_sci = 0; + if ((pm1_enable & PM1_TMR_EN) && (pm1_status & PM1_TMR_STS)) + need_sci = 1; + if ((pm1_enable & PM1_GBL_EN) && (pm1_status & PM1_GBL_STS)) + need_sci = 1; + if ((pm1_enable & PM1_PWRBTN_EN) && (pm1_status & PM1_PWRBTN_STS)) + need_sci = 1; + if ((pm1_enable & PM1_SLPBTN_EN) && (pm1_status & PM1_SLPBTN_STS)) + need_sci = 1; + if ((pm1_enable & PM1_RTC_EN) && (pm1_status & PM1_RTC_STS)) + need_sci = 1; + if (need_sci) + sci_assert(ctx); + else + sci_deassert(ctx); +} + +static int +pm1_status_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + + if (bytes != 2) + return (-1); + + pthread_mutex_lock(&pm_lock); + if (in) + *eax = pm1_status; + else { + /* + * Writes are only permitted to clear certain bits by + * writing 1 to those flags. + */ + pm1_status &= ~(*eax & (PM1_WAK_STS | PM1_RTC_STS | + PM1_SLPBTN_STS | PM1_PWRBTN_STS | PM1_BM_STS)); + sci_update(ctx); + } + pthread_mutex_unlock(&pm_lock); + return (0); +} + +static int +pm1_enable_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + + if (bytes != 2) + return (-1); + + pthread_mutex_lock(&pm_lock); + if (in) + *eax = pm1_enable; + else { + /* + * Only permit certain bits to be set. We never use + * the global lock, but ACPI-CA whines profusely if it + * can't set GBL_EN. + */ + pm1_enable = *eax & (PM1_PWRBTN_EN | PM1_GBL_EN); + sci_update(ctx); + } + pthread_mutex_unlock(&pm_lock); + return (0); +} +INOUT_PORT(pm1_status, PM1A_EVT_ADDR, IOPORT_F_INOUT, pm1_status_handler); +INOUT_PORT(pm1_enable, PM1A_EVT_ADDR + 2, IOPORT_F_INOUT, pm1_enable_handler); + +static void +power_button_handler(int signal, enum ev_type type, void *arg) +{ + struct vmctx *ctx; + + ctx = arg; + pthread_mutex_lock(&pm_lock); + if (!(pm1_status & PM1_PWRBTN_STS)) { + pm1_status |= PM1_PWRBTN_STS; + sci_update(ctx); + } + pthread_mutex_unlock(&pm_lock); +} + +/* + * Power Management 1 Control Register + * + * This is mostly unimplemented except that we wish to handle writes that + * set SPL_EN to handle S5 (soft power off). + */ +static uint16_t pm1_control; + +#define PM1_SCI_EN 0x0001 +#define PM1_SLP_TYP 0x1c00 +#define PM1_SLP_EN 0x2000 +#define PM1_ALWAYS_ZERO 0xc003 + +static int +pm1_control_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + int error; + + if (bytes != 2) + return (-1); + if (in) + *eax = pm1_control; + else { + /* + * Various bits are write-only or reserved, so force them + * to zero in pm1_control. Always preserve SCI_EN as OSPM + * can never change it. + */ + pm1_control = (pm1_control & PM1_SCI_EN) | + (*eax & ~(PM1_SLP_EN | PM1_ALWAYS_ZERO)); + + /* + * If SLP_EN is set, check for S5. Bhyve's _S5_ method + * says that '5' should be stored in SLP_TYP for S5. + */ + if (*eax & PM1_SLP_EN) { + if ((pm1_control & PM1_SLP_TYP) >> 10 == 5) { + error = vm_suspend(ctx, VM_SUSPEND_POWEROFF); + assert(error == 0 || errno == EALREADY); + } + } + } + return (0); +} +INOUT_PORT(pm1_control, PM1A_CNT_ADDR, IOPORT_F_INOUT, pm1_control_handler); +SYSRES_IO(PM1A_EVT_ADDR, 8); + +/* + * ACPI SMI Command Register + * + * This write-only register is used to enable and disable ACPI. + */ +static int +smi_cmd_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + + assert(!in); + if (bytes != 1) + return (-1); + + pthread_mutex_lock(&pm_lock); + switch (*eax) { + case BHYVE_ACPI_ENABLE: + pm1_control |= PM1_SCI_EN; + if (power_button == NULL) { + power_button = mevent_add(SIGTERM, EVF_SIGNAL, + power_button_handler, ctx); + old_power_handler = signal(SIGTERM, SIG_IGN); + } + break; + case BHYVE_ACPI_DISABLE: + pm1_control &= ~PM1_SCI_EN; + if (power_button != NULL) { + mevent_delete(power_button); + power_button = NULL; + signal(SIGTERM, old_power_handler); + } + break; + } + pthread_mutex_unlock(&pm_lock); + return (0); +} +INOUT_PORT(smi_cmd, SMI_CMD, IOPORT_F_OUT, smi_cmd_handler); +SYSRES_IO(SMI_CMD, 1); + +void +sci_init(struct vmctx *ctx) +{ + + /* + * Mark ACPI's SCI as level trigger and bump its use count + * in the PIRQ router. + */ + pci_irq_use(SCI_INT); + vm_isa_set_irq_trigger(ctx, SCI_INT, LEVEL_TRIGGER); +} diff --git a/bhyve/post.c b/bhyve/post.c new file mode 100644 index 0000000..5215a0c --- /dev/null +++ b/bhyve/post.c @@ -0,0 +1,53 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include + +#include "inout.h" +#include "pci_lpc.h" + +static int +post_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + assert(in == 1); + + if (bytes != 1) + return (-1); + + *eax = 0xff; /* return some garbage */ + return (0); +} + +INOUT_PORT(post, 0x84, IOPORT_F_IN, post_data_handler); +SYSRES_IO(0x84, 1); diff --git a/bhyve/rtc.c b/bhyve/rtc.c new file mode 100644 index 0000000..5c70154 --- /dev/null +++ b/bhyve/rtc.c @@ -0,0 +1,129 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include +#include + +#include +#include + +#include "acpi.h" +#include "pci_lpc.h" +#include "rtc.h" + +#define IO_RTC 0x70 + +#define RTC_LMEM_LSB 0x34 +#define RTC_LMEM_MSB 0x35 +#define RTC_HMEM_LSB 0x5b +#define RTC_HMEM_SB 0x5c +#define RTC_HMEM_MSB 0x5d + +#define m_64KB (64*1024) +#define m_16MB (16*1024*1024) +#define m_4GB (4ULL*1024*1024*1024) + +/* + * Returns the current RTC time as number of seconds since 00:00:00 Jan 1, 1970 + */ +static time_t +rtc_time(struct vmctx *ctx, int use_localtime) +{ + struct tm tm; + time_t t; + + time(&t); + if (use_localtime) { + localtime_r(&t, &tm); + t = timegm(&tm); + } + return (t); +} + +void +rtc_init(struct vmctx *ctx, int use_localtime) +{ + size_t himem; + size_t lomem; + int err; + + /* XXX init diag/reset code/equipment/checksum ? */ + + /* + * Report guest memory size in nvram cells as required by UEFI. + * Little-endian encoding. + * 0x34/0x35 - 64KB chunks above 16MB, below 4GB + * 0x5b/0x5c/0x5d - 64KB chunks above 4GB + */ + lomem = (vm_get_lowmem_size(ctx) - m_16MB) / m_64KB; + err = vm_rtc_write(ctx, RTC_LMEM_LSB, lomem); + assert(err == 0); + err = vm_rtc_write(ctx, RTC_LMEM_MSB, lomem >> 8); + assert(err == 0); + + himem = vm_get_highmem_size(ctx) / m_64KB; + err = vm_rtc_write(ctx, RTC_HMEM_LSB, himem); + assert(err == 0); + err = vm_rtc_write(ctx, RTC_HMEM_SB, himem >> 8); + assert(err == 0); + err = vm_rtc_write(ctx, RTC_HMEM_MSB, himem >> 16); + assert(err == 0); + + err = vm_rtc_settime(ctx, rtc_time(ctx, use_localtime)); + assert(err == 0); +} + +static void +rtc_dsdt(void) +{ + + dsdt_line(""); + dsdt_line("Device (RTC)"); + dsdt_line("{"); + dsdt_line(" Name (_HID, EisaId (\"PNP0B00\"))"); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_indent(2); + dsdt_fixed_ioport(IO_RTC, 2); + dsdt_fixed_irq(8); + dsdt_unindent(2); + dsdt_line(" })"); + dsdt_line("}"); +} +LPC_DSDT(rtc_dsdt); + +/* + * Reserve the extended RTC I/O ports although they are not emulated at this + * time. + */ +SYSRES_IO(0x72, 6); diff --git a/bhyve/rtc.h b/bhyve/rtc.h new file mode 100644 index 0000000..5b08ca3 --- /dev/null +++ b/bhyve/rtc.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2013 Peter Grehan + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _RTC_H_ +#define _RTC_H_ + +void rtc_init(struct vmctx *ctx, int use_localtime); + +#endif /* _RTC_H_ */ diff --git a/bhyve/smbiostbl.c b/bhyve/smbiostbl.c new file mode 100644 index 0000000..59a1358 --- /dev/null +++ b/bhyve/smbiostbl.c @@ -0,0 +1,827 @@ +/*- + * Copyright (c) 2014 Tycho Nightingale + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "bhyverun.h" +#include "smbiostbl.h" + +#define MB (1024*1024) +#define GB (1024ULL*1024*1024) + +#define SMBIOS_BASE 0xF1000 + +/* BHYVE_ACPI_BASE - SMBIOS_BASE) */ +#define SMBIOS_MAX_LENGTH (0xF2400 - 0xF1000) + +#define SMBIOS_TYPE_BIOS 0 +#define SMBIOS_TYPE_SYSTEM 1 +#define SMBIOS_TYPE_CHASSIS 3 +#define SMBIOS_TYPE_PROCESSOR 4 +#define SMBIOS_TYPE_MEMARRAY 16 +#define SMBIOS_TYPE_MEMDEVICE 17 +#define SMBIOS_TYPE_MEMARRAYMAP 19 +#define SMBIOS_TYPE_BOOT 32 +#define SMBIOS_TYPE_EOT 127 + +struct smbios_structure { + uint8_t type; + uint8_t length; + uint16_t handle; +} __packed; + +typedef int (*initializer_func_t)(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size); + +struct smbios_template_entry { + struct smbios_structure *entry; + const char **strings; + initializer_func_t initializer; +}; + +/* + * SMBIOS Structure Table Entry Point + */ +#define SMBIOS_ENTRY_EANCHOR "_SM_" +#define SMBIOS_ENTRY_EANCHORLEN 4 +#define SMBIOS_ENTRY_IANCHOR "_DMI_" +#define SMBIOS_ENTRY_IANCHORLEN 5 + +struct smbios_entry_point { + char eanchor[4]; /* anchor tag */ + uint8_t echecksum; /* checksum of entry point structure */ + uint8_t eplen; /* length in bytes of entry point */ + uint8_t major; /* major version of the SMBIOS spec */ + uint8_t minor; /* minor version of the SMBIOS spec */ + uint16_t maxssize; /* maximum size in bytes of a struct */ + uint8_t revision; /* entry point structure revision */ + uint8_t format[5]; /* entry point rev-specific data */ + char ianchor[5]; /* intermediate anchor tag */ + uint8_t ichecksum; /* intermediate checksum */ + uint16_t stlen; /* len in bytes of structure table */ + uint32_t staddr; /* physical addr of structure table */ + uint16_t stnum; /* number of structure table entries */ + uint8_t bcdrev; /* BCD value representing DMI ver */ +} __packed; + +/* + * BIOS Information + */ +#define SMBIOS_FL_ISA 0x00000010 /* ISA is supported */ +#define SMBIOS_FL_PCI 0x00000080 /* PCI is supported */ +#define SMBIOS_FL_SHADOW 0x00001000 /* BIOS shadowing is allowed */ +#define SMBIOS_FL_CDBOOT 0x00008000 /* Boot from CD is supported */ +#define SMBIOS_FL_SELBOOT 0x00010000 /* Selectable Boot supported */ +#define SMBIOS_FL_EDD 0x00080000 /* EDD Spec is supported */ + +#define SMBIOS_XB1_FL_ACPI 0x00000001 /* ACPI is supported */ + +#define SMBIOS_XB2_FL_BBS 0x00000001 /* BIOS Boot Specification */ +#define SMBIOS_XB2_FL_VM 0x00000010 /* Virtual Machine */ + +struct smbios_table_type0 { + struct smbios_structure header; + uint8_t vendor; /* vendor string */ + uint8_t version; /* version string */ + uint16_t segment; /* address segment location */ + uint8_t rel_date; /* release date */ + uint8_t size; /* rom size */ + uint64_t cflags; /* characteristics */ + uint8_t xc_bytes[2]; /* characteristics ext bytes */ + uint8_t sb_major_rel; /* system bios version */ + uint8_t sb_minor_rele; + uint8_t ecfw_major_rel; /* embedded ctrl fw version */ + uint8_t ecfw_minor_rel; +} __packed; + +/* + * System Information + */ +#define SMBIOS_WAKEUP_SWITCH 0x06 /* power switch */ + +struct smbios_table_type1 { + struct smbios_structure header; + uint8_t manufacturer; /* manufacturer string */ + uint8_t product; /* product name string */ + uint8_t version; /* version string */ + uint8_t serial; /* serial number string */ + uint8_t uuid[16]; /* uuid byte array */ + uint8_t wakeup; /* wake-up event */ + uint8_t sku; /* sku number string */ + uint8_t family; /* family name string */ +} __packed; + +/* + * System Enclosure or Chassis + */ +#define SMBIOS_CHT_UNKNOWN 0x02 /* unknown */ + +#define SMBIOS_CHST_SAFE 0x03 /* safe */ + +#define SMBIOS_CHSC_NONE 0x03 /* none */ + +struct smbios_table_type3 { + struct smbios_structure header; + uint8_t manufacturer; /* manufacturer string */ + uint8_t type; /* type */ + uint8_t version; /* version string */ + uint8_t serial; /* serial number string */ + uint8_t asset; /* asset tag string */ + uint8_t bustate; /* boot-up state */ + uint8_t psstate; /* power supply state */ + uint8_t tstate; /* thermal state */ + uint8_t security; /* security status */ + uint8_t uheight; /* height in 'u's */ + uint8_t cords; /* number of power cords */ + uint8_t elems; /* number of element records */ + uint8_t elemlen; /* length of records */ + uint8_t sku; /* sku number string */ +} __packed; + +/* + * Processor Information + */ +#define SMBIOS_PRT_CENTRAL 0x03 /* central processor */ + +#define SMBIOS_PRF_OTHER 0x01 /* other */ + +#define SMBIOS_PRS_PRESENT 0x40 /* socket is populated */ +#define SMBIOS_PRS_ENABLED 0x1 /* enabled */ + +#define SMBIOS_PRU_NONE 0x06 /* none */ + +#define SMBIOS_PFL_64B 0x04 /* 64-bit capable */ + +struct smbios_table_type4 { + struct smbios_structure header; + uint8_t socket; /* socket designation string */ + uint8_t type; /* processor type */ + uint8_t family; /* processor family */ + uint8_t manufacturer; /* manufacturer string */ + uint64_t cpuid; /* processor cpuid */ + uint8_t version; /* version string */ + uint8_t voltage; /* voltage */ + uint16_t clkspeed; /* ext clock speed in mhz */ + uint16_t maxspeed; /* maximum speed in mhz */ + uint16_t curspeed; /* current speed in mhz */ + uint8_t status; /* status */ + uint8_t upgrade; /* upgrade */ + uint16_t l1handle; /* l1 cache handle */ + uint16_t l2handle; /* l2 cache handle */ + uint16_t l3handle; /* l3 cache handle */ + uint8_t serial; /* serial number string */ + uint8_t asset; /* asset tag string */ + uint8_t part; /* part number string */ + uint8_t cores; /* cores per socket */ + uint8_t ecores; /* enabled cores */ + uint8_t threads; /* threads per socket */ + uint16_t cflags; /* processor characteristics */ + uint16_t family2; /* processor family 2 */ +} __packed; + +/* + * Physical Memory Array + */ +#define SMBIOS_MAL_SYSMB 0x03 /* system board or motherboard */ + +#define SMBIOS_MAU_SYSTEM 0x03 /* system memory */ + +#define SMBIOS_MAE_NONE 0x03 /* none */ + +struct smbios_table_type16 { + struct smbios_structure header; + uint8_t location; /* physical device location */ + uint8_t use; /* device functional purpose */ + uint8_t ecc; /* err detect/correct method */ + uint32_t size; /* max mem capacity in kb */ + uint16_t errhand; /* handle of error (if any) */ + uint16_t ndevs; /* num of slots or sockets */ + uint64_t xsize; /* max mem capacity in bytes */ +} __packed; + +/* + * Memory Device + */ +#define SMBIOS_MDFF_UNKNOWN 0x02 /* unknown */ + +#define SMBIOS_MDT_UNKNOWN 0x02 /* unknown */ + +#define SMBIOS_MDF_UNKNOWN 0x0004 /* unknown */ + +struct smbios_table_type17 { + struct smbios_structure header; + uint16_t arrayhand; /* handle of physl mem array */ + uint16_t errhand; /* handle of mem error data */ + uint16_t twidth; /* total width in bits */ + uint16_t dwidth; /* data width in bits */ + uint16_t size; /* size in bytes */ + uint8_t form; /* form factor */ + uint8_t set; /* set */ + uint8_t dloc; /* device locator string */ + uint8_t bloc; /* phys bank locator string */ + uint8_t type; /* memory type */ + uint16_t flags; /* memory characteristics */ + uint16_t maxspeed; /* maximum speed in mhz */ + uint8_t manufacturer; /* manufacturer string */ + uint8_t serial; /* serial number string */ + uint8_t asset; /* asset tag string */ + uint8_t part; /* part number string */ + uint8_t attributes; /* attributes */ + uint32_t xsize; /* extended size in mbs */ + uint16_t curspeed; /* current speed in mhz */ + uint16_t minvoltage; /* minimum voltage */ + uint16_t maxvoltage; /* maximum voltage */ + uint16_t curvoltage; /* configured voltage */ +} __packed; + +/* + * Memory Array Mapped Address + */ +struct smbios_table_type19 { + struct smbios_structure header; + uint32_t saddr; /* start phys addr in kb */ + uint32_t eaddr; /* end phys addr in kb */ + uint16_t arrayhand; /* physical mem array handle */ + uint8_t width; /* num of dev in row */ + uint64_t xsaddr; /* start phys addr in bytes */ + uint64_t xeaddr; /* end phys addr in bytes */ +} __packed; + +/* + * System Boot Information + */ +#define SMBIOS_BOOT_NORMAL 0 /* no errors detected */ + +struct smbios_table_type32 { + struct smbios_structure header; + uint8_t reserved[6]; + uint8_t status; /* boot status */ +} __packed; + +/* + * End-of-Table + */ +struct smbios_table_type127 { + struct smbios_structure header; +} __packed; + +struct smbios_table_type0 smbios_type0_template = { + { SMBIOS_TYPE_BIOS, sizeof (struct smbios_table_type0), 0 }, + 1, /* bios vendor string */ + 2, /* bios version string */ + 0xF000, /* bios address segment location */ + 3, /* bios release date */ + 0x0, /* bios size (64k * (n + 1) is the size in bytes) */ + SMBIOS_FL_ISA | SMBIOS_FL_PCI | SMBIOS_FL_SHADOW | + SMBIOS_FL_CDBOOT | SMBIOS_FL_EDD, + { SMBIOS_XB1_FL_ACPI, SMBIOS_XB2_FL_BBS | SMBIOS_XB2_FL_VM }, + 0x0, /* bios major release */ + 0x0, /* bios minor release */ + 0xff, /* embedded controller firmware major release */ + 0xff /* embedded controller firmware minor release */ +}; + +const char *smbios_type0_strings[] = { + "BHYVE", /* vendor string */ + "1.00", /* bios version string */ + "03/14/2014", /* bios release date string */ + NULL +}; + +struct smbios_table_type1 smbios_type1_template = { + { SMBIOS_TYPE_SYSTEM, sizeof (struct smbios_table_type1), 0 }, + 1, /* manufacturer string */ + 2, /* product string */ + 3, /* version string */ + 4, /* serial number string */ + { 0 }, + SMBIOS_WAKEUP_SWITCH, + 5, /* sku string */ + 6 /* family string */ +}; + +static int smbios_type1_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size); + +const char *smbios_type1_strings[] = { + " ", /* manufacturer string */ + "BHYVE", /* product name string */ + "1.0", /* version string */ + "None", /* serial number string */ + "None", /* sku string */ + " ", /* family name string */ + NULL +}; + +struct smbios_table_type3 smbios_type3_template = { + { SMBIOS_TYPE_CHASSIS, sizeof (struct smbios_table_type3), 0 }, + 1, /* manufacturer string */ + SMBIOS_CHT_UNKNOWN, + 2, /* version string */ + 3, /* serial number string */ + 4, /* asset tag string */ + SMBIOS_CHST_SAFE, + SMBIOS_CHST_SAFE, + SMBIOS_CHST_SAFE, + SMBIOS_CHSC_NONE, + 0, /* height in 'u's (0=enclosure height unspecified) */ + 0, /* number of power cords (0=number unspecified) */ + 0, /* number of contained element records */ + 0, /* length of records */ + 5 /* sku number string */ +}; + +const char *smbios_type3_strings[] = { + " ", /* manufacturer string */ + "1.0", /* version string */ + "None", /* serial number string */ + "None", /* asset tag string */ + "None", /* sku number string */ + NULL +}; + +struct smbios_table_type4 smbios_type4_template = { + { SMBIOS_TYPE_PROCESSOR, sizeof (struct smbios_table_type4), 0 }, + 1, /* socket designation string */ + SMBIOS_PRT_CENTRAL, + SMBIOS_PRF_OTHER, + 2, /* manufacturer string */ + 0, /* cpuid */ + 3, /* version string */ + 0, /* voltage */ + 0, /* external clock frequency in mhz (0=unknown) */ + 0, /* maximum frequency in mhz (0=unknown) */ + 0, /* current frequency in mhz (0=unknown) */ + SMBIOS_PRS_PRESENT | SMBIOS_PRS_ENABLED, + SMBIOS_PRU_NONE, + -1, /* l1 cache handle */ + -1, /* l2 cache handle */ + -1, /* l3 cache handle */ + 4, /* serial number string */ + 5, /* asset tag string */ + 6, /* part number string */ + 0, /* cores per socket (0=unknown) */ + 0, /* enabled cores per socket (0=unknown) */ + 0, /* threads per socket (0=unknown) */ + SMBIOS_PFL_64B, + SMBIOS_PRF_OTHER +}; + +const char *smbios_type4_strings[] = { + " ", /* socket designation string */ + " ", /* manufacturer string */ + " ", /* version string */ + "None", /* serial number string */ + "None", /* asset tag string */ + "None", /* part number string */ + NULL +}; + +static int smbios_type4_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size); + +struct smbios_table_type16 smbios_type16_template = { + { SMBIOS_TYPE_MEMARRAY, sizeof (struct smbios_table_type16), 0 }, + SMBIOS_MAL_SYSMB, + SMBIOS_MAU_SYSTEM, + SMBIOS_MAE_NONE, + 0x80000000, /* max mem capacity in kb (0x80000000=use extended) */ + -1, /* handle of error (if any) */ + 0, /* number of slots or sockets (TBD) */ + 0 /* extended maximum memory capacity in bytes (TBD) */ +}; + +static int smbios_type16_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size); + +struct smbios_table_type17 smbios_type17_template = { + { SMBIOS_TYPE_MEMDEVICE, sizeof (struct smbios_table_type17), 0 }, + -1, /* handle of physical memory array */ + -1, /* handle of memory error data */ + 64, /* total width in bits including ecc */ + 64, /* data width in bits */ + 0x7fff, /* size in bytes (0x7fff=use extended)*/ + SMBIOS_MDFF_UNKNOWN, + 0, /* set (0x00=none, 0xff=unknown) */ + 1, /* device locator string */ + 2, /* physical bank locator string */ + SMBIOS_MDT_UNKNOWN, + SMBIOS_MDF_UNKNOWN, + 0, /* maximum memory speed in mhz (0=unknown) */ + 3, /* manufacturer string */ + 4, /* serial number string */ + 5, /* asset tag string */ + 6, /* part number string */ + 0, /* attributes (0=unknown rank information) */ + 0, /* extended size in mb (TBD) */ + 0, /* current speed in mhz (0=unknown) */ + 0, /* minimum voltage in mv (0=unknown) */ + 0, /* maximum voltage in mv (0=unknown) */ + 0 /* configured voltage in mv (0=unknown) */ +}; + +const char *smbios_type17_strings[] = { + " ", /* device locator string */ + " ", /* physical bank locator string */ + " ", /* manufacturer string */ + "None", /* serial number string */ + "None", /* asset tag string */ + "None", /* part number string */ + NULL +}; + +static int smbios_type17_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size); + +struct smbios_table_type19 smbios_type19_template = { + { SMBIOS_TYPE_MEMARRAYMAP, sizeof (struct smbios_table_type19), 0 }, + 0xffffffff, /* starting phys addr in kb (0xffffffff=use ext) */ + 0xffffffff, /* ending phys addr in kb (0xffffffff=use ext) */ + -1, /* physical memory array handle */ + 1, /* number of devices that form a row */ + 0, /* extended starting phys addr in bytes (TDB) */ + 0 /* extended ending phys addr in bytes (TDB) */ +}; + +static int smbios_type19_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size); + +struct smbios_table_type32 smbios_type32_template = { + { SMBIOS_TYPE_BOOT, sizeof (struct smbios_table_type32), 0 }, + { 0, 0, 0, 0, 0, 0 }, + SMBIOS_BOOT_NORMAL +}; + +struct smbios_table_type127 smbios_type127_template = { + { SMBIOS_TYPE_EOT, sizeof (struct smbios_table_type127), 0 } +}; + +static int smbios_generic_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size); + +static struct smbios_template_entry smbios_template[] = { + { (struct smbios_structure *)&smbios_type0_template, + smbios_type0_strings, + smbios_generic_initializer }, + { (struct smbios_structure *)&smbios_type1_template, + smbios_type1_strings, + smbios_type1_initializer }, + { (struct smbios_structure *)&smbios_type3_template, + smbios_type3_strings, + smbios_generic_initializer }, + { (struct smbios_structure *)&smbios_type4_template, + smbios_type4_strings, + smbios_type4_initializer }, + { (struct smbios_structure *)&smbios_type16_template, + NULL, + smbios_type16_initializer }, + { (struct smbios_structure *)&smbios_type17_template, + smbios_type17_strings, + smbios_type17_initializer }, + { (struct smbios_structure *)&smbios_type19_template, + NULL, + smbios_type19_initializer }, + { (struct smbios_structure *)&smbios_type32_template, + NULL, + smbios_generic_initializer }, + { (struct smbios_structure *)&smbios_type127_template, + NULL, + smbios_generic_initializer }, + { NULL,NULL, NULL } +}; + +static uint64_t guest_lomem, guest_himem; +static uint16_t type16_handle; + +static int +smbios_generic_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size) +{ + struct smbios_structure *entry; + + memcpy(curaddr, template_entry, template_entry->length); + entry = (struct smbios_structure *)curaddr; + entry->handle = *n + 1; + curaddr += entry->length; + if (template_strings != NULL) { + int i; + + for (i = 0; template_strings[i] != NULL; i++) { + const char *string; + int len; + + string = template_strings[i]; + len = strlen(string) + 1; + memcpy(curaddr, string, len); + curaddr += len; + } + *curaddr = '\0'; + curaddr++; + } else { + /* Minimum string section is double nul */ + *curaddr = '\0'; + curaddr++; + *curaddr = '\0'; + curaddr++; + } + (*n)++; + *endaddr = curaddr; + + return (0); +} + +static int +smbios_type1_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size) +{ + struct smbios_table_type1 *type1; + + smbios_generic_initializer(template_entry, template_strings, + curaddr, endaddr, n, size); + type1 = (struct smbios_table_type1 *)curaddr; + + if (guest_uuid_str != NULL) { + uuid_t uuid; + uint32_t status; + + uuid_from_string(guest_uuid_str, &uuid, &status); + if (status != uuid_s_ok) + return (-1); + + uuid_enc_le(&type1->uuid, &uuid); + } else { + MD5_CTX mdctx; + u_char digest[16]; + char hostname[MAXHOSTNAMELEN]; + + /* + * Universally unique and yet reproducible are an + * oxymoron, however reproducible is desirable in + * this case. + */ + if (gethostname(hostname, sizeof(hostname))) + return (-1); + + MD5Init(&mdctx); + MD5Update(&mdctx, vmname, strlen(vmname)); + MD5Update(&mdctx, hostname, sizeof(hostname)); + MD5Final(digest, &mdctx); + + /* + * Set the variant and version number. + */ + digest[6] &= 0x0F; + digest[6] |= 0x30; /* version 3 */ + digest[8] &= 0x3F; + digest[8] |= 0x80; + + memcpy(&type1->uuid, digest, sizeof (digest)); + } + + return (0); +} + +static int +smbios_type4_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size) +{ + int i; + + for (i = 0; i < guest_ncpus; i++) { + struct smbios_table_type4 *type4; + char *p; + int nstrings, len; + + smbios_generic_initializer(template_entry, template_strings, + curaddr, endaddr, n, size); + type4 = (struct smbios_table_type4 *)curaddr; + p = curaddr + sizeof (struct smbios_table_type4); + nstrings = 0; + while (p < *endaddr - 1) { + if (*p++ == '\0') + nstrings++; + } + len = sprintf(*endaddr - 1, "CPU #%d", i) + 1; + *endaddr += len - 1; + *(*endaddr) = '\0'; + (*endaddr)++; + type4->socket = nstrings + 1; + curaddr = *endaddr; + } + + return (0); +} + +static int +smbios_type16_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size) +{ + struct smbios_table_type16 *type16; + + type16_handle = *n; + smbios_generic_initializer(template_entry, template_strings, + curaddr, endaddr, n, size); + type16 = (struct smbios_table_type16 *)curaddr; + type16->xsize = guest_lomem + guest_himem; + type16->ndevs = guest_himem > 0 ? 2 : 1; + + return (0); +} + +static int +smbios_type17_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size) +{ + struct smbios_table_type17 *type17; + + smbios_generic_initializer(template_entry, template_strings, + curaddr, endaddr, n, size); + type17 = (struct smbios_table_type17 *)curaddr; + type17->arrayhand = type16_handle; + type17->xsize = guest_lomem; + + if (guest_himem > 0) { + curaddr = *endaddr; + smbios_generic_initializer(template_entry, template_strings, + curaddr, endaddr, n, size); + type17 = (struct smbios_table_type17 *)curaddr; + type17->arrayhand = type16_handle; + type17->xsize = guest_himem; + } + + return (0); +} + +static int +smbios_type19_initializer(struct smbios_structure *template_entry, + const char **template_strings, char *curaddr, char **endaddr, + uint16_t *n, uint16_t *size) +{ + struct smbios_table_type19 *type19; + + smbios_generic_initializer(template_entry, template_strings, + curaddr, endaddr, n, size); + type19 = (struct smbios_table_type19 *)curaddr; + type19->arrayhand = type16_handle; + type19->xsaddr = 0; + type19->xeaddr = guest_lomem; + + if (guest_himem > 0) { + curaddr = *endaddr; + smbios_generic_initializer(template_entry, template_strings, + curaddr, endaddr, n, size); + type19 = (struct smbios_table_type19 *)curaddr; + type19->arrayhand = type16_handle; + type19->xsaddr = 4*GB; + type19->xeaddr = guest_himem; + } + + return (0); +} + +static void +smbios_ep_initializer(struct smbios_entry_point *smbios_ep, uint32_t staddr) +{ + memset(smbios_ep, 0, sizeof(*smbios_ep)); + memcpy(smbios_ep->eanchor, SMBIOS_ENTRY_EANCHOR, + SMBIOS_ENTRY_EANCHORLEN); + smbios_ep->eplen = 0x1F; + assert(sizeof (struct smbios_entry_point) == smbios_ep->eplen); + smbios_ep->major = 2; + smbios_ep->minor = 6; + smbios_ep->revision = 0; + memcpy(smbios_ep->ianchor, SMBIOS_ENTRY_IANCHOR, + SMBIOS_ENTRY_IANCHORLEN); + smbios_ep->staddr = staddr; + smbios_ep->bcdrev = 0x24; +} + +static void +smbios_ep_finalizer(struct smbios_entry_point *smbios_ep, uint16_t len, + uint16_t num, uint16_t maxssize) +{ + uint8_t checksum; + int i; + + smbios_ep->maxssize = maxssize; + smbios_ep->stlen = len; + smbios_ep->stnum = num; + + checksum = 0; + for (i = 0x10; i < 0x1f; i++) { + checksum -= ((uint8_t *)smbios_ep)[i]; + } + smbios_ep->ichecksum = checksum; + + checksum = 0; + for (i = 0; i < 0x1f; i++) { + checksum -= ((uint8_t *)smbios_ep)[i]; + } + smbios_ep->echecksum = checksum; +} + +int +smbios_build(struct vmctx *ctx) +{ + struct smbios_entry_point *smbios_ep; + uint16_t n; + uint16_t maxssize; + char *curaddr, *startaddr, *ststartaddr; + int i; + int err; + + guest_lomem = vm_get_lowmem_size(ctx); + guest_himem = vm_get_highmem_size(ctx); + + startaddr = paddr_guest2host(ctx, SMBIOS_BASE, SMBIOS_MAX_LENGTH); + if (startaddr == NULL) { + fprintf(stderr, "smbios table requires mapped mem\n"); + return (ENOMEM); + } + + curaddr = startaddr; + + smbios_ep = (struct smbios_entry_point *)curaddr; + smbios_ep_initializer(smbios_ep, SMBIOS_BASE + + sizeof(struct smbios_entry_point)); + curaddr += sizeof(struct smbios_entry_point); + ststartaddr = curaddr; + + n = 0; + maxssize = 0; + for (i = 0; smbios_template[i].entry != NULL; i++) { + struct smbios_structure *entry; + const char **strings; + initializer_func_t initializer; + char *endaddr; + uint16_t size; + + entry = smbios_template[i].entry; + strings = smbios_template[i].strings; + initializer = smbios_template[i].initializer; + + err = (*initializer)(entry, strings, curaddr, &endaddr, + &n, &size); + if (err != 0) + return (err); + + if (size > maxssize) + maxssize = size; + + curaddr = endaddr; + } + + assert(curaddr - startaddr < SMBIOS_MAX_LENGTH); + smbios_ep_finalizer(smbios_ep, curaddr - ststartaddr, n, maxssize); + + return (0); +} diff --git a/bhyve/smbiostbl.h b/bhyve/smbiostbl.h new file mode 100644 index 0000000..e8b3a4f --- /dev/null +++ b/bhyve/smbiostbl.h @@ -0,0 +1,36 @@ +/*- + * Copyright (c) 2014 Tycho Nightingale + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SMBIOSTBL_H_ +#define _SMBIOSTBL_H_ + +struct vmctx; + +int smbios_build(struct vmctx *ctx); + +#endif /* _SMBIOSTBL_H_ */ diff --git a/bhyve/spinup_ap.c b/bhyve/spinup_ap.c new file mode 100644 index 0000000..c597023 --- /dev/null +++ b/bhyve/spinup_ap.c @@ -0,0 +1,104 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include + +#include +#include +#include + +#include "bhyverun.h" +#include "spinup_ap.h" + +static void +spinup_ap_realmode(struct vmctx *ctx, int newcpu, uint64_t *rip) +{ + int vector, error; + uint16_t cs; + uint64_t desc_base; + uint32_t desc_limit, desc_access; + + vector = *rip >> PAGE_SHIFT; + *rip = 0; + + /* + * Update the %cs and %rip of the guest so that it starts + * executing real mode code at at 'vector << 12'. + */ + error = vm_set_register(ctx, newcpu, VM_REG_GUEST_RIP, *rip); + assert(error == 0); + + error = vm_get_desc(ctx, newcpu, VM_REG_GUEST_CS, &desc_base, + &desc_limit, &desc_access); + assert(error == 0); + + desc_base = vector << PAGE_SHIFT; + error = vm_set_desc(ctx, newcpu, VM_REG_GUEST_CS, + desc_base, desc_limit, desc_access); + assert(error == 0); + + cs = (vector << PAGE_SHIFT) >> 4; + error = vm_set_register(ctx, newcpu, VM_REG_GUEST_CS, cs); + assert(error == 0); +} + +int +spinup_ap(struct vmctx *ctx, int vcpu, int newcpu, uint64_t rip) +{ + int error; + + assert(newcpu != 0); + assert(newcpu < guest_ncpus); + + error = vcpu_reset(ctx, newcpu); + assert(error == 0); + + fbsdrun_set_capabilities(ctx, newcpu); + + /* + * Enable the 'unrestricted guest' mode for 'newcpu'. + * + * Set up the processor state in power-on 16-bit mode, with the CS:IP + * init'd to the specified low-mem 4K page. + */ + error = vm_set_capability(ctx, newcpu, VM_CAP_UNRESTRICTED_GUEST, 1); + assert(error == 0); + + spinup_ap_realmode(ctx, newcpu, &rip); + + fbsdrun_addcpu(ctx, vcpu, newcpu, rip); + + return (newcpu); +} diff --git a/bhyve/spinup_ap.h b/bhyve/spinup_ap.h new file mode 100644 index 0000000..2749ee9 --- /dev/null +++ b/bhyve/spinup_ap.h @@ -0,0 +1,34 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SPINUP_AP_H_ +#define _SPINUP_AP_H_ + +int spinup_ap(struct vmctx *ctx, int vcpu, int newcpu, uint64_t rip); + +#endif diff --git a/bhyve/task_switch.c b/bhyve/task_switch.c new file mode 100644 index 0000000..69dfaae --- /dev/null +++ b/bhyve/task_switch.c @@ -0,0 +1,939 @@ +/*- + * Copyright (c) 2014 Neel Natu + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include "bhyverun.h" + +/* + * Using 'struct i386tss' is tempting but causes myriad sign extension + * issues because all of its fields are defined as signed integers. + */ +struct tss32 { + uint16_t tss_link; + uint16_t rsvd1; + uint32_t tss_esp0; + uint16_t tss_ss0; + uint16_t rsvd2; + uint32_t tss_esp1; + uint16_t tss_ss1; + uint16_t rsvd3; + uint32_t tss_esp2; + uint16_t tss_ss2; + uint16_t rsvd4; + uint32_t tss_cr3; + uint32_t tss_eip; + uint32_t tss_eflags; + uint32_t tss_eax; + uint32_t tss_ecx; + uint32_t tss_edx; + uint32_t tss_ebx; + uint32_t tss_esp; + uint32_t tss_ebp; + uint32_t tss_esi; + uint32_t tss_edi; + uint16_t tss_es; + uint16_t rsvd5; + uint16_t tss_cs; + uint16_t rsvd6; + uint16_t tss_ss; + uint16_t rsvd7; + uint16_t tss_ds; + uint16_t rsvd8; + uint16_t tss_fs; + uint16_t rsvd9; + uint16_t tss_gs; + uint16_t rsvd10; + uint16_t tss_ldt; + uint16_t rsvd11; + uint16_t tss_trap; + uint16_t tss_iomap; +}; +CTASSERT(sizeof(struct tss32) == 104); + +#define SEL_START(sel) (((sel) & ~0x7)) +#define SEL_LIMIT(sel) (((sel) | 0x7)) +#define TSS_BUSY(type) (((type) & 0x2) != 0) + +static uint64_t +GETREG(struct vmctx *ctx, int vcpu, int reg) +{ + uint64_t val; + int error; + + error = vm_get_register(ctx, vcpu, reg, &val); + assert(error == 0); + return (val); +} + +static void +SETREG(struct vmctx *ctx, int vcpu, int reg, uint64_t val) +{ + int error; + + error = vm_set_register(ctx, vcpu, reg, val); + assert(error == 0); +} + +static struct seg_desc +usd_to_seg_desc(struct user_segment_descriptor *usd) +{ + struct seg_desc seg_desc; + + seg_desc.base = (u_int)USD_GETBASE(usd); + if (usd->sd_gran) + seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff; + else + seg_desc.limit = (u_int)USD_GETLIMIT(usd); + seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7; + seg_desc.access |= usd->sd_xx << 12; + seg_desc.access |= usd->sd_def32 << 14; + seg_desc.access |= usd->sd_gran << 15; + + return (seg_desc); +} + +/* + * Inject an exception with an error code that is a segment selector. + * The format of the error code is described in section 6.13, "Error Code", + * Intel SDM volume 3. + * + * Bit 0 (EXT) denotes whether the exception occurred during delivery + * of an external event like an interrupt. + * + * Bit 1 (IDT) indicates whether the selector points to a gate descriptor + * in the IDT. + * + * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI). + */ +static void +sel_exception(struct vmctx *ctx, int vcpu, int vector, uint16_t sel, int ext) +{ + /* + * Bit 2 from the selector is retained as-is in the error code. + * + * Bit 1 can be safely cleared because none of the selectors + * encountered during task switch emulation refer to a task + * gate in the IDT. + * + * Bit 0 is set depending on the value of 'ext'. + */ + sel &= ~0x3; + if (ext) + sel |= 0x1; + vm_inject_fault(ctx, vcpu, vector, 1, sel); +} + +/* + * Return 0 if the selector 'sel' in within the limits of the GDT/LDT + * and non-zero otherwise. + */ +static int +desc_table_limit_check(struct vmctx *ctx, int vcpu, uint16_t sel) +{ + uint64_t base; + uint32_t limit, access; + int error, reg; + + reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR; + error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access); + assert(error == 0); + + if (reg == VM_REG_GUEST_LDTR) { + if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access)) + return (-1); + } + + if (limit < SEL_LIMIT(sel)) + return (-1); + else + return (0); +} + +/* + * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced + * by the selector 'sel'. + * + * Returns 0 on success. + * Returns 1 if an exception was injected into the guest. + * Returns -1 otherwise. + */ +static int +desc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, + uint16_t sel, struct user_segment_descriptor *desc, bool doread, + int *faultptr) +{ + struct iovec iov[2]; + uint64_t base; + uint32_t limit, access; + int error, reg; + + reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR; + error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access); + assert(error == 0); + assert(limit >= SEL_LIMIT(sel)); + + error = vm_copy_setup(ctx, vcpu, paging, base + SEL_START(sel), + sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov), + faultptr); + if (error || *faultptr) + return (error); + + if (doread) + vm_copyin(ctx, vcpu, iov, desc, sizeof(*desc)); + else + vm_copyout(ctx, vcpu, desc, iov, sizeof(*desc)); + return (0); +} + +static int +desc_table_read(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, + uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) +{ + return (desc_table_rw(ctx, vcpu, paging, sel, desc, true, faultptr)); +} + +static int +desc_table_write(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, + uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) +{ + return (desc_table_rw(ctx, vcpu, paging, sel, desc, false, faultptr)); +} + +/* + * Read the TSS descriptor referenced by 'sel' into 'desc'. + * + * Returns 0 on success. + * Returns 1 if an exception was injected into the guest. + * Returns -1 otherwise. + */ +static int +read_tss_descriptor(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, + uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) +{ + struct vm_guest_paging sup_paging; + int error; + + assert(!ISLDT(sel)); + assert(IDXSEL(sel) != 0); + + /* Fetch the new TSS descriptor */ + if (desc_table_limit_check(ctx, vcpu, sel)) { + if (ts->reason == TSR_IRET) + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + else + sel_exception(ctx, vcpu, IDT_GP, sel, ts->ext); + return (1); + } + + sup_paging = ts->paging; + sup_paging.cpl = 0; /* implicit supervisor mode */ + error = desc_table_read(ctx, vcpu, &sup_paging, sel, desc, faultptr); + return (error); +} + +static bool +code_desc(int sd_type) +{ + /* code descriptor */ + return ((sd_type & 0x18) == 0x18); +} + +static bool +stack_desc(int sd_type) +{ + /* writable data descriptor */ + return ((sd_type & 0x1A) == 0x12); +} + +static bool +data_desc(int sd_type) +{ + /* data descriptor or a readable code descriptor */ + return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A); +} + +static bool +ldt_desc(int sd_type) +{ + + return (sd_type == SDT_SYSLDT); +} + +/* + * Validate the descriptor 'seg_desc' associated with 'segment'. + */ +static int +validate_seg_desc(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, + int segment, struct seg_desc *seg_desc, int *faultptr) +{ + struct vm_guest_paging sup_paging; + struct user_segment_descriptor usd; + int error, idtvec; + int cpl, dpl, rpl; + uint16_t sel, cs; + bool ldtseg, codeseg, stackseg, dataseg, conforming; + + ldtseg = codeseg = stackseg = dataseg = false; + switch (segment) { + case VM_REG_GUEST_LDTR: + ldtseg = true; + break; + case VM_REG_GUEST_CS: + codeseg = true; + break; + case VM_REG_GUEST_SS: + stackseg = true; + break; + case VM_REG_GUEST_DS: + case VM_REG_GUEST_ES: + case VM_REG_GUEST_FS: + case VM_REG_GUEST_GS: + dataseg = true; + break; + default: + assert(0); + } + + /* Get the segment selector */ + sel = GETREG(ctx, vcpu, segment); + + /* LDT selector must point into the GDT */ + if (ldtseg && ISLDT(sel)) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + + /* Descriptor table limit check */ + if (desc_table_limit_check(ctx, vcpu, sel)) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + + /* NULL selector */ + if (IDXSEL(sel) == 0) { + /* Code and stack segment selectors cannot be NULL */ + if (codeseg || stackseg) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + seg_desc->base = 0; + seg_desc->limit = 0; + seg_desc->access = 0x10000; /* unusable */ + return (0); + } + + /* Read the descriptor from the GDT/LDT */ + sup_paging = ts->paging; + sup_paging.cpl = 0; /* implicit supervisor mode */ + error = desc_table_read(ctx, vcpu, &sup_paging, sel, &usd, faultptr); + if (error || *faultptr) + return (error); + + /* Verify that the descriptor type is compatible with the segment */ + if ((ldtseg && !ldt_desc(usd.sd_type)) || + (codeseg && !code_desc(usd.sd_type)) || + (dataseg && !data_desc(usd.sd_type)) || + (stackseg && !stack_desc(usd.sd_type))) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + + /* Segment must be marked present */ + if (!usd.sd_p) { + if (ldtseg) + idtvec = IDT_TS; + else if (stackseg) + idtvec = IDT_SS; + else + idtvec = IDT_NP; + sel_exception(ctx, vcpu, idtvec, sel, ts->ext); + return (1); + } + + cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS); + cpl = cs & SEL_RPL_MASK; + rpl = sel & SEL_RPL_MASK; + dpl = usd.sd_dpl; + + if (stackseg && (rpl != cpl || dpl != cpl)) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + + if (codeseg) { + conforming = (usd.sd_type & 0x4) ? true : false; + if ((conforming && (cpl < dpl)) || + (!conforming && (cpl != dpl))) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + } + + if (dataseg) { + /* + * A data segment is always non-conforming except when it's + * descriptor is a readable, conforming code segment. + */ + if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0) + conforming = true; + else + conforming = false; + + if (!conforming && (rpl > dpl || cpl > dpl)) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + } + *seg_desc = usd_to_seg_desc(&usd); + return (0); +} + +static void +tss32_save(struct vmctx *ctx, int vcpu, struct vm_task_switch *task_switch, + uint32_t eip, struct tss32 *tss, struct iovec *iov) +{ + + /* General purpose registers */ + tss->tss_eax = GETREG(ctx, vcpu, VM_REG_GUEST_RAX); + tss->tss_ecx = GETREG(ctx, vcpu, VM_REG_GUEST_RCX); + tss->tss_edx = GETREG(ctx, vcpu, VM_REG_GUEST_RDX); + tss->tss_ebx = GETREG(ctx, vcpu, VM_REG_GUEST_RBX); + tss->tss_esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP); + tss->tss_ebp = GETREG(ctx, vcpu, VM_REG_GUEST_RBP); + tss->tss_esi = GETREG(ctx, vcpu, VM_REG_GUEST_RSI); + tss->tss_edi = GETREG(ctx, vcpu, VM_REG_GUEST_RDI); + + /* Segment selectors */ + tss->tss_es = GETREG(ctx, vcpu, VM_REG_GUEST_ES); + tss->tss_cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS); + tss->tss_ss = GETREG(ctx, vcpu, VM_REG_GUEST_SS); + tss->tss_ds = GETREG(ctx, vcpu, VM_REG_GUEST_DS); + tss->tss_fs = GETREG(ctx, vcpu, VM_REG_GUEST_FS); + tss->tss_gs = GETREG(ctx, vcpu, VM_REG_GUEST_GS); + + /* eflags and eip */ + tss->tss_eflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS); + if (task_switch->reason == TSR_IRET) + tss->tss_eflags &= ~PSL_NT; + tss->tss_eip = eip; + + /* Copy updated old TSS into guest memory */ + vm_copyout(ctx, vcpu, tss, iov, sizeof(struct tss32)); +} + +static void +update_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *sd) +{ + int error; + + error = vm_set_desc(ctx, vcpu, reg, sd->base, sd->limit, sd->access); + assert(error == 0); +} + +/* + * Update the vcpu registers to reflect the state of the new task. + */ +static int +tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, + uint16_t ot_sel, struct tss32 *tss, struct iovec *iov, int *faultptr) +{ + struct seg_desc seg_desc, seg_desc2; + uint64_t *pdpte, maxphyaddr, reserved; + uint32_t eflags; + int error, i; + bool nested; + + nested = false; + if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) { + tss->tss_link = ot_sel; + nested = true; + } + + eflags = tss->tss_eflags; + if (nested) + eflags |= PSL_NT; + + /* LDTR */ + SETREG(ctx, vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt); + + /* PBDR */ + if (ts->paging.paging_mode != PAGING_MODE_FLAT) { + if (ts->paging.paging_mode == PAGING_MODE_PAE) { + /* + * XXX Assuming 36-bit MAXPHYADDR. + */ + maxphyaddr = (1UL << 36) - 1; + pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32); + for (i = 0; i < 4; i++) { + /* Check reserved bits if the PDPTE is valid */ + if (!(pdpte[i] & 0x1)) + continue; + /* + * Bits 2:1, 8:5 and bits above the processor's + * maximum physical address are reserved. + */ + reserved = ~maxphyaddr | 0x1E6; + if (pdpte[i] & reserved) { + vm_inject_gp(ctx, vcpu); + return (1); + } + } + SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]); + SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]); + SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]); + SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]); + } + SETREG(ctx, vcpu, VM_REG_GUEST_CR3, tss->tss_cr3); + ts->paging.cr3 = tss->tss_cr3; + } + + /* eflags and eip */ + SETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS, eflags); + SETREG(ctx, vcpu, VM_REG_GUEST_RIP, tss->tss_eip); + + /* General purpose registers */ + SETREG(ctx, vcpu, VM_REG_GUEST_RAX, tss->tss_eax); + SETREG(ctx, vcpu, VM_REG_GUEST_RCX, tss->tss_ecx); + SETREG(ctx, vcpu, VM_REG_GUEST_RDX, tss->tss_edx); + SETREG(ctx, vcpu, VM_REG_GUEST_RBX, tss->tss_ebx); + SETREG(ctx, vcpu, VM_REG_GUEST_RSP, tss->tss_esp); + SETREG(ctx, vcpu, VM_REG_GUEST_RBP, tss->tss_ebp); + SETREG(ctx, vcpu, VM_REG_GUEST_RSI, tss->tss_esi); + SETREG(ctx, vcpu, VM_REG_GUEST_RDI, tss->tss_edi); + + /* Segment selectors */ + SETREG(ctx, vcpu, VM_REG_GUEST_ES, tss->tss_es); + SETREG(ctx, vcpu, VM_REG_GUEST_CS, tss->tss_cs); + SETREG(ctx, vcpu, VM_REG_GUEST_SS, tss->tss_ss); + SETREG(ctx, vcpu, VM_REG_GUEST_DS, tss->tss_ds); + SETREG(ctx, vcpu, VM_REG_GUEST_FS, tss->tss_fs); + SETREG(ctx, vcpu, VM_REG_GUEST_GS, tss->tss_gs); + + /* + * If this is a nested task then write out the new TSS to update + * the previous link field. + */ + if (nested) + vm_copyout(ctx, vcpu, tss, iov, sizeof(*tss)); + + /* Validate segment descriptors */ + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc, + faultptr); + if (error || *faultptr) + return (error); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_LDTR, &seg_desc); + + /* + * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3. + * + * The SS and CS attribute checks on VM-entry are inter-dependent so + * we need to make sure that both segments are valid before updating + * either of them. This ensures that the VMCS state can pass the + * VM-entry checks so the guest can handle any exception injected + * during task switch emulation. + */ + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_CS, &seg_desc, + faultptr); + if (error || *faultptr) + return (error); + + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_SS, &seg_desc2, + faultptr); + if (error || *faultptr) + return (error); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_CS, &seg_desc); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc2); + ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK; + + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_DS, &seg_desc, + faultptr); + if (error || *faultptr) + return (error); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_DS, &seg_desc); + + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_ES, &seg_desc, + faultptr); + if (error || *faultptr) + return (error); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_ES, &seg_desc); + + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_FS, &seg_desc, + faultptr); + if (error || *faultptr) + return (error); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_FS, &seg_desc); + + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_GS, &seg_desc, + faultptr); + if (error || *faultptr) + return (error); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_GS, &seg_desc); + + return (0); +} + +/* + * Push an error code on the stack of the new task. This is needed if the + * task switch was triggered by a hardware exception that causes an error + * code to be saved (e.g. #PF). + */ +static int +push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, + int task_type, uint32_t errcode, int *faultptr) +{ + struct iovec iov[2]; + struct seg_desc seg_desc; + int stacksize, bytes, error; + uint64_t gla, cr0, rflags; + uint32_t esp; + uint16_t stacksel; + + *faultptr = 0; + + cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0); + rflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS); + stacksel = GETREG(ctx, vcpu, VM_REG_GUEST_SS); + + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc.base, + &seg_desc.limit, &seg_desc.access); + assert(error == 0); + + /* + * Section "Error Code" in the Intel SDM vol 3: the error code is + * pushed on the stack as a doubleword or word (depending on the + * default interrupt, trap or task gate size). + */ + if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS) + bytes = 4; + else + bytes = 2; + + /* + * PUSH instruction from Intel SDM vol 2: the 'B' flag in the + * stack-segment descriptor determines the size of the stack + * pointer outside of 64-bit mode. + */ + if (SEG_DESC_DEF32(seg_desc.access)) + stacksize = 4; + else + stacksize = 2; + + esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP); + esp -= bytes; + + if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, + &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) { + sel_exception(ctx, vcpu, IDT_SS, stacksel, 1); + *faultptr = 1; + return (0); + } + + if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) { + vm_inject_ac(ctx, vcpu, 1); + *faultptr = 1; + return (0); + } + + error = vm_copy_setup(ctx, vcpu, paging, gla, bytes, PROT_WRITE, + iov, nitems(iov), faultptr); + if (error || *faultptr) + return (error); + + vm_copyout(ctx, vcpu, &errcode, iov, bytes); + SETREG(ctx, vcpu, VM_REG_GUEST_RSP, esp); + return (0); +} + +/* + * Evaluate return value from helper functions and potentially return to + * the VM run loop. + */ +#define CHKERR(error,fault) \ + do { \ + assert((error == 0) || (error == EFAULT)); \ + if (error) \ + return (VMEXIT_ABORT); \ + else if (fault) \ + return (VMEXIT_CONTINUE); \ + } while (0) + +int +vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + struct seg_desc nt; + struct tss32 oldtss, newtss; + struct vm_task_switch *task_switch; + struct vm_guest_paging *paging, sup_paging; + struct user_segment_descriptor nt_desc, ot_desc; + struct iovec nt_iov[2], ot_iov[2]; + uint64_t cr0, ot_base; + uint32_t eip, ot_lim, access; + int error, ext, fault, minlimit, nt_type, ot_type, vcpu; + enum task_switch_reason reason; + uint16_t nt_sel, ot_sel; + + task_switch = &vmexit->u.task_switch; + nt_sel = task_switch->tsssel; + ext = vmexit->u.task_switch.ext; + reason = vmexit->u.task_switch.reason; + paging = &vmexit->u.task_switch.paging; + vcpu = *pvcpu; + + assert(paging->cpu_mode == CPU_MODE_PROTECTED); + + /* + * Calculate the instruction pointer to store in the old TSS. + */ + eip = vmexit->rip + vmexit->inst_length; + + /* + * Section 4.6, "Access Rights" in Intel SDM Vol 3. + * The following page table accesses are implicitly supervisor mode: + * - accesses to GDT or LDT to load segment descriptors + * - accesses to the task state segment during task switch + */ + sup_paging = *paging; + sup_paging.cpl = 0; /* implicit supervisor mode */ + + /* Fetch the new TSS descriptor */ + error = read_tss_descriptor(ctx, vcpu, task_switch, nt_sel, &nt_desc, + &fault); + CHKERR(error, fault); + + nt = usd_to_seg_desc(&nt_desc); + + /* Verify the type of the new TSS */ + nt_type = SEG_DESC_TYPE(nt.access); + if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS && + nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) { + sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); + goto done; + } + + /* TSS descriptor must have present bit set */ + if (!SEG_DESC_PRESENT(nt.access)) { + sel_exception(ctx, vcpu, IDT_NP, nt_sel, ext); + goto done; + } + + /* + * TSS must have a minimum length of 104 bytes for a 32-bit TSS and + * 44 bytes for a 16-bit TSS. + */ + if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS) + minlimit = 104 - 1; + else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) + minlimit = 44 - 1; + else + minlimit = 0; + + assert(minlimit > 0); + if (nt.limit < minlimit) { + sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); + goto done; + } + + /* TSS must be busy if task switch is due to IRET */ + if (reason == TSR_IRET && !TSS_BUSY(nt_type)) { + sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); + goto done; + } + + /* + * TSS must be available (not busy) if task switch reason is + * CALL, JMP, exception or interrupt. + */ + if (reason != TSR_IRET && TSS_BUSY(nt_type)) { + sel_exception(ctx, vcpu, IDT_GP, nt_sel, ext); + goto done; + } + + /* Fetch the new TSS */ + error = vm_copy_setup(ctx, vcpu, &sup_paging, nt.base, minlimit + 1, + PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov), &fault); + CHKERR(error, fault); + vm_copyin(ctx, vcpu, nt_iov, &newtss, minlimit + 1); + + /* Get the old TSS selector from the guest's task register */ + ot_sel = GETREG(ctx, vcpu, VM_REG_GUEST_TR); + if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) { + /* + * This might happen if a task switch was attempted without + * ever loading the task register with LTR. In this case the + * TR would contain the values from power-on: + * (sel = 0, base = 0, limit = 0xffff). + */ + sel_exception(ctx, vcpu, IDT_TS, ot_sel, task_switch->ext); + goto done; + } + + /* Get the old TSS base and limit from the guest's task register */ + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim, + &access); + assert(error == 0); + assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access)); + ot_type = SEG_DESC_TYPE(access); + assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY); + + /* Fetch the old TSS descriptor */ + error = read_tss_descriptor(ctx, vcpu, task_switch, ot_sel, &ot_desc, + &fault); + CHKERR(error, fault); + + /* Get the old TSS */ + error = vm_copy_setup(ctx, vcpu, &sup_paging, ot_base, minlimit + 1, + PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov), &fault); + CHKERR(error, fault); + vm_copyin(ctx, vcpu, ot_iov, &oldtss, minlimit + 1); + + /* + * Clear the busy bit in the old TSS descriptor if the task switch + * due to an IRET or JMP instruction. + */ + if (reason == TSR_IRET || reason == TSR_JMP) { + ot_desc.sd_type &= ~0x2; + error = desc_table_write(ctx, vcpu, &sup_paging, ot_sel, + &ot_desc, &fault); + CHKERR(error, fault); + } + + if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) { + fprintf(stderr, "Task switch to 16-bit TSS not supported\n"); + return (VMEXIT_ABORT); + } + + /* Save processor state in old TSS */ + tss32_save(ctx, vcpu, task_switch, eip, &oldtss, ot_iov); + + /* + * If the task switch was triggered for any reason other than IRET + * then set the busy bit in the new TSS descriptor. + */ + if (reason != TSR_IRET) { + nt_desc.sd_type |= 0x2; + error = desc_table_write(ctx, vcpu, &sup_paging, nt_sel, + &nt_desc, &fault); + CHKERR(error, fault); + } + + /* Update task register to point at the new TSS */ + SETREG(ctx, vcpu, VM_REG_GUEST_TR, nt_sel); + + /* Update the hidden descriptor state of the task register */ + nt = usd_to_seg_desc(&nt_desc); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_TR, &nt); + + /* Set CR0.TS */ + cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0); + SETREG(ctx, vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS); + + /* + * We are now committed to the task switch. Any exceptions encountered + * after this point will be handled in the context of the new task and + * the saved instruction pointer will belong to the new task. + */ + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, newtss.tss_eip); + assert(error == 0); + + /* Load processor state from new TSS */ + error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov, + &fault); + CHKERR(error, fault); + + /* + * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception + * caused an error code to be generated, this error code is copied + * to the stack of the new task. + */ + if (task_switch->errcode_valid) { + assert(task_switch->ext); + assert(task_switch->reason == TSR_IDT_GATE); + error = push_errcode(ctx, vcpu, &task_switch->paging, nt_type, + task_switch->errcode, &fault); + CHKERR(error, fault); + } + + /* + * Treatment of virtual-NMI blocking if NMI is delivered through + * a task gate. + * + * Section "Architectural State Before A VM Exit", Intel SDM, Vol3: + * If the virtual NMIs VM-execution control is 1, VM entry injects + * an NMI, and delivery of the NMI causes a task switch that causes + * a VM exit, virtual-NMI blocking is in effect before the VM exit + * commences. + * + * Thus, virtual-NMI blocking is in effect at the time of the task + * switch VM exit. + */ + + /* + * Treatment of virtual-NMI unblocking on IRET from NMI handler task. + * + * Section "Changes to Instruction Behavior in VMX Non-Root Operation" + * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking. + * This unblocking of virtual-NMI occurs even if IRET causes a fault. + * + * Thus, virtual-NMI blocking is cleared at the time of the task switch + * VM exit. + */ + + /* + * If the task switch was triggered by an event delivered through + * the IDT then extinguish the pending event from the vcpu's + * exitintinfo. + */ + if (task_switch->reason == TSR_IDT_GATE) { + error = vm_set_intinfo(ctx, vcpu, 0); + assert(error == 0); + } + + /* + * XXX should inject debug exception if 'T' bit is 1 + */ +done: + return (VMEXIT_CONTINUE); +} diff --git a/bhyve/uart_emul.c b/bhyve/uart_emul.c new file mode 100644 index 0000000..4242e5c --- /dev/null +++ b/bhyve/uart_emul.c @@ -0,0 +1,657 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * Copyright (c) 2013 Neel Natu + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mevent.h" +#include "uart_emul.h" + +#define COM1_BASE 0x3F8 +#define COM1_IRQ 4 +#define COM2_BASE 0x2F8 +#define COM2_IRQ 3 + +#define DEFAULT_RCLK 1843200 +#define DEFAULT_BAUD 9600 + +#define FCR_RX_MASK 0xC0 + +#define MCR_OUT1 0x04 +#define MCR_OUT2 0x08 + +#define MSR_DELTA_MASK 0x0f + +#ifndef REG_SCR +#define REG_SCR com_scr +#endif + +#define FIFOSZ 16 + +static bool uart_stdio; /* stdio in use for i/o */ +static struct termios tio_stdio_orig; + +static struct { + int baseaddr; + int irq; + bool inuse; +} uart_lres[] = { + { COM1_BASE, COM1_IRQ, false}, + { COM2_BASE, COM2_IRQ, false}, +}; + +#define UART_NLDEVS (sizeof(uart_lres) / sizeof(uart_lres[0])) + +struct fifo { + uint8_t buf[FIFOSZ]; + int rindex; /* index to read from */ + int windex; /* index to write to */ + int num; /* number of characters in the fifo */ + int size; /* size of the fifo */ +}; + +struct ttyfd { + bool opened; + int fd; /* tty device file descriptor */ + struct termios tio_orig, tio_new; /* I/O Terminals */ +}; + +struct uart_softc { + pthread_mutex_t mtx; /* protects all softc elements */ + uint8_t data; /* Data register (R/W) */ + uint8_t ier; /* Interrupt enable register (R/W) */ + uint8_t lcr; /* Line control register (R/W) */ + uint8_t mcr; /* Modem control register (R/W) */ + uint8_t lsr; /* Line status register (R/W) */ + uint8_t msr; /* Modem status register (R/W) */ + uint8_t fcr; /* FIFO control register (W) */ + uint8_t scr; /* Scratch register (R/W) */ + + uint8_t dll; /* Baudrate divisor latch LSB */ + uint8_t dlh; /* Baudrate divisor latch MSB */ + + struct fifo rxfifo; + struct mevent *mev; + + struct ttyfd tty; + bool thre_int_pending; /* THRE interrupt pending */ + + void *arg; + uart_intr_func_t intr_assert; + uart_intr_func_t intr_deassert; +}; + +static void uart_drain(int fd, enum ev_type ev, void *arg); + +static void +ttyclose(void) +{ + + tcsetattr(STDIN_FILENO, TCSANOW, &tio_stdio_orig); +} + +static void +ttyopen(struct ttyfd *tf) +{ + + tcgetattr(tf->fd, &tf->tio_orig); + + tf->tio_new = tf->tio_orig; + cfmakeraw(&tf->tio_new); + tf->tio_new.c_cflag |= CLOCAL; + tcsetattr(tf->fd, TCSANOW, &tf->tio_new); + + if (tf->fd == STDIN_FILENO) { + tio_stdio_orig = tf->tio_orig; + atexit(ttyclose); + } +} + +static int +ttyread(struct ttyfd *tf) +{ + unsigned char rb; + + if (read(tf->fd, &rb, 1) == 1) + return (rb); + else + return (-1); +} + +static void +ttywrite(struct ttyfd *tf, unsigned char wb) +{ + + (void)write(tf->fd, &wb, 1); +} + +static void +rxfifo_reset(struct uart_softc *sc, int size) +{ + char flushbuf[32]; + struct fifo *fifo; + ssize_t nread; + int error; + + fifo = &sc->rxfifo; + bzero(fifo, sizeof(struct fifo)); + fifo->size = size; + + if (sc->tty.opened) { + /* + * Flush any unread input from the tty buffer. + */ + while (1) { + nread = read(sc->tty.fd, flushbuf, sizeof(flushbuf)); + if (nread != sizeof(flushbuf)) + break; + } + + /* + * Enable mevent to trigger when new characters are available + * on the tty fd. + */ + error = mevent_enable(sc->mev); + assert(error == 0); + } +} + +static int +rxfifo_available(struct uart_softc *sc) +{ + struct fifo *fifo; + + fifo = &sc->rxfifo; + return (fifo->num < fifo->size); +} + +static int +rxfifo_putchar(struct uart_softc *sc, uint8_t ch) +{ + struct fifo *fifo; + int error; + + fifo = &sc->rxfifo; + + if (fifo->num < fifo->size) { + fifo->buf[fifo->windex] = ch; + fifo->windex = (fifo->windex + 1) % fifo->size; + fifo->num++; + if (!rxfifo_available(sc)) { + if (sc->tty.opened) { + /* + * Disable mevent callback if the FIFO is full. + */ + error = mevent_disable(sc->mev); + assert(error == 0); + } + } + return (0); + } else + return (-1); +} + +static int +rxfifo_getchar(struct uart_softc *sc) +{ + struct fifo *fifo; + int c, error, wasfull; + + wasfull = 0; + fifo = &sc->rxfifo; + if (fifo->num > 0) { + if (!rxfifo_available(sc)) + wasfull = 1; + c = fifo->buf[fifo->rindex]; + fifo->rindex = (fifo->rindex + 1) % fifo->size; + fifo->num--; + if (wasfull) { + if (sc->tty.opened) { + error = mevent_enable(sc->mev); + assert(error == 0); + } + } + return (c); + } else + return (-1); +} + +static int +rxfifo_numchars(struct uart_softc *sc) +{ + struct fifo *fifo = &sc->rxfifo; + + return (fifo->num); +} + +static void +uart_opentty(struct uart_softc *sc) +{ + + ttyopen(&sc->tty); + sc->mev = mevent_add(sc->tty.fd, EVF_READ, uart_drain, sc); + assert(sc->mev != NULL); +} + +/* + * The IIR returns a prioritized interrupt reason: + * - receive data available + * - transmit holding register empty + * - modem status change + * + * Return an interrupt reason if one is available. + */ +static int +uart_intr_reason(struct uart_softc *sc) +{ + + if ((sc->lsr & LSR_OE) != 0 && (sc->ier & IER_ERLS) != 0) + return (IIR_RLS); + else if (rxfifo_numchars(sc) > 0 && (sc->ier & IER_ERXRDY) != 0) + return (IIR_RXTOUT); + else if (sc->thre_int_pending && (sc->ier & IER_ETXRDY) != 0) + return (IIR_TXRDY); + else if ((sc->msr & MSR_DELTA_MASK) != 0 && (sc->ier & IER_EMSC) != 0) + return (IIR_MLSC); + else + return (IIR_NOPEND); +} + +static void +uart_reset(struct uart_softc *sc) +{ + uint16_t divisor; + + divisor = DEFAULT_RCLK / DEFAULT_BAUD / 16; + sc->dll = divisor; + sc->dlh = divisor >> 16; + + rxfifo_reset(sc, 1); /* no fifo until enabled by software */ +} + +/* + * Toggle the COM port's intr pin depending on whether or not we have an + * interrupt condition to report to the processor. + */ +static void +uart_toggle_intr(struct uart_softc *sc) +{ + uint8_t intr_reason; + + intr_reason = uart_intr_reason(sc); + + if (intr_reason == IIR_NOPEND) + (*sc->intr_deassert)(sc->arg); + else + (*sc->intr_assert)(sc->arg); +} + +static void +uart_drain(int fd, enum ev_type ev, void *arg) +{ + struct uart_softc *sc; + int ch; + + sc = arg; + + assert(fd == sc->tty.fd); + assert(ev == EVF_READ); + + /* + * This routine is called in the context of the mevent thread + * to take out the softc lock to protect against concurrent + * access from a vCPU i/o exit + */ + pthread_mutex_lock(&sc->mtx); + + if ((sc->mcr & MCR_LOOPBACK) != 0) { + (void) ttyread(&sc->tty); + } else { + while (rxfifo_available(sc) && + ((ch = ttyread(&sc->tty)) != -1)) { + rxfifo_putchar(sc, ch); + } + uart_toggle_intr(sc); + } + + pthread_mutex_unlock(&sc->mtx); +} + +void +uart_write(struct uart_softc *sc, int offset, uint8_t value) +{ + int fifosz; + uint8_t msr; + + pthread_mutex_lock(&sc->mtx); + + /* + * Take care of the special case DLAB accesses first + */ + if ((sc->lcr & LCR_DLAB) != 0) { + if (offset == REG_DLL) { + sc->dll = value; + goto done; + } + + if (offset == REG_DLH) { + sc->dlh = value; + goto done; + } + } + + switch (offset) { + case REG_DATA: + if (sc->mcr & MCR_LOOPBACK) { + if (rxfifo_putchar(sc, value) != 0) + sc->lsr |= LSR_OE; + } else if (sc->tty.opened) { + ttywrite(&sc->tty, value); + } /* else drop on floor */ + sc->thre_int_pending = true; + break; + case REG_IER: + /* + * Apply mask so that bits 4-7 are 0 + * Also enables bits 0-3 only if they're 1 + */ + sc->ier = value & 0x0F; + break; + case REG_FCR: + /* + * When moving from FIFO and 16450 mode and vice versa, + * the FIFO contents are reset. + */ + if ((sc->fcr & FCR_ENABLE) ^ (value & FCR_ENABLE)) { + fifosz = (value & FCR_ENABLE) ? FIFOSZ : 1; + rxfifo_reset(sc, fifosz); + } + + /* + * The FCR_ENABLE bit must be '1' for the programming + * of other FCR bits to be effective. + */ + if ((value & FCR_ENABLE) == 0) { + sc->fcr = 0; + } else { + if ((value & FCR_RCV_RST) != 0) + rxfifo_reset(sc, FIFOSZ); + + sc->fcr = value & + (FCR_ENABLE | FCR_DMA | FCR_RX_MASK); + } + break; + case REG_LCR: + sc->lcr = value; + break; + case REG_MCR: + /* Apply mask so that bits 5-7 are 0 */ + sc->mcr = value & 0x1F; + + msr = 0; + if (sc->mcr & MCR_LOOPBACK) { + /* + * In the loopback mode certain bits from the + * MCR are reflected back into MSR + */ + if (sc->mcr & MCR_RTS) + msr |= MSR_CTS; + if (sc->mcr & MCR_DTR) + msr |= MSR_DSR; + if (sc->mcr & MCR_OUT1) + msr |= MSR_RI; + if (sc->mcr & MCR_OUT2) + msr |= MSR_DCD; + } + + /* + * Detect if there has been any change between the + * previous and the new value of MSR. If there is + * then assert the appropriate MSR delta bit. + */ + if ((msr & MSR_CTS) ^ (sc->msr & MSR_CTS)) + sc->msr |= MSR_DCTS; + if ((msr & MSR_DSR) ^ (sc->msr & MSR_DSR)) + sc->msr |= MSR_DDSR; + if ((msr & MSR_DCD) ^ (sc->msr & MSR_DCD)) + sc->msr |= MSR_DDCD; + if ((sc->msr & MSR_RI) != 0 && (msr & MSR_RI) == 0) + sc->msr |= MSR_TERI; + + /* + * Update the value of MSR while retaining the delta + * bits. + */ + sc->msr &= MSR_DELTA_MASK; + sc->msr |= msr; + break; + case REG_LSR: + /* + * Line status register is not meant to be written to + * during normal operation. + */ + break; + case REG_MSR: + /* + * As far as I can tell MSR is a read-only register. + */ + break; + case REG_SCR: + sc->scr = value; + break; + default: + break; + } + +done: + uart_toggle_intr(sc); + pthread_mutex_unlock(&sc->mtx); +} + +uint8_t +uart_read(struct uart_softc *sc, int offset) +{ + uint8_t iir, intr_reason, reg; + + pthread_mutex_lock(&sc->mtx); + + /* + * Take care of the special case DLAB accesses first + */ + if ((sc->lcr & LCR_DLAB) != 0) { + if (offset == REG_DLL) { + reg = sc->dll; + goto done; + } + + if (offset == REG_DLH) { + reg = sc->dlh; + goto done; + } + } + + switch (offset) { + case REG_DATA: + reg = rxfifo_getchar(sc); + break; + case REG_IER: + reg = sc->ier; + break; + case REG_IIR: + iir = (sc->fcr & FCR_ENABLE) ? IIR_FIFO_MASK : 0; + + intr_reason = uart_intr_reason(sc); + + /* + * Deal with side effects of reading the IIR register + */ + if (intr_reason == IIR_TXRDY) + sc->thre_int_pending = false; + + iir |= intr_reason; + + reg = iir; + break; + case REG_LCR: + reg = sc->lcr; + break; + case REG_MCR: + reg = sc->mcr; + break; + case REG_LSR: + /* Transmitter is always ready for more data */ + sc->lsr |= LSR_TEMT | LSR_THRE; + + /* Check for new receive data */ + if (rxfifo_numchars(sc) > 0) + sc->lsr |= LSR_RXRDY; + else + sc->lsr &= ~LSR_RXRDY; + + reg = sc->lsr; + + /* The LSR_OE bit is cleared on LSR read */ + sc->lsr &= ~LSR_OE; + break; + case REG_MSR: + /* + * MSR delta bits are cleared on read + */ + reg = sc->msr; + sc->msr &= ~MSR_DELTA_MASK; + break; + case REG_SCR: + reg = sc->scr; + break; + default: + reg = 0xFF; + break; + } + +done: + uart_toggle_intr(sc); + pthread_mutex_unlock(&sc->mtx); + + return (reg); +} + +int +uart_legacy_alloc(int which, int *baseaddr, int *irq) +{ + + if (which < 0 || which >= UART_NLDEVS || uart_lres[which].inuse) + return (-1); + + uart_lres[which].inuse = true; + *baseaddr = uart_lres[which].baseaddr; + *irq = uart_lres[which].irq; + + return (0); +} + +struct uart_softc * +uart_init(uart_intr_func_t intr_assert, uart_intr_func_t intr_deassert, + void *arg) +{ + struct uart_softc *sc; + + sc = calloc(1, sizeof(struct uart_softc)); + + sc->arg = arg; + sc->intr_assert = intr_assert; + sc->intr_deassert = intr_deassert; + + pthread_mutex_init(&sc->mtx, NULL); + + uart_reset(sc); + + return (sc); +} + +static int +uart_tty_backend(struct uart_softc *sc, const char *opts) +{ + int fd; + int retval; + + retval = -1; + + fd = open(opts, O_RDWR | O_NONBLOCK); + if (fd > 0 && isatty(fd)) { + sc->tty.fd = fd; + sc->tty.opened = true; + retval = 0; + } + + return (retval); +} + +int +uart_set_backend(struct uart_softc *sc, const char *opts) +{ + int retval; + + retval = -1; + + if (opts == NULL) + return (0); + + if (strcmp("stdio", opts) == 0) { + if (!uart_stdio) { + sc->tty.fd = STDIN_FILENO; + sc->tty.opened = true; + uart_stdio = true; + retval = 0; + } + } else if (uart_tty_backend(sc, opts) == 0) { + retval = 0; + } + + /* Make the backend file descriptor non-blocking */ + if (retval == 0) + retval = fcntl(sc->tty.fd, F_SETFL, O_NONBLOCK); + + if (retval == 0) + uart_opentty(sc); + + return (retval); +} diff --git a/bhyve/uart_emul.h b/bhyve/uart_emul.h new file mode 100644 index 0000000..993b92e --- /dev/null +++ b/bhyve/uart_emul.h @@ -0,0 +1,45 @@ +/*- + * Copyright (c) 2013 Neel Natu + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _UART_EMUL_H_ +#define _UART_EMUL_H_ + + +#define UART_IO_BAR_SIZE 8 + +struct uart_softc; + +typedef void (*uart_intr_func_t)(void *arg); +struct uart_softc *uart_init(uart_intr_func_t intr_assert, + uart_intr_func_t intr_deassert, void *arg); + +int uart_legacy_alloc(int unit, int *ioaddr, int *irq); +uint8_t uart_read(struct uart_softc *sc, int offset); +void uart_write(struct uart_softc *sc, int offset, uint8_t value); +int uart_set_backend(struct uart_softc *sc, const char *opt); +#endif diff --git a/bhyve/virtio.c b/bhyve/virtio.c new file mode 100644 index 0000000..11b1e62 --- /dev/null +++ b/bhyve/virtio.c @@ -0,0 +1,777 @@ +/*- + * Copyright (c) 2013 Chris Torek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include +#include + +#include "bhyverun.h" +#include "pci_emul.h" +#include "virtio.h" + +/* + * Functions for dealing with generalized "virtual devices" as + * defined by + */ + +/* + * In case we decide to relax the "virtio softc comes at the + * front of virtio-based device softc" constraint, let's use + * this to convert. + */ +#define DEV_SOFTC(vs) ((void *)(vs)) + +/* + * Link a virtio_softc to its constants, the device softc, and + * the PCI emulation. + */ +void +vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc, + void *dev_softc, struct pci_devinst *pi, + struct vqueue_info *queues) +{ + int i; + + /* vs and dev_softc addresses must match */ + assert((void *)vs == dev_softc); + vs->vs_vc = vc; + vs->vs_pi = pi; + pi->pi_arg = vs; + + vs->vs_queues = queues; + for (i = 0; i < vc->vc_nvq; i++) { + queues[i].vq_vs = vs; + queues[i].vq_num = i; + } +} + +/* + * Reset device (device-wide). This erases all queues, i.e., + * all the queues become invalid (though we don't wipe out the + * internal pointers, we just clear the VQ_ALLOC flag). + * + * It resets negotiated features to "none". + * + * If MSI-X is enabled, this also resets all the vectors to NO_VECTOR. + */ +void +vi_reset_dev(struct virtio_softc *vs) +{ + struct vqueue_info *vq; + int i, nvq; + + if (vs->vs_mtx) + assert(pthread_mutex_isowned_np(vs->vs_mtx)); + + nvq = vs->vs_vc->vc_nvq; + for (vq = vs->vs_queues, i = 0; i < nvq; vq++, i++) { + vq->vq_flags = 0; + vq->vq_last_avail = 0; + vq->vq_save_used = 0; + vq->vq_pfn = 0; + vq->vq_msix_idx = VIRTIO_MSI_NO_VECTOR; + } + vs->vs_negotiated_caps = 0; + vs->vs_curq = 0; + /* vs->vs_status = 0; -- redundant */ + if (vs->vs_isr) + pci_lintr_deassert(vs->vs_pi); + vs->vs_isr = 0; + vs->vs_msix_cfg_idx = VIRTIO_MSI_NO_VECTOR; +} + +/* + * Set I/O BAR (usually 0) to map PCI config registers. + */ +void +vi_set_io_bar(struct virtio_softc *vs, int barnum) +{ + size_t size; + + /* + * ??? should we use CFG0 if MSI-X is disabled? + * Existing code did not... + */ + size = VTCFG_R_CFG1 + vs->vs_vc->vc_cfgsize; + pci_emul_alloc_bar(vs->vs_pi, barnum, PCIBAR_IO, size); +} + +/* + * Initialize MSI-X vector capabilities if we're to use MSI-X, + * or MSI capabilities if not. + * + * We assume we want one MSI-X vector per queue, here, plus one + * for the config vec. + */ +int +vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix) +{ + int nvec; + + if (use_msix) { + vs->vs_flags |= VIRTIO_USE_MSIX; + VS_LOCK(vs); + vi_reset_dev(vs); /* set all vectors to NO_VECTOR */ + VS_UNLOCK(vs); + nvec = vs->vs_vc->vc_nvq + 1; + if (pci_emul_add_msixcap(vs->vs_pi, nvec, barnum)) + return (1); + } else + vs->vs_flags &= ~VIRTIO_USE_MSIX; + + /* Only 1 MSI vector for bhyve */ + pci_emul_add_msicap(vs->vs_pi, 1); + + /* Legacy interrupts are mandatory for virtio devices */ + pci_lintr_request(vs->vs_pi); + + return (0); +} + +/* + * Initialize the currently-selected virtio queue (vs->vs_curq). + * The guest just gave us a page frame number, from which we can + * calculate the addresses of the queue. + */ +void +vi_vq_init(struct virtio_softc *vs, uint32_t pfn) +{ + struct vqueue_info *vq; + uint64_t phys; + size_t size; + char *base; + + vq = &vs->vs_queues[vs->vs_curq]; + vq->vq_pfn = pfn; + phys = (uint64_t)pfn << VRING_PFN; + size = vring_size(vq->vq_qsize); + base = paddr_guest2host(vs->vs_pi->pi_vmctx, phys, size); + + /* First page(s) are descriptors... */ + vq->vq_desc = (struct virtio_desc *)base; + base += vq->vq_qsize * sizeof(struct virtio_desc); + + /* ... immediately followed by "avail" ring (entirely uint16_t's) */ + vq->vq_avail = (struct vring_avail *)base; + base += (2 + vq->vq_qsize + 1) * sizeof(uint16_t); + + /* Then it's rounded up to the next page... */ + base = (char *)roundup2((uintptr_t)base, VRING_ALIGN); + + /* ... and the last page(s) are the used ring. */ + vq->vq_used = (struct vring_used *)base; + + /* Mark queue as allocated, and start at 0 when we use it. */ + vq->vq_flags = VQ_ALLOC; + vq->vq_last_avail = 0; + vq->vq_save_used = 0; +} + +/* + * Helper inline for vq_getchain(): record the i'th "real" + * descriptor. + */ +static inline void +_vq_record(int i, volatile struct virtio_desc *vd, struct vmctx *ctx, + struct iovec *iov, int n_iov, uint16_t *flags) { + + if (i >= n_iov) + return; + iov[i].iov_base = paddr_guest2host(ctx, vd->vd_addr, vd->vd_len); + iov[i].iov_len = vd->vd_len; + if (flags != NULL) + flags[i] = vd->vd_flags; +} +#define VQ_MAX_DESCRIPTORS 512 /* see below */ + +/* + * Examine the chain of descriptors starting at the "next one" to + * make sure that they describe a sensible request. If so, return + * the number of "real" descriptors that would be needed/used in + * acting on this request. This may be smaller than the number of + * available descriptors, e.g., if there are two available but + * they are two separate requests, this just returns 1. Or, it + * may be larger: if there are indirect descriptors involved, + * there may only be one descriptor available but it may be an + * indirect pointing to eight more. We return 8 in this case, + * i.e., we do not count the indirect descriptors, only the "real" + * ones. + * + * Basically, this vets the vd_flags and vd_next field of each + * descriptor and tells you how many are involved. Since some may + * be indirect, this also needs the vmctx (in the pci_devinst + * at vs->vs_pi) so that it can find indirect descriptors. + * + * As we process each descriptor, we copy and adjust it (guest to + * host address wise, also using the vmtctx) into the given iov[] + * array (of the given size). If the array overflows, we stop + * placing values into the array but keep processing descriptors, + * up to VQ_MAX_DESCRIPTORS, before giving up and returning -1. + * So you, the caller, must not assume that iov[] is as big as the + * return value (you can process the same thing twice to allocate + * a larger iov array if needed, or supply a zero length to find + * out how much space is needed). + * + * If you want to verify the WRITE flag on each descriptor, pass a + * non-NULL "flags" pointer to an array of "uint16_t" of the same size + * as n_iov and we'll copy each vd_flags field after unwinding any + * indirects. + * + * If some descriptor(s) are invalid, this prints a diagnostic message + * and returns -1. If no descriptors are ready now it simply returns 0. + * + * You are assumed to have done a vq_ring_ready() if needed (note + * that vq_has_descs() does one). + */ +int +vq_getchain(struct vqueue_info *vq, uint16_t *pidx, + struct iovec *iov, int n_iov, uint16_t *flags) +{ + int i; + u_int ndesc, n_indir; + u_int idx, next; + volatile struct virtio_desc *vdir, *vindir, *vp; + struct vmctx *ctx; + struct virtio_softc *vs; + const char *name; + + vs = vq->vq_vs; + name = vs->vs_vc->vc_name; + + /* + * Note: it's the responsibility of the guest not to + * update vq->vq_avail->va_idx until all of the descriptors + * the guest has written are valid (including all their + * vd_next fields and vd_flags). + * + * Compute (last_avail - va_idx) in integers mod 2**16. This is + * the number of descriptors the device has made available + * since the last time we updated vq->vq_last_avail. + * + * We just need to do the subtraction as an unsigned int, + * then trim off excess bits. + */ + idx = vq->vq_last_avail; + ndesc = (uint16_t)((u_int)vq->vq_avail->va_idx - idx); + if (ndesc == 0) + return (0); + if (ndesc > vq->vq_qsize) { + /* XXX need better way to diagnose issues */ + fprintf(stderr, + "%s: ndesc (%u) out of range, driver confused?\r\n", + name, (u_int)ndesc); + return (-1); + } + + /* + * Now count/parse "involved" descriptors starting from + * the head of the chain. + * + * To prevent loops, we could be more complicated and + * check whether we're re-visiting a previously visited + * index, but we just abort if the count gets excessive. + */ + ctx = vs->vs_pi->pi_vmctx; + *pidx = next = vq->vq_avail->va_ring[idx & (vq->vq_qsize - 1)]; + vq->vq_last_avail++; + for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->vd_next) { + if (next >= vq->vq_qsize) { + fprintf(stderr, + "%s: descriptor index %u out of range, " + "driver confused?\r\n", + name, next); + return (-1); + } + vdir = &vq->vq_desc[next]; + if ((vdir->vd_flags & VRING_DESC_F_INDIRECT) == 0) { + _vq_record(i, vdir, ctx, iov, n_iov, flags); + i++; + } else if ((vs->vs_vc->vc_hv_caps & + VIRTIO_RING_F_INDIRECT_DESC) == 0) { + fprintf(stderr, + "%s: descriptor has forbidden INDIRECT flag, " + "driver confused?\r\n", + name); + return (-1); + } else { + n_indir = vdir->vd_len / 16; + if ((vdir->vd_len & 0xf) || n_indir == 0) { + fprintf(stderr, + "%s: invalid indir len 0x%x, " + "driver confused?\r\n", + name, (u_int)vdir->vd_len); + return (-1); + } + vindir = paddr_guest2host(ctx, + vdir->vd_addr, vdir->vd_len); + /* + * Indirects start at the 0th, then follow + * their own embedded "next"s until those run + * out. Each one's indirect flag must be off + * (we don't really have to check, could just + * ignore errors...). + */ + next = 0; + for (;;) { + vp = &vindir[next]; + if (vp->vd_flags & VRING_DESC_F_INDIRECT) { + fprintf(stderr, + "%s: indirect desc has INDIR flag," + " driver confused?\r\n", + name); + return (-1); + } + _vq_record(i, vp, ctx, iov, n_iov, flags); + if (++i > VQ_MAX_DESCRIPTORS) + goto loopy; + if ((vp->vd_flags & VRING_DESC_F_NEXT) == 0) + break; + next = vp->vd_next; + if (next >= n_indir) { + fprintf(stderr, + "%s: invalid next %u > %u, " + "driver confused?\r\n", + name, (u_int)next, n_indir); + return (-1); + } + } + } + if ((vdir->vd_flags & VRING_DESC_F_NEXT) == 0) + return (i); + } +loopy: + fprintf(stderr, + "%s: descriptor loop? count > %d - driver confused?\r\n", + name, i); + return (-1); +} + +/* + * Return the currently-first request chain back to the available queue. + * + * (This chain is the one you handled when you called vq_getchain() + * and used its positive return value.) + */ +void +vq_retchain(struct vqueue_info *vq) +{ + + vq->vq_last_avail--; +} + +/* + * Return specified request chain to the guest, setting its I/O length + * to the provided value. + * + * (This chain is the one you handled when you called vq_getchain() + * and used its positive return value.) + */ +void +vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen) +{ + uint16_t uidx, mask; + volatile struct vring_used *vuh; + volatile struct virtio_used *vue; + + /* + * Notes: + * - mask is N-1 where N is a power of 2 so computes x % N + * - vuh points to the "used" data shared with guest + * - vue points to the "used" ring entry we want to update + * - head is the same value we compute in vq_iovecs(). + * + * (I apologize for the two fields named vu_idx; the + * virtio spec calls the one that vue points to, "id"...) + */ + mask = vq->vq_qsize - 1; + vuh = vq->vq_used; + + uidx = vuh->vu_idx; + vue = &vuh->vu_ring[uidx++ & mask]; + vue->vu_idx = idx; + vue->vu_tlen = iolen; + vuh->vu_idx = uidx; +} + +/* + * Driver has finished processing "available" chains and calling + * vq_relchain on each one. If driver used all the available + * chains, used_all should be set. + * + * If the "used" index moved we may need to inform the guest, i.e., + * deliver an interrupt. Even if the used index did NOT move we + * may need to deliver an interrupt, if the avail ring is empty and + * we are supposed to interrupt on empty. + * + * Note that used_all_avail is provided by the caller because it's + * a snapshot of the ring state when he decided to finish interrupt + * processing -- it's possible that descriptors became available after + * that point. (It's also typically a constant 1/True as well.) + */ +void +vq_endchains(struct vqueue_info *vq, int used_all_avail) +{ + struct virtio_softc *vs; + uint16_t event_idx, new_idx, old_idx; + int intr; + + /* + * Interrupt generation: if we're using EVENT_IDX, + * interrupt if we've crossed the event threshold. + * Otherwise interrupt is generated if we added "used" entries, + * but suppressed by VRING_AVAIL_F_NO_INTERRUPT. + * + * In any case, though, if NOTIFY_ON_EMPTY is set and the + * entire avail was processed, we need to interrupt always. + */ + vs = vq->vq_vs; + old_idx = vq->vq_save_used; + vq->vq_save_used = new_idx = vq->vq_used->vu_idx; + if (used_all_avail && + (vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY)) + intr = 1; + else if (vs->vs_negotiated_caps & VIRTIO_RING_F_EVENT_IDX) { + event_idx = VQ_USED_EVENT_IDX(vq); + /* + * This calculation is per docs and the kernel + * (see src/sys/dev/virtio/virtio_ring.h). + */ + intr = (uint16_t)(new_idx - event_idx - 1) < + (uint16_t)(new_idx - old_idx); + } else { + intr = new_idx != old_idx && + !(vq->vq_avail->va_flags & VRING_AVAIL_F_NO_INTERRUPT); + } + if (intr) + vq_interrupt(vs, vq); +} + +/* Note: these are in sorted order to make for a fast search */ +static struct config_reg { + uint16_t cr_offset; /* register offset */ + uint8_t cr_size; /* size (bytes) */ + uint8_t cr_ro; /* true => reg is read only */ + const char *cr_name; /* name of reg */ +} config_regs[] = { + { VTCFG_R_HOSTCAP, 4, 1, "HOSTCAP" }, + { VTCFG_R_GUESTCAP, 4, 0, "GUESTCAP" }, + { VTCFG_R_PFN, 4, 0, "PFN" }, + { VTCFG_R_QNUM, 2, 1, "QNUM" }, + { VTCFG_R_QSEL, 2, 0, "QSEL" }, + { VTCFG_R_QNOTIFY, 2, 0, "QNOTIFY" }, + { VTCFG_R_STATUS, 1, 0, "STATUS" }, + { VTCFG_R_ISR, 1, 0, "ISR" }, + { VTCFG_R_CFGVEC, 2, 0, "CFGVEC" }, + { VTCFG_R_QVEC, 2, 0, "QVEC" }, +}; + +static inline struct config_reg * +vi_find_cr(int offset) { + u_int hi, lo, mid; + struct config_reg *cr; + + lo = 0; + hi = sizeof(config_regs) / sizeof(*config_regs) - 1; + while (hi >= lo) { + mid = (hi + lo) >> 1; + cr = &config_regs[mid]; + if (cr->cr_offset == offset) + return (cr); + if (cr->cr_offset < offset) + lo = mid + 1; + else + hi = mid - 1; + } + return (NULL); +} + +/* + * Handle pci config space reads. + * If it's to the MSI-X info, do that. + * If it's part of the virtio standard stuff, do that. + * Otherwise dispatch to the actual driver. + */ +uint64_t +vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size) +{ + struct virtio_softc *vs = pi->pi_arg; + struct virtio_consts *vc; + struct config_reg *cr; + uint64_t virtio_config_size, max; + const char *name; + uint32_t newoff; + uint32_t value; + int error; + + if (vs->vs_flags & VIRTIO_USE_MSIX) { + if (baridx == pci_msix_table_bar(pi) || + baridx == pci_msix_pba_bar(pi)) { + return (pci_emul_msix_tread(pi, offset, size)); + } + } + + /* XXX probably should do something better than just assert() */ + assert(baridx == 0); + + if (vs->vs_mtx) + pthread_mutex_lock(vs->vs_mtx); + + vc = vs->vs_vc; + name = vc->vc_name; + value = size == 1 ? 0xff : size == 2 ? 0xffff : 0xffffffff; + + if (size != 1 && size != 2 && size != 4) + goto bad; + + if (pci_msix_enabled(pi)) + virtio_config_size = VTCFG_R_CFG1; + else + virtio_config_size = VTCFG_R_CFG0; + + if (offset >= virtio_config_size) { + /* + * Subtract off the standard size (including MSI-X + * registers if enabled) and dispatch to underlying driver. + * If that fails, fall into general code. + */ + newoff = offset - virtio_config_size; + max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000; + if (newoff + size > max) + goto bad; + error = (*vc->vc_cfgread)(DEV_SOFTC(vs), newoff, size, &value); + if (!error) + goto done; + } + +bad: + cr = vi_find_cr(offset); + if (cr == NULL || cr->cr_size != size) { + if (cr != NULL) { + /* offset must be OK, so size must be bad */ + fprintf(stderr, + "%s: read from %s: bad size %d\r\n", + name, cr->cr_name, size); + } else { + fprintf(stderr, + "%s: read from bad offset/size %jd/%d\r\n", + name, (uintmax_t)offset, size); + } + goto done; + } + + switch (offset) { + case VTCFG_R_HOSTCAP: + value = vc->vc_hv_caps; + break; + case VTCFG_R_GUESTCAP: + value = vs->vs_negotiated_caps; + break; + case VTCFG_R_PFN: + if (vs->vs_curq < vc->vc_nvq) + value = vs->vs_queues[vs->vs_curq].vq_pfn; + break; + case VTCFG_R_QNUM: + value = vs->vs_curq < vc->vc_nvq ? + vs->vs_queues[vs->vs_curq].vq_qsize : 0; + break; + case VTCFG_R_QSEL: + value = vs->vs_curq; + break; + case VTCFG_R_QNOTIFY: + value = 0; /* XXX */ + break; + case VTCFG_R_STATUS: + value = vs->vs_status; + break; + case VTCFG_R_ISR: + value = vs->vs_isr; + vs->vs_isr = 0; /* a read clears this flag */ + if (value) + pci_lintr_deassert(pi); + break; + case VTCFG_R_CFGVEC: + value = vs->vs_msix_cfg_idx; + break; + case VTCFG_R_QVEC: + value = vs->vs_curq < vc->vc_nvq ? + vs->vs_queues[vs->vs_curq].vq_msix_idx : + VIRTIO_MSI_NO_VECTOR; + break; + } +done: + if (vs->vs_mtx) + pthread_mutex_unlock(vs->vs_mtx); + return (value); +} + +/* + * Handle pci config space writes. + * If it's to the MSI-X info, do that. + * If it's part of the virtio standard stuff, do that. + * Otherwise dispatch to the actual driver. + */ +void +vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value) +{ + struct virtio_softc *vs = pi->pi_arg; + struct vqueue_info *vq; + struct virtio_consts *vc; + struct config_reg *cr; + uint64_t virtio_config_size, max; + const char *name; + uint32_t newoff; + int error; + + if (vs->vs_flags & VIRTIO_USE_MSIX) { + if (baridx == pci_msix_table_bar(pi) || + baridx == pci_msix_pba_bar(pi)) { + pci_emul_msix_twrite(pi, offset, size, value); + return; + } + } + + /* XXX probably should do something better than just assert() */ + assert(baridx == 0); + + if (vs->vs_mtx) + pthread_mutex_lock(vs->vs_mtx); + + vc = vs->vs_vc; + name = vc->vc_name; + + if (size != 1 && size != 2 && size != 4) + goto bad; + + if (pci_msix_enabled(pi)) + virtio_config_size = VTCFG_R_CFG1; + else + virtio_config_size = VTCFG_R_CFG0; + + if (offset >= virtio_config_size) { + /* + * Subtract off the standard size (including MSI-X + * registers if enabled) and dispatch to underlying driver. + */ + newoff = offset - virtio_config_size; + max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000; + if (newoff + size > max) + goto bad; + error = (*vc->vc_cfgwrite)(DEV_SOFTC(vs), newoff, size, value); + if (!error) + goto done; + } + +bad: + cr = vi_find_cr(offset); + if (cr == NULL || cr->cr_size != size || cr->cr_ro) { + if (cr != NULL) { + /* offset must be OK, wrong size and/or reg is R/O */ + if (cr->cr_size != size) + fprintf(stderr, + "%s: write to %s: bad size %d\r\n", + name, cr->cr_name, size); + if (cr->cr_ro) + fprintf(stderr, + "%s: write to read-only reg %s\r\n", + name, cr->cr_name); + } else { + fprintf(stderr, + "%s: write to bad offset/size %jd/%d\r\n", + name, (uintmax_t)offset, size); + } + goto done; + } + + switch (offset) { + case VTCFG_R_GUESTCAP: + vs->vs_negotiated_caps = value & vc->vc_hv_caps; + if (vc->vc_apply_features) + (*vc->vc_apply_features)(DEV_SOFTC(vs), + vs->vs_negotiated_caps); + break; + case VTCFG_R_PFN: + if (vs->vs_curq >= vc->vc_nvq) + goto bad_qindex; + vi_vq_init(vs, value); + break; + case VTCFG_R_QSEL: + /* + * Note that the guest is allowed to select an + * invalid queue; we just need to return a QNUM + * of 0 while the bad queue is selected. + */ + vs->vs_curq = value; + break; + case VTCFG_R_QNOTIFY: + if (value >= vc->vc_nvq) { + fprintf(stderr, "%s: queue %d notify out of range\r\n", + name, (int)value); + goto done; + } + vq = &vs->vs_queues[value]; + if (vq->vq_notify) + (*vq->vq_notify)(DEV_SOFTC(vs), vq); + else if (vc->vc_qnotify) + (*vc->vc_qnotify)(DEV_SOFTC(vs), vq); + else + fprintf(stderr, + "%s: qnotify queue %d: missing vq/vc notify\r\n", + name, (int)value); + break; + case VTCFG_R_STATUS: + vs->vs_status = value; + if (value == 0) + (*vc->vc_reset)(DEV_SOFTC(vs)); + break; + case VTCFG_R_CFGVEC: + vs->vs_msix_cfg_idx = value; + break; + case VTCFG_R_QVEC: + if (vs->vs_curq >= vc->vc_nvq) + goto bad_qindex; + vq = &vs->vs_queues[vs->vs_curq]; + vq->vq_msix_idx = value; + break; + } + goto done; + +bad_qindex: + fprintf(stderr, + "%s: write config reg %s: curq %d >= max %d\r\n", + name, cr->cr_name, vs->vs_curq, vc->vc_nvq); +done: + if (vs->vs_mtx) + pthread_mutex_unlock(vs->vs_mtx); +} diff --git a/bhyve/virtio.h b/bhyve/virtio.h new file mode 100644 index 0000000..0e96a1d --- /dev/null +++ b/bhyve/virtio.h @@ -0,0 +1,464 @@ +/*- + * Copyright (c) 2013 Chris Torek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VIRTIO_H_ +#define _VIRTIO_H_ + +/* + * These are derived from several virtio specifications. + * + * Some useful links: + * https://github.com/rustyrussell/virtio-spec + * http://people.redhat.com/pbonzini/virtio-spec.pdf + */ + +/* + * A virtual device has zero or more "virtual queues" (virtqueue). + * Each virtqueue uses at least two 4096-byte pages, laid out thus: + * + * +-----------------------------------------------+ + * | "desc": descriptors, 16 bytes each | + * | ----------------------------------------- | + * | "avail": 2 uint16; uint16; 1 uint16 | + * | ----------------------------------------- | + * | pad to 4k boundary | + * +-----------------------------------------------+ + * | "used": 2 x uint16; elems; 1 uint16 | + * | ----------------------------------------- | + * | pad to 4k boundary | + * +-----------------------------------------------+ + * + * The number that appears here is always a power of two and is + * limited to no more than 32768 (as it must fit in a 16-bit field). + * If is sufficiently large, the above will occupy more than + * two pages. In any case, all pages must be physically contiguous + * within the guest's physical address space. + * + * The 16-byte "desc" descriptors consist of a 64-bit guest + * physical address , a 32-bit length , a 16-bit + * , and a 16-bit field (all in guest byte order). + * + * There are three flags that may be set : + * NEXT descriptor is chained, so use its "next" field + * WRITE descriptor is for host to write into guest RAM + * (else host is to read from guest RAM) + * INDIRECT descriptor address field is (guest physical) + * address of a linear array of descriptors + * + * Unless INDIRECT is set, is the number of bytes that may + * be read/written from guest physical address . If + * INDIRECT is set, WRITE is ignored and provides the length + * of the indirect descriptors (and must be a multiple of + * 16). Note that NEXT may still be set in the main descriptor + * pointing to the indirect, and should be set in each indirect + * descriptor that uses the next descriptor (these should generally + * be numbered sequentially). However, INDIRECT must not be set + * in the indirect descriptors. Upon reaching an indirect descriptor + * without a NEXT bit, control returns to the direct descriptors. + * + * Except inside an indirect, each value must be in the + * range [0 .. N) (i.e., the half-open interval). (Inside an + * indirect, each must be in the range [0 .. /16).) + * + * The "avail" data structures reside in the same pages as the + * "desc" structures since both together are used by the device to + * pass information to the hypervisor's virtual driver. These + * begin with a 16-bit field and 16-bit index , then + * have 16-bit values, followed by one final 16-bit + * field . The entries are simply indices + * indices into the descriptor ring (and thus must meet the same + * constraints as each value). However, is counted + * up from 0 (initially) and simply wraps around after 65535; it + * is taken mod to find the next available entry. + * + * The "used" ring occupies a separate page or pages, and contains + * values written from the virtual driver back to the guest OS. + * This begins with a 16-bit and 16-bit , then there + * are "vring_used" elements, followed by a 16-bit . + * The "vring_used" elements consist of a 32-bit and a + * 32-bit (vu_tlen below). The is simply the index of + * the head of a descriptor chain the guest made available + * earlier, and the is the number of bytes actually written, + * e.g., in the case of a network driver that provided a large + * receive buffer but received only a small amount of data. + * + * The two event fields, and , in the + * avail and used rings (respectively -- note the reversal!), are + * always provided, but are used only if the virtual device + * negotiates the VIRTIO_RING_F_EVENT_IDX feature during feature + * negotiation. Similarly, both rings provide a flag -- + * VRING_AVAIL_F_NO_INTERRUPT and VRING_USED_F_NO_NOTIFY -- in + * their field, indicating that the guest does not need an + * interrupt, or that the hypervisor driver does not need a + * notify, when descriptors are added to the corresponding ring. + * (These are provided only for interrupt optimization and need + * not be implemented.) + */ +#define VRING_ALIGN 4096 + +#define VRING_DESC_F_NEXT (1 << 0) +#define VRING_DESC_F_WRITE (1 << 1) +#define VRING_DESC_F_INDIRECT (1 << 2) + +struct virtio_desc { /* AKA vring_desc */ + uint64_t vd_addr; /* guest physical address */ + uint32_t vd_len; /* length of scatter/gather seg */ + uint16_t vd_flags; /* VRING_F_DESC_* */ + uint16_t vd_next; /* next desc if F_NEXT */ +} __packed; + +struct virtio_used { /* AKA vring_used_elem */ + uint32_t vu_idx; /* head of used descriptor chain */ + uint32_t vu_tlen; /* length written-to */ +} __packed; + +#define VRING_AVAIL_F_NO_INTERRUPT 1 + +struct vring_avail { + uint16_t va_flags; /* VRING_AVAIL_F_* */ + uint16_t va_idx; /* counts to 65535, then cycles */ + uint16_t va_ring[]; /* size N, reported in QNUM value */ +/* uint16_t va_used_event; -- after N ring entries */ +} __packed; + +#define VRING_USED_F_NO_NOTIFY 1 +struct vring_used { + uint16_t vu_flags; /* VRING_USED_F_* */ + uint16_t vu_idx; /* counts to 65535, then cycles */ + struct virtio_used vu_ring[]; /* size N */ +/* uint16_t vu_avail_event; -- after N ring entries */ +} __packed; + +/* + * The address of any given virtual queue is determined by a single + * Page Frame Number register. The guest writes the PFN into the + * PCI config space. However, a device that has two or more + * virtqueues can have a different PFN, and size, for each queue. + * The number of queues is determinable via the PCI config space + * VTCFG_R_QSEL register. Writes to QSEL select the queue: 0 means + * queue #0, 1 means queue#1, etc. Once a queue is selected, the + * remaining PFN and QNUM registers refer to that queue. + * + * QNUM is a read-only register containing a nonzero power of two + * that indicates the (hypervisor's) queue size. Or, if reading it + * produces zero, the hypervisor does not have a corresponding + * queue. (The number of possible queues depends on the virtual + * device. The block device has just one; the network device + * provides either two -- 0 = receive, 1 = transmit -- or three, + * with 2 = control.) + * + * PFN is a read/write register giving the physical page address of + * the virtqueue in guest memory (the guest must allocate enough space + * based on the hypervisor's provided QNUM). + * + * QNOTIFY is effectively write-only: when the guest writes a queue + * number to the register, the hypervisor should scan the specified + * virtqueue. (Reading QNOTIFY currently always gets 0). + */ + +/* + * PFN register shift amount + */ +#define VRING_PFN 12 + +/* + * Virtio device types + * + * XXX Should really be merged with defines + */ +#define VIRTIO_TYPE_NET 1 +#define VIRTIO_TYPE_BLOCK 2 +#define VIRTIO_TYPE_CONSOLE 3 +#define VIRTIO_TYPE_ENTROPY 4 +#define VIRTIO_TYPE_BALLOON 5 +#define VIRTIO_TYPE_IOMEMORY 6 +#define VIRTIO_TYPE_RPMSG 7 +#define VIRTIO_TYPE_SCSI 8 +#define VIRTIO_TYPE_9P 9 + +/* experimental IDs start at 65535 and work down */ + +/* + * PCI vendor/device IDs + */ +#define VIRTIO_VENDOR 0x1AF4 +#define VIRTIO_DEV_NET 0x1000 +#define VIRTIO_DEV_BLOCK 0x1001 +#define VIRTIO_DEV_RANDOM 0x1002 + +/* + * PCI config space constants. + * + * If MSI-X is enabled, the ISR register is generally not used, + * and the configuration vector and queue vector appear at offsets + * 20 and 22 with the remaining configuration registers at 24. + * If MSI-X is not enabled, those two registers disappear and + * the remaining configuration registers start at offset 20. + */ +#define VTCFG_R_HOSTCAP 0 +#define VTCFG_R_GUESTCAP 4 +#define VTCFG_R_PFN 8 +#define VTCFG_R_QNUM 12 +#define VTCFG_R_QSEL 14 +#define VTCFG_R_QNOTIFY 16 +#define VTCFG_R_STATUS 18 +#define VTCFG_R_ISR 19 +#define VTCFG_R_CFGVEC 20 +#define VTCFG_R_QVEC 22 +#define VTCFG_R_CFG0 20 /* No MSI-X */ +#define VTCFG_R_CFG1 24 /* With MSI-X */ +#define VTCFG_R_MSIX 20 + +/* + * Bits in VTCFG_R_STATUS. Guests need not actually set any of these, + * but a guest writing 0 to this register means "please reset". + */ +#define VTCFG_STATUS_ACK 0x01 /* guest OS has acknowledged dev */ +#define VTCFG_STATUS_DRIVER 0x02 /* guest OS driver is loaded */ +#define VTCFG_STATUS_DRIVER_OK 0x04 /* guest OS driver ready */ +#define VTCFG_STATUS_FAILED 0x80 /* guest has given up on this dev */ + +/* + * Bits in VTCFG_R_ISR. These apply only if not using MSI-X. + * + * (We don't [yet?] ever use CONF_CHANGED.) + */ +#define VTCFG_ISR_QUEUES 0x01 /* re-scan queues */ +#define VTCFG_ISR_CONF_CHANGED 0x80 /* configuration changed */ + +#define VIRTIO_MSI_NO_VECTOR 0xFFFF + +/* + * Feature flags. + * Note: bits 0 through 23 are reserved to each device type. + */ +#define VIRTIO_F_NOTIFY_ON_EMPTY (1 << 24) +#define VIRTIO_RING_F_INDIRECT_DESC (1 << 28) +#define VIRTIO_RING_F_EVENT_IDX (1 << 29) + +/* From section 2.3, "Virtqueue Configuration", of the virtio specification */ +static inline size_t +vring_size(u_int qsz) +{ + size_t size; + + /* constant 3 below = va_flags, va_idx, va_used_event */ + size = sizeof(struct virtio_desc) * qsz + sizeof(uint16_t) * (3 + qsz); + size = roundup2(size, VRING_ALIGN); + + /* constant 3 below = vu_flags, vu_idx, vu_avail_event */ + size += sizeof(uint16_t) * 3 + sizeof(struct virtio_used) * qsz; + size = roundup2(size, VRING_ALIGN); + + return (size); +} + +struct vmctx; +struct pci_devinst; +struct vqueue_info; + +/* + * A virtual device, with some number (possibly 0) of virtual + * queues and some size (possibly 0) of configuration-space + * registers private to the device. The virtio_softc should come + * at the front of each "derived class", so that a pointer to the + * virtio_softc is also a pointer to the more specific, derived- + * from-virtio driver's softc. + * + * Note: inside each hypervisor virtio driver, changes to these + * data structures must be locked against other threads, if any. + * Except for PCI config space register read/write, we assume each + * driver does the required locking, but we need a pointer to the + * lock (if there is one) for PCI config space read/write ops. + * + * When the guest reads or writes the device's config space, the + * generic layer checks for operations on the special registers + * described above. If the offset of the register(s) being read + * or written is past the CFG area (CFG0 or CFG1), the request is + * passed on to the virtual device, after subtracting off the + * generic-layer size. (So, drivers can just use the offset as + * an offset into "struct config", for instance.) + * + * (The virtio layer also makes sure that the read or write is to/ + * from a "good" config offset, hence vc_cfgsize, and on BAR #0. + * However, the driver must verify the read or write size and offset + * and that no one is writing a readonly register.) + * + * The BROKED flag ("this thing done gone and broked") is for future + * use. + */ +#define VIRTIO_USE_MSIX 0x01 +#define VIRTIO_EVENT_IDX 0x02 /* use the event-index values */ +#define VIRTIO_BROKED 0x08 /* ??? */ + +struct virtio_softc { + struct virtio_consts *vs_vc; /* constants (see below) */ + int vs_flags; /* VIRTIO_* flags from above */ + pthread_mutex_t *vs_mtx; /* POSIX mutex, if any */ + struct pci_devinst *vs_pi; /* PCI device instance */ + uint32_t vs_negotiated_caps; /* negotiated capabilities */ + struct vqueue_info *vs_queues; /* one per vc_nvq */ + int vs_curq; /* current queue */ + uint8_t vs_status; /* value from last status write */ + uint8_t vs_isr; /* ISR flags, if not MSI-X */ + uint16_t vs_msix_cfg_idx; /* MSI-X vector for config event */ +}; + +#define VS_LOCK(vs) \ +do { \ + if (vs->vs_mtx) \ + pthread_mutex_lock(vs->vs_mtx); \ +} while (0) + +#define VS_UNLOCK(vs) \ +do { \ + if (vs->vs_mtx) \ + pthread_mutex_unlock(vs->vs_mtx); \ +} while (0) + +struct virtio_consts { + const char *vc_name; /* name of driver (for diagnostics) */ + int vc_nvq; /* number of virtual queues */ + size_t vc_cfgsize; /* size of dev-specific config regs */ + void (*vc_reset)(void *); /* called on virtual device reset */ + void (*vc_qnotify)(void *, struct vqueue_info *); + /* called on QNOTIFY if no VQ notify */ + int (*vc_cfgread)(void *, int, int, uint32_t *); + /* called to read config regs */ + int (*vc_cfgwrite)(void *, int, int, uint32_t); + /* called to write config regs */ + void (*vc_apply_features)(void *, uint64_t); + /* called to apply negotiated features */ + uint64_t vc_hv_caps; /* hypervisor-provided capabilities */ +}; + +/* + * Data structure allocated (statically) per virtual queue. + * + * Drivers may change vq_qsize after a reset. When the guest OS + * requests a device reset, the hypervisor first calls + * vs->vs_vc->vc_reset(); then the data structure below is + * reinitialized (for each virtqueue: vs->vs_vc->vc_nvq). + * + * The remaining fields should only be fussed-with by the generic + * code. + * + * Note: the addresses of vq_desc, vq_avail, and vq_used are all + * computable from each other, but it's a lot simpler if we just + * keep a pointer to each one. The event indices are similarly + * (but more easily) computable, and this time we'll compute them: + * they're just XX_ring[N]. + */ +#define VQ_ALLOC 0x01 /* set once we have a pfn */ +#define VQ_BROKED 0x02 /* ??? */ +struct vqueue_info { + uint16_t vq_qsize; /* size of this queue (a power of 2) */ + void (*vq_notify)(void *, struct vqueue_info *); + /* called instead of vc_notify, if not NULL */ + + struct virtio_softc *vq_vs; /* backpointer to softc */ + uint16_t vq_num; /* we're the num'th queue in the softc */ + + uint16_t vq_flags; /* flags (see above) */ + uint16_t vq_last_avail; /* a recent value of vq_avail->va_idx */ + uint16_t vq_save_used; /* saved vq_used->vu_idx; see vq_endchains */ + uint16_t vq_msix_idx; /* MSI-X index, or VIRTIO_MSI_NO_VECTOR */ + + uint32_t vq_pfn; /* PFN of virt queue (not shifted!) */ + + volatile struct virtio_desc *vq_desc; /* descriptor array */ + volatile struct vring_avail *vq_avail; /* the "avail" ring */ + volatile struct vring_used *vq_used; /* the "used" ring */ + +}; +/* as noted above, these are sort of backwards, name-wise */ +#define VQ_AVAIL_EVENT_IDX(vq) \ + (*(volatile uint16_t *)&(vq)->vq_used->vu_ring[(vq)->vq_qsize]) +#define VQ_USED_EVENT_IDX(vq) \ + ((vq)->vq_avail->va_ring[(vq)->vq_qsize]) + +/* + * Is this ring ready for I/O? + */ +static inline int +vq_ring_ready(struct vqueue_info *vq) +{ + + return (vq->vq_flags & VQ_ALLOC); +} + +/* + * Are there "available" descriptors? (This does not count + * how many, just returns True if there are some.) + */ +static inline int +vq_has_descs(struct vqueue_info *vq) +{ + + return (vq_ring_ready(vq) && vq->vq_last_avail != + vq->vq_avail->va_idx); +} + +/* + * Deliver an interrupt to guest on the given virtual queue + * (if possible, or a generic MSI interrupt if not using MSI-X). + */ +static inline void +vq_interrupt(struct virtio_softc *vs, struct vqueue_info *vq) +{ + + if (pci_msix_enabled(vs->vs_pi)) + pci_generate_msix(vs->vs_pi, vq->vq_msix_idx); + else { + VS_LOCK(vs); + vs->vs_isr |= VTCFG_ISR_QUEUES; + pci_generate_msi(vs->vs_pi, 0); + pci_lintr_assert(vs->vs_pi); + VS_UNLOCK(vs); + } +} + +struct iovec; +void vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc, + void *dev_softc, struct pci_devinst *pi, + struct vqueue_info *queues); +int vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix); +void vi_reset_dev(struct virtio_softc *); +void vi_set_io_bar(struct virtio_softc *, int); + +int vq_getchain(struct vqueue_info *vq, uint16_t *pidx, + struct iovec *iov, int n_iov, uint16_t *flags); +void vq_retchain(struct vqueue_info *vq); +void vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen); +void vq_endchains(struct vqueue_info *vq, int used_all_avail); + +uint64_t vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size); +void vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value); +#endif /* _VIRTIO_H_ */ diff --git a/bhyve/xmsr.c b/bhyve/xmsr.c new file mode 100644 index 0000000..5b7bfbb --- /dev/null +++ b/bhyve/xmsr.c @@ -0,0 +1,230 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include +#include +#include + +#include + +#include +#include +#include + +#include "xmsr.h" + +static int cpu_vendor_intel, cpu_vendor_amd; + +int +emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t num, uint64_t val) +{ + + if (cpu_vendor_intel) { + switch (num) { + case 0xd04: /* Sandy Bridge uncore PMCs */ + case 0xc24: + return (0); + case MSR_BIOS_UPDT_TRIG: + return (0); + case MSR_BIOS_SIGN: + return (0); + default: + break; + } + } else if (cpu_vendor_amd) { + switch (num) { + case MSR_HWCR: + /* + * Ignore writes to hardware configuration MSR. + */ + return (0); + + case MSR_NB_CFG1: + case MSR_IC_CFG: + return (0); /* Ignore writes */ + + case MSR_PERFEVSEL0: + case MSR_PERFEVSEL1: + case MSR_PERFEVSEL2: + case MSR_PERFEVSEL3: + /* Ignore writes to the PerfEvtSel MSRs */ + return (0); + + case MSR_K7_PERFCTR0: + case MSR_K7_PERFCTR1: + case MSR_K7_PERFCTR2: + case MSR_K7_PERFCTR3: + /* Ignore writes to the PerfCtr MSRs */ + return (0); + + case MSR_P_STATE_CONTROL: + /* Ignore write to change the P-state */ + return (0); + + default: + break; + } + } + return (-1); +} + +int +emulate_rdmsr(struct vmctx *ctx, int vcpu, uint32_t num, uint64_t *val) +{ + int error = 0; + + if (cpu_vendor_intel) { + switch (num) { + case MSR_BIOS_SIGN: + case MSR_IA32_PLATFORM_ID: + case MSR_PKG_ENERGY_STATUS: + case MSR_PP0_ENERGY_STATUS: + case MSR_PP1_ENERGY_STATUS: + case MSR_DRAM_ENERGY_STATUS: + *val = 0; + break; + case MSR_RAPL_POWER_UNIT: + /* + * Use the default value documented in section + * "RAPL Interfaces" in Intel SDM vol3. + */ + *val = 0x000a1003; + break; + default: + error = -1; + break; + } + } else if (cpu_vendor_amd) { + switch (num) { + case MSR_BIOS_SIGN: + *val = 0; + break; + case MSR_HWCR: + /* + * Bios and Kernel Developer's Guides for AMD Families + * 12H, 14H, 15H and 16H. + */ + *val = 0x01000010; /* Reset value */ + *val |= 1 << 9; /* MONITOR/MWAIT disable */ + break; + + case MSR_NB_CFG1: + case MSR_IC_CFG: + /* + * The reset value is processor family dependent so + * just return 0. + */ + *val = 0; + break; + + case MSR_PERFEVSEL0: + case MSR_PERFEVSEL1: + case MSR_PERFEVSEL2: + case MSR_PERFEVSEL3: + /* + * PerfEvtSel MSRs are not properly virtualized so just + * return zero. + */ + *val = 0; + break; + + case MSR_K7_PERFCTR0: + case MSR_K7_PERFCTR1: + case MSR_K7_PERFCTR2: + case MSR_K7_PERFCTR3: + /* + * PerfCtr MSRs are not properly virtualized so just + * return zero. + */ + *val = 0; + break; + + case MSR_SMM_ADDR: + case MSR_SMM_MASK: + /* + * Return the reset value defined in the AMD Bios and + * Kernel Developer's Guide. + */ + *val = 0; + break; + + case MSR_P_STATE_LIMIT: + case MSR_P_STATE_CONTROL: + case MSR_P_STATE_STATUS: + case MSR_P_STATE_CONFIG(0): /* P0 configuration */ + *val = 0; + break; + + /* + * OpenBSD guests test bit 0 of this MSR to detect if the + * workaround for erratum 721 is already applied. + * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf + */ + case 0xC0011029: + *val = 1; + break; + + default: + error = -1; + break; + } + } else { + error = -1; + } + return (error); +} + +int +init_msr(void) +{ + int error; + u_int regs[4]; + char cpu_vendor[13]; + + do_cpuid(0, regs); + ((u_int *)&cpu_vendor)[0] = regs[1]; + ((u_int *)&cpu_vendor)[1] = regs[3]; + ((u_int *)&cpu_vendor)[2] = regs[2]; + cpu_vendor[12] = '\0'; + + error = 0; + if (strcmp(cpu_vendor, "AuthenticAMD") == 0) { + cpu_vendor_amd = 1; + } else if (strcmp(cpu_vendor, "GenuineIntel") == 0) { + cpu_vendor_intel = 1; + } else { + fprintf(stderr, "Unknown cpu vendor \"%s\"\n", cpu_vendor); + error = -1; + } + return (error); +} diff --git a/bhyve/xmsr.h b/bhyve/xmsr.h new file mode 100644 index 0000000..bcf65b7 --- /dev/null +++ b/bhyve/xmsr.h @@ -0,0 +1,36 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _XMSR_H_ +#define _XMSR_H_ + +int init_msr(void); +int emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t val); +int emulate_rdmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t *val); + +#endif diff --git a/bhyvectl/Makefile b/bhyvectl/Makefile new file mode 100644 index 0000000..dba3f12 --- /dev/null +++ b/bhyvectl/Makefile @@ -0,0 +1,16 @@ +# +# $FreeBSD$ +# + +PROG= bhyvectl +SRCS= bhyvectl.c + +MAN= + +LIBADD= vmmapi + +WARNS?= 3 + +CFLAGS+= -I${.CURDIR}/../../sys/amd64/vmm + +.include diff --git a/bhyvectl/bhyvectl.c b/bhyvectl/bhyvectl.c new file mode 100644 index 0000000..223ee25 --- /dev/null +++ b/bhyvectl/bhyvectl.c @@ -0,0 +1,2142 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "amd/vmcb.h" +#include "intel/vmcs.h" + +#define MB (1UL << 20) +#define GB (1UL << 30) + +#define REQ_ARG required_argument +#define NO_ARG no_argument +#define OPT_ARG optional_argument + +static const char *progname; + +static void +usage(bool cpu_intel) +{ + + (void)fprintf(stderr, + "Usage: %s --vm=\n" + " [--cpu=]\n" + " [--create]\n" + " [--destroy]\n" + " [--get-all]\n" + " [--get-stats]\n" + " [--set-desc-ds]\n" + " [--get-desc-ds]\n" + " [--set-desc-es]\n" + " [--get-desc-es]\n" + " [--set-desc-gs]\n" + " [--get-desc-gs]\n" + " [--set-desc-fs]\n" + " [--get-desc-fs]\n" + " [--set-desc-cs]\n" + " [--get-desc-cs]\n" + " [--set-desc-ss]\n" + " [--get-desc-ss]\n" + " [--set-desc-tr]\n" + " [--get-desc-tr]\n" + " [--set-desc-ldtr]\n" + " [--get-desc-ldtr]\n" + " [--set-desc-gdtr]\n" + " [--get-desc-gdtr]\n" + " [--set-desc-idtr]\n" + " [--get-desc-idtr]\n" + " [--run]\n" + " [--capname=]\n" + " [--getcap]\n" + " [--setcap=<0|1>]\n" + " [--desc-base=]\n" + " [--desc-limit=]\n" + " [--desc-access=]\n" + " [--set-cr0=]\n" + " [--get-cr0]\n" + " [--set-cr3=]\n" + " [--get-cr3]\n" + " [--set-cr4=]\n" + " [--get-cr4]\n" + " [--set-dr7=]\n" + " [--get-dr7]\n" + " [--set-rsp=]\n" + " [--get-rsp]\n" + " [--set-rip=]\n" + " [--get-rip]\n" + " [--get-rax]\n" + " [--set-rax=]\n" + " [--get-rbx]\n" + " [--get-rcx]\n" + " [--get-rdx]\n" + " [--get-rsi]\n" + " [--get-rdi]\n" + " [--get-rbp]\n" + " [--get-r8]\n" + " [--get-r9]\n" + " [--get-r10]\n" + " [--get-r11]\n" + " [--get-r12]\n" + " [--get-r13]\n" + " [--get-r14]\n" + " [--get-r15]\n" + " [--set-rflags=]\n" + " [--get-rflags]\n" + " [--set-cs]\n" + " [--get-cs]\n" + " [--set-ds]\n" + " [--get-ds]\n" + " [--set-es]\n" + " [--get-es]\n" + " [--set-fs]\n" + " [--get-fs]\n" + " [--set-gs]\n" + " [--get-gs]\n" + " [--set-ss]\n" + " [--get-ss]\n" + " [--get-tr]\n" + " [--get-ldtr]\n" + " [--set-x2apic-state=]\n" + " [--get-x2apic-state]\n" + " [--unassign-pptdev=]\n" + " [--set-mem=]\n" + " [--get-lowmem]\n" + " [--get-highmem]\n" + " [--get-gpa-pmap]\n" + " [--assert-lapic-lvt=]\n" + " [--inject-nmi]\n" + " [--force-reset]\n" + " [--force-poweroff]\n" + " [--get-rtc-time]\n" + " [--set-rtc-time=]\n" + " [--get-rtc-nvram]\n" + " [--set-rtc-nvram=]\n" + " [--rtc-nvram-offset=]\n" + " [--get-active-cpus]\n" + " [--get-suspended-cpus]\n" + " [--get-intinfo]\n" + " [--get-eptp]\n" + " [--set-exception-bitmap]\n" + " [--get-exception-bitmap]\n" + " [--get-tsc-offset]\n" + " [--get-guest-pat]\n" + " [--get-io-bitmap-address]\n" + " [--get-msr-bitmap]\n" + " [--get-msr-bitmap-address]\n" + " [--get-guest-sysenter]\n" + " [--get-exit-reason]\n", + progname); + + if (cpu_intel) { + (void)fprintf(stderr, + " [--get-vmcs-pinbased-ctls]\n" + " [--get-vmcs-procbased-ctls]\n" + " [--get-vmcs-procbased-ctls2]\n" + " [--get-vmcs-entry-interruption-info]\n" + " [--set-vmcs-entry-interruption-info=]\n" + " [--get-vmcs-guest-physical-address\n" + " [--get-vmcs-guest-linear-address\n" + " [--get-vmcs-host-pat]\n" + " [--get-vmcs-host-cr0]\n" + " [--get-vmcs-host-cr3]\n" + " [--get-vmcs-host-cr4]\n" + " [--get-vmcs-host-rip]\n" + " [--get-vmcs-host-rsp]\n" + " [--get-vmcs-cr0-mask]\n" + " [--get-vmcs-cr0-shadow]\n" + " [--get-vmcs-cr4-mask]\n" + " [--get-vmcs-cr4-shadow]\n" + " [--get-vmcs-cr3-targets]\n" + " [--get-vmcs-apic-access-address]\n" + " [--get-vmcs-virtual-apic-address]\n" + " [--get-vmcs-tpr-threshold]\n" + " [--get-vmcs-vpid]\n" + " [--get-vmcs-instruction-error]\n" + " [--get-vmcs-exit-ctls]\n" + " [--get-vmcs-entry-ctls]\n" + " [--get-vmcs-link]\n" + " [--get-vmcs-exit-qualification]\n" + " [--get-vmcs-exit-interruption-info]\n" + " [--get-vmcs-exit-interruption-error]\n" + " [--get-vmcs-interruptibility]\n" + ); + } else { + (void)fprintf(stderr, + " [--get-vmcb-intercepts]\n" + " [--get-vmcb-asid]\n" + " [--get-vmcb-exit-details]\n" + " [--get-vmcb-tlb-ctrl]\n" + " [--get-vmcb-virq]\n" + " [--get-avic-apic-bar]\n" + " [--get-avic-backing-page]\n" + " [--get-avic-table]\n" + ); + } + exit(1); +} + +static int get_rtc_time, set_rtc_time; +static int get_rtc_nvram, set_rtc_nvram; +static int rtc_nvram_offset; +static uint8_t rtc_nvram_value; +static time_t rtc_secs; + +static int get_stats, getcap, setcap, capval, get_gpa_pmap; +static int inject_nmi, assert_lapic_lvt; +static int force_reset, force_poweroff; +static const char *capname; +static int create, destroy, get_lowmem, get_highmem; +static int get_intinfo; +static int get_active_cpus, get_suspended_cpus; +static uint64_t memsize; +static int set_cr0, get_cr0, set_cr3, get_cr3, set_cr4, get_cr4; +static int set_efer, get_efer; +static int set_dr7, get_dr7; +static int set_rsp, get_rsp, set_rip, get_rip, set_rflags, get_rflags; +static int set_rax, get_rax; +static int get_rbx, get_rcx, get_rdx, get_rsi, get_rdi, get_rbp; +static int get_r8, get_r9, get_r10, get_r11, get_r12, get_r13, get_r14, get_r15; +static int set_desc_ds, get_desc_ds; +static int set_desc_es, get_desc_es; +static int set_desc_fs, get_desc_fs; +static int set_desc_gs, get_desc_gs; +static int set_desc_cs, get_desc_cs; +static int set_desc_ss, get_desc_ss; +static int set_desc_gdtr, get_desc_gdtr; +static int set_desc_idtr, get_desc_idtr; +static int set_desc_tr, get_desc_tr; +static int set_desc_ldtr, get_desc_ldtr; +static int set_cs, set_ds, set_es, set_fs, set_gs, set_ss, set_tr, set_ldtr; +static int get_cs, get_ds, get_es, get_fs, get_gs, get_ss, get_tr, get_ldtr; +static int set_x2apic_state, get_x2apic_state; +enum x2apic_state x2apic_state; +static int unassign_pptdev, bus, slot, func; +static int run; + +/* + * VMCB specific. + */ +static int get_vmcb_intercept, get_vmcb_exit_details, get_vmcb_tlb_ctrl; +static int get_vmcb_virq, get_avic_table; + +/* + * VMCS-specific fields + */ +static int get_pinbased_ctls, get_procbased_ctls, get_procbased_ctls2; +static int get_eptp, get_io_bitmap, get_tsc_offset; +static int get_vmcs_entry_interruption_info, set_vmcs_entry_interruption_info; +static int get_vmcs_interruptibility; +uint32_t vmcs_entry_interruption_info; +static int get_vmcs_gpa, get_vmcs_gla; +static int get_exception_bitmap, set_exception_bitmap, exception_bitmap; +static int get_cr0_mask, get_cr0_shadow; +static int get_cr4_mask, get_cr4_shadow; +static int get_cr3_targets; +static int get_apic_access_addr, get_virtual_apic_addr, get_tpr_threshold; +static int get_msr_bitmap, get_msr_bitmap_address; +static int get_vpid_asid; +static int get_inst_err, get_exit_ctls, get_entry_ctls; +static int get_host_cr0, get_host_cr3, get_host_cr4; +static int get_host_rip, get_host_rsp; +static int get_guest_pat, get_host_pat; +static int get_guest_sysenter, get_vmcs_link; +static int get_exit_reason, get_vmcs_exit_qualification; +static int get_vmcs_exit_interruption_info, get_vmcs_exit_interruption_error; + +static uint64_t desc_base; +static uint32_t desc_limit, desc_access; + +static int get_all; + +static void +dump_vm_run_exitcode(struct vm_exit *vmexit, int vcpu) +{ + printf("vm exit[%d]\n", vcpu); + printf("\trip\t\t0x%016lx\n", vmexit->rip); + printf("\tinst_length\t%d\n", vmexit->inst_length); + switch (vmexit->exitcode) { + case VM_EXITCODE_INOUT: + printf("\treason\t\tINOUT\n"); + printf("\tdirection\t%s\n", vmexit->u.inout.in ? "IN" : "OUT"); + printf("\tbytes\t\t%d\n", vmexit->u.inout.bytes); + printf("\tflags\t\t%s%s\n", + vmexit->u.inout.string ? "STRING " : "", + vmexit->u.inout.rep ? "REP " : ""); + printf("\tport\t\t0x%04x\n", vmexit->u.inout.port); + printf("\teax\t\t0x%08x\n", vmexit->u.inout.eax); + break; + case VM_EXITCODE_VMX: + printf("\treason\t\tVMX\n"); + printf("\tstatus\t\t%d\n", vmexit->u.vmx.status); + printf("\texit_reason\t0x%08x (%u)\n", + vmexit->u.vmx.exit_reason, vmexit->u.vmx.exit_reason); + printf("\tqualification\t0x%016lx\n", + vmexit->u.vmx.exit_qualification); + printf("\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type); + printf("\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error); + break; + case VM_EXITCODE_SVM: + printf("\treason\t\tSVM\n"); + printf("\texit_reason\t\t%#lx\n", vmexit->u.svm.exitcode); + printf("\texitinfo1\t\t%#lx\n", vmexit->u.svm.exitinfo1); + printf("\texitinfo2\t\t%#lx\n", vmexit->u.svm.exitinfo2); + break; + default: + printf("*** unknown vm run exitcode %d\n", vmexit->exitcode); + break; + } +} + +/* AMD 6th generation and Intel compatible MSRs */ +#define MSR_AMD6TH_START 0xC0000000 +#define MSR_AMD6TH_END 0xC0001FFF +/* AMD 7th and 8th generation compatible MSRs */ +#define MSR_AMD7TH_START 0xC0010000 +#define MSR_AMD7TH_END 0xC0011FFF + +static const char * +msr_name(uint32_t msr) +{ + static char buf[32]; + + switch(msr) { + case MSR_TSC: + return ("MSR_TSC"); + case MSR_EFER: + return ("MSR_EFER"); + case MSR_STAR: + return ("MSR_STAR"); + case MSR_LSTAR: + return ("MSR_LSTAR"); + case MSR_CSTAR: + return ("MSR_CSTAR"); + case MSR_SF_MASK: + return ("MSR_SF_MASK"); + case MSR_FSBASE: + return ("MSR_FSBASE"); + case MSR_GSBASE: + return ("MSR_GSBASE"); + case MSR_KGSBASE: + return ("MSR_KGSBASE"); + case MSR_SYSENTER_CS_MSR: + return ("MSR_SYSENTER_CS_MSR"); + case MSR_SYSENTER_ESP_MSR: + return ("MSR_SYSENTER_ESP_MSR"); + case MSR_SYSENTER_EIP_MSR: + return ("MSR_SYSENTER_EIP_MSR"); + case MSR_PAT: + return ("MSR_PAT"); + } + snprintf(buf, sizeof(buf), "MSR %#08x", msr); + + return (buf); +} + +static inline void +print_msr_pm(uint64_t msr, int vcpu, int readable, int writeable) +{ + + if (readable || writeable) { + printf("%-20s[%d]\t\t%c%c\n", msr_name(msr), vcpu, + readable ? 'R' : '-', writeable ? 'W' : '-'); + } +} + +/* + * Reference APM vol2, section 15.11 MSR Intercepts. + */ +static void +dump_amd_msr_pm(const char *bitmap, int vcpu) +{ + int byte, bit, readable, writeable; + uint32_t msr; + + for (msr = 0; msr < 0x2000; msr++) { + byte = msr / 4; + bit = (msr % 4) * 2; + + /* Look at MSRs in the range 0x00000000 to 0x00001FFF */ + readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; + writeable = (bitmap[byte] & (2 << bit)) ? 0 : 1; + print_msr_pm(msr, vcpu, readable, writeable); + + /* Look at MSRs in the range 0xC0000000 to 0xC0001FFF */ + byte += 2048; + readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; + writeable = (bitmap[byte] & (2 << bit)) ? 0 : 1; + print_msr_pm(msr + MSR_AMD6TH_START, vcpu, readable, + writeable); + + /* MSR 0xC0010000 to 0xC0011FF is only for AMD */ + byte += 4096; + readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; + writeable = (bitmap[byte] & (2 << bit)) ? 0 : 1; + print_msr_pm(msr + MSR_AMD7TH_START, vcpu, readable, + writeable); + } +} + +/* + * Reference Intel SDM Vol3 Section 24.6.9 MSR-Bitmap Address + */ +static void +dump_intel_msr_pm(const char *bitmap, int vcpu) +{ + int byte, bit, readable, writeable; + uint32_t msr; + + for (msr = 0; msr < 0x2000; msr++) { + byte = msr / 8; + bit = msr & 0x7; + + /* Look at MSRs in the range 0x00000000 to 0x00001FFF */ + readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; + writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1; + print_msr_pm(msr, vcpu, readable, writeable); + + /* Look at MSRs in the range 0xC0000000 to 0xC0001FFF */ + byte += 1024; + readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; + writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1; + print_msr_pm(msr + MSR_AMD6TH_START, vcpu, readable, + writeable); + } +} + +static int +dump_msr_bitmap(int vcpu, uint64_t addr, bool cpu_intel) +{ + int error, fd, map_size; + const char *bitmap; + + error = -1; + bitmap = MAP_FAILED; + + fd = open("/dev/mem", O_RDONLY, 0); + if (fd < 0) { + perror("Couldn't open /dev/mem"); + goto done; + } + + if (cpu_intel) + map_size = PAGE_SIZE; + else + map_size = 2 * PAGE_SIZE; + + bitmap = mmap(NULL, map_size, PROT_READ, MAP_SHARED, fd, addr); + if (bitmap == MAP_FAILED) { + perror("mmap failed"); + goto done; + } + + if (cpu_intel) + dump_intel_msr_pm(bitmap, vcpu); + else + dump_amd_msr_pm(bitmap, vcpu); + + error = 0; +done: + if (bitmap != MAP_FAILED) + munmap((void *)bitmap, map_size); + if (fd >= 0) + close(fd); + + return (error); +} + +static int +vm_get_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t *ret_val) +{ + + return (vm_get_register(ctx, vcpu, VMCS_IDENT(field), ret_val)); +} + +static int +vm_set_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t val) +{ + + return (vm_set_register(ctx, vcpu, VMCS_IDENT(field), val)); +} + +static int +vm_get_vmcb_field(struct vmctx *ctx, int vcpu, int off, int bytes, + uint64_t *ret_val) +{ + + return (vm_get_register(ctx, vcpu, VMCB_ACCESS(off, bytes), ret_val)); +} + +static int +vm_set_vmcb_field(struct vmctx *ctx, int vcpu, int off, int bytes, + uint64_t val) +{ + + return (vm_set_register(ctx, vcpu, VMCB_ACCESS(off, bytes), val)); +} + +enum { + VMNAME = 1000, /* avoid collision with return values from getopt */ + VCPU, + SET_MEM, + SET_EFER, + SET_CR0, + SET_CR3, + SET_CR4, + SET_DR7, + SET_RSP, + SET_RIP, + SET_RAX, + SET_RFLAGS, + DESC_BASE, + DESC_LIMIT, + DESC_ACCESS, + SET_CS, + SET_DS, + SET_ES, + SET_FS, + SET_GS, + SET_SS, + SET_TR, + SET_LDTR, + SET_X2APIC_STATE, + SET_EXCEPTION_BITMAP, + SET_VMCS_ENTRY_INTERRUPTION_INFO, + SET_CAP, + CAPNAME, + UNASSIGN_PPTDEV, + GET_GPA_PMAP, + ASSERT_LAPIC_LVT, + SET_RTC_TIME, + SET_RTC_NVRAM, + RTC_NVRAM_OFFSET, +}; + +static void +print_cpus(const char *banner, const cpuset_t *cpus) +{ + int i, first; + + first = 1; + printf("%s:\t", banner); + if (!CPU_EMPTY(cpus)) { + for (i = 0; i < CPU_SETSIZE; i++) { + if (CPU_ISSET(i, cpus)) { + printf("%s%d", first ? " " : ", ", i); + first = 0; + } + } + } else + printf(" (none)"); + printf("\n"); +} + +static void +print_intinfo(const char *banner, uint64_t info) +{ + int type; + + printf("%s:\t", banner); + if (info & VM_INTINFO_VALID) { + type = info & VM_INTINFO_TYPE; + switch (type) { + case VM_INTINFO_HWINTR: + printf("extint"); + break; + case VM_INTINFO_NMI: + printf("nmi"); + break; + case VM_INTINFO_SWINTR: + printf("swint"); + break; + default: + printf("exception"); + break; + } + printf(" vector %d", (int)VM_INTINFO_VECTOR(info)); + if (info & VM_INTINFO_DEL_ERRCODE) + printf(" errcode %#x", (u_int)(info >> 32)); + } else { + printf("n/a"); + } + printf("\n"); +} + +static bool +cpu_vendor_intel(void) +{ + u_int regs[4]; + char cpu_vendor[13]; + + do_cpuid(0, regs); + ((u_int *)&cpu_vendor)[0] = regs[1]; + ((u_int *)&cpu_vendor)[1] = regs[3]; + ((u_int *)&cpu_vendor)[2] = regs[2]; + cpu_vendor[12] = '\0'; + + if (strcmp(cpu_vendor, "AuthenticAMD") == 0) { + return (false); + } else if (strcmp(cpu_vendor, "GenuineIntel") == 0) { + return (true); + } else { + fprintf(stderr, "Unknown cpu vendor \"%s\"\n", cpu_vendor); + exit(1); + } +} + +static int +get_all_registers(struct vmctx *ctx, int vcpu) +{ + uint64_t cr0, cr3, cr4, dr7, rsp, rip, rflags, efer; + uint64_t rax, rbx, rcx, rdx, rsi, rdi, rbp; + uint64_t r8, r9, r10, r11, r12, r13, r14, r15; + int error = 0; + + if (!error && (get_efer || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_EFER, &efer); + if (error == 0) + printf("efer[%d]\t\t0x%016lx\n", vcpu, efer); + } + + if (!error && (get_cr0 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR0, &cr0); + if (error == 0) + printf("cr0[%d]\t\t0x%016lx\n", vcpu, cr0); + } + + if (!error && (get_cr3 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR3, &cr3); + if (error == 0) + printf("cr3[%d]\t\t0x%016lx\n", vcpu, cr3); + } + + if (!error && (get_cr4 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR4, &cr4); + if (error == 0) + printf("cr4[%d]\t\t0x%016lx\n", vcpu, cr4); + } + + if (!error && (get_dr7 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR7, &dr7); + if (error == 0) + printf("dr7[%d]\t\t0x%016lx\n", vcpu, dr7); + } + + if (!error && (get_rsp || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RSP, &rsp); + if (error == 0) + printf("rsp[%d]\t\t0x%016lx\n", vcpu, rsp); + } + + if (!error && (get_rip || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip); + if (error == 0) + printf("rip[%d]\t\t0x%016lx\n", vcpu, rip); + } + + if (!error && (get_rax || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RAX, &rax); + if (error == 0) + printf("rax[%d]\t\t0x%016lx\n", vcpu, rax); + } + + if (!error && (get_rbx || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RBX, &rbx); + if (error == 0) + printf("rbx[%d]\t\t0x%016lx\n", vcpu, rbx); + } + + if (!error && (get_rcx || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RCX, &rcx); + if (error == 0) + printf("rcx[%d]\t\t0x%016lx\n", vcpu, rcx); + } + + if (!error && (get_rdx || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RDX, &rdx); + if (error == 0) + printf("rdx[%d]\t\t0x%016lx\n", vcpu, rdx); + } + + if (!error && (get_rsi || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RSI, &rsi); + if (error == 0) + printf("rsi[%d]\t\t0x%016lx\n", vcpu, rsi); + } + + if (!error && (get_rdi || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RDI, &rdi); + if (error == 0) + printf("rdi[%d]\t\t0x%016lx\n", vcpu, rdi); + } + + if (!error && (get_rbp || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RBP, &rbp); + if (error == 0) + printf("rbp[%d]\t\t0x%016lx\n", vcpu, rbp); + } + + if (!error && (get_r8 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R8, &r8); + if (error == 0) + printf("r8[%d]\t\t0x%016lx\n", vcpu, r8); + } + + if (!error && (get_r9 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R9, &r9); + if (error == 0) + printf("r9[%d]\t\t0x%016lx\n", vcpu, r9); + } + + if (!error && (get_r10 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R10, &r10); + if (error == 0) + printf("r10[%d]\t\t0x%016lx\n", vcpu, r10); + } + + if (!error && (get_r11 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R11, &r11); + if (error == 0) + printf("r11[%d]\t\t0x%016lx\n", vcpu, r11); + } + + if (!error && (get_r12 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R12, &r12); + if (error == 0) + printf("r12[%d]\t\t0x%016lx\n", vcpu, r12); + } + + if (!error && (get_r13 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R13, &r13); + if (error == 0) + printf("r13[%d]\t\t0x%016lx\n", vcpu, r13); + } + + if (!error && (get_r14 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R14, &r14); + if (error == 0) + printf("r14[%d]\t\t0x%016lx\n", vcpu, r14); + } + + if (!error && (get_r15 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R15, &r15); + if (error == 0) + printf("r15[%d]\t\t0x%016lx\n", vcpu, r15); + } + + if (!error && (get_rflags || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RFLAGS, + &rflags); + if (error == 0) + printf("rflags[%d]\t0x%016lx\n", vcpu, rflags); + } + + return (error); +} + +static int +get_all_segments(struct vmctx *ctx, int vcpu) +{ + uint64_t cs, ds, es, fs, gs, ss, tr, ldtr; + int error = 0; + + if (!error && (get_desc_ds || get_all)) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_DS, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("ds desc[%d]\t0x%016lx/0x%08x/0x%08x\n", + vcpu, desc_base, desc_limit, desc_access); + } + } + + if (!error && (get_desc_es || get_all)) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_ES, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("es desc[%d]\t0x%016lx/0x%08x/0x%08x\n", + vcpu, desc_base, desc_limit, desc_access); + } + } + + if (!error && (get_desc_fs || get_all)) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_FS, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("fs desc[%d]\t0x%016lx/0x%08x/0x%08x\n", + vcpu, desc_base, desc_limit, desc_access); + } + } + + if (!error && (get_desc_gs || get_all)) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_GS, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("gs desc[%d]\t0x%016lx/0x%08x/0x%08x\n", + vcpu, desc_base, desc_limit, desc_access); + } + } + + if (!error && (get_desc_ss || get_all)) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("ss desc[%d]\t0x%016lx/0x%08x/0x%08x\n", + vcpu, desc_base, desc_limit, desc_access); + } + } + + if (!error && (get_desc_cs || get_all)) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_CS, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("cs desc[%d]\t0x%016lx/0x%08x/0x%08x\n", + vcpu, desc_base, desc_limit, desc_access); + } + } + + if (!error && (get_desc_tr || get_all)) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("tr desc[%d]\t0x%016lx/0x%08x/0x%08x\n", + vcpu, desc_base, desc_limit, desc_access); + } + } + + if (!error && (get_desc_ldtr || get_all)) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_LDTR, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("ldtr desc[%d]\t0x%016lx/0x%08x/0x%08x\n", + vcpu, desc_base, desc_limit, desc_access); + } + } + + if (!error && (get_desc_gdtr || get_all)) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_GDTR, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("gdtr[%d]\t\t0x%016lx/0x%08x\n", + vcpu, desc_base, desc_limit); + } + } + + if (!error && (get_desc_idtr || get_all)) { + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_IDTR, + &desc_base, &desc_limit, &desc_access); + if (error == 0) { + printf("idtr[%d]\t\t0x%016lx/0x%08x\n", + vcpu, desc_base, desc_limit); + } + } + + if (!error && (get_cs || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CS, &cs); + if (error == 0) + printf("cs[%d]\t\t0x%04lx\n", vcpu, cs); + } + + if (!error && (get_ds || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DS, &ds); + if (error == 0) + printf("ds[%d]\t\t0x%04lx\n", vcpu, ds); + } + + if (!error && (get_es || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_ES, &es); + if (error == 0) + printf("es[%d]\t\t0x%04lx\n", vcpu, es); + } + + if (!error && (get_fs || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_FS, &fs); + if (error == 0) + printf("fs[%d]\t\t0x%04lx\n", vcpu, fs); + } + + if (!error && (get_gs || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_GS, &gs); + if (error == 0) + printf("gs[%d]\t\t0x%04lx\n", vcpu, gs); + } + + if (!error && (get_ss || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_SS, &ss); + if (error == 0) + printf("ss[%d]\t\t0x%04lx\n", vcpu, ss); + } + + if (!error && (get_tr || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_TR, &tr); + if (error == 0) + printf("tr[%d]\t\t0x%04lx\n", vcpu, tr); + } + + if (!error && (get_ldtr || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_LDTR, &ldtr); + if (error == 0) + printf("ldtr[%d]\t\t0x%04lx\n", vcpu, ldtr); + } + + return (error); +} + +static int +get_misc_vmcs(struct vmctx *ctx, int vcpu) +{ + uint64_t ctl, cr0, cr3, cr4, rsp, rip, pat, addr, u64; + int error = 0; + + if (!error && (get_cr0_mask || get_all)) { + uint64_t cr0mask; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR0_MASK, &cr0mask); + if (error == 0) + printf("cr0_mask[%d]\t\t0x%016lx\n", vcpu, cr0mask); + } + + if (!error && (get_cr0_shadow || get_all)) { + uint64_t cr0shadow; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR0_SHADOW, + &cr0shadow); + if (error == 0) + printf("cr0_shadow[%d]\t\t0x%016lx\n", vcpu, cr0shadow); + } + + if (!error && (get_cr4_mask || get_all)) { + uint64_t cr4mask; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR4_MASK, &cr4mask); + if (error == 0) + printf("cr4_mask[%d]\t\t0x%016lx\n", vcpu, cr4mask); + } + + if (!error && (get_cr4_shadow || get_all)) { + uint64_t cr4shadow; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR4_SHADOW, + &cr4shadow); + if (error == 0) + printf("cr4_shadow[%d]\t\t0x%016lx\n", vcpu, cr4shadow); + } + + if (!error && (get_cr3_targets || get_all)) { + uint64_t target_count, target_addr; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET_COUNT, + &target_count); + if (error == 0) { + printf("cr3_target_count[%d]\t0x%016lx\n", + vcpu, target_count); + } + + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET0, + &target_addr); + if (error == 0) { + printf("cr3_target0[%d]\t\t0x%016lx\n", + vcpu, target_addr); + } + + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET1, + &target_addr); + if (error == 0) { + printf("cr3_target1[%d]\t\t0x%016lx\n", + vcpu, target_addr); + } + + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET2, + &target_addr); + if (error == 0) { + printf("cr3_target2[%d]\t\t0x%016lx\n", + vcpu, target_addr); + } + + error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET3, + &target_addr); + if (error == 0) { + printf("cr3_target3[%d]\t\t0x%016lx\n", + vcpu, target_addr); + } + } + + if (!error && (get_pinbased_ctls || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_PIN_BASED_CTLS, &ctl); + if (error == 0) + printf("pinbased_ctls[%d]\t0x%016lx\n", vcpu, ctl); + } + + if (!error && (get_procbased_ctls || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_PRI_PROC_BASED_CTLS, &ctl); + if (error == 0) + printf("procbased_ctls[%d]\t0x%016lx\n", vcpu, ctl); + } + + if (!error && (get_procbased_ctls2 || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_SEC_PROC_BASED_CTLS, &ctl); + if (error == 0) + printf("procbased_ctls2[%d]\t0x%016lx\n", vcpu, ctl); + } + + if (!error && (get_vmcs_gla || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_LINEAR_ADDRESS, &u64); + if (error == 0) + printf("gla[%d]\t\t0x%016lx\n", vcpu, u64); + } + + if (!error && (get_vmcs_gpa || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_PHYSICAL_ADDRESS, &u64); + if (error == 0) + printf("gpa[%d]\t\t0x%016lx\n", vcpu, u64); + } + + if (!error && (get_vmcs_entry_interruption_info || + get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,&u64); + if (error == 0) { + printf("entry_interruption_info[%d]\t0x%016lx\n", + vcpu, u64); + } + } + + if (!error && (get_tpr_threshold || get_all)) { + uint64_t threshold; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_TPR_THRESHOLD, + &threshold); + if (error == 0) + printf("tpr_threshold[%d]\t0x%016lx\n", vcpu, threshold); + } + + if (!error && (get_inst_err || get_all)) { + uint64_t insterr; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_INSTRUCTION_ERROR, + &insterr); + if (error == 0) { + printf("instruction_error[%d]\t0x%016lx\n", + vcpu, insterr); + } + } + + if (!error && (get_exit_ctls || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_CTLS, &ctl); + if (error == 0) + printf("exit_ctls[%d]\t\t0x%016lx\n", vcpu, ctl); + } + + if (!error && (get_entry_ctls || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_CTLS, &ctl); + if (error == 0) + printf("entry_ctls[%d]\t\t0x%016lx\n", vcpu, ctl); + } + + if (!error && (get_host_pat || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_IA32_PAT, &pat); + if (error == 0) + printf("host_pat[%d]\t\t0x%016lx\n", vcpu, pat); + } + + if (!error && (get_host_cr0 || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR0, &cr0); + if (error == 0) + printf("host_cr0[%d]\t\t0x%016lx\n", vcpu, cr0); + } + + if (!error && (get_host_cr3 || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR3, &cr3); + if (error == 0) + printf("host_cr3[%d]\t\t0x%016lx\n", vcpu, cr3); + } + + if (!error && (get_host_cr4 || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR4, &cr4); + if (error == 0) + printf("host_cr4[%d]\t\t0x%016lx\n", vcpu, cr4); + } + + if (!error && (get_host_rip || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_RIP, &rip); + if (error == 0) + printf("host_rip[%d]\t\t0x%016lx\n", vcpu, rip); + } + + if (!error && (get_host_rsp || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_RSP, &rsp); + if (error == 0) + printf("host_rsp[%d]\t\t0x%016lx\n", vcpu, rsp); + } + + if (!error && (get_vmcs_link || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_LINK_POINTER, &addr); + if (error == 0) + printf("vmcs_pointer[%d]\t0x%016lx\n", vcpu, addr); + } + + if (!error && (get_vmcs_exit_interruption_info || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_INTR_INFO, &u64); + if (error == 0) { + printf("vmcs_exit_interruption_info[%d]\t0x%016lx\n", + vcpu, u64); + } + } + + if (!error && (get_vmcs_exit_interruption_error || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_INTR_ERRCODE, + &u64); + if (error == 0) { + printf("vmcs_exit_interruption_error[%d]\t0x%016lx\n", + vcpu, u64); + } + } + + if (!error && (get_vmcs_interruptibility || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_INTERRUPTIBILITY, &u64); + if (error == 0) { + printf("vmcs_guest_interruptibility[%d]\t0x%016lx\n", + vcpu, u64); + } + } + + if (!error && (get_vmcs_exit_qualification || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_QUALIFICATION, + &u64); + if (error == 0) + printf("vmcs_exit_qualification[%d]\t0x%016lx\n", + vcpu, u64); + } + + return (error); +} + +static int +get_misc_vmcb(struct vmctx *ctx, int vcpu) +{ + uint64_t ctl, addr; + int error = 0; + + if (!error && (get_vmcb_intercept || get_all)) { + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_CR_INTERCEPT, 4, + &ctl); + if (error == 0) + printf("cr_intercept[%d]\t0x%08x\n", vcpu, (int)ctl); + + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_DR_INTERCEPT, 4, + &ctl); + if (error == 0) + printf("dr_intercept[%d]\t0x%08x\n", vcpu, (int)ctl); + + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXC_INTERCEPT, 4, + &ctl); + if (error == 0) + printf("exc_intercept[%d]\t0x%08x\n", vcpu, (int)ctl); + + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_INST1_INTERCEPT, + 4, &ctl); + if (error == 0) + printf("inst1_intercept[%d]\t0x%08x\n", vcpu, (int)ctl); + + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_INST2_INTERCEPT, + 4, &ctl); + if (error == 0) + printf("inst2_intercept[%d]\t0x%08x\n", vcpu, (int)ctl); + } + + if (!error && (get_vmcb_tlb_ctrl || get_all)) { + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_TLB_CTRL, + 4, &ctl); + if (error == 0) + printf("TLB ctrl[%d]\t0x%016lx\n", vcpu, ctl); + } + + if (!error && (get_vmcb_exit_details || get_all)) { + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXITINFO1, + 8, &ctl); + if (error == 0) + printf("exitinfo1[%d]\t0x%016lx\n", vcpu, ctl); + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXITINFO2, + 8, &ctl); + if (error == 0) + printf("exitinfo2[%d]\t0x%016lx\n", vcpu, ctl); + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXITINTINFO, + 8, &ctl); + if (error == 0) + printf("exitintinfo[%d]\t0x%016lx\n", vcpu, ctl); + } + + if (!error && (get_vmcb_virq || get_all)) { + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_VIRQ, + 8, &ctl); + if (error == 0) + printf("v_irq/tpr[%d]\t0x%016lx\n", vcpu, ctl); + } + + if (!error && (get_apic_access_addr || get_all)) { + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_AVIC_BAR, 8, + &addr); + if (error == 0) + printf("AVIC apic_bar[%d]\t0x%016lx\n", vcpu, addr); + } + + if (!error && (get_virtual_apic_addr || get_all)) { + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_AVIC_PAGE, 8, + &addr); + if (error == 0) + printf("AVIC backing page[%d]\t0x%016lx\n", vcpu, addr); + } + + if (!error && (get_avic_table || get_all)) { + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_AVIC_LT, 8, + &addr); + if (error == 0) + printf("AVIC logical table[%d]\t0x%016lx\n", + vcpu, addr); + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_AVIC_PT, 8, + &addr); + if (error == 0) + printf("AVIC physical table[%d]\t0x%016lx\n", + vcpu, addr); + } + + return (error); +} + +static struct option * +setup_options(bool cpu_intel) +{ + const struct option common_opts[] = { + { "vm", REQ_ARG, 0, VMNAME }, + { "cpu", REQ_ARG, 0, VCPU }, + { "set-mem", REQ_ARG, 0, SET_MEM }, + { "set-efer", REQ_ARG, 0, SET_EFER }, + { "set-cr0", REQ_ARG, 0, SET_CR0 }, + { "set-cr3", REQ_ARG, 0, SET_CR3 }, + { "set-cr4", REQ_ARG, 0, SET_CR4 }, + { "set-dr7", REQ_ARG, 0, SET_DR7 }, + { "set-rsp", REQ_ARG, 0, SET_RSP }, + { "set-rip", REQ_ARG, 0, SET_RIP }, + { "set-rax", REQ_ARG, 0, SET_RAX }, + { "set-rflags", REQ_ARG, 0, SET_RFLAGS }, + { "desc-base", REQ_ARG, 0, DESC_BASE }, + { "desc-limit", REQ_ARG, 0, DESC_LIMIT }, + { "desc-access",REQ_ARG, 0, DESC_ACCESS }, + { "set-cs", REQ_ARG, 0, SET_CS }, + { "set-ds", REQ_ARG, 0, SET_DS }, + { "set-es", REQ_ARG, 0, SET_ES }, + { "set-fs", REQ_ARG, 0, SET_FS }, + { "set-gs", REQ_ARG, 0, SET_GS }, + { "set-ss", REQ_ARG, 0, SET_SS }, + { "set-tr", REQ_ARG, 0, SET_TR }, + { "set-ldtr", REQ_ARG, 0, SET_LDTR }, + { "set-x2apic-state",REQ_ARG, 0, SET_X2APIC_STATE }, + { "set-exception-bitmap", + REQ_ARG, 0, SET_EXCEPTION_BITMAP }, + { "capname", REQ_ARG, 0, CAPNAME }, + { "unassign-pptdev", REQ_ARG, 0, UNASSIGN_PPTDEV }, + { "setcap", REQ_ARG, 0, SET_CAP }, + { "get-gpa-pmap", REQ_ARG, 0, GET_GPA_PMAP }, + { "assert-lapic-lvt", REQ_ARG, 0, ASSERT_LAPIC_LVT }, + { "get-rtc-time", NO_ARG, &get_rtc_time, 1 }, + { "set-rtc-time", REQ_ARG, 0, SET_RTC_TIME }, + { "rtc-nvram-offset", REQ_ARG, 0, RTC_NVRAM_OFFSET }, + { "get-rtc-nvram", NO_ARG, &get_rtc_nvram, 1 }, + { "set-rtc-nvram", REQ_ARG, 0, SET_RTC_NVRAM }, + { "getcap", NO_ARG, &getcap, 1 }, + { "get-stats", NO_ARG, &get_stats, 1 }, + { "get-desc-ds",NO_ARG, &get_desc_ds, 1 }, + { "set-desc-ds",NO_ARG, &set_desc_ds, 1 }, + { "get-desc-es",NO_ARG, &get_desc_es, 1 }, + { "set-desc-es",NO_ARG, &set_desc_es, 1 }, + { "get-desc-ss",NO_ARG, &get_desc_ss, 1 }, + { "set-desc-ss",NO_ARG, &set_desc_ss, 1 }, + { "get-desc-cs",NO_ARG, &get_desc_cs, 1 }, + { "set-desc-cs",NO_ARG, &set_desc_cs, 1 }, + { "get-desc-fs",NO_ARG, &get_desc_fs, 1 }, + { "set-desc-fs",NO_ARG, &set_desc_fs, 1 }, + { "get-desc-gs",NO_ARG, &get_desc_gs, 1 }, + { "set-desc-gs",NO_ARG, &set_desc_gs, 1 }, + { "get-desc-tr",NO_ARG, &get_desc_tr, 1 }, + { "set-desc-tr",NO_ARG, &set_desc_tr, 1 }, + { "set-desc-ldtr", NO_ARG, &set_desc_ldtr, 1 }, + { "get-desc-ldtr", NO_ARG, &get_desc_ldtr, 1 }, + { "set-desc-gdtr", NO_ARG, &set_desc_gdtr, 1 }, + { "get-desc-gdtr", NO_ARG, &get_desc_gdtr, 1 }, + { "set-desc-idtr", NO_ARG, &set_desc_idtr, 1 }, + { "get-desc-idtr", NO_ARG, &get_desc_idtr, 1 }, + { "get-lowmem", NO_ARG, &get_lowmem, 1 }, + { "get-highmem",NO_ARG, &get_highmem, 1 }, + { "get-efer", NO_ARG, &get_efer, 1 }, + { "get-cr0", NO_ARG, &get_cr0, 1 }, + { "get-cr3", NO_ARG, &get_cr3, 1 }, + { "get-cr4", NO_ARG, &get_cr4, 1 }, + { "get-dr7", NO_ARG, &get_dr7, 1 }, + { "get-rsp", NO_ARG, &get_rsp, 1 }, + { "get-rip", NO_ARG, &get_rip, 1 }, + { "get-rax", NO_ARG, &get_rax, 1 }, + { "get-rbx", NO_ARG, &get_rbx, 1 }, + { "get-rcx", NO_ARG, &get_rcx, 1 }, + { "get-rdx", NO_ARG, &get_rdx, 1 }, + { "get-rsi", NO_ARG, &get_rsi, 1 }, + { "get-rdi", NO_ARG, &get_rdi, 1 }, + { "get-rbp", NO_ARG, &get_rbp, 1 }, + { "get-r8", NO_ARG, &get_r8, 1 }, + { "get-r9", NO_ARG, &get_r9, 1 }, + { "get-r10", NO_ARG, &get_r10, 1 }, + { "get-r11", NO_ARG, &get_r11, 1 }, + { "get-r12", NO_ARG, &get_r12, 1 }, + { "get-r13", NO_ARG, &get_r13, 1 }, + { "get-r14", NO_ARG, &get_r14, 1 }, + { "get-r15", NO_ARG, &get_r15, 1 }, + { "get-rflags", NO_ARG, &get_rflags, 1 }, + { "get-cs", NO_ARG, &get_cs, 1 }, + { "get-ds", NO_ARG, &get_ds, 1 }, + { "get-es", NO_ARG, &get_es, 1 }, + { "get-fs", NO_ARG, &get_fs, 1 }, + { "get-gs", NO_ARG, &get_gs, 1 }, + { "get-ss", NO_ARG, &get_ss, 1 }, + { "get-tr", NO_ARG, &get_tr, 1 }, + { "get-ldtr", NO_ARG, &get_ldtr, 1 }, + { "get-eptp", NO_ARG, &get_eptp, 1 }, + { "get-exception-bitmap", + NO_ARG, &get_exception_bitmap, 1 }, + { "get-io-bitmap-address", + NO_ARG, &get_io_bitmap, 1 }, + { "get-tsc-offset", NO_ARG, &get_tsc_offset, 1 }, + { "get-msr-bitmap", + NO_ARG, &get_msr_bitmap, 1 }, + { "get-msr-bitmap-address", + NO_ARG, &get_msr_bitmap_address, 1 }, + { "get-guest-pat", NO_ARG, &get_guest_pat, 1 }, + { "get-guest-sysenter", + NO_ARG, &get_guest_sysenter, 1 }, + { "get-exit-reason", + NO_ARG, &get_exit_reason, 1 }, + { "get-x2apic-state", NO_ARG, &get_x2apic_state, 1 }, + { "get-all", NO_ARG, &get_all, 1 }, + { "run", NO_ARG, &run, 1 }, + { "create", NO_ARG, &create, 1 }, + { "destroy", NO_ARG, &destroy, 1 }, + { "inject-nmi", NO_ARG, &inject_nmi, 1 }, + { "force-reset", NO_ARG, &force_reset, 1 }, + { "force-poweroff", NO_ARG, &force_poweroff, 1 }, + { "get-active-cpus", NO_ARG, &get_active_cpus, 1 }, + { "get-suspended-cpus", NO_ARG, &get_suspended_cpus, 1 }, + { "get-intinfo", NO_ARG, &get_intinfo, 1 }, + }; + + const struct option intel_opts[] = { + { "get-vmcs-pinbased-ctls", + NO_ARG, &get_pinbased_ctls, 1 }, + { "get-vmcs-procbased-ctls", + NO_ARG, &get_procbased_ctls, 1 }, + { "get-vmcs-procbased-ctls2", + NO_ARG, &get_procbased_ctls2, 1 }, + { "get-vmcs-guest-linear-address", + NO_ARG, &get_vmcs_gla, 1 }, + { "get-vmcs-guest-physical-address", + NO_ARG, &get_vmcs_gpa, 1 }, + { "get-vmcs-entry-interruption-info", + NO_ARG, &get_vmcs_entry_interruption_info, 1}, + { "get-vmcs-cr0-mask", NO_ARG, &get_cr0_mask, 1 }, + { "get-vmcs-cr0-shadow", NO_ARG,&get_cr0_shadow, 1 }, + { "get-vmcs-cr4-mask", NO_ARG, &get_cr4_mask, 1 }, + { "get-vmcs-cr4-shadow", NO_ARG, &get_cr4_shadow, 1 }, + { "get-vmcs-cr3-targets", NO_ARG, &get_cr3_targets, 1 }, + { "get-vmcs-tpr-threshold", + NO_ARG, &get_tpr_threshold, 1 }, + { "get-vmcs-vpid", NO_ARG, &get_vpid_asid, 1 }, + { "get-vmcs-exit-ctls", NO_ARG, &get_exit_ctls, 1 }, + { "get-vmcs-entry-ctls", + NO_ARG, &get_entry_ctls, 1 }, + { "get-vmcs-instruction-error", + NO_ARG, &get_inst_err, 1 }, + { "get-vmcs-host-pat", NO_ARG, &get_host_pat, 1 }, + { "get-vmcs-host-cr0", + NO_ARG, &get_host_cr0, 1 }, + { "set-vmcs-entry-interruption-info", + REQ_ARG, 0, SET_VMCS_ENTRY_INTERRUPTION_INFO }, + { "get-vmcs-exit-qualification", + NO_ARG, &get_vmcs_exit_qualification, 1 }, + { "get-vmcs-interruptibility", + NO_ARG, &get_vmcs_interruptibility, 1 }, + { "get-vmcs-exit-interruption-error", + NO_ARG, &get_vmcs_exit_interruption_error, 1 }, + { "get-vmcs-exit-interruption-info", + NO_ARG, &get_vmcs_exit_interruption_info, 1 }, + { "get-vmcs-link", NO_ARG, &get_vmcs_link, 1 }, + { "get-vmcs-host-cr3", + NO_ARG, &get_host_cr3, 1 }, + { "get-vmcs-host-cr4", + NO_ARG, &get_host_cr4, 1 }, + { "get-vmcs-host-rip", + NO_ARG, &get_host_rip, 1 }, + { "get-vmcs-host-rsp", + NO_ARG, &get_host_rsp, 1 }, + { "get-apic-access-address", + NO_ARG, &get_apic_access_addr, 1}, + { "get-virtual-apic-address", + NO_ARG, &get_virtual_apic_addr, 1} + }; + + const struct option amd_opts[] = { + { "get-vmcb-intercepts", + NO_ARG, &get_vmcb_intercept, 1 }, + { "get-vmcb-asid", + NO_ARG, &get_vpid_asid, 1 }, + { "get-vmcb-exit-details", + NO_ARG, &get_vmcb_exit_details, 1 }, + { "get-vmcb-tlb-ctrl", + NO_ARG, &get_vmcb_tlb_ctrl, 1 }, + { "get-vmcb-virq", + NO_ARG, &get_vmcb_virq, 1 }, + { "get-avic-apic-bar", + NO_ARG, &get_apic_access_addr, 1 }, + { "get-avic-backing-page", + NO_ARG, &get_virtual_apic_addr, 1 }, + { "get-avic-table", + NO_ARG, &get_avic_table, 1 } + }; + + const struct option null_opt = { + NULL, 0, NULL, 0 + }; + + struct option *all_opts; + char *cp; + int optlen; + + optlen = sizeof(common_opts); + + if (cpu_intel) + optlen += sizeof(intel_opts); + else + optlen += sizeof(amd_opts); + + optlen += sizeof(null_opt); + + all_opts = malloc(optlen); + + cp = (char *)all_opts; + memcpy(cp, common_opts, sizeof(common_opts)); + cp += sizeof(common_opts); + + if (cpu_intel) { + memcpy(cp, intel_opts, sizeof(intel_opts)); + cp += sizeof(intel_opts); + } else { + memcpy(cp, amd_opts, sizeof(amd_opts)); + cp += sizeof(amd_opts); + } + + memcpy(cp, &null_opt, sizeof(null_opt)); + cp += sizeof(null_opt); + + return (all_opts); +} + +static const char * +wday_str(int idx) +{ + static const char *weekdays[] = { + "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat" + }; + + if (idx >= 0 && idx < 7) + return (weekdays[idx]); + else + return ("UNK"); +} + +static const char * +mon_str(int idx) +{ + static const char *months[] = { + "Jan", "Feb", "Mar", "Apr", "May", "Jun", + "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" + }; + + if (idx >= 0 && idx < 12) + return (months[idx]); + else + return ("UNK"); +} + +int +main(int argc, char *argv[]) +{ + char *vmname; + int error, ch, vcpu, ptenum; + vm_paddr_t gpa, gpa_pmap; + size_t len; + struct vm_exit vmexit; + uint64_t rax, cr0, cr3, cr4, dr7, rsp, rip, rflags, efer, pat; + uint64_t eptp, bm, addr, u64, pteval[4], *pte, info[2]; + struct vmctx *ctx; + int wired; + cpuset_t cpus; + bool cpu_intel; + uint64_t cs, ds, es, fs, gs, ss, tr, ldtr; + struct tm tm; + struct option *opts; + + cpu_intel = cpu_vendor_intel(); + opts = setup_options(cpu_intel); + + vcpu = 0; + vmname = NULL; + assert_lapic_lvt = -1; + progname = basename(argv[0]); + + while ((ch = getopt_long(argc, argv, "", opts, NULL)) != -1) { + switch (ch) { + case 0: + break; + case VMNAME: + vmname = optarg; + break; + case VCPU: + vcpu = atoi(optarg); + break; + case SET_MEM: + memsize = atoi(optarg) * MB; + memsize = roundup(memsize, 2 * MB); + break; + case SET_EFER: + efer = strtoul(optarg, NULL, 0); + set_efer = 1; + break; + case SET_CR0: + cr0 = strtoul(optarg, NULL, 0); + set_cr0 = 1; + break; + case SET_CR3: + cr3 = strtoul(optarg, NULL, 0); + set_cr3 = 1; + break; + case SET_CR4: + cr4 = strtoul(optarg, NULL, 0); + set_cr4 = 1; + break; + case SET_DR7: + dr7 = strtoul(optarg, NULL, 0); + set_dr7 = 1; + break; + case SET_RSP: + rsp = strtoul(optarg, NULL, 0); + set_rsp = 1; + break; + case SET_RIP: + rip = strtoul(optarg, NULL, 0); + set_rip = 1; + break; + case SET_RAX: + rax = strtoul(optarg, NULL, 0); + set_rax = 1; + break; + case SET_RFLAGS: + rflags = strtoul(optarg, NULL, 0); + set_rflags = 1; + break; + case DESC_BASE: + desc_base = strtoul(optarg, NULL, 0); + break; + case DESC_LIMIT: + desc_limit = strtoul(optarg, NULL, 0); + break; + case DESC_ACCESS: + desc_access = strtoul(optarg, NULL, 0); + break; + case SET_CS: + cs = strtoul(optarg, NULL, 0); + set_cs = 1; + break; + case SET_DS: + ds = strtoul(optarg, NULL, 0); + set_ds = 1; + break; + case SET_ES: + es = strtoul(optarg, NULL, 0); + set_es = 1; + break; + case SET_FS: + fs = strtoul(optarg, NULL, 0); + set_fs = 1; + break; + case SET_GS: + gs = strtoul(optarg, NULL, 0); + set_gs = 1; + break; + case SET_SS: + ss = strtoul(optarg, NULL, 0); + set_ss = 1; + break; + case SET_TR: + tr = strtoul(optarg, NULL, 0); + set_tr = 1; + break; + case SET_LDTR: + ldtr = strtoul(optarg, NULL, 0); + set_ldtr = 1; + break; + case SET_X2APIC_STATE: + x2apic_state = strtol(optarg, NULL, 0); + set_x2apic_state = 1; + break; + case SET_EXCEPTION_BITMAP: + exception_bitmap = strtoul(optarg, NULL, 0); + set_exception_bitmap = 1; + break; + case SET_VMCS_ENTRY_INTERRUPTION_INFO: + vmcs_entry_interruption_info = strtoul(optarg, NULL, 0); + set_vmcs_entry_interruption_info = 1; + break; + case SET_CAP: + capval = strtoul(optarg, NULL, 0); + setcap = 1; + break; + case SET_RTC_TIME: + rtc_secs = strtoul(optarg, NULL, 0); + set_rtc_time = 1; + break; + case SET_RTC_NVRAM: + rtc_nvram_value = (uint8_t)strtoul(optarg, NULL, 0); + set_rtc_nvram = 1; + break; + case RTC_NVRAM_OFFSET: + rtc_nvram_offset = strtoul(optarg, NULL, 0); + break; + case GET_GPA_PMAP: + gpa_pmap = strtoul(optarg, NULL, 0); + get_gpa_pmap = 1; + break; + case CAPNAME: + capname = optarg; + break; + case UNASSIGN_PPTDEV: + unassign_pptdev = 1; + if (sscanf(optarg, "%d/%d/%d", &bus, &slot, &func) != 3) + usage(cpu_intel); + break; + case ASSERT_LAPIC_LVT: + assert_lapic_lvt = atoi(optarg); + break; + default: + usage(cpu_intel); + } + } + argc -= optind; + argv += optind; + + if (vmname == NULL) + usage(cpu_intel); + + error = 0; + + if (!error && create) + error = vm_create(vmname); + + if (!error) { + ctx = vm_open(vmname); + if (ctx == NULL) { + printf("VM:%s is not created.\n", vmname); + exit (1); + } + } + + if (!error && memsize) + error = vm_setup_memory(ctx, memsize, VM_MMAP_NONE); + + if (!error && set_efer) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_EFER, efer); + + if (!error && set_cr0) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR0, cr0); + + if (!error && set_cr3) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR3, cr3); + + if (!error && set_cr4) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR4, cr4); + + if (!error && set_dr7) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR7, dr7); + + if (!error && set_rsp) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RSP, rsp); + + if (!error && set_rip) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, rip); + + if (!error && set_rax) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, rax); + + if (!error && set_rflags) { + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RFLAGS, + rflags); + } + + if (!error && set_desc_ds) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_DS, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_es) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_ES, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_ss) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_SS, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_cs) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_CS, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_fs) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_FS, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_gs) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GS, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_tr) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_TR, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_ldtr) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_LDTR, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_gdtr) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GDTR, + desc_base, desc_limit, 0); + } + + if (!error && set_desc_idtr) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_IDTR, + desc_base, desc_limit, 0); + } + + if (!error && set_cs) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CS, cs); + + if (!error && set_ds) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DS, ds); + + if (!error && set_es) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_ES, es); + + if (!error && set_fs) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_FS, fs); + + if (!error && set_gs) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_GS, gs); + + if (!error && set_ss) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_SS, ss); + + if (!error && set_tr) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_TR, tr); + + if (!error && set_ldtr) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_LDTR, ldtr); + + if (!error && set_x2apic_state) + error = vm_set_x2apic_state(ctx, vcpu, x2apic_state); + + if (!error && unassign_pptdev) + error = vm_unassign_pptdev(ctx, bus, slot, func); + + if (!error && set_exception_bitmap) { + if (cpu_intel) + error = vm_set_vmcs_field(ctx, vcpu, + VMCS_EXCEPTION_BITMAP, + exception_bitmap); + else + error = vm_set_vmcb_field(ctx, vcpu, + VMCB_OFF_EXC_INTERCEPT, + 4, exception_bitmap); + } + + if (!error && cpu_intel && set_vmcs_entry_interruption_info) { + error = vm_set_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO, + vmcs_entry_interruption_info); + } + + if (!error && inject_nmi) { + error = vm_inject_nmi(ctx, vcpu); + } + + if (!error && assert_lapic_lvt != -1) { + error = vm_lapic_local_irq(ctx, vcpu, assert_lapic_lvt); + } + + if (!error && (get_lowmem || get_all)) { + gpa = 0; + error = vm_get_memory_seg(ctx, gpa, &len, &wired); + if (error == 0) + printf("lowmem\t\t0x%016lx/%ld%s\n", gpa, len, + wired ? " wired" : ""); + } + + if (!error && (get_highmem || get_all)) { + gpa = 4 * GB; + error = vm_get_memory_seg(ctx, gpa, &len, &wired); + if (error == 0) + printf("highmem\t\t0x%016lx/%ld%s\n", gpa, len, + wired ? " wired" : ""); + } + + if (!error) + error = get_all_registers(ctx, vcpu); + + if (!error) + error = get_all_segments(ctx, vcpu); + + if (!error) { + if (cpu_intel) + error = get_misc_vmcs(ctx, vcpu); + else + error = get_misc_vmcb(ctx, vcpu); + } + + if (!error && (get_x2apic_state || get_all)) { + error = vm_get_x2apic_state(ctx, vcpu, &x2apic_state); + if (error == 0) + printf("x2apic_state[%d]\t%d\n", vcpu, x2apic_state); + } + + if (!error && (get_eptp || get_all)) { + if (cpu_intel) + error = vm_get_vmcs_field(ctx, vcpu, VMCS_EPTP, &eptp); + else + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_NPT_BASE, + 8, &eptp); + if (error == 0) + printf("%s[%d]\t\t0x%016lx\n", + cpu_intel ? "eptp" : "rvi/npt", vcpu, eptp); + } + + if (!error && (get_exception_bitmap || get_all)) { + if(cpu_intel) + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_EXCEPTION_BITMAP, &bm); + else + error = vm_get_vmcb_field(ctx, vcpu, + VMCB_OFF_EXC_INTERCEPT, + 4, &bm); + if (error == 0) + printf("exception_bitmap[%d]\t%#lx\n", vcpu, bm); + } + + if (!error && (get_io_bitmap || get_all)) { + if (cpu_intel) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_A, + &bm); + if (error == 0) + printf("io_bitmap_a[%d]\t%#lx\n", vcpu, bm); + error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_B, + &bm); + if (error == 0) + printf("io_bitmap_b[%d]\t%#lx\n", vcpu, bm); + } else { + error = vm_get_vmcb_field(ctx, vcpu, + VMCB_OFF_IO_PERM, 8, &bm); + if (error == 0) + printf("io_bitmap[%d]\t%#lx\n", vcpu, bm); + } + } + + if (!error && (get_tsc_offset || get_all)) { + uint64_t tscoff; + if (cpu_intel) + error = vm_get_vmcs_field(ctx, vcpu, VMCS_TSC_OFFSET, + &tscoff); + else + error = vm_get_vmcb_field(ctx, vcpu, + VMCB_OFF_TSC_OFFSET, + 8, &tscoff); + if (error == 0) + printf("tsc_offset[%d]\t0x%016lx\n", vcpu, tscoff); + } + + if (!error && (get_msr_bitmap_address || get_all)) { + if (cpu_intel) + error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, + &addr); + else + error = vm_get_vmcb_field(ctx, vcpu, + VMCB_OFF_MSR_PERM, 8, &addr); + if (error == 0) + printf("msr_bitmap[%d]\t\t%#lx\n", vcpu, addr); + } + + if (!error && (get_msr_bitmap || get_all)) { + if (cpu_intel) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_MSR_BITMAP, &addr); + } else { + error = vm_get_vmcb_field(ctx, vcpu, + VMCB_OFF_MSR_PERM, 8, + &addr); + } + + if (error == 0) + error = dump_msr_bitmap(vcpu, addr, cpu_intel); + } + + if (!error && (get_vpid_asid || get_all)) { + uint64_t vpid; + if (cpu_intel) + error = vm_get_vmcs_field(ctx, vcpu, VMCS_VPID, &vpid); + else + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_ASID, + 4, &vpid); + if (error == 0) + printf("%s[%d]\t\t0x%04lx\n", + cpu_intel ? "vpid" : "asid", vcpu, vpid); + } + + if (!error && (get_guest_pat || get_all)) { + if (cpu_intel) + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_IA32_PAT, &pat); + else + error = vm_get_vmcb_field(ctx, vcpu, + VMCB_OFF_GUEST_PAT, 8, &pat); + if (error == 0) + printf("guest_pat[%d]\t\t0x%016lx\n", vcpu, pat); + } + + if (!error && (get_guest_sysenter || get_all)) { + if (cpu_intel) + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_IA32_SYSENTER_CS, + &cs); + else + error = vm_get_vmcb_field(ctx, vcpu, + VMCB_OFF_SYSENTER_CS, 8, + &cs); + + if (error == 0) + printf("guest_sysenter_cs[%d]\t%#lx\n", vcpu, cs); + if (cpu_intel) + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_IA32_SYSENTER_ESP, + &rsp); + else + error = vm_get_vmcb_field(ctx, vcpu, + VMCB_OFF_SYSENTER_ESP, 8, + &rsp); + + if (error == 0) + printf("guest_sysenter_sp[%d]\t%#lx\n", vcpu, rsp); + if (cpu_intel) + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_IA32_SYSENTER_EIP, + &rip); + else + error = vm_get_vmcb_field(ctx, vcpu, + VMCB_OFF_SYSENTER_EIP, 8, + &rip); + if (error == 0) + printf("guest_sysenter_ip[%d]\t%#lx\n", vcpu, rip); + } + + if (!error && (get_exit_reason || get_all)) { + if (cpu_intel) + error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_REASON, + &u64); + else + error = vm_get_vmcb_field(ctx, vcpu, + VMCB_OFF_EXIT_REASON, 8, + &u64); + if (error == 0) + printf("exit_reason[%d]\t%#lx\n", vcpu, u64); + } + + if (!error && setcap) { + int captype; + captype = vm_capability_name2type(capname); + error = vm_set_capability(ctx, vcpu, captype, capval); + if (error != 0 && errno == ENOENT) + printf("Capability \"%s\" is not available\n", capname); + } + + if (!error && get_gpa_pmap) { + error = vm_get_gpa_pmap(ctx, gpa_pmap, pteval, &ptenum); + if (error == 0) { + printf("gpa %#lx:", gpa_pmap); + pte = &pteval[0]; + while (ptenum-- > 0) + printf(" %#lx", *pte++); + printf("\n"); + } + } + + if (!error && set_rtc_nvram) + error = vm_rtc_write(ctx, rtc_nvram_offset, rtc_nvram_value); + + if (!error && (get_rtc_nvram || get_all)) { + error = vm_rtc_read(ctx, rtc_nvram_offset, &rtc_nvram_value); + if (error == 0) { + printf("rtc nvram[%03d]: 0x%02x\n", rtc_nvram_offset, + rtc_nvram_value); + } + } + + if (!error && set_rtc_time) + error = vm_rtc_settime(ctx, rtc_secs); + + if (!error && (get_rtc_time || get_all)) { + error = vm_rtc_gettime(ctx, &rtc_secs); + if (error == 0) { + gmtime_r(&rtc_secs, &tm); + printf("rtc time %#lx: %s %s %02d %02d:%02d:%02d %d\n", + rtc_secs, wday_str(tm.tm_wday), mon_str(tm.tm_mon), + tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec, + 1900 + tm.tm_year); + } + } + + if (!error && (getcap || get_all)) { + int captype, val, getcaptype; + + if (getcap && capname) + getcaptype = vm_capability_name2type(capname); + else + getcaptype = -1; + + for (captype = 0; captype < VM_CAP_MAX; captype++) { + if (getcaptype >= 0 && captype != getcaptype) + continue; + error = vm_get_capability(ctx, vcpu, captype, &val); + if (error == 0) { + printf("Capability \"%s\" is %s on vcpu %d\n", + vm_capability_type2name(captype), + val ? "set" : "not set", vcpu); + } else if (errno == ENOENT) { + error = 0; + printf("Capability \"%s\" is not available\n", + vm_capability_type2name(captype)); + } else { + break; + } + } + } + + if (!error && (get_active_cpus || get_all)) { + error = vm_active_cpus(ctx, &cpus); + if (!error) + print_cpus("active cpus", &cpus); + } + + if (!error && (get_suspended_cpus || get_all)) { + error = vm_suspended_cpus(ctx, &cpus); + if (!error) + print_cpus("suspended cpus", &cpus); + } + + if (!error && (get_intinfo || get_all)) { + error = vm_get_intinfo(ctx, vcpu, &info[0], &info[1]); + if (!error) { + print_intinfo("pending", info[0]); + print_intinfo("current", info[1]); + } + } + + if (!error && (get_stats || get_all)) { + int i, num_stats; + uint64_t *stats; + struct timeval tv; + const char *desc; + + stats = vm_get_stats(ctx, vcpu, &tv, &num_stats); + if (stats != NULL) { + printf("vcpu%d stats:\n", vcpu); + for (i = 0; i < num_stats; i++) { + desc = vm_get_stat_desc(ctx, i); + printf("%-40s\t%ld\n", desc, stats[i]); + } + } + } + + if (!error && run) { + error = vm_run(ctx, vcpu, &vmexit); + if (error == 0) + dump_vm_run_exitcode(&vmexit, vcpu); + else + printf("vm_run error %d\n", error); + } + + if (!error && force_reset) + error = vm_suspend(ctx, VM_SUSPEND_RESET); + + if (!error && force_poweroff) + error = vm_suspend(ctx, VM_SUSPEND_POWEROFF); + + if (error) + printf("errno = %d\n", errno); + + if (!error && destroy) + vm_destroy(ctx); + + free (opts); + exit(error); +} diff --git a/bhyveload/Makefile b/bhyveload/Makefile new file mode 100644 index 0000000..fce0c1b --- /dev/null +++ b/bhyveload/Makefile @@ -0,0 +1,13 @@ +# $FreeBSD$ + +PROG= bhyveload +SRCS= bhyveload.c +MAN= bhyveload.8 + +LIBADD= vmmapi + +WARNS?= 3 + +CFLAGS+=-I${.CURDIR}/../../sys/boot/userboot + +.include diff --git a/bhyveload/bhyveload.8 b/bhyveload/bhyveload.8 new file mode 100644 index 0000000..c168832 --- /dev/null +++ b/bhyveload/bhyveload.8 @@ -0,0 +1,157 @@ +.\" +.\" Copyright (c) 2012 NetApp Inc +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd January 7, 2012 +.Dt BHYVELOAD 8 +.Os +.Sh NAME +.Nm bhyveload +.Nd load a +.Fx +guest inside a bhyve virtual machine +.Sh SYNOPSIS +.Nm +.Op Fl c Ar cons-dev +.Op Fl d Ar disk-path +.Op Fl e Ar name=value +.Op Fl h Ar host-path +.Op Fl m Ar mem-size +.Ar vmname +.Sh DESCRIPTION +.Nm +is used to load a +.Fx +guest inside a +.Xr bhyve 4 +virtual machine. +.Pp +.Nm +is based on +.Xr loader 8 +and will present an interface identical to the +.Fx +loader on the user's terminal. +.Pp +The virtual machine is identified as +.Ar vmname +and will be created if it does not already exist. +.Sh OPTIONS +The following options are available: +.Bl -tag -width indent +.It Fl c Ar cons-dev +.Ar cons-dev +is a +.Xr tty 4 +device to use for +.Nm +terminal I/O. +.Pp +The text string "stdio" is also accepted and selects the use of +unbuffered standard I/O. This is the default value. +.It Fl d Ar disk-path +The +.Ar disk-path +is the pathname of the guest's boot disk image. +.It Fl e Ar name=value +Set the FreeBSD loader environment variable +.Ar name +to +.Ar value . +.Pp +The option may be used more than once to set more than one environment +variable. +.It Fl h Ar host-path +The +.Ar host-path +is the directory at the top of the guest's boot filesystem. +.It Fl m Ar mem-size Xo +.Sm off +.Op Cm K | k | M | m | G | g | T | t +.Xc +.Sm on +.Ar mem-size +is the amount of memory allocated to the guest. +.Pp +The +.Ar mem-size +argument may be suffixed with one of +.Cm K , +.Cm M , +.Cm G +or +.Cm T +(either upper or lower case) to indicate a multiple of +Kilobytes, Megabytes, Gigabytes or Terabytes +respectively. +.Pp +The default value of +.Ar mem-size +is 256M. +.El +.Sh EXAMPLES +To create a virtual machine named +.Ar freebsd-vm +that boots off the ISO image +.Pa /freebsd/release.iso +and has 1GB memory allocated to it: +.Pp +.Dl "bhyveload -m 1G -d /freebsd/release.iso freebsd-vm" +.Pp +To create a virtual machine named +.Ar test-vm +with 256MB of memory allocated, the guest root filesystem under the host +directory +.Pa /user/images/test +and terminal I/O sent to the +.Xr nmdm 4 +device +.Pa /dev/nmdm1B +.Pp +.Dl "bhyveload -m 256MB -h /usr/images/test -c /dev/nmdm1B test-vm" +.Sh SEE ALSO +.Xr bhyve 4 , +.Xr nmdm 4 , +.Xr vmm 4 , +.Xr bhyve 8 , +.Xr loader 8 +.Sh HISTORY +.Nm +first appeared in +.Fx 10.0 , +and was developed at NetApp Inc. +.Sh AUTHORS +.Nm +was developed by +.An -nosplit +.An Neel Natu Aq Mt neel@FreeBSD.org +at NetApp Inc with a lot of help from +.An Doug Rabson Aq Mt dfr@FreeBSD.org . +.Sh BUGS +.Nm +can only load +.Fx +as a guest. diff --git a/bhyveload/bhyveload.c b/bhyveload/bhyveload.c new file mode 100644 index 0000000..8ebf116 --- /dev/null +++ b/bhyveload/bhyveload.c @@ -0,0 +1,746 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/*- + * Copyright (c) 2011 Google, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "userboot.h" + +#define MB (1024 * 1024UL) +#define GB (1024 * 1024 * 1024UL) +#define BSP 0 + +#define NDISKS 32 + +static char *host_base; +static struct termios term, oldterm; +static int disk_fd[NDISKS]; +static int ndisks; +static int consin_fd, consout_fd; + +static char *vmname, *progname; +static struct vmctx *ctx; + +static uint64_t gdtbase, cr3, rsp; + +static void cb_exit(void *arg, int v); + +/* + * Console i/o callbacks + */ + +static void +cb_putc(void *arg, int ch) +{ + char c = ch; + + (void) write(consout_fd, &c, 1); +} + +static int +cb_getc(void *arg) +{ + char c; + + if (read(consin_fd, &c, 1) == 1) + return (c); + return (-1); +} + +static int +cb_poll(void *arg) +{ + int n; + + if (ioctl(consin_fd, FIONREAD, &n) >= 0) + return (n > 0); + return (0); +} + +/* + * Host filesystem i/o callbacks + */ + +struct cb_file { + int cf_isdir; + size_t cf_size; + struct stat cf_stat; + union { + int fd; + DIR *dir; + } cf_u; +}; + +static int +cb_open(void *arg, const char *filename, void **hp) +{ + struct stat st; + struct cb_file *cf; + char path[PATH_MAX]; + + if (!host_base) + return (ENOENT); + + strlcpy(path, host_base, PATH_MAX); + if (path[strlen(path) - 1] == '/') + path[strlen(path) - 1] = 0; + strlcat(path, filename, PATH_MAX); + cf = malloc(sizeof(struct cb_file)); + if (stat(path, &cf->cf_stat) < 0) { + free(cf); + return (errno); + } + + cf->cf_size = st.st_size; + if (S_ISDIR(cf->cf_stat.st_mode)) { + cf->cf_isdir = 1; + cf->cf_u.dir = opendir(path); + if (!cf->cf_u.dir) + goto out; + *hp = cf; + return (0); + } + if (S_ISREG(cf->cf_stat.st_mode)) { + cf->cf_isdir = 0; + cf->cf_u.fd = open(path, O_RDONLY); + if (cf->cf_u.fd < 0) + goto out; + *hp = cf; + return (0); + } + +out: + free(cf); + return (EINVAL); +} + +static int +cb_close(void *arg, void *h) +{ + struct cb_file *cf = h; + + if (cf->cf_isdir) + closedir(cf->cf_u.dir); + else + close(cf->cf_u.fd); + free(cf); + + return (0); +} + +static int +cb_isdir(void *arg, void *h) +{ + struct cb_file *cf = h; + + return (cf->cf_isdir); +} + +static int +cb_read(void *arg, void *h, void *buf, size_t size, size_t *resid) +{ + struct cb_file *cf = h; + ssize_t sz; + + if (cf->cf_isdir) + return (EINVAL); + sz = read(cf->cf_u.fd, buf, size); + if (sz < 0) + return (EINVAL); + *resid = size - sz; + return (0); +} + +static int +cb_readdir(void *arg, void *h, uint32_t *fileno_return, uint8_t *type_return, + size_t *namelen_return, char *name) +{ + struct cb_file *cf = h; + struct dirent *dp; + + if (!cf->cf_isdir) + return (EINVAL); + + dp = readdir(cf->cf_u.dir); + if (!dp) + return (ENOENT); + + /* + * Note: d_namlen is in the range 0..255 and therefore less + * than PATH_MAX so we don't need to test before copying. + */ + *fileno_return = dp->d_fileno; + *type_return = dp->d_type; + *namelen_return = dp->d_namlen; + memcpy(name, dp->d_name, dp->d_namlen); + name[dp->d_namlen] = 0; + + return (0); +} + +static int +cb_seek(void *arg, void *h, uint64_t offset, int whence) +{ + struct cb_file *cf = h; + + if (cf->cf_isdir) + return (EINVAL); + if (lseek(cf->cf_u.fd, offset, whence) < 0) + return (errno); + return (0); +} + +static int +cb_stat(void *arg, void *h, int *mode, int *uid, int *gid, uint64_t *size) +{ + struct cb_file *cf = h; + + *mode = cf->cf_stat.st_mode; + *uid = cf->cf_stat.st_uid; + *gid = cf->cf_stat.st_gid; + *size = cf->cf_stat.st_size; + return (0); +} + +/* + * Disk image i/o callbacks + */ + +static int +cb_diskread(void *arg, int unit, uint64_t from, void *to, size_t size, + size_t *resid) +{ + ssize_t n; + + if (unit < 0 || unit >= ndisks ) + return (EIO); + n = pread(disk_fd[unit], to, size, from); + if (n < 0) + return (errno); + *resid = size - n; + return (0); +} + +static int +cb_diskioctl(void *arg, int unit, u_long cmd, void *data) +{ + struct stat sb; + + if (unit < 0 || unit >= ndisks) + return (EBADF); + + switch (cmd) { + case DIOCGSECTORSIZE: + *(u_int *)data = 512; + break; + case DIOCGMEDIASIZE: + if (fstat(disk_fd[unit], &sb) == 0) + *(off_t *)data = sb.st_size; + else + return (ENOTTY); + break; + default: + return (ENOTTY); + } + + return (0); +} + +/* + * Guest virtual machine i/o callbacks + */ +static int +cb_copyin(void *arg, const void *from, uint64_t to, size_t size) +{ + char *ptr; + + to &= 0x7fffffff; + + ptr = vm_map_gpa(ctx, to, size); + if (ptr == NULL) + return (EFAULT); + + memcpy(ptr, from, size); + return (0); +} + +static int +cb_copyout(void *arg, uint64_t from, void *to, size_t size) +{ + char *ptr; + + from &= 0x7fffffff; + + ptr = vm_map_gpa(ctx, from, size); + if (ptr == NULL) + return (EFAULT); + + memcpy(to, ptr, size); + return (0); +} + +static void +cb_setreg(void *arg, int r, uint64_t v) +{ + int error; + enum vm_reg_name vmreg; + + vmreg = VM_REG_LAST; + + switch (r) { + case 4: + vmreg = VM_REG_GUEST_RSP; + rsp = v; + break; + default: + break; + } + + if (vmreg == VM_REG_LAST) { + printf("test_setreg(%d): not implemented\n", r); + cb_exit(NULL, USERBOOT_EXIT_QUIT); + } + + error = vm_set_register(ctx, BSP, vmreg, v); + if (error) { + perror("vm_set_register"); + cb_exit(NULL, USERBOOT_EXIT_QUIT); + } +} + +static void +cb_setmsr(void *arg, int r, uint64_t v) +{ + int error; + enum vm_reg_name vmreg; + + vmreg = VM_REG_LAST; + + switch (r) { + case MSR_EFER: + vmreg = VM_REG_GUEST_EFER; + break; + default: + break; + } + + if (vmreg == VM_REG_LAST) { + printf("test_setmsr(%d): not implemented\n", r); + cb_exit(NULL, USERBOOT_EXIT_QUIT); + } + + error = vm_set_register(ctx, BSP, vmreg, v); + if (error) { + perror("vm_set_msr"); + cb_exit(NULL, USERBOOT_EXIT_QUIT); + } +} + +static void +cb_setcr(void *arg, int r, uint64_t v) +{ + int error; + enum vm_reg_name vmreg; + + vmreg = VM_REG_LAST; + + switch (r) { + case 0: + vmreg = VM_REG_GUEST_CR0; + break; + case 3: + vmreg = VM_REG_GUEST_CR3; + cr3 = v; + break; + case 4: + vmreg = VM_REG_GUEST_CR4; + break; + default: + break; + } + + if (vmreg == VM_REG_LAST) { + printf("test_setcr(%d): not implemented\n", r); + cb_exit(NULL, USERBOOT_EXIT_QUIT); + } + + error = vm_set_register(ctx, BSP, vmreg, v); + if (error) { + perror("vm_set_cr"); + cb_exit(NULL, USERBOOT_EXIT_QUIT); + } +} + +static void +cb_setgdt(void *arg, uint64_t base, size_t size) +{ + int error; + + error = vm_set_desc(ctx, BSP, VM_REG_GUEST_GDTR, base, size - 1, 0); + if (error != 0) { + perror("vm_set_desc(gdt)"); + cb_exit(NULL, USERBOOT_EXIT_QUIT); + } + + gdtbase = base; +} + +static void +cb_exec(void *arg, uint64_t rip) +{ + int error; + + if (cr3 == 0) + error = vm_setup_freebsd_registers_i386(ctx, BSP, rip, gdtbase, + rsp); + else + error = vm_setup_freebsd_registers(ctx, BSP, rip, cr3, gdtbase, + rsp); + if (error) { + perror("vm_setup_freebsd_registers"); + cb_exit(NULL, USERBOOT_EXIT_QUIT); + } + + cb_exit(NULL, 0); +} + +/* + * Misc + */ + +static void +cb_delay(void *arg, int usec) +{ + + usleep(usec); +} + +static void +cb_exit(void *arg, int v) +{ + + tcsetattr(consout_fd, TCSAFLUSH, &oldterm); + exit(v); +} + +static void +cb_getmem(void *arg, uint64_t *ret_lowmem, uint64_t *ret_highmem) +{ + + *ret_lowmem = vm_get_lowmem_size(ctx); + *ret_highmem = vm_get_highmem_size(ctx); +} + +struct env { + const char *str; /* name=value */ + SLIST_ENTRY(env) next; +}; + +static SLIST_HEAD(envhead, env) envhead; + +static void +addenv(const char *str) +{ + struct env *env; + + env = malloc(sizeof(struct env)); + env->str = str; + SLIST_INSERT_HEAD(&envhead, env, next); +} + +static const char * +cb_getenv(void *arg, int num) +{ + int i; + struct env *env; + + i = 0; + SLIST_FOREACH(env, &envhead, next) { + if (i == num) + return (env->str); + i++; + } + + return (NULL); +} + +static struct loader_callbacks cb = { + .getc = cb_getc, + .putc = cb_putc, + .poll = cb_poll, + + .open = cb_open, + .close = cb_close, + .isdir = cb_isdir, + .read = cb_read, + .readdir = cb_readdir, + .seek = cb_seek, + .stat = cb_stat, + + .diskread = cb_diskread, + .diskioctl = cb_diskioctl, + + .copyin = cb_copyin, + .copyout = cb_copyout, + .setreg = cb_setreg, + .setmsr = cb_setmsr, + .setcr = cb_setcr, + .setgdt = cb_setgdt, + .exec = cb_exec, + + .delay = cb_delay, + .exit = cb_exit, + .getmem = cb_getmem, + + .getenv = cb_getenv, +}; + +static int +altcons_open(char *path) +{ + struct stat sb; + int err; + int fd; + + /* + * Allow stdio to be passed in so that the same string + * can be used for the bhyveload console and bhyve com-port + * parameters + */ + if (!strcmp(path, "stdio")) + return (0); + + err = stat(path, &sb); + if (err == 0) { + if (!S_ISCHR(sb.st_mode)) + err = ENOTSUP; + else { + fd = open(path, O_RDWR | O_NONBLOCK); + if (fd < 0) + err = errno; + else + consin_fd = consout_fd = fd; + } + } + + return (err); +} + +static int +disk_open(char *path) +{ + int err, fd; + + if (ndisks >= NDISKS) + return (ERANGE); + + err = 0; + fd = open(path, O_RDONLY); + + if (fd > 0) { + disk_fd[ndisks] = fd; + ndisks++; + } else + err = errno; + + return (err); +} + +static void +usage(void) +{ + + fprintf(stderr, + "usage: %s [-c ] [-d ] [-e ]\n" + " %*s [-h ] [-m mem-size] \n", + progname, + (int)strlen(progname), ""); + exit(1); +} + +int +main(int argc, char** argv) +{ + void *h; + void (*func)(struct loader_callbacks *, void *, int, int); + uint64_t mem_size; + int opt, error, need_reinit; + + progname = basename(argv[0]); + + mem_size = 256 * MB; + + consin_fd = STDIN_FILENO; + consout_fd = STDOUT_FILENO; + + while ((opt = getopt(argc, argv, "c:d:e:h:m:")) != -1) { + switch (opt) { + case 'c': + error = altcons_open(optarg); + if (error != 0) + errx(EX_USAGE, "Could not open '%s'", optarg); + break; + + case 'd': + error = disk_open(optarg); + if (error != 0) + errx(EX_USAGE, "Could not open '%s'", optarg); + break; + + case 'e': + addenv(optarg); + break; + + case 'h': + host_base = optarg; + break; + + case 'm': + error = vm_parse_memsize(optarg, &mem_size); + if (error != 0) + errx(EX_USAGE, "Invalid memsize '%s'", optarg); + break; + case '?': + usage(); + } + } + + argc -= optind; + argv += optind; + + if (argc != 1) + usage(); + + vmname = argv[0]; + + need_reinit = 0; + error = vm_create(vmname); + if (error) { + if (errno != EEXIST) { + perror("vm_create"); + exit(1); + } + need_reinit = 1; + } + + ctx = vm_open(vmname); + if (ctx == NULL) { + perror("vm_open"); + exit(1); + } + + if (need_reinit) { + error = vm_reinit(ctx); + if (error) { + perror("vm_reinit"); + exit(1); + } + } + + error = vm_setup_memory(ctx, mem_size, VM_MMAP_ALL); + if (error) { + perror("vm_setup_memory"); + exit(1); + } + + tcgetattr(consout_fd, &term); + oldterm = term; + cfmakeraw(&term); + term.c_cflag |= CLOCAL; + + tcsetattr(consout_fd, TCSAFLUSH, &term); + + h = dlopen("/boot/userboot.so", RTLD_LOCAL); + if (!h) { + printf("%s\n", dlerror()); + return (1); + } + func = dlsym(h, "loader_main"); + if (!func) { + printf("%s\n", dlerror()); + return (1); + } + + addenv("smbios.bios.vendor=BHYVE"); + addenv("boot_serial=1"); + + func(&cb, NULL, USERBOOT_VERSION_3, ndisks); +} diff --git a/libvmmapi/Makefile b/libvmmapi/Makefile new file mode 100644 index 0000000..26cf86f --- /dev/null +++ b/libvmmapi/Makefile @@ -0,0 +1,13 @@ +# $FreeBSD$ + +LIB= vmmapi +SRCS= vmmapi.c vmmapi_freebsd.c +INCS= vmmapi.h + +WARNS?= 2 + +LIBADD= util + +CFLAGS+= -I${.CURDIR} + +.include diff --git a/libvmmapi/vmmapi.c b/libvmmapi/vmmapi.c new file mode 100644 index 0000000..1e6e627 --- /dev/null +++ b/libvmmapi/vmmapi.c @@ -0,0 +1,1201 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include "vmmapi.h" + +#define MB (1024 * 1024UL) +#define GB (1024 * 1024 * 1024UL) + +struct vmctx { + int fd; + uint32_t lowmem_limit; + enum vm_mmap_style vms; + int memflags; + size_t lowmem; + char *lowmem_addr; + size_t highmem; + char *highmem_addr; + char *name; +}; + +#define CREATE(x) sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x))) +#define DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x))) + +static int +vm_device_open(const char *name) +{ + int fd, len; + char *vmfile; + + len = strlen("/dev/vmm/") + strlen(name) + 1; + vmfile = malloc(len); + assert(vmfile != NULL); + snprintf(vmfile, len, "/dev/vmm/%s", name); + + /* Open the device file */ + fd = open(vmfile, O_RDWR, 0); + + free(vmfile); + return (fd); +} + +int +vm_create(const char *name) +{ + + return (CREATE((char *)name)); +} + +struct vmctx * +vm_open(const char *name) +{ + struct vmctx *vm; + + vm = malloc(sizeof(struct vmctx) + strlen(name) + 1); + assert(vm != NULL); + + vm->fd = -1; + vm->memflags = 0; + vm->lowmem_limit = 3 * GB; + vm->name = (char *)(vm + 1); + strcpy(vm->name, name); + + if ((vm->fd = vm_device_open(vm->name)) < 0) + goto err; + + return (vm); +err: + vm_destroy(vm); + return (NULL); +} + +void +vm_destroy(struct vmctx *vm) +{ + assert(vm != NULL); + + if (vm->fd >= 0) + close(vm->fd); + DESTROY(vm->name); + + free(vm); +} + +int +vm_parse_memsize(const char *optarg, size_t *ret_memsize) +{ + char *endptr; + size_t optval; + int error; + + optval = strtoul(optarg, &endptr, 0); + if (*optarg != '\0' && *endptr == '\0') { + /* + * For the sake of backward compatibility if the memory size + * specified on the command line is less than a megabyte then + * it is interpreted as being in units of MB. + */ + if (optval < MB) + optval *= MB; + *ret_memsize = optval; + error = 0; + } else + error = expand_number(optarg, ret_memsize); + + return (error); +} + +int +vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len, + int *wired) +{ + int error; + struct vm_memory_segment seg; + + bzero(&seg, sizeof(seg)); + seg.gpa = gpa; + error = ioctl(ctx->fd, VM_GET_MEMORY_SEG, &seg); + *ret_len = seg.len; + if (wired != NULL) + *wired = seg.wired; + return (error); +} + +uint32_t +vm_get_lowmem_limit(struct vmctx *ctx) +{ + + return (ctx->lowmem_limit); +} + +void +vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit) +{ + + ctx->lowmem_limit = limit; +} + +void +vm_set_memflags(struct vmctx *ctx, int flags) +{ + + ctx->memflags = flags; +} + +static int +setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char **addr) +{ + int error, mmap_flags; + struct vm_memory_segment seg; + + /* + * Create and optionally map 'len' bytes of memory at guest + * physical address 'gpa' + */ + bzero(&seg, sizeof(seg)); + seg.gpa = gpa; + seg.len = len; + error = ioctl(ctx->fd, VM_MAP_MEMORY, &seg); + if (error == 0 && addr != NULL) { + mmap_flags = MAP_SHARED; + if ((ctx->memflags & VM_MEM_F_INCORE) == 0) + mmap_flags |= MAP_NOCORE; + *addr = mmap(NULL, len, PROT_READ | PROT_WRITE, mmap_flags, + ctx->fd, gpa); + } + return (error); +} + +int +vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms) +{ + char **addr; + int error; + + /* XXX VM_MMAP_SPARSE not implemented yet */ + assert(vms == VM_MMAP_NONE || vms == VM_MMAP_ALL); + ctx->vms = vms; + + /* + * If 'memsize' cannot fit entirely in the 'lowmem' segment then + * create another 'highmem' segment above 4GB for the remainder. + */ + if (memsize > ctx->lowmem_limit) { + ctx->lowmem = ctx->lowmem_limit; + ctx->highmem = memsize - ctx->lowmem; + } else { + ctx->lowmem = memsize; + ctx->highmem = 0; + } + + if (ctx->lowmem > 0) { + addr = (vms == VM_MMAP_ALL) ? &ctx->lowmem_addr : NULL; + error = setup_memory_segment(ctx, 0, ctx->lowmem, addr); + if (error) + return (error); + } + + if (ctx->highmem > 0) { + addr = (vms == VM_MMAP_ALL) ? &ctx->highmem_addr : NULL; + error = setup_memory_segment(ctx, 4*GB, ctx->highmem, addr); + if (error) + return (error); + } + + return (0); +} + +void * +vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len) +{ + + /* XXX VM_MMAP_SPARSE not implemented yet */ + assert(ctx->vms == VM_MMAP_ALL); + + if (gaddr < ctx->lowmem && gaddr + len <= ctx->lowmem) + return ((void *)(ctx->lowmem_addr + gaddr)); + + if (gaddr >= 4*GB) { + gaddr -= 4*GB; + if (gaddr < ctx->highmem && gaddr + len <= ctx->highmem) + return ((void *)(ctx->highmem_addr + gaddr)); + } + + return (NULL); +} + +size_t +vm_get_lowmem_size(struct vmctx *ctx) +{ + + return (ctx->lowmem); +} + +size_t +vm_get_highmem_size(struct vmctx *ctx) +{ + + return (ctx->highmem); +} + +int +vm_set_desc(struct vmctx *ctx, int vcpu, int reg, + uint64_t base, uint32_t limit, uint32_t access) +{ + int error; + struct vm_seg_desc vmsegdesc; + + bzero(&vmsegdesc, sizeof(vmsegdesc)); + vmsegdesc.cpuid = vcpu; + vmsegdesc.regnum = reg; + vmsegdesc.desc.base = base; + vmsegdesc.desc.limit = limit; + vmsegdesc.desc.access = access; + + error = ioctl(ctx->fd, VM_SET_SEGMENT_DESCRIPTOR, &vmsegdesc); + return (error); +} + +int +vm_get_desc(struct vmctx *ctx, int vcpu, int reg, + uint64_t *base, uint32_t *limit, uint32_t *access) +{ + int error; + struct vm_seg_desc vmsegdesc; + + bzero(&vmsegdesc, sizeof(vmsegdesc)); + vmsegdesc.cpuid = vcpu; + vmsegdesc.regnum = reg; + + error = ioctl(ctx->fd, VM_GET_SEGMENT_DESCRIPTOR, &vmsegdesc); + if (error == 0) { + *base = vmsegdesc.desc.base; + *limit = vmsegdesc.desc.limit; + *access = vmsegdesc.desc.access; + } + return (error); +} + +int +vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *seg_desc) +{ + int error; + + error = vm_get_desc(ctx, vcpu, reg, &seg_desc->base, &seg_desc->limit, + &seg_desc->access); + return (error); +} + +int +vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val) +{ + int error; + struct vm_register vmreg; + + bzero(&vmreg, sizeof(vmreg)); + vmreg.cpuid = vcpu; + vmreg.regnum = reg; + vmreg.regval = val; + + error = ioctl(ctx->fd, VM_SET_REGISTER, &vmreg); + return (error); +} + +int +vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *ret_val) +{ + int error; + struct vm_register vmreg; + + bzero(&vmreg, sizeof(vmreg)); + vmreg.cpuid = vcpu; + vmreg.regnum = reg; + + error = ioctl(ctx->fd, VM_GET_REGISTER, &vmreg); + *ret_val = vmreg.regval; + return (error); +} + +int +vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit) +{ + int error; + struct vm_run vmrun; + + bzero(&vmrun, sizeof(vmrun)); + vmrun.cpuid = vcpu; + + error = ioctl(ctx->fd, VM_RUN, &vmrun); + bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit)); + return (error); +} + +int +vm_suspend(struct vmctx *ctx, enum vm_suspend_how how) +{ + struct vm_suspend vmsuspend; + + bzero(&vmsuspend, sizeof(vmsuspend)); + vmsuspend.how = how; + return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend)); +} + +int +vm_reinit(struct vmctx *ctx) +{ + + return (ioctl(ctx->fd, VM_REINIT, 0)); +} + +int +vm_inject_exception(struct vmctx *ctx, int vcpu, int vector, int errcode_valid, + uint32_t errcode, int restart_instruction) +{ + struct vm_exception exc; + + exc.cpuid = vcpu; + exc.vector = vector; + exc.error_code = errcode; + exc.error_code_valid = errcode_valid; + exc.restart_instruction = restart_instruction; + + return (ioctl(ctx->fd, VM_INJECT_EXCEPTION, &exc)); +} + +int +vm_apicid2vcpu(struct vmctx *ctx, int apicid) +{ + /* + * The apic id associated with the 'vcpu' has the same numerical value + * as the 'vcpu' itself. + */ + return (apicid); +} + +int +vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector) +{ + struct vm_lapic_irq vmirq; + + bzero(&vmirq, sizeof(vmirq)); + vmirq.cpuid = vcpu; + vmirq.vector = vector; + + return (ioctl(ctx->fd, VM_LAPIC_IRQ, &vmirq)); +} + +int +vm_lapic_local_irq(struct vmctx *ctx, int vcpu, int vector) +{ + struct vm_lapic_irq vmirq; + + bzero(&vmirq, sizeof(vmirq)); + vmirq.cpuid = vcpu; + vmirq.vector = vector; + + return (ioctl(ctx->fd, VM_LAPIC_LOCAL_IRQ, &vmirq)); +} + +int +vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg) +{ + struct vm_lapic_msi vmmsi; + + bzero(&vmmsi, sizeof(vmmsi)); + vmmsi.addr = addr; + vmmsi.msg = msg; + + return (ioctl(ctx->fd, VM_LAPIC_MSI, &vmmsi)); +} + +int +vm_ioapic_assert_irq(struct vmctx *ctx, int irq) +{ + struct vm_ioapic_irq ioapic_irq; + + bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq)); + ioapic_irq.irq = irq; + + return (ioctl(ctx->fd, VM_IOAPIC_ASSERT_IRQ, &ioapic_irq)); +} + +int +vm_ioapic_deassert_irq(struct vmctx *ctx, int irq) +{ + struct vm_ioapic_irq ioapic_irq; + + bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq)); + ioapic_irq.irq = irq; + + return (ioctl(ctx->fd, VM_IOAPIC_DEASSERT_IRQ, &ioapic_irq)); +} + +int +vm_ioapic_pulse_irq(struct vmctx *ctx, int irq) +{ + struct vm_ioapic_irq ioapic_irq; + + bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq)); + ioapic_irq.irq = irq; + + return (ioctl(ctx->fd, VM_IOAPIC_PULSE_IRQ, &ioapic_irq)); +} + +int +vm_ioapic_pincount(struct vmctx *ctx, int *pincount) +{ + + return (ioctl(ctx->fd, VM_IOAPIC_PINCOUNT, pincount)); +} + +int +vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq) +{ + struct vm_isa_irq isa_irq; + + bzero(&isa_irq, sizeof(struct vm_isa_irq)); + isa_irq.atpic_irq = atpic_irq; + isa_irq.ioapic_irq = ioapic_irq; + + return (ioctl(ctx->fd, VM_ISA_ASSERT_IRQ, &isa_irq)); +} + +int +vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq) +{ + struct vm_isa_irq isa_irq; + + bzero(&isa_irq, sizeof(struct vm_isa_irq)); + isa_irq.atpic_irq = atpic_irq; + isa_irq.ioapic_irq = ioapic_irq; + + return (ioctl(ctx->fd, VM_ISA_DEASSERT_IRQ, &isa_irq)); +} + +int +vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq) +{ + struct vm_isa_irq isa_irq; + + bzero(&isa_irq, sizeof(struct vm_isa_irq)); + isa_irq.atpic_irq = atpic_irq; + isa_irq.ioapic_irq = ioapic_irq; + + return (ioctl(ctx->fd, VM_ISA_PULSE_IRQ, &isa_irq)); +} + +int +vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq, + enum vm_intr_trigger trigger) +{ + struct vm_isa_irq_trigger isa_irq_trigger; + + bzero(&isa_irq_trigger, sizeof(struct vm_isa_irq_trigger)); + isa_irq_trigger.atpic_irq = atpic_irq; + isa_irq_trigger.trigger = trigger; + + return (ioctl(ctx->fd, VM_ISA_SET_IRQ_TRIGGER, &isa_irq_trigger)); +} + +int +vm_inject_nmi(struct vmctx *ctx, int vcpu) +{ + struct vm_nmi vmnmi; + + bzero(&vmnmi, sizeof(vmnmi)); + vmnmi.cpuid = vcpu; + + return (ioctl(ctx->fd, VM_INJECT_NMI, &vmnmi)); +} + +static struct { + const char *name; + int type; +} capstrmap[] = { + { "hlt_exit", VM_CAP_HALT_EXIT }, + { "mtrap_exit", VM_CAP_MTRAP_EXIT }, + { "pause_exit", VM_CAP_PAUSE_EXIT }, + { "unrestricted_guest", VM_CAP_UNRESTRICTED_GUEST }, + { "enable_invpcid", VM_CAP_ENABLE_INVPCID }, + { 0 } +}; + +int +vm_capability_name2type(const char *capname) +{ + int i; + + for (i = 0; capstrmap[i].name != NULL && capname != NULL; i++) { + if (strcmp(capstrmap[i].name, capname) == 0) + return (capstrmap[i].type); + } + + return (-1); +} + +const char * +vm_capability_type2name(int type) +{ + int i; + + for (i = 0; capstrmap[i].name != NULL; i++) { + if (capstrmap[i].type == type) + return (capstrmap[i].name); + } + + return (NULL); +} + +int +vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, + int *retval) +{ + int error; + struct vm_capability vmcap; + + bzero(&vmcap, sizeof(vmcap)); + vmcap.cpuid = vcpu; + vmcap.captype = cap; + + error = ioctl(ctx->fd, VM_GET_CAPABILITY, &vmcap); + *retval = vmcap.capval; + return (error); +} + +int +vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val) +{ + struct vm_capability vmcap; + + bzero(&vmcap, sizeof(vmcap)); + vmcap.cpuid = vcpu; + vmcap.captype = cap; + vmcap.capval = val; + + return (ioctl(ctx->fd, VM_SET_CAPABILITY, &vmcap)); +} + +int +vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func) +{ + struct vm_pptdev pptdev; + + bzero(&pptdev, sizeof(pptdev)); + pptdev.bus = bus; + pptdev.slot = slot; + pptdev.func = func; + + return (ioctl(ctx->fd, VM_BIND_PPTDEV, &pptdev)); +} + +int +vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func) +{ + struct vm_pptdev pptdev; + + bzero(&pptdev, sizeof(pptdev)); + pptdev.bus = bus; + pptdev.slot = slot; + pptdev.func = func; + + return (ioctl(ctx->fd, VM_UNBIND_PPTDEV, &pptdev)); +} + +int +vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, + vm_paddr_t gpa, size_t len, vm_paddr_t hpa) +{ + struct vm_pptdev_mmio pptmmio; + + bzero(&pptmmio, sizeof(pptmmio)); + pptmmio.bus = bus; + pptmmio.slot = slot; + pptmmio.func = func; + pptmmio.gpa = gpa; + pptmmio.len = len; + pptmmio.hpa = hpa; + + return (ioctl(ctx->fd, VM_MAP_PPTDEV_MMIO, &pptmmio)); +} + +int +vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func, + uint64_t addr, uint64_t msg, int numvec) +{ + struct vm_pptdev_msi pptmsi; + + bzero(&pptmsi, sizeof(pptmsi)); + pptmsi.vcpu = vcpu; + pptmsi.bus = bus; + pptmsi.slot = slot; + pptmsi.func = func; + pptmsi.msg = msg; + pptmsi.addr = addr; + pptmsi.numvec = numvec; + + return (ioctl(ctx->fd, VM_PPTDEV_MSI, &pptmsi)); +} + +int +vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func, + int idx, uint64_t addr, uint64_t msg, uint32_t vector_control) +{ + struct vm_pptdev_msix pptmsix; + + bzero(&pptmsix, sizeof(pptmsix)); + pptmsix.vcpu = vcpu; + pptmsix.bus = bus; + pptmsix.slot = slot; + pptmsix.func = func; + pptmsix.idx = idx; + pptmsix.msg = msg; + pptmsix.addr = addr; + pptmsix.vector_control = vector_control; + + return ioctl(ctx->fd, VM_PPTDEV_MSIX, &pptmsix); +} + +uint64_t * +vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv, + int *ret_entries) +{ + int error; + + static struct vm_stats vmstats; + + vmstats.cpuid = vcpu; + + error = ioctl(ctx->fd, VM_STATS, &vmstats); + if (error == 0) { + if (ret_entries) + *ret_entries = vmstats.num_entries; + if (ret_tv) + *ret_tv = vmstats.tv; + return (vmstats.statbuf); + } else + return (NULL); +} + +const char * +vm_get_stat_desc(struct vmctx *ctx, int index) +{ + static struct vm_stat_desc statdesc; + + statdesc.index = index; + if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0) + return (statdesc.desc); + else + return (NULL); +} + +int +vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *state) +{ + int error; + struct vm_x2apic x2apic; + + bzero(&x2apic, sizeof(x2apic)); + x2apic.cpuid = vcpu; + + error = ioctl(ctx->fd, VM_GET_X2APIC_STATE, &x2apic); + *state = x2apic.state; + return (error); +} + +int +vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state state) +{ + int error; + struct vm_x2apic x2apic; + + bzero(&x2apic, sizeof(x2apic)); + x2apic.cpuid = vcpu; + x2apic.state = state; + + error = ioctl(ctx->fd, VM_SET_X2APIC_STATE, &x2apic); + + return (error); +} + +/* + * From Intel Vol 3a: + * Table 9-1. IA-32 Processor States Following Power-up, Reset or INIT + */ +int +vcpu_reset(struct vmctx *vmctx, int vcpu) +{ + int error; + uint64_t rflags, rip, cr0, cr4, zero, desc_base, rdx; + uint32_t desc_access, desc_limit; + uint16_t sel; + + zero = 0; + + rflags = 0x2; + error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags); + if (error) + goto done; + + rip = 0xfff0; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0) + goto done; + + cr0 = CR0_NE; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0) + goto done; + + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, zero)) != 0) + goto done; + + cr4 = 0; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0) + goto done; + + /* + * CS: present, r/w, accessed, 16-bit, byte granularity, usable + */ + desc_base = 0xffff0000; + desc_limit = 0xffff; + desc_access = 0x0093; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + sel = 0xf000; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, sel)) != 0) + goto done; + + /* + * SS,DS,ES,FS,GS: present, r/w, accessed, 16-bit, byte granularity + */ + desc_base = 0; + desc_limit = 0xffff; + desc_access = 0x0093; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + sel = 0; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, sel)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, sel)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, sel)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, sel)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, sel)) != 0) + goto done; + + /* General purpose registers */ + rdx = 0xf00; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RAX, zero)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBX, zero)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RCX, zero)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDX, rdx)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSI, zero)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDI, zero)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBP, zero)) != 0) + goto done; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, zero)) != 0) + goto done; + + /* GDTR, IDTR */ + desc_base = 0; + desc_limit = 0xffff; + desc_access = 0; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR, + desc_base, desc_limit, desc_access); + if (error != 0) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_IDTR, + desc_base, desc_limit, desc_access); + if (error != 0) + goto done; + + /* TR */ + desc_base = 0; + desc_limit = 0xffff; + desc_access = 0x0000008b; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access); + if (error) + goto done; + + sel = 0; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, sel)) != 0) + goto done; + + /* LDTR */ + desc_base = 0; + desc_limit = 0xffff; + desc_access = 0x00000082; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, desc_base, + desc_limit, desc_access); + if (error) + goto done; + + sel = 0; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0) + goto done; + + /* XXX cr2, debug registers */ + + error = 0; +done: + return (error); +} + +int +vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num) +{ + int error, i; + struct vm_gpa_pte gpapte; + + bzero(&gpapte, sizeof(gpapte)); + gpapte.gpa = gpa; + + error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte); + + if (error == 0) { + *num = gpapte.ptenum; + for (i = 0; i < gpapte.ptenum; i++) + pte[i] = gpapte.pte[i]; + } + + return (error); +} + +int +vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities) +{ + int error; + struct vm_hpet_cap cap; + + bzero(&cap, sizeof(struct vm_hpet_cap)); + error = ioctl(ctx->fd, VM_GET_HPET_CAPABILITIES, &cap); + if (capabilities != NULL) + *capabilities = cap.capabilities; + return (error); +} + +int +vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *fault) +{ + struct vm_gla2gpa gg; + int error; + + bzero(&gg, sizeof(struct vm_gla2gpa)); + gg.vcpuid = vcpu; + gg.prot = prot; + gg.gla = gla; + gg.paging = *paging; + + error = ioctl(ctx->fd, VM_GLA2GPA, &gg); + if (error == 0) { + *fault = gg.fault; + *gpa = gg.gpa; + } + return (error); +} + +#ifndef min +#define min(a,b) (((a) < (b)) ? (a) : (b)) +#endif + +int +vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, + uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt, + int *fault) +{ + void *va; + uint64_t gpa; + int error, i, n, off; + + for (i = 0; i < iovcnt; i++) { + iov[i].iov_base = 0; + iov[i].iov_len = 0; + } + + while (len) { + assert(iovcnt > 0); + error = vm_gla2gpa(ctx, vcpu, paging, gla, prot, &gpa, fault); + if (error || *fault) + return (error); + + off = gpa & PAGE_MASK; + n = min(len, PAGE_SIZE - off); + + va = vm_map_gpa(ctx, gpa, n); + if (va == NULL) + return (EFAULT); + + iov->iov_base = va; + iov->iov_len = n; + iov++; + iovcnt--; + + gla += n; + len -= n; + } + return (0); +} + +void +vm_copy_teardown(struct vmctx *ctx, int vcpu, struct iovec *iov, int iovcnt) +{ + + return; +} + +void +vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *iov, void *vp, size_t len) +{ + const char *src; + char *dst; + size_t n; + + dst = vp; + while (len) { + assert(iov->iov_len); + n = min(len, iov->iov_len); + src = iov->iov_base; + bcopy(src, dst, n); + + iov++; + dst += n; + len -= n; + } +} + +void +vm_copyout(struct vmctx *ctx, int vcpu, const void *vp, struct iovec *iov, + size_t len) +{ + const char *src; + char *dst; + size_t n; + + src = vp; + while (len) { + assert(iov->iov_len); + n = min(len, iov->iov_len); + dst = iov->iov_base; + bcopy(src, dst, n); + + iov++; + src += n; + len -= n; + } +} + +static int +vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus) +{ + struct vm_cpuset vm_cpuset; + int error; + + bzero(&vm_cpuset, sizeof(struct vm_cpuset)); + vm_cpuset.which = which; + vm_cpuset.cpusetsize = sizeof(cpuset_t); + vm_cpuset.cpus = cpus; + + error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset); + return (error); +} + +int +vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus) +{ + + return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus)); +} + +int +vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus) +{ + + return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus)); +} + +int +vm_activate_cpu(struct vmctx *ctx, int vcpu) +{ + struct vm_activate_cpu ac; + int error; + + bzero(&ac, sizeof(struct vm_activate_cpu)); + ac.vcpuid = vcpu; + error = ioctl(ctx->fd, VM_ACTIVATE_CPU, &ac); + return (error); +} + +int +vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *info1, uint64_t *info2) +{ + struct vm_intinfo vmii; + int error; + + bzero(&vmii, sizeof(struct vm_intinfo)); + vmii.vcpuid = vcpu; + error = ioctl(ctx->fd, VM_GET_INTINFO, &vmii); + if (error == 0) { + *info1 = vmii.info1; + *info2 = vmii.info2; + } + return (error); +} + +int +vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t info1) +{ + struct vm_intinfo vmii; + int error; + + bzero(&vmii, sizeof(struct vm_intinfo)); + vmii.vcpuid = vcpu; + vmii.info1 = info1; + error = ioctl(ctx->fd, VM_SET_INTINFO, &vmii); + return (error); +} + +int +vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value) +{ + struct vm_rtc_data rtcdata; + int error; + + bzero(&rtcdata, sizeof(struct vm_rtc_data)); + rtcdata.offset = offset; + rtcdata.value = value; + error = ioctl(ctx->fd, VM_RTC_WRITE, &rtcdata); + return (error); +} + +int +vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval) +{ + struct vm_rtc_data rtcdata; + int error; + + bzero(&rtcdata, sizeof(struct vm_rtc_data)); + rtcdata.offset = offset; + error = ioctl(ctx->fd, VM_RTC_READ, &rtcdata); + if (error == 0) + *retval = rtcdata.value; + return (error); +} + +int +vm_rtc_settime(struct vmctx *ctx, time_t secs) +{ + struct vm_rtc_time rtctime; + int error; + + bzero(&rtctime, sizeof(struct vm_rtc_time)); + rtctime.secs = secs; + error = ioctl(ctx->fd, VM_RTC_SETTIME, &rtctime); + return (error); +} + +int +vm_rtc_gettime(struct vmctx *ctx, time_t *secs) +{ + struct vm_rtc_time rtctime; + int error; + + bzero(&rtctime, sizeof(struct vm_rtc_time)); + error = ioctl(ctx->fd, VM_RTC_GETTIME, &rtctime); + if (error == 0) + *secs = rtctime.secs; + return (error); +} + +int +vm_restart_instruction(void *arg, int vcpu) +{ + struct vmctx *ctx = arg; + + return (ioctl(ctx->fd, VM_RESTART_INSTRUCTION, &vcpu)); +} diff --git a/libvmmapi/vmmapi.h b/libvmmapi/vmmapi.h new file mode 100644 index 0000000..d3ecdc4 --- /dev/null +++ b/libvmmapi/vmmapi.h @@ -0,0 +1,173 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMMAPI_H_ +#define _VMMAPI_H_ + +#include +#include + +/* + * API version for out-of-tree consumers like grub-bhyve for making compile + * time decisions. + */ +#define VMMAPI_VERSION 0101 /* 2 digit major followed by 2 digit minor */ + +struct iovec; +struct vmctx; +enum x2apic_state; + +/* + * Different styles of mapping the memory assigned to a VM into the address + * space of the controlling process. + */ +enum vm_mmap_style { + VM_MMAP_NONE, /* no mapping */ + VM_MMAP_ALL, /* fully and statically mapped */ + VM_MMAP_SPARSE, /* mappings created on-demand */ +}; + +#define VM_MEM_F_INCORE 0x01 /* include guest memory in core file */ + +int vm_create(const char *name); +struct vmctx *vm_open(const char *name); +void vm_destroy(struct vmctx *ctx); +int vm_parse_memsize(const char *optarg, size_t *memsize); +int vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len, + int *wired); +int vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s); +void *vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len); +int vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num); +int vm_gla2gpa(struct vmctx *, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *fault); +uint32_t vm_get_lowmem_limit(struct vmctx *ctx); +void vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit); +void vm_set_memflags(struct vmctx *ctx, int flags); +size_t vm_get_lowmem_size(struct vmctx *ctx); +size_t vm_get_highmem_size(struct vmctx *ctx); +int vm_set_desc(struct vmctx *ctx, int vcpu, int reg, + uint64_t base, uint32_t limit, uint32_t access); +int vm_get_desc(struct vmctx *ctx, int vcpu, int reg, + uint64_t *base, uint32_t *limit, uint32_t *access); +int vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg, + struct seg_desc *seg_desc); +int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val); +int vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval); +int vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *ret_vmexit); +int vm_suspend(struct vmctx *ctx, enum vm_suspend_how how); +int vm_reinit(struct vmctx *ctx); +int vm_apicid2vcpu(struct vmctx *ctx, int apicid); +int vm_inject_exception(struct vmctx *ctx, int vcpu, int vector, + int errcode_valid, uint32_t errcode, int restart_instruction); +int vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector); +int vm_lapic_local_irq(struct vmctx *ctx, int vcpu, int vector); +int vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg); +int vm_ioapic_assert_irq(struct vmctx *ctx, int irq); +int vm_ioapic_deassert_irq(struct vmctx *ctx, int irq); +int vm_ioapic_pulse_irq(struct vmctx *ctx, int irq); +int vm_ioapic_pincount(struct vmctx *ctx, int *pincount); +int vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq); +int vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq); +int vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq); +int vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq, + enum vm_intr_trigger trigger); +int vm_inject_nmi(struct vmctx *ctx, int vcpu); +int vm_capability_name2type(const char *capname); +const char *vm_capability_type2name(int type); +int vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, + int *retval); +int vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, + int val); +int vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func); +int vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func); +int vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, + vm_paddr_t gpa, size_t len, vm_paddr_t hpa); +int vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot, + int func, uint64_t addr, uint64_t msg, int numvec); +int vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot, + int func, int idx, uint64_t addr, uint64_t msg, + uint32_t vector_control); + +int vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *i1, uint64_t *i2); +int vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t exit_intinfo); + +/* + * Return a pointer to the statistics buffer. Note that this is not MT-safe. + */ +uint64_t *vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv, + int *ret_entries); +const char *vm_get_stat_desc(struct vmctx *ctx, int index); + +int vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *s); +int vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state s); + +int vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities); + +/* + * Translate the GLA range [gla,gla+len) into GPA segments in 'iov'. + * The 'iovcnt' should be big enough to accomodate all GPA segments. + * + * retval fault Interpretation + * 0 0 Success + * 0 1 An exception was injected into the guest + * EFAULT N/A Error + */ +int vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *pg, + uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt, + int *fault); +void vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *guest_iov, + void *host_dst, size_t len); +void vm_copyout(struct vmctx *ctx, int vcpu, const void *host_src, + struct iovec *guest_iov, size_t len); +void vm_copy_teardown(struct vmctx *ctx, int vcpu, struct iovec *iov, + int iovcnt); + +/* RTC */ +int vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value); +int vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval); +int vm_rtc_settime(struct vmctx *ctx, time_t secs); +int vm_rtc_gettime(struct vmctx *ctx, time_t *secs); + +/* Reset vcpu register state */ +int vcpu_reset(struct vmctx *ctx, int vcpu); + +int vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus); +int vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus); +int vm_activate_cpu(struct vmctx *ctx, int vcpu); + +/* + * FreeBSD specific APIs + */ +int vm_setup_freebsd_registers(struct vmctx *ctx, int vcpu, + uint64_t rip, uint64_t cr3, uint64_t gdtbase, + uint64_t rsp); +int vm_setup_freebsd_registers_i386(struct vmctx *vmctx, int vcpu, + uint32_t eip, uint32_t gdtbase, + uint32_t esp); +void vm_setup_freebsd_gdt(uint64_t *gdtr); +#endif /* _VMMAPI_H_ */ diff --git a/libvmmapi/vmmapi_freebsd.c b/libvmmapi/vmmapi_freebsd.c new file mode 100644 index 0000000..d801184 --- /dev/null +++ b/libvmmapi/vmmapi_freebsd.c @@ -0,0 +1,345 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include +#include +#include + +#include +#include + +#include "vmmapi.h" + +#define I386_TSS_SIZE 104 + +#define DESC_PRESENT 0x00000080 +#define DESC_LONGMODE 0x00002000 +#define DESC_DEF32 0x00004000 +#define DESC_GRAN 0x00008000 +#define DESC_UNUSABLE 0x00010000 + +#define GUEST_NULL_SEL 0 +#define GUEST_CODE_SEL 1 +#define GUEST_DATA_SEL 2 +#define GUEST_TSS_SEL 3 +#define GUEST_GDTR_LIMIT64 (3 * 8 - 1) + +static struct segment_descriptor i386_gdt[] = { + {}, /* NULL */ + { .sd_lolimit = 0xffff, .sd_type = SDT_MEMER, /* CODE */ + .sd_p = 1, .sd_hilimit = 0xf, .sd_def32 = 1, .sd_gran = 1 }, + { .sd_lolimit = 0xffff, .sd_type = SDT_MEMRW, /* DATA */ + .sd_p = 1, .sd_hilimit = 0xf, .sd_def32 = 1, .sd_gran = 1 }, + { .sd_lolimit = I386_TSS_SIZE - 1, /* TSS */ + .sd_type = SDT_SYS386TSS, .sd_p = 1 } +}; + +/* + * Setup the 'vcpu' register set such that it will begin execution at + * 'eip' in flat mode. + */ +int +vm_setup_freebsd_registers_i386(struct vmctx *vmctx, int vcpu, uint32_t eip, + uint32_t gdtbase, uint32_t esp) +{ + uint64_t cr0, rflags, desc_base; + uint32_t desc_access, desc_limit, tssbase; + uint16_t gsel; + struct segment_descriptor *gdt; + int error, tmp; + + /* A 32-bit guest requires unrestricted mode. */ + error = vm_get_capability(vmctx, vcpu, VM_CAP_UNRESTRICTED_GUEST, &tmp); + if (error) + goto done; + error = vm_set_capability(vmctx, vcpu, VM_CAP_UNRESTRICTED_GUEST, 1); + if (error) + goto done; + + cr0 = CR0_PE | CR0_NE; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0) + goto done; + + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, 0)) != 0) + goto done; + + /* + * Forcing EFER to 0 causes bhyve to clear the "IA-32e guest + * mode" entry control. + */ + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_EFER, 0))) + goto done; + + gdt = vm_map_gpa(vmctx, gdtbase, 0x1000); + if (gdt == NULL) + return (EFAULT); + memcpy(gdt, i386_gdt, sizeof(i386_gdt)); + desc_base = gdtbase; + desc_limit = sizeof(i386_gdt) - 1; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR, + desc_base, desc_limit, 0); + if (error != 0) + goto done; + + /* Place the TSS one page above the GDT. */ + tssbase = gdtbase + 0x1000; + gdt[3].sd_lobase = tssbase; + + rflags = 0x2; + error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags); + if (error) + goto done; + + desc_base = 0; + desc_limit = 0xffffffff; + desc_access = DESC_GRAN | DESC_DEF32 | DESC_PRESENT | SDT_MEMERA; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS, + desc_base, desc_limit, desc_access); + + desc_access = DESC_GRAN | DESC_DEF32 | DESC_PRESENT | SDT_MEMRWA; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + desc_base = tssbase; + desc_limit = I386_TSS_SIZE - 1; + desc_access = DESC_PRESENT | SDT_SYS386BSY; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, 0, 0, + DESC_UNUSABLE); + if (error) + goto done; + + gsel = GSEL(GUEST_CODE_SEL, SEL_KPL); + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, gsel)) != 0) + goto done; + + gsel = GSEL(GUEST_DATA_SEL, SEL_KPL); + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, gsel)) != 0) + goto done; + + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, gsel)) != 0) + goto done; + + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, gsel)) != 0) + goto done; + + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, gsel)) != 0) + goto done; + + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, gsel)) != 0) + goto done; + + gsel = GSEL(GUEST_TSS_SEL, SEL_KPL); + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, gsel)) != 0) + goto done; + + /* LDTR is pointing to the null selector */ + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0) + goto done; + + /* entry point */ + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, eip)) != 0) + goto done; + + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, esp)) != 0) + goto done; + + error = 0; +done: + return (error); +} + +void +vm_setup_freebsd_gdt(uint64_t *gdtr) +{ + gdtr[GUEST_NULL_SEL] = 0; + gdtr[GUEST_CODE_SEL] = 0x0020980000000000; + gdtr[GUEST_DATA_SEL] = 0x0000900000000000; +} + +/* + * Setup the 'vcpu' register set such that it will begin execution at + * 'rip' in long mode. + */ +int +vm_setup_freebsd_registers(struct vmctx *vmctx, int vcpu, + uint64_t rip, uint64_t cr3, uint64_t gdtbase, + uint64_t rsp) +{ + int error; + uint64_t cr0, cr4, efer, rflags, desc_base; + uint32_t desc_access, desc_limit; + uint16_t gsel; + + cr0 = CR0_PE | CR0_PG | CR0_NE; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0) + goto done; + + cr4 = CR4_PAE; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0) + goto done; + + efer = EFER_LME | EFER_LMA; + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_EFER, efer))) + goto done; + + rflags = 0x2; + error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags); + if (error) + goto done; + + desc_base = 0; + desc_limit = 0; + desc_access = 0x0000209B; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + desc_access = 0x00000093; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS, + desc_base, desc_limit, desc_access); + if (error) + goto done; + + /* + * XXX TR is pointing to null selector even though we set the + * TSS segment to be usable with a base address and limit of 0. + */ + desc_access = 0x0000008b; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access); + if (error) + goto done; + + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, 0, 0, + DESC_UNUSABLE); + if (error) + goto done; + + gsel = GSEL(GUEST_CODE_SEL, SEL_KPL); + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, gsel)) != 0) + goto done; + + gsel = GSEL(GUEST_DATA_SEL, SEL_KPL); + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, gsel)) != 0) + goto done; + + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, gsel)) != 0) + goto done; + + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, gsel)) != 0) + goto done; + + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, gsel)) != 0) + goto done; + + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, gsel)) != 0) + goto done; + + /* XXX TR is pointing to the null selector */ + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, 0)) != 0) + goto done; + + /* LDTR is pointing to the null selector */ + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0) + goto done; + + /* entry point */ + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0) + goto done; + + /* page table base */ + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, cr3)) != 0) + goto done; + + desc_base = gdtbase; + desc_limit = GUEST_GDTR_LIMIT64; + error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR, + desc_base, desc_limit, 0); + if (error != 0) + goto done; + + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, rsp)) != 0) + goto done; + + error = 0; +done: + return (error); +} diff --git a/vmm.h b/vmm.h new file mode 100644 index 0000000..d3798bc --- /dev/null +++ b/vmm.h @@ -0,0 +1,648 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_H_ +#define _VMM_H_ + +#include + +enum vm_suspend_how { + VM_SUSPEND_NONE, + VM_SUSPEND_RESET, + VM_SUSPEND_POWEROFF, + VM_SUSPEND_HALT, + VM_SUSPEND_TRIPLEFAULT, + VM_SUSPEND_LAST +}; + +/* + * Identifiers for architecturally defined registers. + */ +enum vm_reg_name { + VM_REG_GUEST_RAX, + VM_REG_GUEST_RBX, + VM_REG_GUEST_RCX, + VM_REG_GUEST_RDX, + VM_REG_GUEST_RSI, + VM_REG_GUEST_RDI, + VM_REG_GUEST_RBP, + VM_REG_GUEST_R8, + VM_REG_GUEST_R9, + VM_REG_GUEST_R10, + VM_REG_GUEST_R11, + VM_REG_GUEST_R12, + VM_REG_GUEST_R13, + VM_REG_GUEST_R14, + VM_REG_GUEST_R15, + VM_REG_GUEST_CR0, + VM_REG_GUEST_CR3, + VM_REG_GUEST_CR4, + VM_REG_GUEST_DR7, + VM_REG_GUEST_RSP, + VM_REG_GUEST_RIP, + VM_REG_GUEST_RFLAGS, + VM_REG_GUEST_ES, + VM_REG_GUEST_CS, + VM_REG_GUEST_SS, + VM_REG_GUEST_DS, + VM_REG_GUEST_FS, + VM_REG_GUEST_GS, + VM_REG_GUEST_LDTR, + VM_REG_GUEST_TR, + VM_REG_GUEST_IDTR, + VM_REG_GUEST_GDTR, + VM_REG_GUEST_EFER, + VM_REG_GUEST_CR2, + VM_REG_GUEST_PDPTE0, + VM_REG_GUEST_PDPTE1, + VM_REG_GUEST_PDPTE2, + VM_REG_GUEST_PDPTE3, + VM_REG_GUEST_INTR_SHADOW, + VM_REG_LAST +}; + +enum x2apic_state { + X2APIC_DISABLED, + X2APIC_ENABLED, + X2APIC_STATE_LAST +}; + +#define VM_INTINFO_VECTOR(info) ((info) & 0xff) +#define VM_INTINFO_DEL_ERRCODE 0x800 +#define VM_INTINFO_RSVD 0x7ffff000 +#define VM_INTINFO_VALID 0x80000000 +#define VM_INTINFO_TYPE 0x700 +#define VM_INTINFO_HWINTR (0 << 8) +#define VM_INTINFO_NMI (2 << 8) +#define VM_INTINFO_HWEXCEPTION (3 << 8) +#define VM_INTINFO_SWINTR (4 << 8) + +#ifdef _KERNEL + +#define VM_MAX_NAMELEN 32 + +struct vm; +struct vm_exception; +struct vm_memory_segment; +struct seg_desc; +struct vm_exit; +struct vm_run; +struct vhpet; +struct vioapic; +struct vlapic; +struct vmspace; +struct vm_object; +struct vm_guest_paging; +struct pmap; + +typedef int (*vmm_init_func_t)(int ipinum); +typedef int (*vmm_cleanup_func_t)(void); +typedef void (*vmm_resume_func_t)(void); +typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap); +typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip, + struct pmap *pmap, void *rendezvous_cookie, + void *suspend_cookie); +typedef void (*vmi_cleanup_func_t)(void *vmi); +typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num, + uint64_t *retval); +typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num, + uint64_t val); +typedef int (*vmi_get_desc_t)(void *vmi, int vcpu, int num, + struct seg_desc *desc); +typedef int (*vmi_set_desc_t)(void *vmi, int vcpu, int num, + struct seg_desc *desc); +typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval); +typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val); +typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max); +typedef void (*vmi_vmspace_free)(struct vmspace *vmspace); +typedef struct vlapic * (*vmi_vlapic_init)(void *vmi, int vcpu); +typedef void (*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic); + +struct vmm_ops { + vmm_init_func_t init; /* module wide initialization */ + vmm_cleanup_func_t cleanup; + vmm_resume_func_t resume; + + vmi_init_func_t vminit; /* vm-specific initialization */ + vmi_run_func_t vmrun; + vmi_cleanup_func_t vmcleanup; + vmi_get_register_t vmgetreg; + vmi_set_register_t vmsetreg; + vmi_get_desc_t vmgetdesc; + vmi_set_desc_t vmsetdesc; + vmi_get_cap_t vmgetcap; + vmi_set_cap_t vmsetcap; + vmi_vmspace_alloc vmspace_alloc; + vmi_vmspace_free vmspace_free; + vmi_vlapic_init vlapic_init; + vmi_vlapic_cleanup vlapic_cleanup; +}; + +extern struct vmm_ops vmm_ops_intel; +extern struct vmm_ops vmm_ops_amd; + +int vm_create(const char *name, struct vm **retvm); +void vm_destroy(struct vm *vm); +int vm_reinit(struct vm *vm); +const char *vm_name(struct vm *vm); +int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len); +int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); +int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len); +void *vm_gpa_hold(struct vm *, vm_paddr_t gpa, size_t len, int prot, + void **cookie); +void vm_gpa_release(void *cookie); +int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, + struct vm_memory_segment *seg); +int vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len, + vm_offset_t *offset, struct vm_object **object); +boolean_t vm_mem_allocated(struct vm *vm, vm_paddr_t gpa); +int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval); +int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val); +int vm_get_seg_desc(struct vm *vm, int vcpu, int reg, + struct seg_desc *ret_desc); +int vm_set_seg_desc(struct vm *vm, int vcpu, int reg, + struct seg_desc *desc); +int vm_run(struct vm *vm, struct vm_run *vmrun); +int vm_suspend(struct vm *vm, enum vm_suspend_how how); +int vm_inject_nmi(struct vm *vm, int vcpu); +int vm_nmi_pending(struct vm *vm, int vcpuid); +void vm_nmi_clear(struct vm *vm, int vcpuid); +int vm_inject_extint(struct vm *vm, int vcpu); +int vm_extint_pending(struct vm *vm, int vcpuid); +void vm_extint_clear(struct vm *vm, int vcpuid); +struct vlapic *vm_lapic(struct vm *vm, int cpu); +struct vioapic *vm_ioapic(struct vm *vm); +struct vhpet *vm_hpet(struct vm *vm); +int vm_get_capability(struct vm *vm, int vcpu, int type, int *val); +int vm_set_capability(struct vm *vm, int vcpu, int type, int val); +int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state); +int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state); +int vm_apicid2vcpuid(struct vm *vm, int apicid); +int vm_activate_cpu(struct vm *vm, int vcpu); +struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid); +void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip); +void vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip); +void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip); + +#ifdef _SYS__CPUSET_H_ +/* + * Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'. + * The rendezvous 'func(arg)' is not allowed to do anything that will + * cause the thread to be put to sleep. + * + * If the rendezvous is being initiated from a vcpu context then the + * 'vcpuid' must refer to that vcpu, otherwise it should be set to -1. + * + * The caller cannot hold any locks when initiating the rendezvous. + * + * The implementation of this API may cause vcpus other than those specified + * by 'dest' to be stalled. The caller should not rely on any vcpus making + * forward progress when the rendezvous is in progress. + */ +typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg); +void vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest, + vm_rendezvous_func_t func, void *arg); +cpuset_t vm_active_cpus(struct vm *vm); +cpuset_t vm_suspended_cpus(struct vm *vm); +#endif /* _SYS__CPUSET_H_ */ + +static __inline int +vcpu_rendezvous_pending(void *rendezvous_cookie) +{ + + return (*(uintptr_t *)rendezvous_cookie != 0); +} + +static __inline int +vcpu_suspended(void *suspend_cookie) +{ + + return (*(int *)suspend_cookie); +} + +/* + * Return 1 if device indicated by bus/slot/func is supposed to be a + * pci passthrough device. + * + * Return 0 otherwise. + */ +int vmm_is_pptdev(int bus, int slot, int func); + +void *vm_iommu_domain(struct vm *vm); + +enum vcpu_state { + VCPU_IDLE, + VCPU_FROZEN, + VCPU_RUNNING, + VCPU_SLEEPING, +}; + +int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state, + bool from_idle); +enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu, int *hostcpu); + +static int __inline +vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu) +{ + return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING); +} + +#ifdef _SYS_PROC_H_ +static int __inline +vcpu_should_yield(struct vm *vm, int vcpu) +{ + + if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) + return (1); + else if (curthread->td_owepreempt) + return (1); + else + return (0); +} +#endif + +void *vcpu_stats(struct vm *vm, int vcpu); +void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr); +struct vmspace *vm_get_vmspace(struct vm *vm); +int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func); +int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func); +struct vatpic *vm_atpic(struct vm *vm); +struct vatpit *vm_atpit(struct vm *vm); +struct vpmtmr *vm_pmtmr(struct vm *vm); +struct vrtc *vm_rtc(struct vm *vm); + +/* + * Inject exception 'vector' into the guest vcpu. This function returns 0 on + * success and non-zero on failure. + * + * Wrapper functions like 'vm_inject_gp()' should be preferred to calling + * this function directly because they enforce the trap-like or fault-like + * behavior of an exception. + * + * This function should only be called in the context of the thread that is + * executing this vcpu. + */ +int vm_inject_exception(struct vm *vm, int vcpuid, int vector, int err_valid, + uint32_t errcode, int restart_instruction); + +/* + * This function is called after a VM-exit that occurred during exception or + * interrupt delivery through the IDT. The format of 'intinfo' is described + * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2. + * + * If a VM-exit handler completes the event delivery successfully then it + * should call vm_exit_intinfo() to extinguish the pending event. For e.g., + * if the task switch emulation is triggered via a task gate then it should + * call this function with 'intinfo=0' to indicate that the external event + * is not pending anymore. + * + * Return value is 0 on success and non-zero on failure. + */ +int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t intinfo); + +/* + * This function is called before every VM-entry to retrieve a pending + * event that should be injected into the guest. This function combines + * nested events into a double or triple fault. + * + * Returns 0 if there are no events that need to be injected into the guest + * and non-zero otherwise. + */ +int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info); + +int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2); + +enum vm_reg_name vm_segment_name(int seg_encoding); + +struct vm_copyinfo { + uint64_t gpa; + size_t len; + void *hva; + void *cookie; +}; + +/* + * Set up 'copyinfo[]' to copy to/from guest linear address space starting + * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for + * a copyin or PROT_WRITE for a copyout. + * + * retval is_fault Intepretation + * 0 0 Success + * 0 1 An exception was injected into the guest + * EFAULT N/A Unrecoverable error + * + * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if + * the return value is 0. The 'copyinfo[]' resources should be freed by calling + * 'vm_copy_teardown()' after the copy is done. + */ +int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, + int num_copyinfo, int *is_fault); +void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, + int num_copyinfo); +void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, + void *kaddr, size_t len); +void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, + struct vm_copyinfo *copyinfo, size_t len); + +int vcpu_trace_exceptions(struct vm *vm, int vcpuid); +#endif /* KERNEL */ + +#define VM_MAXCPU 16 /* maximum virtual cpus */ + +/* + * Identifiers for optional vmm capabilities + */ +enum vm_cap_type { + VM_CAP_HALT_EXIT, + VM_CAP_MTRAP_EXIT, + VM_CAP_PAUSE_EXIT, + VM_CAP_UNRESTRICTED_GUEST, + VM_CAP_ENABLE_INVPCID, + VM_CAP_MAX +}; + +enum vm_intr_trigger { + EDGE_TRIGGER, + LEVEL_TRIGGER +}; + +/* + * The 'access' field has the format specified in Table 21-2 of the Intel + * Architecture Manual vol 3b. + * + * XXX The contents of the 'access' field are architecturally defined except + * bit 16 - Segment Unusable. + */ +struct seg_desc { + uint64_t base; + uint32_t limit; + uint32_t access; +}; +#define SEG_DESC_TYPE(access) ((access) & 0x001f) +#define SEG_DESC_DPL(access) (((access) >> 5) & 0x3) +#define SEG_DESC_PRESENT(access) (((access) & 0x0080) ? 1 : 0) +#define SEG_DESC_DEF32(access) (((access) & 0x4000) ? 1 : 0) +#define SEG_DESC_GRANULARITY(access) (((access) & 0x8000) ? 1 : 0) +#define SEG_DESC_UNUSABLE(access) (((access) & 0x10000) ? 1 : 0) + +enum vm_cpu_mode { + CPU_MODE_REAL, + CPU_MODE_PROTECTED, + CPU_MODE_COMPATIBILITY, /* IA-32E mode (CS.L = 0) */ + CPU_MODE_64BIT, /* IA-32E mode (CS.L = 1) */ +}; + +enum vm_paging_mode { + PAGING_MODE_FLAT, + PAGING_MODE_32, + PAGING_MODE_PAE, + PAGING_MODE_64, +}; + +struct vm_guest_paging { + uint64_t cr3; + int cpl; + enum vm_cpu_mode cpu_mode; + enum vm_paging_mode paging_mode; +}; + +/* + * The data structures 'vie' and 'vie_op' are meant to be opaque to the + * consumers of instruction decoding. The only reason why their contents + * need to be exposed is because they are part of the 'vm_exit' structure. + */ +struct vie_op { + uint8_t op_byte; /* actual opcode byte */ + uint8_t op_type; /* type of operation (e.g. MOV) */ + uint16_t op_flags; +}; + +#define VIE_INST_SIZE 15 +struct vie { + uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */ + uint8_t num_valid; /* size of the instruction */ + uint8_t num_processed; + + uint8_t addrsize:4, opsize:4; /* address and operand sizes */ + uint8_t rex_w:1, /* REX prefix */ + rex_r:1, + rex_x:1, + rex_b:1, + rex_present:1, + repz_present:1, /* REP/REPE/REPZ prefix */ + repnz_present:1, /* REPNE/REPNZ prefix */ + opsize_override:1, /* Operand size override */ + addrsize_override:1, /* Address size override */ + segment_override:1; /* Segment override */ + + uint8_t mod:2, /* ModRM byte */ + reg:4, + rm:4; + + uint8_t ss:2, /* SIB byte */ + index:4, + base:4; + + uint8_t disp_bytes; + uint8_t imm_bytes; + + uint8_t scale; + int base_register; /* VM_REG_GUEST_xyz */ + int index_register; /* VM_REG_GUEST_xyz */ + int segment_register; /* VM_REG_GUEST_xyz */ + + int64_t displacement; /* optional addr displacement */ + int64_t immediate; /* optional immediate operand */ + + uint8_t decoded; /* set to 1 if successfully decoded */ + + struct vie_op op; /* opcode description */ +}; + +enum vm_exitcode { + VM_EXITCODE_INOUT, + VM_EXITCODE_VMX, + VM_EXITCODE_BOGUS, + VM_EXITCODE_RDMSR, + VM_EXITCODE_WRMSR, + VM_EXITCODE_HLT, + VM_EXITCODE_MTRAP, + VM_EXITCODE_PAUSE, + VM_EXITCODE_PAGING, + VM_EXITCODE_INST_EMUL, + VM_EXITCODE_SPINUP_AP, + VM_EXITCODE_DEPRECATED1, /* used to be SPINDOWN_CPU */ + VM_EXITCODE_RENDEZVOUS, + VM_EXITCODE_IOAPIC_EOI, + VM_EXITCODE_SUSPENDED, + VM_EXITCODE_INOUT_STR, + VM_EXITCODE_TASK_SWITCH, + VM_EXITCODE_MONITOR, + VM_EXITCODE_MWAIT, + VM_EXITCODE_SVM, + VM_EXITCODE_MAX +}; + +struct vm_inout { + uint16_t bytes:3; /* 1 or 2 or 4 */ + uint16_t in:1; + uint16_t string:1; + uint16_t rep:1; + uint16_t port; + uint32_t eax; /* valid for out */ +}; + +struct vm_inout_str { + struct vm_inout inout; /* must be the first element */ + struct vm_guest_paging paging; + uint64_t rflags; + uint64_t cr0; + uint64_t index; + uint64_t count; /* rep=1 (%rcx), rep=0 (1) */ + int addrsize; + enum vm_reg_name seg_name; + struct seg_desc seg_desc; +}; + +enum task_switch_reason { + TSR_CALL, + TSR_IRET, + TSR_JMP, + TSR_IDT_GATE, /* task gate in IDT */ +}; + +struct vm_task_switch { + uint16_t tsssel; /* new TSS selector */ + int ext; /* task switch due to external event */ + uint32_t errcode; + int errcode_valid; /* push 'errcode' on the new stack */ + enum task_switch_reason reason; + struct vm_guest_paging paging; +}; + +struct vm_exit { + enum vm_exitcode exitcode; + int inst_length; /* 0 means unknown */ + uint64_t rip; + union { + struct vm_inout inout; + struct vm_inout_str inout_str; + struct { + uint64_t gpa; + int fault_type; + } paging; + struct { + uint64_t gpa; + uint64_t gla; + uint64_t cs_base; + int cs_d; /* CS.D */ + struct vm_guest_paging paging; + struct vie vie; + } inst_emul; + /* + * VMX specific payload. Used when there is no "better" + * exitcode to represent the VM-exit. + */ + struct { + int status; /* vmx inst status */ + /* + * 'exit_reason' and 'exit_qualification' are valid + * only if 'status' is zero. + */ + uint32_t exit_reason; + uint64_t exit_qualification; + /* + * 'inst_error' and 'inst_type' are valid + * only if 'status' is non-zero. + */ + int inst_type; + int inst_error; + } vmx; + /* + * SVM specific payload. + */ + struct { + uint64_t exitcode; + uint64_t exitinfo1; + uint64_t exitinfo2; + } svm; + struct { + uint32_t code; /* ecx value */ + uint64_t wval; + } msr; + struct { + int vcpu; + uint64_t rip; + } spinup_ap; + struct { + uint64_t rflags; + } hlt; + struct { + int vector; + } ioapic_eoi; + struct { + enum vm_suspend_how how; + } suspended; + struct vm_task_switch task_switch; + } u; +}; + +/* APIs to inject faults into the guest */ +void vm_inject_fault(void *vm, int vcpuid, int vector, int errcode_valid, + int errcode); + +static __inline void +vm_inject_ud(void *vm, int vcpuid) +{ + vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0); +} + +static __inline void +vm_inject_gp(void *vm, int vcpuid) +{ + vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0); +} + +static __inline void +vm_inject_ac(void *vm, int vcpuid, int errcode) +{ + vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode); +} + +static __inline void +vm_inject_ss(void *vm, int vcpuid, int errcode) +{ + vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode); +} + +void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2); + +int vm_restart_instruction(void *vm, int vcpuid); + +#endif /* _VMM_H_ */ diff --git a/vmm/amd/amdv.c b/vmm/amd/amdv.c new file mode 100644 index 0000000..3157e21 --- /dev/null +++ b/vmm/amd/amdv.c @@ -0,0 +1,133 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include "io/iommu.h" + +static int +amd_iommu_init(void) +{ + + printf("amd_iommu_init: not implemented\n"); + return (ENXIO); +} + +static void +amd_iommu_cleanup(void) +{ + + printf("amd_iommu_cleanup: not implemented\n"); +} + +static void +amd_iommu_enable(void) +{ + + printf("amd_iommu_enable: not implemented\n"); +} + +static void +amd_iommu_disable(void) +{ + + printf("amd_iommu_disable: not implemented\n"); +} + +static void * +amd_iommu_create_domain(vm_paddr_t maxaddr) +{ + + printf("amd_iommu_create_domain: not implemented\n"); + return (NULL); +} + +static void +amd_iommu_destroy_domain(void *domain) +{ + + printf("amd_iommu_destroy_domain: not implemented\n"); +} + +static uint64_t +amd_iommu_create_mapping(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, + uint64_t len) +{ + + printf("amd_iommu_create_mapping: not implemented\n"); + return (0); +} + +static uint64_t +amd_iommu_remove_mapping(void *domain, vm_paddr_t gpa, uint64_t len) +{ + + printf("amd_iommu_remove_mapping: not implemented\n"); + return (0); +} + +static void +amd_iommu_add_device(void *domain, uint16_t rid) +{ + + printf("amd_iommu_add_device: not implemented\n"); +} + +static void +amd_iommu_remove_device(void *domain, uint16_t rid) +{ + + printf("amd_iommu_remove_device: not implemented\n"); +} + +static void +amd_iommu_invalidate_tlb(void *domain) +{ + + printf("amd_iommu_invalidate_tlb: not implemented\n"); +} + +struct iommu_ops iommu_ops_amd = { + amd_iommu_init, + amd_iommu_cleanup, + amd_iommu_enable, + amd_iommu_disable, + amd_iommu_create_domain, + amd_iommu_destroy_domain, + amd_iommu_create_mapping, + amd_iommu_remove_mapping, + amd_iommu_add_device, + amd_iommu_remove_device, + amd_iommu_invalidate_tlb, +}; diff --git a/vmm/amd/npt.c b/vmm/amd/npt.c new file mode 100644 index 0000000..bebb4d5 --- /dev/null +++ b/vmm/amd/npt.c @@ -0,0 +1,87 @@ +/*- + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include "npt.h" + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, npt, CTLFLAG_RW, NULL, NULL); + +static int npt_flags; +SYSCTL_INT(_hw_vmm_npt, OID_AUTO, pmap_flags, CTLFLAG_RD, + &npt_flags, 0, NULL); + +#define NPT_IPIMASK 0xFF + +/* + * AMD nested page table init. + */ +int +svm_npt_init(int ipinum) +{ + int enable_superpage = 1; + + npt_flags = ipinum & NPT_IPIMASK; + TUNABLE_INT_FETCH("hw.vmm.npt.enable_superpage", &enable_superpage); + if (enable_superpage) + npt_flags |= PMAP_PDE_SUPERPAGE; + + return (0); +} + +static int +npt_pinit(pmap_t pmap) +{ + + return (pmap_pinit_type(pmap, PT_RVI, npt_flags)); +} + +struct vmspace * +svm_npt_alloc(vm_offset_t min, vm_offset_t max) +{ + + return (vmspace_alloc(min, max, npt_pinit)); +} + +void +svm_npt_free(struct vmspace *vmspace) +{ + + vmspace_free(vmspace); +} diff --git a/vmm/amd/npt.h b/vmm/amd/npt.h new file mode 100644 index 0000000..5966474 --- /dev/null +++ b/vmm/amd/npt.h @@ -0,0 +1,36 @@ +/*- + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SVM_NPT_H_ +#define _SVM_NPT_H_ + +int svm_npt_init(int ipinum); +struct vmspace *svm_npt_alloc(vm_offset_t min, vm_offset_t max); +void svm_npt_free(struct vmspace *vmspace); + +#endif /* _SVM_NPT_H_ */ diff --git a/vmm/amd/svm.c b/vmm/amd/svm.c new file mode 100644 index 0000000..20e8f76 --- /dev/null +++ b/vmm/amd/svm.c @@ -0,0 +1,2259 @@ +/*- + * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vmm_lapic.h" +#include "vmm_stat.h" +#include "vmm_ktr.h" +#include "vmm_ioport.h" +#include "vatpic.h" +#include "vlapic.h" +#include "vlapic_priv.h" + +#include "x86.h" +#include "vmcb.h" +#include "svm.h" +#include "svm_softc.h" +#include "svm_msr.h" +#include "npt.h" + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW, NULL, NULL); + +/* + * SVM CPUID function 0x8000_000A, edx bit decoding. + */ +#define AMD_CPUID_SVM_NP BIT(0) /* Nested paging or RVI */ +#define AMD_CPUID_SVM_LBR BIT(1) /* Last branch virtualization */ +#define AMD_CPUID_SVM_SVML BIT(2) /* SVM lock */ +#define AMD_CPUID_SVM_NRIP_SAVE BIT(3) /* Next RIP is saved */ +#define AMD_CPUID_SVM_TSC_RATE BIT(4) /* TSC rate control. */ +#define AMD_CPUID_SVM_VMCB_CLEAN BIT(5) /* VMCB state caching */ +#define AMD_CPUID_SVM_FLUSH_BY_ASID BIT(6) /* Flush by ASID */ +#define AMD_CPUID_SVM_DECODE_ASSIST BIT(7) /* Decode assist */ +#define AMD_CPUID_SVM_PAUSE_INC BIT(10) /* Pause intercept filter. */ +#define AMD_CPUID_SVM_PAUSE_FTH BIT(12) /* Pause filter threshold */ +#define AMD_CPUID_SVM_AVIC BIT(13) /* AVIC present */ + +#define VMCB_CACHE_DEFAULT (VMCB_CACHE_ASID | \ + VMCB_CACHE_IOPM | \ + VMCB_CACHE_I | \ + VMCB_CACHE_TPR | \ + VMCB_CACHE_CR2 | \ + VMCB_CACHE_CR | \ + VMCB_CACHE_DT | \ + VMCB_CACHE_SEG | \ + VMCB_CACHE_NP) + +static uint32_t vmcb_clean = VMCB_CACHE_DEFAULT; +SYSCTL_INT(_hw_vmm_svm, OID_AUTO, vmcb_clean, CTLFLAG_RDTUN, &vmcb_clean, + 0, NULL); + +static MALLOC_DEFINE(M_SVM, "svm", "svm"); +static MALLOC_DEFINE(M_SVM_VLAPIC, "svm-vlapic", "svm-vlapic"); + +/* Per-CPU context area. */ +extern struct pcpu __pcpu[]; + +static uint32_t svm_feature; /* AMD SVM features. */ +SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, features, CTLFLAG_RD, &svm_feature, 0, + "SVM features advertised by CPUID.8000000AH:EDX"); + +static int disable_npf_assist; +SYSCTL_INT(_hw_vmm_svm, OID_AUTO, disable_npf_assist, CTLFLAG_RWTUN, + &disable_npf_assist, 0, NULL); + +/* Maximum ASIDs supported by the processor */ +static uint32_t nasid; +SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, num_asids, CTLFLAG_RD, &nasid, 0, + "Number of ASIDs supported by this processor"); + +/* Current ASID generation for each host cpu */ +static struct asid asid[MAXCPU]; + +/* + * SVM host state saved area of size 4KB for each core. + */ +static uint8_t hsave[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); + +static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery"); +static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry"); +static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window"); + +static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val); + +static __inline int +flush_by_asid(void) +{ + + return (svm_feature & AMD_CPUID_SVM_FLUSH_BY_ASID); +} + +static __inline int +decode_assist(void) +{ + + return (svm_feature & AMD_CPUID_SVM_DECODE_ASSIST); +} + +static void +svm_disable(void *arg __unused) +{ + uint64_t efer; + + efer = rdmsr(MSR_EFER); + efer &= ~EFER_SVM; + wrmsr(MSR_EFER, efer); +} + +/* + * Disable SVM on all CPUs. + */ +static int +svm_cleanup(void) +{ + + smp_rendezvous(NULL, svm_disable, NULL, NULL); + return (0); +} + +/* + * Verify that all the features required by bhyve are available. + */ +static int +check_svm_features(void) +{ + u_int regs[4]; + + /* CPUID Fn8000_000A is for SVM */ + do_cpuid(0x8000000A, regs); + svm_feature = regs[3]; + + nasid = regs[1]; + KASSERT(nasid > 1, ("Insufficient ASIDs for guests: %#x", nasid)); + + /* bhyve requires the Nested Paging feature */ + if (!(svm_feature & AMD_CPUID_SVM_NP)) { + printf("SVM: Nested Paging feature not available.\n"); + return (ENXIO); + } + + /* bhyve requires the NRIP Save feature */ + if (!(svm_feature & AMD_CPUID_SVM_NRIP_SAVE)) { + printf("SVM: NRIP Save feature not available.\n"); + return (ENXIO); + } + + return (0); +} + +static void +svm_enable(void *arg __unused) +{ + uint64_t efer; + + efer = rdmsr(MSR_EFER); + efer |= EFER_SVM; + wrmsr(MSR_EFER, efer); + + wrmsr(MSR_VM_HSAVE_PA, vtophys(hsave[curcpu])); +} + +/* + * Return 1 if SVM is enabled on this processor and 0 otherwise. + */ +static int +svm_available(void) +{ + uint64_t msr; + + /* Section 15.4 Enabling SVM from APM2. */ + if ((amd_feature2 & AMDID2_SVM) == 0) { + printf("SVM: not available.\n"); + return (0); + } + + msr = rdmsr(MSR_VM_CR); + if ((msr & VM_CR_SVMDIS) != 0) { + printf("SVM: disabled by BIOS.\n"); + return (0); + } + + return (1); +} + +static int +svm_init(int ipinum) +{ + int error, cpu; + + if (!svm_available()) + return (ENXIO); + + error = check_svm_features(); + if (error) + return (error); + + vmcb_clean &= VMCB_CACHE_DEFAULT; + + for (cpu = 0; cpu < MAXCPU; cpu++) { + /* + * Initialize the host ASIDs to their "highest" valid values. + * + * The next ASID allocation will rollover both 'gen' and 'num' + * and start off the sequence at {1,1}. + */ + asid[cpu].gen = ~0UL; + asid[cpu].num = nasid - 1; + } + + svm_msr_init(); + svm_npt_init(ipinum); + + /* Enable SVM on all CPUs */ + smp_rendezvous(NULL, svm_enable, NULL, NULL); + + return (0); +} + +static void +svm_restore(void) +{ + + svm_enable(NULL); +} + +/* Pentium compatible MSRs */ +#define MSR_PENTIUM_START 0 +#define MSR_PENTIUM_END 0x1FFF +/* AMD 6th generation and Intel compatible MSRs */ +#define MSR_AMD6TH_START 0xC0000000UL +#define MSR_AMD6TH_END 0xC0001FFFUL +/* AMD 7th and 8th generation compatible MSRs */ +#define MSR_AMD7TH_START 0xC0010000UL +#define MSR_AMD7TH_END 0xC0011FFFUL + +/* + * Get the index and bit position for a MSR in permission bitmap. + * Two bits are used for each MSR: lower bit for read and higher bit for write. + */ +static int +svm_msr_index(uint64_t msr, int *index, int *bit) +{ + uint32_t base, off; + + *index = -1; + *bit = (msr % 4) * 2; + base = 0; + + if (msr >= MSR_PENTIUM_START && msr <= MSR_PENTIUM_END) { + *index = msr / 4; + return (0); + } + + base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1); + if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) { + off = (msr - MSR_AMD6TH_START); + *index = (off + base) / 4; + return (0); + } + + base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1); + if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) { + off = (msr - MSR_AMD7TH_START); + *index = (off + base) / 4; + return (0); + } + + return (EINVAL); +} + +/* + * Allow vcpu to read or write the 'msr' without trapping into the hypervisor. + */ +static void +svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write) +{ + int index, bit, error; + + error = svm_msr_index(msr, &index, &bit); + KASSERT(error == 0, ("%s: invalid msr %#lx", __func__, msr)); + KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE, + ("%s: invalid index %d for msr %#lx", __func__, index, msr)); + KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d " + "msr %#lx", __func__, bit, msr)); + + if (read) + perm_bitmap[index] &= ~(1UL << bit); + + if (write) + perm_bitmap[index] &= ~(2UL << bit); +} + +static void +svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr) +{ + + svm_msr_perm(perm_bitmap, msr, true, true); +} + +static void +svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr) +{ + + svm_msr_perm(perm_bitmap, msr, true, false); +} + +static __inline int +svm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask) +{ + struct vmcb_ctrl *ctrl; + + KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx)); + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + return (ctrl->intercept[idx] & bitmask ? 1 : 0); +} + +static __inline void +svm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask, + int enabled) +{ + struct vmcb_ctrl *ctrl; + uint32_t oldval; + + KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx)); + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + oldval = ctrl->intercept[idx]; + + if (enabled) + ctrl->intercept[idx] |= bitmask; + else + ctrl->intercept[idx] &= ~bitmask; + + if (ctrl->intercept[idx] != oldval) { + svm_set_dirty(sc, vcpu, VMCB_CACHE_I); + VCPU_CTR3(sc->vm, vcpu, "intercept[%d] modified " + "from %#x to %#x", idx, oldval, ctrl->intercept[idx]); + } +} + +static __inline void +svm_disable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask) +{ + + svm_set_intercept(sc, vcpu, off, bitmask, 0); +} + +static __inline void +svm_enable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask) +{ + + svm_set_intercept(sc, vcpu, off, bitmask, 1); +} + +static void +vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa, + uint64_t msrpm_base_pa, uint64_t np_pml4) +{ + struct vmcb_ctrl *ctrl; + struct vmcb_state *state; + uint32_t mask; + int n; + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + state = svm_get_vmcb_state(sc, vcpu); + + ctrl->iopm_base_pa = iopm_base_pa; + ctrl->msrpm_base_pa = msrpm_base_pa; + + /* Enable nested paging */ + ctrl->np_enable = 1; + ctrl->n_cr3 = np_pml4; + + /* + * Intercept accesses to the control registers that are not shadowed + * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8. + */ + for (n = 0; n < 16; n++) { + mask = (BIT(n) << 16) | BIT(n); + if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8) + svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask); + else + svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask); + } + + + /* + * Intercept everything when tracing guest exceptions otherwise + * just intercept machine check exception. + */ + if (vcpu_trace_exceptions(sc->vm, vcpu)) { + for (n = 0; n < 32; n++) { + /* + * Skip unimplemented vectors in the exception bitmap. + */ + if (n == 2 || n == 9) { + continue; + } + svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(n)); + } + } else { + svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC)); + } + + /* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */ + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_FERR_FREEZE); + + svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR); + svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT); + + /* + * From section "Canonicalization and Consistency Checks" in APMv2 + * the VMRUN intercept bit must be set to pass the consistency check. + */ + svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN); + + /* + * The ASID will be set to a non-zero value just before VMRUN. + */ + ctrl->asid = 0; + + /* + * Section 15.21.1, Interrupt Masking in EFLAGS + * Section 15.21.2, Virtualizing APIC.TPR + * + * This must be set for %rflag and %cr8 isolation of guest and host. + */ + ctrl->v_intr_masking = 1; + + /* Enable Last Branch Record aka LBR for debugging */ + ctrl->lbr_virt_en = 1; + state->dbgctl = BIT(0); + + /* EFER_SVM must always be set when the guest is executing */ + state->efer = EFER_SVM; + + /* Set up the PAT to power-on state */ + state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK) | + PAT_VALUE(1, PAT_WRITE_THROUGH) | + PAT_VALUE(2, PAT_UNCACHED) | + PAT_VALUE(3, PAT_UNCACHEABLE) | + PAT_VALUE(4, PAT_WRITE_BACK) | + PAT_VALUE(5, PAT_WRITE_THROUGH) | + PAT_VALUE(6, PAT_UNCACHED) | + PAT_VALUE(7, PAT_UNCACHEABLE); +} + +/* + * Initialize a virtual machine. + */ +static void * +svm_vminit(struct vm *vm, pmap_t pmap) +{ + struct svm_softc *svm_sc; + struct svm_vcpu *vcpu; + vm_paddr_t msrpm_pa, iopm_pa, pml4_pa; + int i; + + svm_sc = malloc(sizeof (struct svm_softc), M_SVM, M_WAITOK | M_ZERO); + svm_sc->vm = vm; + svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pml4); + + /* + * Intercept read and write accesses to all MSRs. + */ + memset(svm_sc->msr_bitmap, 0xFF, sizeof(svm_sc->msr_bitmap)); + + /* + * Access to the following MSRs is redirected to the VMCB when the + * guest is executing. Therefore it is safe to allow the guest to + * read/write these MSRs directly without hypervisor involvement. + */ + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE); + + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT); + + svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC); + + /* + * Intercept writes to make sure that the EFER_SVM bit is not cleared. + */ + svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER); + + /* Intercept access to all I/O ports. */ + memset(svm_sc->iopm_bitmap, 0xFF, sizeof(svm_sc->iopm_bitmap)); + + iopm_pa = vtophys(svm_sc->iopm_bitmap); + msrpm_pa = vtophys(svm_sc->msr_bitmap); + pml4_pa = svm_sc->nptp; + for (i = 0; i < VM_MAXCPU; i++) { + vcpu = svm_get_vcpu(svm_sc, i); + vcpu->nextrip = ~0; + vcpu->lastcpu = NOCPU; + vcpu->vmcb_pa = vtophys(&vcpu->vmcb); + vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa); + svm_msr_guest_init(svm_sc, i); + } + return (svm_sc); +} + +/* + * Collateral for a generic SVM VM-exit. + */ +static void +vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2) +{ + + vme->exitcode = VM_EXITCODE_SVM; + vme->u.svm.exitcode = code; + vme->u.svm.exitinfo1 = info1; + vme->u.svm.exitinfo2 = info2; +} + +static int +svm_cpl(struct vmcb_state *state) +{ + + /* + * From APMv2: + * "Retrieve the CPL from the CPL field in the VMCB, not + * from any segment DPL" + */ + return (state->cpl); +} + +static enum vm_cpu_mode +svm_vcpu_mode(struct vmcb *vmcb) +{ + struct vmcb_segment seg; + struct vmcb_state *state; + int error; + + state = &vmcb->state; + + if (state->efer & EFER_LMA) { + error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg); + KASSERT(error == 0, ("%s: vmcb_seg(cs) error %d", __func__, + error)); + + /* + * Section 4.8.1 for APM2, check if Code Segment has + * Long attribute set in descriptor. + */ + if (seg.attrib & VMCB_CS_ATTRIB_L) + return (CPU_MODE_64BIT); + else + return (CPU_MODE_COMPATIBILITY); + } else if (state->cr0 & CR0_PE) { + return (CPU_MODE_PROTECTED); + } else { + return (CPU_MODE_REAL); + } +} + +static enum vm_paging_mode +svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer) +{ + + if ((cr0 & CR0_PG) == 0) + return (PAGING_MODE_FLAT); + if ((cr4 & CR4_PAE) == 0) + return (PAGING_MODE_32); + if (efer & EFER_LME) + return (PAGING_MODE_64); + else + return (PAGING_MODE_PAE); +} + +/* + * ins/outs utility routines + */ +static uint64_t +svm_inout_str_index(struct svm_regctx *regs, int in) +{ + uint64_t val; + + val = in ? regs->sctx_rdi : regs->sctx_rsi; + + return (val); +} + +static uint64_t +svm_inout_str_count(struct svm_regctx *regs, int rep) +{ + uint64_t val; + + val = rep ? regs->sctx_rcx : 1; + + return (val); +} + +static void +svm_inout_str_seginfo(struct svm_softc *svm_sc, int vcpu, int64_t info1, + int in, struct vm_inout_str *vis) +{ + int error, s; + + if (in) { + vis->seg_name = VM_REG_GUEST_ES; + } else { + /* The segment field has standard encoding */ + s = (info1 >> 10) & 0x7; + vis->seg_name = vm_segment_name(s); + } + + error = vmcb_getdesc(svm_sc, vcpu, vis->seg_name, &vis->seg_desc); + KASSERT(error == 0, ("%s: svm_getdesc error %d", __func__, error)); +} + +static int +svm_inout_str_addrsize(uint64_t info1) +{ + uint32_t size; + + size = (info1 >> 7) & 0x7; + switch (size) { + case 1: + return (2); /* 16 bit */ + case 2: + return (4); /* 32 bit */ + case 4: + return (8); /* 64 bit */ + default: + panic("%s: invalid size encoding %d", __func__, size); + } +} + +static void +svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging) +{ + struct vmcb_state *state; + + state = &vmcb->state; + paging->cr3 = state->cr3; + paging->cpl = svm_cpl(state); + paging->cpu_mode = svm_vcpu_mode(vmcb); + paging->paging_mode = svm_paging_mode(state->cr0, state->cr4, + state->efer); +} + +#define UNHANDLED 0 + +/* + * Handle guest I/O intercept. + */ +static int +svm_handle_io(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) +{ + struct vmcb_ctrl *ctrl; + struct vmcb_state *state; + struct svm_regctx *regs; + struct vm_inout_str *vis; + uint64_t info1; + int inout_string; + + state = svm_get_vmcb_state(svm_sc, vcpu); + ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); + regs = svm_get_guest_regctx(svm_sc, vcpu); + + info1 = ctrl->exitinfo1; + inout_string = info1 & BIT(2) ? 1 : 0; + + /* + * The effective segment number in EXITINFO1[12:10] is populated + * only if the processor has the DecodeAssist capability. + * + * XXX this is not specified explicitly in APMv2 but can be verified + * empirically. + */ + if (inout_string && !decode_assist()) + return (UNHANDLED); + + vmexit->exitcode = VM_EXITCODE_INOUT; + vmexit->u.inout.in = (info1 & BIT(0)) ? 1 : 0; + vmexit->u.inout.string = inout_string; + vmexit->u.inout.rep = (info1 & BIT(3)) ? 1 : 0; + vmexit->u.inout.bytes = (info1 >> 4) & 0x7; + vmexit->u.inout.port = (uint16_t)(info1 >> 16); + vmexit->u.inout.eax = (uint32_t)(state->rax); + + if (inout_string) { + vmexit->exitcode = VM_EXITCODE_INOUT_STR; + vis = &vmexit->u.inout_str; + svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &vis->paging); + vis->rflags = state->rflags; + vis->cr0 = state->cr0; + vis->index = svm_inout_str_index(regs, vmexit->u.inout.in); + vis->count = svm_inout_str_count(regs, vmexit->u.inout.rep); + vis->addrsize = svm_inout_str_addrsize(info1); + svm_inout_str_seginfo(svm_sc, vcpu, info1, + vmexit->u.inout.in, vis); + } + + return (UNHANDLED); +} + +static int +npf_fault_type(uint64_t exitinfo1) +{ + + if (exitinfo1 & VMCB_NPF_INFO1_W) + return (VM_PROT_WRITE); + else if (exitinfo1 & VMCB_NPF_INFO1_ID) + return (VM_PROT_EXECUTE); + else + return (VM_PROT_READ); +} + +static bool +svm_npf_emul_fault(uint64_t exitinfo1) +{ + + if (exitinfo1 & VMCB_NPF_INFO1_ID) { + return (false); + } + + if (exitinfo1 & VMCB_NPF_INFO1_GPT) { + return (false); + } + + if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) { + return (false); + } + + return (true); +} + +static void +svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit) +{ + struct vm_guest_paging *paging; + struct vmcb_segment seg; + struct vmcb_ctrl *ctrl; + char *inst_bytes; + int error, inst_len; + + ctrl = &vmcb->ctrl; + paging = &vmexit->u.inst_emul.paging; + + vmexit->exitcode = VM_EXITCODE_INST_EMUL; + vmexit->u.inst_emul.gpa = gpa; + vmexit->u.inst_emul.gla = VIE_INVALID_GLA; + svm_paging_info(vmcb, paging); + + error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg); + KASSERT(error == 0, ("%s: vmcb_seg(CS) error %d", __func__, error)); + + switch(paging->cpu_mode) { + case CPU_MODE_REAL: + vmexit->u.inst_emul.cs_base = seg.base; + vmexit->u.inst_emul.cs_d = 0; + break; + case CPU_MODE_PROTECTED: + case CPU_MODE_COMPATIBILITY: + vmexit->u.inst_emul.cs_base = seg.base; + + /* + * Section 4.8.1 of APM2, Default Operand Size or D bit. + */ + vmexit->u.inst_emul.cs_d = (seg.attrib & VMCB_CS_ATTRIB_D) ? + 1 : 0; + break; + default: + vmexit->u.inst_emul.cs_base = 0; + vmexit->u.inst_emul.cs_d = 0; + break; + } + + /* + * Copy the instruction bytes into 'vie' if available. + */ + if (decode_assist() && !disable_npf_assist) { + inst_len = ctrl->inst_len; + inst_bytes = ctrl->inst_bytes; + } else { + inst_len = 0; + inst_bytes = NULL; + } + vie_init(&vmexit->u.inst_emul.vie, inst_bytes, inst_len); +} + +#ifdef KTR +static const char * +intrtype_to_str(int intr_type) +{ + switch (intr_type) { + case VMCB_EVENTINJ_TYPE_INTR: + return ("hwintr"); + case VMCB_EVENTINJ_TYPE_NMI: + return ("nmi"); + case VMCB_EVENTINJ_TYPE_INTn: + return ("swintr"); + case VMCB_EVENTINJ_TYPE_EXCEPTION: + return ("exception"); + default: + panic("%s: unknown intr_type %d", __func__, intr_type); + } +} +#endif + +/* + * Inject an event to vcpu as described in section 15.20, "Event injection". + */ +static void +svm_eventinject(struct svm_softc *sc, int vcpu, int intr_type, int vector, + uint32_t error, bool ec_valid) +{ + struct vmcb_ctrl *ctrl; + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + + KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, + ("%s: event already pending %#lx", __func__, ctrl->eventinj)); + + KASSERT(vector >=0 && vector <= 255, ("%s: invalid vector %d", + __func__, vector)); + + switch (intr_type) { + case VMCB_EVENTINJ_TYPE_INTR: + case VMCB_EVENTINJ_TYPE_NMI: + case VMCB_EVENTINJ_TYPE_INTn: + break; + case VMCB_EVENTINJ_TYPE_EXCEPTION: + if (vector >= 0 && vector <= 31 && vector != 2) + break; + /* FALLTHROUGH */ + default: + panic("%s: invalid intr_type/vector: %d/%d", __func__, + intr_type, vector); + } + ctrl->eventinj = vector | (intr_type << 8) | VMCB_EVENTINJ_VALID; + if (ec_valid) { + ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID; + ctrl->eventinj |= (uint64_t)error << 32; + VCPU_CTR3(sc->vm, vcpu, "Injecting %s at vector %d errcode %#x", + intrtype_to_str(intr_type), vector, error); + } else { + VCPU_CTR2(sc->vm, vcpu, "Injecting %s at vector %d", + intrtype_to_str(intr_type), vector); + } +} + +static void +svm_update_virqinfo(struct svm_softc *sc, int vcpu) +{ + struct vm *vm; + struct vlapic *vlapic; + struct vmcb_ctrl *ctrl; + int pending; + + vm = sc->vm; + vlapic = vm_lapic(vm, vcpu); + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + + /* Update %cr8 in the emulated vlapic */ + vlapic_set_cr8(vlapic, ctrl->v_tpr); + + /* + * If V_IRQ indicates that the interrupt injection attempted on then + * last VMRUN was successful then update the vlapic accordingly. + */ + if (ctrl->v_intr_vector != 0) { + pending = ctrl->v_irq; + KASSERT(ctrl->v_intr_vector >= 16, ("%s: invalid " + "v_intr_vector %d", __func__, ctrl->v_intr_vector)); + KASSERT(!ctrl->v_ign_tpr, ("%s: invalid v_ign_tpr", __func__)); + VCPU_CTR2(vm, vcpu, "v_intr_vector %d %s", ctrl->v_intr_vector, + pending ? "pending" : "accepted"); + if (!pending) + vlapic_intr_accepted(vlapic, ctrl->v_intr_vector); + } +} + +static void +svm_save_intinfo(struct svm_softc *svm_sc, int vcpu) +{ + struct vmcb_ctrl *ctrl; + uint64_t intinfo; + + ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); + intinfo = ctrl->exitintinfo; + if (!VMCB_EXITINTINFO_VALID(intinfo)) + return; + + /* + * From APMv2, Section "Intercepts during IDT interrupt delivery" + * + * If a #VMEXIT happened during event delivery then record the event + * that was being delivered. + */ + VCPU_CTR2(svm_sc->vm, vcpu, "SVM:Pending INTINFO(0x%lx), vector=%d.\n", + intinfo, VMCB_EXITINTINFO_VECTOR(intinfo)); + vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1); + vm_exit_intinfo(svm_sc->vm, vcpu, intinfo); +} + +static __inline int +vintr_intercept_enabled(struct svm_softc *sc, int vcpu) +{ + + return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_VINTR)); +} + +static __inline void +enable_intr_window_exiting(struct svm_softc *sc, int vcpu) +{ + struct vmcb_ctrl *ctrl; + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + + if (ctrl->v_irq && ctrl->v_intr_vector == 0) { + KASSERT(ctrl->v_ign_tpr, ("%s: invalid v_ign_tpr", __func__)); + KASSERT(vintr_intercept_enabled(sc, vcpu), + ("%s: vintr intercept should be enabled", __func__)); + return; + } + + VCPU_CTR0(sc->vm, vcpu, "Enable intr window exiting"); + ctrl->v_irq = 1; + ctrl->v_ign_tpr = 1; + ctrl->v_intr_vector = 0; + svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR); +} + +static __inline void +disable_intr_window_exiting(struct svm_softc *sc, int vcpu) +{ + struct vmcb_ctrl *ctrl; + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + + if (!ctrl->v_irq && ctrl->v_intr_vector == 0) { + KASSERT(!vintr_intercept_enabled(sc, vcpu), + ("%s: vintr intercept should be disabled", __func__)); + return; + } + +#ifdef KTR + if (ctrl->v_intr_vector == 0) + VCPU_CTR0(sc->vm, vcpu, "Disable intr window exiting"); + else + VCPU_CTR0(sc->vm, vcpu, "Clearing V_IRQ interrupt injection"); +#endif + ctrl->v_irq = 0; + ctrl->v_intr_vector = 0; + svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); + svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR); +} + +static int +svm_modify_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t val) +{ + struct vmcb_ctrl *ctrl; + int oldval, newval; + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + oldval = ctrl->intr_shadow; + newval = val ? 1 : 0; + if (newval != oldval) { + ctrl->intr_shadow = newval; + VCPU_CTR1(sc->vm, vcpu, "Setting intr_shadow to %d", newval); + } + return (0); +} + +static int +svm_get_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t *val) +{ + struct vmcb_ctrl *ctrl; + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + *val = ctrl->intr_shadow; + return (0); +} + +/* + * Once an NMI is injected it blocks delivery of further NMIs until the handler + * executes an IRET. The IRET intercept is enabled when an NMI is injected to + * to track when the vcpu is done handling the NMI. + */ +static int +nmi_blocked(struct svm_softc *sc, int vcpu) +{ + int blocked; + + blocked = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_IRET); + return (blocked); +} + +static void +enable_nmi_blocking(struct svm_softc *sc, int vcpu) +{ + + KASSERT(!nmi_blocked(sc, vcpu), ("vNMI already blocked")); + VCPU_CTR0(sc->vm, vcpu, "vNMI blocking enabled"); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); +} + +static void +clear_nmi_blocking(struct svm_softc *sc, int vcpu) +{ + int error; + + KASSERT(nmi_blocked(sc, vcpu), ("vNMI already unblocked")); + VCPU_CTR0(sc->vm, vcpu, "vNMI blocking cleared"); + /* + * When the IRET intercept is cleared the vcpu will attempt to execute + * the "iret" when it runs next. However, it is possible to inject + * another NMI into the vcpu before the "iret" has actually executed. + * + * For e.g. if the "iret" encounters a #NPF when accessing the stack + * it will trap back into the hypervisor. If an NMI is pending for + * the vcpu it will be injected into the guest. + * + * XXX this needs to be fixed + */ + svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); + + /* + * Set 'intr_shadow' to prevent an NMI from being injected on the + * immediate VMRUN. + */ + error = svm_modify_intr_shadow(sc, vcpu, 1); + KASSERT(!error, ("%s: error %d setting intr_shadow", __func__, error)); +} + +#define EFER_MBZ_BITS 0xFFFFFFFFFFFF0200UL + +static int +svm_write_efer(struct svm_softc *sc, int vcpu, uint64_t newval, bool *retu) +{ + struct vm_exit *vme; + struct vmcb_state *state; + uint64_t changed, lma, oldval; + int error; + + state = svm_get_vmcb_state(sc, vcpu); + + oldval = state->efer; + VCPU_CTR2(sc->vm, vcpu, "wrmsr(efer) %#lx/%#lx", oldval, newval); + + newval &= ~0xFE; /* clear the Read-As-Zero (RAZ) bits */ + changed = oldval ^ newval; + + if (newval & EFER_MBZ_BITS) + goto gpf; + + /* APMv2 Table 14-5 "Long-Mode Consistency Checks" */ + if (changed & EFER_LME) { + if (state->cr0 & CR0_PG) + goto gpf; + } + + /* EFER.LMA = EFER.LME & CR0.PG */ + if ((newval & EFER_LME) != 0 && (state->cr0 & CR0_PG) != 0) + lma = EFER_LMA; + else + lma = 0; + + if ((newval & EFER_LMA) != lma) + goto gpf; + + if (newval & EFER_NXE) { + if (!vm_cpuid_capability(sc->vm, vcpu, VCC_NO_EXECUTE)) + goto gpf; + } + + /* + * XXX bhyve does not enforce segment limits in 64-bit mode. Until + * this is fixed flag guest attempt to set EFER_LMSLE as an error. + */ + if (newval & EFER_LMSLE) { + vme = vm_exitinfo(sc->vm, vcpu); + vm_exit_svm(vme, VMCB_EXIT_MSR, 1, 0); + *retu = true; + return (0); + } + + if (newval & EFER_FFXSR) { + if (!vm_cpuid_capability(sc->vm, vcpu, VCC_FFXSR)) + goto gpf; + } + + if (newval & EFER_TCE) { + if (!vm_cpuid_capability(sc->vm, vcpu, VCC_TCE)) + goto gpf; + } + + error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, newval); + KASSERT(error == 0, ("%s: error %d updating efer", __func__, error)); + return (0); +gpf: + vm_inject_gp(sc->vm, vcpu); + return (0); +} + +static int +emulate_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val, + bool *retu) +{ + int error; + + if (lapic_msr(num)) + error = lapic_wrmsr(sc->vm, vcpu, num, val, retu); + else if (num == MSR_EFER) + error = svm_write_efer(sc, vcpu, val, retu); + else + error = svm_wrmsr(sc, vcpu, num, val, retu); + + return (error); +} + +static int +emulate_rdmsr(struct svm_softc *sc, int vcpu, u_int num, bool *retu) +{ + struct vmcb_state *state; + struct svm_regctx *ctx; + uint64_t result; + int error; + + if (lapic_msr(num)) + error = lapic_rdmsr(sc->vm, vcpu, num, &result, retu); + else + error = svm_rdmsr(sc, vcpu, num, &result, retu); + + if (error == 0) { + state = svm_get_vmcb_state(sc, vcpu); + ctx = svm_get_guest_regctx(sc, vcpu); + state->rax = result & 0xffffffff; + ctx->sctx_rdx = result >> 32; + } + + return (error); +} + +#ifdef KTR +static const char * +exit_reason_to_str(uint64_t reason) +{ + static char reasonbuf[32]; + + switch (reason) { + case VMCB_EXIT_INVALID: + return ("invalvmcb"); + case VMCB_EXIT_SHUTDOWN: + return ("shutdown"); + case VMCB_EXIT_NPF: + return ("nptfault"); + case VMCB_EXIT_PAUSE: + return ("pause"); + case VMCB_EXIT_HLT: + return ("hlt"); + case VMCB_EXIT_CPUID: + return ("cpuid"); + case VMCB_EXIT_IO: + return ("inout"); + case VMCB_EXIT_MC: + return ("mchk"); + case VMCB_EXIT_INTR: + return ("extintr"); + case VMCB_EXIT_NMI: + return ("nmi"); + case VMCB_EXIT_VINTR: + return ("vintr"); + case VMCB_EXIT_MSR: + return ("msr"); + case VMCB_EXIT_IRET: + return ("iret"); + case VMCB_EXIT_MONITOR: + return ("monitor"); + case VMCB_EXIT_MWAIT: + return ("mwait"); + default: + snprintf(reasonbuf, sizeof(reasonbuf), "%#lx", reason); + return (reasonbuf); + } +} +#endif /* KTR */ + +/* + * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs + * that are due to instruction intercepts as well as MSR and IOIO intercepts + * and exceptions caused by INT3, INTO and BOUND instructions. + * + * Return 1 if the nRIP is valid and 0 otherwise. + */ +static int +nrip_valid(uint64_t exitcode) +{ + switch (exitcode) { + case 0x00 ... 0x0F: /* read of CR0 through CR15 */ + case 0x10 ... 0x1F: /* write of CR0 through CR15 */ + case 0x20 ... 0x2F: /* read of DR0 through DR15 */ + case 0x30 ... 0x3F: /* write of DR0 through DR15 */ + case 0x43: /* INT3 */ + case 0x44: /* INTO */ + case 0x45: /* BOUND */ + case 0x65 ... 0x7C: /* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */ + case 0x80 ... 0x8D: /* VMEXIT_VMRUN ... VMEXIT_XSETBV */ + return (1); + default: + return (0); + } +} + +static int +svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) +{ + struct vmcb *vmcb; + struct vmcb_state *state; + struct vmcb_ctrl *ctrl; + struct svm_regctx *ctx; + uint64_t code, info1, info2, val; + uint32_t eax, ecx, edx; + int error, errcode_valid, handled, idtvec, reflect; + bool retu; + + ctx = svm_get_guest_regctx(svm_sc, vcpu); + vmcb = svm_get_vmcb(svm_sc, vcpu); + state = &vmcb->state; + ctrl = &vmcb->ctrl; + + handled = 0; + code = ctrl->exitcode; + info1 = ctrl->exitinfo1; + info2 = ctrl->exitinfo2; + + vmexit->exitcode = VM_EXITCODE_BOGUS; + vmexit->rip = state->rip; + vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0; + + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1); + + /* + * #VMEXIT(INVALID) needs to be handled early because the VMCB is + * in an inconsistent state and can trigger assertions that would + * never happen otherwise. + */ + if (code == VMCB_EXIT_INVALID) { + vm_exit_svm(vmexit, code, info1, info2); + return (0); + } + + KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event " + "injection valid bit is set %#lx", __func__, ctrl->eventinj)); + + KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15, + ("invalid inst_length %d: code (%#lx), info1 (%#lx), info2 (%#lx)", + vmexit->inst_length, code, info1, info2)); + + svm_update_virqinfo(svm_sc, vcpu); + svm_save_intinfo(svm_sc, vcpu); + + switch (code) { + case VMCB_EXIT_IRET: + /* + * Restart execution at "iret" but with the intercept cleared. + */ + vmexit->inst_length = 0; + clear_nmi_blocking(svm_sc, vcpu); + handled = 1; + break; + case VMCB_EXIT_VINTR: /* interrupt window exiting */ + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1); + handled = 1; + break; + case VMCB_EXIT_INTR: /* external interrupt */ + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1); + handled = 1; + break; + case VMCB_EXIT_NMI: /* external NMI */ + handled = 1; + break; + case 0x40 ... 0x5F: + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1); + reflect = 1; + idtvec = code - 0x40; + switch (idtvec) { + case IDT_MC: + /* + * Call the machine check handler by hand. Also don't + * reflect the machine check back into the guest. + */ + reflect = 0; + VCPU_CTR0(svm_sc->vm, vcpu, "Vectoring to MCE handler"); + __asm __volatile("int $18"); + break; + case IDT_PF: + error = svm_setreg(svm_sc, vcpu, VM_REG_GUEST_CR2, + info2); + KASSERT(error == 0, ("%s: error %d updating cr2", + __func__, error)); + /* fallthru */ + case IDT_NP: + case IDT_SS: + case IDT_GP: + case IDT_AC: + case IDT_TS: + errcode_valid = 1; + break; + + case IDT_DF: + errcode_valid = 1; + info1 = 0; + break; + + case IDT_BP: + case IDT_OF: + case IDT_BR: + /* + * The 'nrip' field is populated for INT3, INTO and + * BOUND exceptions and this also implies that + * 'inst_length' is non-zero. + * + * Reset 'inst_length' to zero so the guest %rip at + * event injection is identical to what it was when + * the exception originally happened. + */ + VCPU_CTR2(svm_sc->vm, vcpu, "Reset inst_length from %d " + "to zero before injecting exception %d", + vmexit->inst_length, idtvec); + vmexit->inst_length = 0; + /* fallthru */ + default: + errcode_valid = 0; + info1 = 0; + break; + } + KASSERT(vmexit->inst_length == 0, ("invalid inst_length (%d) " + "when reflecting exception %d into guest", + vmexit->inst_length, idtvec)); + + if (reflect) { + /* Reflect the exception back into the guest */ + VCPU_CTR2(svm_sc->vm, vcpu, "Reflecting exception " + "%d/%#x into the guest", idtvec, (int)info1); + error = vm_inject_exception(svm_sc->vm, vcpu, idtvec, + errcode_valid, info1, 0); + KASSERT(error == 0, ("%s: vm_inject_exception error %d", + __func__, error)); + } + handled = 1; + break; + case VMCB_EXIT_MSR: /* MSR access. */ + eax = state->rax; + ecx = ctx->sctx_rcx; + edx = ctx->sctx_rdx; + retu = false; + + if (info1) { + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1); + val = (uint64_t)edx << 32 | eax; + VCPU_CTR2(svm_sc->vm, vcpu, "wrmsr %#x val %#lx", + ecx, val); + if (emulate_wrmsr(svm_sc, vcpu, ecx, val, &retu)) { + vmexit->exitcode = VM_EXITCODE_WRMSR; + vmexit->u.msr.code = ecx; + vmexit->u.msr.wval = val; + } else if (!retu) { + handled = 1; + } else { + KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, + ("emulate_wrmsr retu with bogus exitcode")); + } + } else { + VCPU_CTR1(svm_sc->vm, vcpu, "rdmsr %#x", ecx); + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1); + if (emulate_rdmsr(svm_sc, vcpu, ecx, &retu)) { + vmexit->exitcode = VM_EXITCODE_RDMSR; + vmexit->u.msr.code = ecx; + } else if (!retu) { + handled = 1; + } else { + KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, + ("emulate_rdmsr retu with bogus exitcode")); + } + } + break; + case VMCB_EXIT_IO: + handled = svm_handle_io(svm_sc, vcpu, vmexit); + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1); + break; + case VMCB_EXIT_CPUID: + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1); + handled = x86_emulate_cpuid(svm_sc->vm, vcpu, + (uint32_t *)&state->rax, + (uint32_t *)&ctx->sctx_rbx, + (uint32_t *)&ctx->sctx_rcx, + (uint32_t *)&ctx->sctx_rdx); + break; + case VMCB_EXIT_HLT: + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1); + vmexit->exitcode = VM_EXITCODE_HLT; + vmexit->u.hlt.rflags = state->rflags; + break; + case VMCB_EXIT_PAUSE: + vmexit->exitcode = VM_EXITCODE_PAUSE; + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1); + break; + case VMCB_EXIT_NPF: + /* EXITINFO2 contains the faulting guest physical address */ + if (info1 & VMCB_NPF_INFO1_RSV) { + VCPU_CTR2(svm_sc->vm, vcpu, "nested page fault with " + "reserved bits set: info1(%#lx) info2(%#lx)", + info1, info2); + } else if (vm_mem_allocated(svm_sc->vm, info2)) { + vmexit->exitcode = VM_EXITCODE_PAGING; + vmexit->u.paging.gpa = info2; + vmexit->u.paging.fault_type = npf_fault_type(info1); + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1); + VCPU_CTR3(svm_sc->vm, vcpu, "nested page fault " + "on gpa %#lx/%#lx at rip %#lx", + info2, info1, state->rip); + } else if (svm_npf_emul_fault(info1)) { + svm_handle_inst_emul(vmcb, info2, vmexit); + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INST_EMUL, 1); + VCPU_CTR3(svm_sc->vm, vcpu, "inst_emul fault " + "for gpa %#lx/%#lx at rip %#lx", + info2, info1, state->rip); + } + break; + case VMCB_EXIT_MONITOR: + vmexit->exitcode = VM_EXITCODE_MONITOR; + break; + case VMCB_EXIT_MWAIT: + vmexit->exitcode = VM_EXITCODE_MWAIT; + break; + default: + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1); + break; + } + + VCPU_CTR4(svm_sc->vm, vcpu, "%s %s vmexit at %#lx/%d", + handled ? "handled" : "unhandled", exit_reason_to_str(code), + vmexit->rip, vmexit->inst_length); + + if (handled) { + vmexit->rip += vmexit->inst_length; + vmexit->inst_length = 0; + state->rip = vmexit->rip; + } else { + if (vmexit->exitcode == VM_EXITCODE_BOGUS) { + /* + * If this VM exit was not claimed by anybody then + * treat it as a generic SVM exit. + */ + vm_exit_svm(vmexit, code, info1, info2); + } else { + /* + * The exitcode and collateral have been populated. + * The VM exit will be processed further in userland. + */ + } + } + return (handled); +} + +static void +svm_inj_intinfo(struct svm_softc *svm_sc, int vcpu) +{ + uint64_t intinfo; + + if (!vm_entry_intinfo(svm_sc->vm, vcpu, &intinfo)) + return; + + KASSERT(VMCB_EXITINTINFO_VALID(intinfo), ("%s: entry intinfo is not " + "valid: %#lx", __func__, intinfo)); + + svm_eventinject(svm_sc, vcpu, VMCB_EXITINTINFO_TYPE(intinfo), + VMCB_EXITINTINFO_VECTOR(intinfo), + VMCB_EXITINTINFO_EC(intinfo), + VMCB_EXITINTINFO_EC_VALID(intinfo)); + vmm_stat_incr(svm_sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1); + VCPU_CTR1(svm_sc->vm, vcpu, "Injected entry intinfo: %#lx", intinfo); +} + +/* + * Inject event to virtual cpu. + */ +static void +svm_inj_interrupts(struct svm_softc *sc, int vcpu, struct vlapic *vlapic) +{ + struct vmcb_ctrl *ctrl; + struct vmcb_state *state; + struct svm_vcpu *vcpustate; + uint8_t v_tpr; + int vector, need_intr_window, pending_apic_vector; + + state = svm_get_vmcb_state(sc, vcpu); + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + vcpustate = svm_get_vcpu(sc, vcpu); + + need_intr_window = 0; + pending_apic_vector = 0; + + if (vcpustate->nextrip != state->rip) { + ctrl->intr_shadow = 0; + VCPU_CTR2(sc->vm, vcpu, "Guest interrupt blocking " + "cleared due to rip change: %#lx/%#lx", + vcpustate->nextrip, state->rip); + } + + /* + * Inject pending events or exceptions for this vcpu. + * + * An event might be pending because the previous #VMEXIT happened + * during event delivery (i.e. ctrl->exitintinfo). + * + * An event might also be pending because an exception was injected + * by the hypervisor (e.g. #PF during instruction emulation). + */ + svm_inj_intinfo(sc, vcpu); + + /* NMI event has priority over interrupts. */ + if (vm_nmi_pending(sc->vm, vcpu)) { + if (nmi_blocked(sc, vcpu)) { + /* + * Can't inject another NMI if the guest has not + * yet executed an "iret" after the last NMI. + */ + VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due " + "to NMI-blocking"); + } else if (ctrl->intr_shadow) { + /* + * Can't inject an NMI if the vcpu is in an intr_shadow. + */ + VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due to " + "interrupt shadow"); + need_intr_window = 1; + goto done; + } else if (ctrl->eventinj & VMCB_EVENTINJ_VALID) { + /* + * If there is already an exception/interrupt pending + * then defer the NMI until after that. + */ + VCPU_CTR1(sc->vm, vcpu, "Cannot inject NMI due to " + "eventinj %#lx", ctrl->eventinj); + + /* + * Use self-IPI to trigger a VM-exit as soon as + * possible after the event injection is completed. + * + * This works only if the external interrupt exiting + * is at a lower priority than the event injection. + * + * Although not explicitly specified in APMv2 the + * relative priorities were verified empirically. + */ + ipi_cpu(curcpu, IPI_AST); /* XXX vmm_ipinum? */ + } else { + vm_nmi_clear(sc->vm, vcpu); + + /* Inject NMI, vector number is not used */ + svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_NMI, + IDT_NMI, 0, false); + + /* virtual NMI blocking is now in effect */ + enable_nmi_blocking(sc, vcpu); + + VCPU_CTR0(sc->vm, vcpu, "Injecting vNMI"); + } + } + + if (!vm_extint_pending(sc->vm, vcpu)) { + /* + * APIC interrupts are delivered using the V_IRQ offload. + * + * The primary benefit is that the hypervisor doesn't need to + * deal with the various conditions that inhibit interrupts. + * It also means that TPR changes via CR8 will be handled + * without any hypervisor involvement. + * + * Note that the APIC vector must remain pending in the vIRR + * until it is confirmed that it was delivered to the guest. + * This can be confirmed based on the value of V_IRQ at the + * next #VMEXIT (1 = pending, 0 = delivered). + * + * Also note that it is possible that another higher priority + * vector can become pending before this vector is delivered + * to the guest. This is alright because vcpu_notify_event() + * will send an IPI and force the vcpu to trap back into the + * hypervisor. The higher priority vector will be injected on + * the next VMRUN. + */ + if (vlapic_pending_intr(vlapic, &vector)) { + KASSERT(vector >= 16 && vector <= 255, + ("invalid vector %d from local APIC", vector)); + pending_apic_vector = vector; + } + goto done; + } + + /* Ask the legacy pic for a vector to inject */ + vatpic_pending_intr(sc->vm, &vector); + KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d from INTR", + vector)); + + /* + * If the guest has disabled interrupts or is in an interrupt shadow + * then we cannot inject the pending interrupt. + */ + if ((state->rflags & PSL_I) == 0) { + VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to " + "rflags %#lx", vector, state->rflags); + need_intr_window = 1; + goto done; + } + + if (ctrl->intr_shadow) { + VCPU_CTR1(sc->vm, vcpu, "Cannot inject vector %d due to " + "interrupt shadow", vector); + need_intr_window = 1; + goto done; + } + + if (ctrl->eventinj & VMCB_EVENTINJ_VALID) { + VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to " + "eventinj %#lx", vector, ctrl->eventinj); + need_intr_window = 1; + goto done; + } + + /* + * Legacy PIC interrupts are delivered via the event injection + * mechanism. + */ + svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_INTR, vector, 0, false); + + vm_extint_clear(sc->vm, vcpu); + vatpic_intr_accepted(sc->vm, vector); + + /* + * Force a VM-exit as soon as the vcpu is ready to accept another + * interrupt. This is done because the PIC might have another vector + * that it wants to inject. Also, if the APIC has a pending interrupt + * that was preempted by the ExtInt then it allows us to inject the + * APIC vector as soon as possible. + */ + need_intr_window = 1; +done: + /* + * The guest can modify the TPR by writing to %CR8. In guest mode + * the processor reflects this write to V_TPR without hypervisor + * intervention. + * + * The guest can also modify the TPR by writing to it via the memory + * mapped APIC page. In this case, the write will be emulated by the + * hypervisor. For this reason V_TPR must be updated before every + * VMRUN. + */ + v_tpr = vlapic_get_cr8(vlapic); + KASSERT(v_tpr <= 15, ("invalid v_tpr %#x", v_tpr)); + if (ctrl->v_tpr != v_tpr) { + VCPU_CTR2(sc->vm, vcpu, "VMCB V_TPR changed from %#x to %#x", + ctrl->v_tpr, v_tpr); + ctrl->v_tpr = v_tpr; + svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); + } + + if (pending_apic_vector) { + /* + * If an APIC vector is being injected then interrupt window + * exiting is not possible on this VMRUN. + */ + KASSERT(!need_intr_window, ("intr_window exiting impossible")); + VCPU_CTR1(sc->vm, vcpu, "Injecting vector %d using V_IRQ", + pending_apic_vector); + + ctrl->v_irq = 1; + ctrl->v_ign_tpr = 0; + ctrl->v_intr_vector = pending_apic_vector; + ctrl->v_intr_prio = pending_apic_vector >> 4; + svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); + } else if (need_intr_window) { + /* + * We use V_IRQ in conjunction with the VINTR intercept to + * trap into the hypervisor as soon as a virtual interrupt + * can be delivered. + * + * Since injected events are not subject to intercept checks + * we need to ensure that the V_IRQ is not actually going to + * be delivered on VM entry. The KASSERT below enforces this. + */ + KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 || + (state->rflags & PSL_I) == 0 || ctrl->intr_shadow, + ("Bogus intr_window_exiting: eventinj (%#lx), " + "intr_shadow (%u), rflags (%#lx)", + ctrl->eventinj, ctrl->intr_shadow, state->rflags)); + enable_intr_window_exiting(sc, vcpu); + } else { + disable_intr_window_exiting(sc, vcpu); + } +} + +static __inline void +restore_host_tss(void) +{ + struct system_segment_descriptor *tss_sd; + + /* + * The TSS descriptor was in use prior to launching the guest so it + * has been marked busy. + * + * 'ltr' requires the descriptor to be marked available so change the + * type to "64-bit available TSS". + */ + tss_sd = PCPU_GET(tss); + tss_sd->sd_type = SDT_SYSTSS; + ltr(GSEL(GPROC0_SEL, SEL_KPL)); +} + +static void +check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, u_int thiscpu) +{ + struct svm_vcpu *vcpustate; + struct vmcb_ctrl *ctrl; + long eptgen; + bool alloc_asid; + + KASSERT(CPU_ISSET(thiscpu, &pmap->pm_active), ("%s: nested pmap not " + "active on cpu %u", __func__, thiscpu)); + + vcpustate = svm_get_vcpu(sc, vcpuid); + ctrl = svm_get_vmcb_ctrl(sc, vcpuid); + + /* + * The TLB entries associated with the vcpu's ASID are not valid + * if either of the following conditions is true: + * + * 1. The vcpu's ASID generation is different than the host cpu's + * ASID generation. This happens when the vcpu migrates to a new + * host cpu. It can also happen when the number of vcpus executing + * on a host cpu is greater than the number of ASIDs available. + * + * 2. The pmap generation number is different than the value cached in + * the 'vcpustate'. This happens when the host invalidates pages + * belonging to the guest. + * + * asidgen eptgen Action + * mismatch mismatch + * 0 0 (a) + * 0 1 (b1) or (b2) + * 1 0 (c) + * 1 1 (d) + * + * (a) There is no mismatch in eptgen or ASID generation and therefore + * no further action is needed. + * + * (b1) If the cpu supports FlushByAsid then the vcpu's ASID is + * retained and the TLB entries associated with this ASID + * are flushed by VMRUN. + * + * (b2) If the cpu does not support FlushByAsid then a new ASID is + * allocated. + * + * (c) A new ASID is allocated. + * + * (d) A new ASID is allocated. + */ + + alloc_asid = false; + eptgen = pmap->pm_eptgen; + ctrl->tlb_ctrl = VMCB_TLB_FLUSH_NOTHING; + + if (vcpustate->asid.gen != asid[thiscpu].gen) { + alloc_asid = true; /* (c) and (d) */ + } else if (vcpustate->eptgen != eptgen) { + if (flush_by_asid()) + ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST; /* (b1) */ + else + alloc_asid = true; /* (b2) */ + } else { + /* + * This is the common case (a). + */ + KASSERT(!alloc_asid, ("ASID allocation not necessary")); + KASSERT(ctrl->tlb_ctrl == VMCB_TLB_FLUSH_NOTHING, + ("Invalid VMCB tlb_ctrl: %#x", ctrl->tlb_ctrl)); + } + + if (alloc_asid) { + if (++asid[thiscpu].num >= nasid) { + asid[thiscpu].num = 1; + if (++asid[thiscpu].gen == 0) + asid[thiscpu].gen = 1; + /* + * If this cpu does not support "flush-by-asid" + * then flush the entire TLB on a generation + * bump. Subsequent ASID allocation in this + * generation can be done without a TLB flush. + */ + if (!flush_by_asid()) + ctrl->tlb_ctrl = VMCB_TLB_FLUSH_ALL; + } + vcpustate->asid.gen = asid[thiscpu].gen; + vcpustate->asid.num = asid[thiscpu].num; + + ctrl->asid = vcpustate->asid.num; + svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID); + /* + * If this cpu supports "flush-by-asid" then the TLB + * was not flushed after the generation bump. The TLB + * is flushed selectively after every new ASID allocation. + */ + if (flush_by_asid()) + ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST; + } + vcpustate->eptgen = eptgen; + + KASSERT(ctrl->asid != 0, ("Guest ASID must be non-zero")); + KASSERT(ctrl->asid == vcpustate->asid.num, + ("ASID mismatch: %u/%u", ctrl->asid, vcpustate->asid.num)); +} + +static __inline void +disable_gintr(void) +{ + + __asm __volatile("clgi"); +} + +static __inline void +enable_gintr(void) +{ + + __asm __volatile("stgi"); +} + +/* + * Start vcpu with specified RIP. + */ +static int +svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap, + void *rend_cookie, void *suspended_cookie) +{ + struct svm_regctx *gctx; + struct svm_softc *svm_sc; + struct svm_vcpu *vcpustate; + struct vmcb_state *state; + struct vmcb_ctrl *ctrl; + struct vm_exit *vmexit; + struct vlapic *vlapic; + struct vm *vm; + uint64_t vmcb_pa; + u_int thiscpu; + int handled; + + svm_sc = arg; + vm = svm_sc->vm; + + vcpustate = svm_get_vcpu(svm_sc, vcpu); + state = svm_get_vmcb_state(svm_sc, vcpu); + ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); + vmexit = vm_exitinfo(vm, vcpu); + vlapic = vm_lapic(vm, vcpu); + + /* + * Stash 'curcpu' on the stack as 'thiscpu'. + * + * The per-cpu data area is not accessible until MSR_GSBASE is restored + * after the #VMEXIT. Since VMRUN is executed inside a critical section + * 'curcpu' and 'thiscpu' are guaranteed to identical. + */ + thiscpu = curcpu; + + gctx = svm_get_guest_regctx(svm_sc, vcpu); + vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa; + + if (vcpustate->lastcpu != thiscpu) { + /* + * Force new ASID allocation by invalidating the generation. + */ + vcpustate->asid.gen = 0; + + /* + * Invalidate the VMCB state cache by marking all fields dirty. + */ + svm_set_dirty(svm_sc, vcpu, 0xffffffff); + + /* + * XXX + * Setting 'vcpustate->lastcpu' here is bit premature because + * we may return from this function without actually executing + * the VMRUN instruction. This could happen if a rendezvous + * or an AST is pending on the first time through the loop. + * + * This works for now but any new side-effects of vcpu + * migration should take this case into account. + */ + vcpustate->lastcpu = thiscpu; + vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1); + } + + svm_msr_guest_enter(svm_sc, vcpu); + + /* Update Guest RIP */ + state->rip = rip; + + do { + /* + * Disable global interrupts to guarantee atomicity during + * loading of guest state. This includes not only the state + * loaded by the "vmrun" instruction but also software state + * maintained by the hypervisor: suspended and rendezvous + * state, NPT generation number, vlapic interrupts etc. + */ + disable_gintr(); + + if (vcpu_suspended(suspended_cookie)) { + enable_gintr(); + vm_exit_suspended(vm, vcpu, state->rip); + break; + } + + if (vcpu_rendezvous_pending(rend_cookie)) { + enable_gintr(); + vm_exit_rendezvous(vm, vcpu, state->rip); + break; + } + + /* We are asked to give the cpu by scheduler. */ + if (vcpu_should_yield(vm, vcpu)) { + enable_gintr(); + vm_exit_astpending(vm, vcpu, state->rip); + break; + } + + svm_inj_interrupts(svm_sc, vcpu, vlapic); + + /* Activate the nested pmap on 'thiscpu' */ + CPU_SET_ATOMIC_ACQ(thiscpu, &pmap->pm_active); + + /* + * Check the pmap generation and the ASID generation to + * ensure that the vcpu does not use stale TLB mappings. + */ + check_asid(svm_sc, vcpu, pmap, thiscpu); + + ctrl->vmcb_clean = vmcb_clean & ~vcpustate->dirty; + vcpustate->dirty = 0; + VCPU_CTR1(vm, vcpu, "vmcb clean %#x", ctrl->vmcb_clean); + + /* Launch Virtual Machine. */ + VCPU_CTR1(vm, vcpu, "Resume execution at %#lx", state->rip); + svm_launch(vmcb_pa, gctx); + + CPU_CLR_ATOMIC(thiscpu, &pmap->pm_active); + + /* + * Restore MSR_GSBASE to point to the pcpu data area. + * + * Note that accesses done via PCPU_GET/PCPU_SET will work + * only after MSR_GSBASE is restored. + * + * Also note that we don't bother restoring MSR_KGSBASE + * since it is not used in the kernel and will be restored + * when the VMRUN ioctl returns to userspace. + */ + wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[thiscpu]); + KASSERT(curcpu == thiscpu, ("thiscpu/curcpu (%u/%u) mismatch", + thiscpu, curcpu)); + + /* + * The host GDTR and IDTR is saved by VMRUN and restored + * automatically on #VMEXIT. However, the host TSS needs + * to be restored explicitly. + */ + restore_host_tss(); + + /* #VMEXIT disables interrupts so re-enable them here. */ + enable_gintr(); + + /* Update 'nextrip' */ + vcpustate->nextrip = state->rip; + + /* Handle #VMEXIT and if required return to user space. */ + handled = svm_vmexit(svm_sc, vcpu, vmexit); + } while (handled); + + svm_msr_guest_exit(svm_sc, vcpu); + + return (0); +} + +static void +svm_vmcleanup(void *arg) +{ + struct svm_softc *sc = arg; + + free(sc, M_SVM); +} + +static register_t * +swctx_regptr(struct svm_regctx *regctx, int reg) +{ + + switch (reg) { + case VM_REG_GUEST_RBX: + return (®ctx->sctx_rbx); + case VM_REG_GUEST_RCX: + return (®ctx->sctx_rcx); + case VM_REG_GUEST_RDX: + return (®ctx->sctx_rdx); + case VM_REG_GUEST_RDI: + return (®ctx->sctx_rdi); + case VM_REG_GUEST_RSI: + return (®ctx->sctx_rsi); + case VM_REG_GUEST_RBP: + return (®ctx->sctx_rbp); + case VM_REG_GUEST_R8: + return (®ctx->sctx_r8); + case VM_REG_GUEST_R9: + return (®ctx->sctx_r9); + case VM_REG_GUEST_R10: + return (®ctx->sctx_r10); + case VM_REG_GUEST_R11: + return (®ctx->sctx_r11); + case VM_REG_GUEST_R12: + return (®ctx->sctx_r12); + case VM_REG_GUEST_R13: + return (®ctx->sctx_r13); + case VM_REG_GUEST_R14: + return (®ctx->sctx_r14); + case VM_REG_GUEST_R15: + return (®ctx->sctx_r15); + default: + return (NULL); + } +} + +static int +svm_getreg(void *arg, int vcpu, int ident, uint64_t *val) +{ + struct svm_softc *svm_sc; + register_t *reg; + + svm_sc = arg; + + if (ident == VM_REG_GUEST_INTR_SHADOW) { + return (svm_get_intr_shadow(svm_sc, vcpu, val)); + } + + if (vmcb_read(svm_sc, vcpu, ident, val) == 0) { + return (0); + } + + reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident); + + if (reg != NULL) { + *val = *reg; + return (0); + } + + VCPU_CTR1(svm_sc->vm, vcpu, "svm_getreg: unknown register %#x", ident); + return (EINVAL); +} + +static int +svm_setreg(void *arg, int vcpu, int ident, uint64_t val) +{ + struct svm_softc *svm_sc; + register_t *reg; + + svm_sc = arg; + + if (ident == VM_REG_GUEST_INTR_SHADOW) { + return (svm_modify_intr_shadow(svm_sc, vcpu, val)); + } + + if (vmcb_write(svm_sc, vcpu, ident, val) == 0) { + return (0); + } + + reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident); + + if (reg != NULL) { + *reg = val; + return (0); + } + + /* + * XXX deal with CR3 and invalidate TLB entries tagged with the + * vcpu's ASID. This needs to be treated differently depending on + * whether 'running' is true/false. + */ + + VCPU_CTR1(svm_sc->vm, vcpu, "svm_setreg: unknown register %#x", ident); + return (EINVAL); +} + +static int +svm_setcap(void *arg, int vcpu, int type, int val) +{ + struct svm_softc *sc; + int error; + + sc = arg; + error = 0; + switch (type) { + case VM_CAP_HALT_EXIT: + svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_HLT, val); + break; + case VM_CAP_PAUSE_EXIT: + svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_PAUSE, val); + break; + case VM_CAP_UNRESTRICTED_GUEST: + /* Unrestricted guest execution cannot be disabled in SVM */ + if (val == 0) + error = EINVAL; + break; + default: + error = ENOENT; + break; + } + return (error); +} + +static int +svm_getcap(void *arg, int vcpu, int type, int *retval) +{ + struct svm_softc *sc; + int error; + + sc = arg; + error = 0; + + switch (type) { + case VM_CAP_HALT_EXIT: + *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_HLT); + break; + case VM_CAP_PAUSE_EXIT: + *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_PAUSE); + break; + case VM_CAP_UNRESTRICTED_GUEST: + *retval = 1; /* unrestricted guest is always enabled */ + break; + default: + error = ENOENT; + break; + } + return (error); +} + +static struct vlapic * +svm_vlapic_init(void *arg, int vcpuid) +{ + struct svm_softc *svm_sc; + struct vlapic *vlapic; + + svm_sc = arg; + vlapic = malloc(sizeof(struct vlapic), M_SVM_VLAPIC, M_WAITOK | M_ZERO); + vlapic->vm = svm_sc->vm; + vlapic->vcpuid = vcpuid; + vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid]; + + vlapic_init(vlapic); + + return (vlapic); +} + +static void +svm_vlapic_cleanup(void *arg, struct vlapic *vlapic) +{ + + vlapic_cleanup(vlapic); + free(vlapic, M_SVM_VLAPIC); +} + +struct vmm_ops vmm_ops_amd = { + svm_init, + svm_cleanup, + svm_restore, + svm_vminit, + svm_vmrun, + svm_vmcleanup, + svm_getreg, + svm_setreg, + vmcb_getdesc, + vmcb_setdesc, + svm_getcap, + svm_setcap, + svm_npt_alloc, + svm_npt_free, + svm_vlapic_init, + svm_vlapic_cleanup +}; diff --git a/vmm/amd/svm.h b/vmm/amd/svm.h new file mode 100644 index 0000000..86bd638 --- /dev/null +++ b/vmm/amd/svm.h @@ -0,0 +1,54 @@ +/*- + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SVM_H_ +#define _SVM_H_ + +/* + * Guest register state that is saved outside the VMCB. + */ +struct svm_regctx { + register_t sctx_rbp; + register_t sctx_rbx; + register_t sctx_rcx; + register_t sctx_rdx; + register_t sctx_rdi; + register_t sctx_rsi; + register_t sctx_r8; + register_t sctx_r9; + register_t sctx_r10; + register_t sctx_r11; + register_t sctx_r12; + register_t sctx_r13; + register_t sctx_r14; + register_t sctx_r15; +}; + +void svm_launch(uint64_t pa, struct svm_regctx *); + +#endif /* _SVM_H_ */ diff --git a/vmm/amd/svm_genassym.c b/vmm/amd/svm_genassym.c new file mode 100644 index 0000000..b7831eb --- /dev/null +++ b/vmm/amd/svm_genassym.c @@ -0,0 +1,48 @@ +/*- + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include "svm.h" + +ASSYM(SCTX_RBX, offsetof(struct svm_regctx, sctx_rbx)); +ASSYM(SCTX_RCX, offsetof(struct svm_regctx, sctx_rcx)); +ASSYM(SCTX_RBP, offsetof(struct svm_regctx, sctx_rbp)); +ASSYM(SCTX_RDX, offsetof(struct svm_regctx, sctx_rdx)); +ASSYM(SCTX_RDI, offsetof(struct svm_regctx, sctx_rdi)); +ASSYM(SCTX_RSI, offsetof(struct svm_regctx, sctx_rsi)); +ASSYM(SCTX_R8, offsetof(struct svm_regctx, sctx_r8)); +ASSYM(SCTX_R9, offsetof(struct svm_regctx, sctx_r9)); +ASSYM(SCTX_R10, offsetof(struct svm_regctx, sctx_r10)); +ASSYM(SCTX_R11, offsetof(struct svm_regctx, sctx_r11)); +ASSYM(SCTX_R12, offsetof(struct svm_regctx, sctx_r12)); +ASSYM(SCTX_R13, offsetof(struct svm_regctx, sctx_r13)); +ASSYM(SCTX_R14, offsetof(struct svm_regctx, sctx_r14)); +ASSYM(SCTX_R15, offsetof(struct svm_regctx, sctx_r15)); diff --git a/vmm/amd/svm_msr.c b/vmm/amd/svm_msr.c new file mode 100644 index 0000000..088751a --- /dev/null +++ b/vmm/amd/svm_msr.c @@ -0,0 +1,165 @@ +/*- + * Copyright (c) 2014, Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include +#include + +#include "svm.h" +#include "vmcb.h" +#include "svm_softc.h" +#include "svm_msr.h" + +#ifndef MSR_AMDK8_IPM +#define MSR_AMDK8_IPM 0xc0010055 +#endif + +enum { + IDX_MSR_LSTAR, + IDX_MSR_CSTAR, + IDX_MSR_STAR, + IDX_MSR_SF_MASK, + HOST_MSR_NUM /* must be the last enumeration */ +}; + +static uint64_t host_msrs[HOST_MSR_NUM]; + +void +svm_msr_init(void) +{ + /* + * It is safe to cache the values of the following MSRs because they + * don't change based on curcpu, curproc or curthread. + */ + host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); + host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); + host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); + host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); +} + +void +svm_msr_guest_init(struct svm_softc *sc, int vcpu) +{ + /* + * All the MSRs accessible to the guest are either saved/restored by + * hardware on every #VMEXIT/VMRUN (e.g., G_PAT) or are saved/restored + * by VMSAVE/VMLOAD (e.g., MSR_GSBASE). + * + * There are no guest MSRs that are saved/restored "by hand" so nothing + * more to do here. + */ + return; +} + +void +svm_msr_guest_enter(struct svm_softc *sc, int vcpu) +{ + /* + * Save host MSRs (if any) and restore guest MSRs (if any). + */ +} + +void +svm_msr_guest_exit(struct svm_softc *sc, int vcpu) +{ + /* + * Save guest MSRs (if any) and restore host MSRs. + */ + wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]); + wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]); + wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]); + wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]); + + /* MSR_KGSBASE will be restored on the way back to userspace */ +} + +int +svm_rdmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t *result, + bool *retu) +{ + int error = 0; + + switch (num) { + case MSR_MCG_CAP: + case MSR_MCG_STATUS: + *result = 0; + break; + case MSR_MTRRcap: + case MSR_MTRRdefType: + case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8: + case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: + case MSR_MTRR64kBase: + case MSR_SYSCFG: + *result = 0; + break; + case MSR_AMDK8_IPM: + *result = 0; + break; + default: + error = EINVAL; + break; + } + + return (error); +} + +int +svm_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val, bool *retu) +{ + int error = 0; + + switch (num) { + case MSR_MCG_CAP: + case MSR_MCG_STATUS: + break; /* ignore writes */ + case MSR_MTRRcap: + vm_inject_gp(sc->vm, vcpu); + break; + case MSR_MTRRdefType: + case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8: + case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: + case MSR_MTRR64kBase: + case MSR_SYSCFG: + break; /* Ignore writes */ + case MSR_AMDK8_IPM: + /* + * Ignore writes to the "Interrupt Pending Message" MSR. + */ + break; + default: + error = EINVAL; + break; + } + + return (error); +} diff --git a/vmm/amd/svm_msr.h b/vmm/amd/svm_msr.h new file mode 100644 index 0000000..07716c8 --- /dev/null +++ b/vmm/amd/svm_msr.h @@ -0,0 +1,44 @@ +/*- + * Copyright (c) 2014 Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SVM_MSR_H_ +#define _SVM_MSR_H_ + +struct svm_softc; + +void svm_msr_init(void); +void svm_msr_guest_init(struct svm_softc *sc, int vcpu); +void svm_msr_guest_enter(struct svm_softc *sc, int vcpu); +void svm_msr_guest_exit(struct svm_softc *sc, int vcpu); + +int svm_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val, + bool *retu); +int svm_rdmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t *result, + bool *retu); + +#endif /* _SVM_MSR_H_ */ diff --git a/vmm/amd/svm_softc.h b/vmm/amd/svm_softc.h new file mode 100644 index 0000000..de0c3f7 --- /dev/null +++ b/vmm/amd/svm_softc.h @@ -0,0 +1,114 @@ +/*- + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SVM_SOFTC_H_ +#define _SVM_SOFTC_H_ + +#define SVM_IO_BITMAP_SIZE (3 * PAGE_SIZE) +#define SVM_MSR_BITMAP_SIZE (2 * PAGE_SIZE) + +struct asid { + uint64_t gen; /* range is [1, ~0UL] */ + uint32_t num; /* range is [1, nasid - 1] */ +}; + +/* + * XXX separate out 'struct vmcb' from 'svm_vcpu' to avoid wasting space + * due to VMCB alignment requirements. + */ +struct svm_vcpu { + struct vmcb vmcb; /* hardware saved vcpu context */ + struct svm_regctx swctx; /* software saved vcpu context */ + uint64_t vmcb_pa; /* VMCB physical address */ + uint64_t nextrip; /* next instruction to be executed by guest */ + int lastcpu; /* host cpu that the vcpu last ran on */ + uint32_t dirty; /* state cache bits that must be cleared */ + long eptgen; /* pmap->pm_eptgen when the vcpu last ran */ + struct asid asid; +} __aligned(PAGE_SIZE); + +/* + * SVM softc, one per virtual machine. + */ +struct svm_softc { + uint8_t iopm_bitmap[SVM_IO_BITMAP_SIZE]; /* shared by all vcpus */ + uint8_t msr_bitmap[SVM_MSR_BITMAP_SIZE]; /* shared by all vcpus */ + uint8_t apic_page[VM_MAXCPU][PAGE_SIZE]; + struct svm_vcpu vcpu[VM_MAXCPU]; + vm_offset_t nptp; /* nested page table */ + struct vm *vm; +} __aligned(PAGE_SIZE); + +CTASSERT((offsetof(struct svm_softc, nptp) & PAGE_MASK) == 0); + +static __inline struct svm_vcpu * +svm_get_vcpu(struct svm_softc *sc, int vcpu) +{ + + return (&(sc->vcpu[vcpu])); +} + +static __inline struct vmcb * +svm_get_vmcb(struct svm_softc *sc, int vcpu) +{ + + return (&(sc->vcpu[vcpu].vmcb)); +} + +static __inline struct vmcb_state * +svm_get_vmcb_state(struct svm_softc *sc, int vcpu) +{ + + return (&(sc->vcpu[vcpu].vmcb.state)); +} + +static __inline struct vmcb_ctrl * +svm_get_vmcb_ctrl(struct svm_softc *sc, int vcpu) +{ + + return (&(sc->vcpu[vcpu].vmcb.ctrl)); +} + +static __inline struct svm_regctx * +svm_get_guest_regctx(struct svm_softc *sc, int vcpu) +{ + + return (&(sc->vcpu[vcpu].swctx)); +} + +static __inline void +svm_set_dirty(struct svm_softc *sc, int vcpu, uint32_t dirtybits) +{ + struct svm_vcpu *vcpustate; + + vcpustate = svm_get_vcpu(sc, vcpu); + + vcpustate->dirty |= dirtybits; +} + +#endif /* _SVM_SOFTC_H_ */ diff --git a/vmm/amd/svm_support.S b/vmm/amd/svm_support.S new file mode 100644 index 0000000..b363101 --- /dev/null +++ b/vmm/amd/svm_support.S @@ -0,0 +1,121 @@ +/*- + * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ +#include + +#include "svm_assym.h" + +/* + * Be friendly to DTrace FBT's prologue/epilogue pattern matching. + * + * They are also responsible for saving/restoring the host %rbp across VMRUN. + */ +#define VENTER push %rbp ; mov %rsp,%rbp +#define VLEAVE pop %rbp + +#define VMLOAD .byte 0x0f, 0x01, 0xda +#define VMRUN .byte 0x0f, 0x01, 0xd8 +#define VMSAVE .byte 0x0f, 0x01, 0xdb + +/* + * svm_launch(uint64_t vmcb, struct svm_regctx *gctx) + * %rdi: physical address of VMCB + * %rsi: pointer to guest context + */ +ENTRY(svm_launch) + VENTER + + /* + * Host register state saved across a VMRUN. + * + * All "callee saved registers" except: + * %rsp: because it is preserved by the processor across VMRUN. + * %rbp: because it is saved/restored by the function prologue/epilogue. + */ + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + + /* Save the physical address of the VMCB in %rax */ + movq %rdi, %rax + + push %rsi /* push guest context pointer on the stack */ + + /* + * Restore guest state. + */ + movq SCTX_R8(%rsi), %r8 + movq SCTX_R9(%rsi), %r9 + movq SCTX_R10(%rsi), %r10 + movq SCTX_R11(%rsi), %r11 + movq SCTX_R12(%rsi), %r12 + movq SCTX_R13(%rsi), %r13 + movq SCTX_R14(%rsi), %r14 + movq SCTX_R15(%rsi), %r15 + movq SCTX_RBP(%rsi), %rbp + movq SCTX_RBX(%rsi), %rbx + movq SCTX_RCX(%rsi), %rcx + movq SCTX_RDX(%rsi), %rdx + movq SCTX_RDI(%rsi), %rdi + movq SCTX_RSI(%rsi), %rsi /* %rsi must be restored last */ + + VMLOAD + VMRUN + VMSAVE + + pop %rax /* pop guest context pointer from the stack */ + + /* + * Save guest state. + */ + movq %r8, SCTX_R8(%rax) + movq %r9, SCTX_R9(%rax) + movq %r10, SCTX_R10(%rax) + movq %r11, SCTX_R11(%rax) + movq %r12, SCTX_R12(%rax) + movq %r13, SCTX_R13(%rax) + movq %r14, SCTX_R14(%rax) + movq %r15, SCTX_R15(%rax) + movq %rbp, SCTX_RBP(%rax) + movq %rbx, SCTX_RBX(%rax) + movq %rcx, SCTX_RCX(%rax) + movq %rdx, SCTX_RDX(%rax) + movq %rdi, SCTX_RDI(%rax) + movq %rsi, SCTX_RSI(%rax) + + /* Restore host state */ + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + + VLEAVE + ret +END(svm_launch) diff --git a/vmm/amd/vmcb.c b/vmm/amd/vmcb.c new file mode 100644 index 0000000..d860169 --- /dev/null +++ b/vmm/amd/vmcb.c @@ -0,0 +1,442 @@ +/*- + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include + +#include "vmm_ktr.h" + +#include "vmcb.h" +#include "svm.h" +#include "svm_softc.h" + +/* + * The VMCB aka Virtual Machine Control Block is a 4KB aligned page + * in memory that describes the virtual machine. + * + * The VMCB contains: + * - instructions or events in the guest to intercept + * - control bits that modify execution environment of the guest + * - guest processor state (e.g. general purpose registers) + */ + +/* + * Return VMCB segment area. + */ +static struct vmcb_segment * +vmcb_segptr(struct vmcb *vmcb, int type) +{ + struct vmcb_state *state; + struct vmcb_segment *seg; + + state = &vmcb->state; + + switch (type) { + case VM_REG_GUEST_CS: + seg = &state->cs; + break; + + case VM_REG_GUEST_DS: + seg = &state->ds; + break; + + case VM_REG_GUEST_ES: + seg = &state->es; + break; + + case VM_REG_GUEST_FS: + seg = &state->fs; + break; + + case VM_REG_GUEST_GS: + seg = &state->gs; + break; + + case VM_REG_GUEST_SS: + seg = &state->ss; + break; + + case VM_REG_GUEST_GDTR: + seg = &state->gdt; + break; + + case VM_REG_GUEST_IDTR: + seg = &state->idt; + break; + + case VM_REG_GUEST_LDTR: + seg = &state->ldt; + break; + + case VM_REG_GUEST_TR: + seg = &state->tr; + break; + + default: + seg = NULL; + break; + } + + return (seg); +} + +static int +vmcb_access(struct svm_softc *softc, int vcpu, int write, int ident, + uint64_t *val) +{ + struct vmcb *vmcb; + int off, bytes; + char *ptr; + + vmcb = svm_get_vmcb(softc, vcpu); + off = VMCB_ACCESS_OFFSET(ident); + bytes = VMCB_ACCESS_BYTES(ident); + + if ((off + bytes) >= sizeof (struct vmcb)) + return (EINVAL); + + ptr = (char *)vmcb; + + if (!write) + *val = 0; + + switch (bytes) { + case 8: + case 4: + case 2: + if (write) + memcpy(ptr + off, val, bytes); + else + memcpy(val, ptr + off, bytes); + break; + default: + VCPU_CTR1(softc->vm, vcpu, + "Invalid size %d for VMCB access: %d", bytes); + return (EINVAL); + } + + /* Invalidate all VMCB state cached by h/w. */ + if (write) + svm_set_dirty(softc, vcpu, 0xffffffff); + + return (0); +} + +/* + * Read from segment selector, control and general purpose register of VMCB. + */ +int +vmcb_read(struct svm_softc *sc, int vcpu, int ident, uint64_t *retval) +{ + struct vmcb *vmcb; + struct vmcb_state *state; + struct vmcb_segment *seg; + int err; + + vmcb = svm_get_vmcb(sc, vcpu); + state = &vmcb->state; + err = 0; + + if (VMCB_ACCESS_OK(ident)) + return (vmcb_access(sc, vcpu, 0, ident, retval)); + + switch (ident) { + case VM_REG_GUEST_CR0: + *retval = state->cr0; + break; + + case VM_REG_GUEST_CR2: + *retval = state->cr2; + break; + + case VM_REG_GUEST_CR3: + *retval = state->cr3; + break; + + case VM_REG_GUEST_CR4: + *retval = state->cr4; + break; + + case VM_REG_GUEST_DR7: + *retval = state->dr7; + break; + + case VM_REG_GUEST_EFER: + *retval = state->efer; + break; + + case VM_REG_GUEST_RAX: + *retval = state->rax; + break; + + case VM_REG_GUEST_RFLAGS: + *retval = state->rflags; + break; + + case VM_REG_GUEST_RIP: + *retval = state->rip; + break; + + case VM_REG_GUEST_RSP: + *retval = state->rsp; + break; + + case VM_REG_GUEST_CS: + case VM_REG_GUEST_DS: + case VM_REG_GUEST_ES: + case VM_REG_GUEST_FS: + case VM_REG_GUEST_GS: + case VM_REG_GUEST_SS: + case VM_REG_GUEST_LDTR: + case VM_REG_GUEST_TR: + seg = vmcb_segptr(vmcb, ident); + KASSERT(seg != NULL, ("%s: unable to get segment %d from VMCB", + __func__, ident)); + *retval = seg->selector; + break; + + case VM_REG_GUEST_GDTR: + case VM_REG_GUEST_IDTR: + /* GDTR and IDTR don't have segment selectors */ + err = EINVAL; + break; + default: + err = EINVAL; + break; + } + + return (err); +} + +/* + * Write to segment selector, control and general purpose register of VMCB. + */ +int +vmcb_write(struct svm_softc *sc, int vcpu, int ident, uint64_t val) +{ + struct vmcb *vmcb; + struct vmcb_state *state; + struct vmcb_segment *seg; + int err, dirtyseg; + + vmcb = svm_get_vmcb(sc, vcpu); + state = &vmcb->state; + dirtyseg = 0; + err = 0; + + if (VMCB_ACCESS_OK(ident)) + return (vmcb_access(sc, vcpu, 1, ident, &val)); + + switch (ident) { + case VM_REG_GUEST_CR0: + state->cr0 = val; + svm_set_dirty(sc, vcpu, VMCB_CACHE_CR); + break; + + case VM_REG_GUEST_CR2: + state->cr2 = val; + svm_set_dirty(sc, vcpu, VMCB_CACHE_CR2); + break; + + case VM_REG_GUEST_CR3: + state->cr3 = val; + svm_set_dirty(sc, vcpu, VMCB_CACHE_CR); + break; + + case VM_REG_GUEST_CR4: + state->cr4 = val; + svm_set_dirty(sc, vcpu, VMCB_CACHE_CR); + break; + + case VM_REG_GUEST_DR7: + state->dr7 = val; + break; + + case VM_REG_GUEST_EFER: + /* EFER_SVM must always be set when the guest is executing */ + state->efer = val | EFER_SVM; + svm_set_dirty(sc, vcpu, VMCB_CACHE_CR); + break; + + case VM_REG_GUEST_RAX: + state->rax = val; + break; + + case VM_REG_GUEST_RFLAGS: + state->rflags = val; + break; + + case VM_REG_GUEST_RIP: + state->rip = val; + break; + + case VM_REG_GUEST_RSP: + state->rsp = val; + break; + + case VM_REG_GUEST_CS: + case VM_REG_GUEST_DS: + case VM_REG_GUEST_ES: + case VM_REG_GUEST_SS: + dirtyseg = 1; /* FALLTHROUGH */ + case VM_REG_GUEST_FS: + case VM_REG_GUEST_GS: + case VM_REG_GUEST_LDTR: + case VM_REG_GUEST_TR: + seg = vmcb_segptr(vmcb, ident); + KASSERT(seg != NULL, ("%s: unable to get segment %d from VMCB", + __func__, ident)); + seg->selector = val; + if (dirtyseg) + svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG); + break; + + case VM_REG_GUEST_GDTR: + case VM_REG_GUEST_IDTR: + /* GDTR and IDTR don't have segment selectors */ + err = EINVAL; + break; + default: + err = EINVAL; + break; + } + + return (err); +} + +int +vmcb_seg(struct vmcb *vmcb, int ident, struct vmcb_segment *seg2) +{ + struct vmcb_segment *seg; + + seg = vmcb_segptr(vmcb, ident); + if (seg != NULL) { + bcopy(seg, seg2, sizeof(struct vmcb_segment)); + return (0); + } else { + return (EINVAL); + } +} + +int +vmcb_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) +{ + struct vmcb *vmcb; + struct svm_softc *sc; + struct vmcb_segment *seg; + uint16_t attrib; + + sc = arg; + vmcb = svm_get_vmcb(sc, vcpu); + + seg = vmcb_segptr(vmcb, reg); + KASSERT(seg != NULL, ("%s: invalid segment descriptor %d", + __func__, reg)); + + seg->base = desc->base; + seg->limit = desc->limit; + if (reg != VM_REG_GUEST_GDTR && reg != VM_REG_GUEST_IDTR) { + /* + * Map seg_desc access to VMCB attribute format. + * + * SVM uses the 'P' bit in the segment attributes to indicate a + * NULL segment so clear it if the segment is marked unusable. + */ + attrib = ((desc->access & 0xF000) >> 4) | (desc->access & 0xFF); + if (SEG_DESC_UNUSABLE(desc->access)) { + attrib &= ~0x80; + } + seg->attrib = attrib; + } + + VCPU_CTR4(sc->vm, vcpu, "Setting desc %d: base (%#lx), limit (%#x), " + "attrib (%#x)", reg, seg->base, seg->limit, seg->attrib); + + switch (reg) { + case VM_REG_GUEST_CS: + case VM_REG_GUEST_DS: + case VM_REG_GUEST_ES: + case VM_REG_GUEST_SS: + svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG); + break; + case VM_REG_GUEST_GDTR: + case VM_REG_GUEST_IDTR: + svm_set_dirty(sc, vcpu, VMCB_CACHE_DT); + break; + default: + break; + } + + return (0); +} + +int +vmcb_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) +{ + struct vmcb *vmcb; + struct svm_softc *sc; + struct vmcb_segment *seg; + + sc = arg; + vmcb = svm_get_vmcb(sc, vcpu); + seg = vmcb_segptr(vmcb, reg); + KASSERT(seg != NULL, ("%s: invalid segment descriptor %d", + __func__, reg)); + + desc->base = seg->base; + desc->limit = seg->limit; + desc->access = 0; + + if (reg != VM_REG_GUEST_GDTR && reg != VM_REG_GUEST_IDTR) { + /* Map seg_desc access to VMCB attribute format */ + desc->access = ((seg->attrib & 0xF00) << 4) | + (seg->attrib & 0xFF); + + /* + * VT-x uses bit 16 to indicate a segment that has been loaded + * with a NULL selector (aka unusable). The 'desc->access' + * field is interpreted in the VT-x format by the + * processor-independent code. + * + * SVM uses the 'P' bit to convey the same information so + * convert it into the VT-x format. For more details refer to + * section "Segment State in the VMCB" in APMv2. + */ + if (reg != VM_REG_GUEST_CS && reg != VM_REG_GUEST_TR) { + if ((desc->access & 0x80) == 0) + desc->access |= 0x10000; /* Unusable segment */ + } + } + + return (0); +} diff --git a/vmm/amd/vmcb.h b/vmm/amd/vmcb.h new file mode 100644 index 0000000..496f880 --- /dev/null +++ b/vmm/amd/vmcb.h @@ -0,0 +1,334 @@ +/*- + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMCB_H_ +#define _VMCB_H_ + +struct svm_softc; + +#define BIT(n) (1ULL << n) + +/* + * Secure Virtual Machine: AMD64 Programmer's Manual Vol2, Chapter 15 + * Layout of VMCB: AMD64 Programmer's Manual Vol2, Appendix B + */ + +/* vmcb_ctrl->intercept[] array indices */ +#define VMCB_CR_INTCPT 0 +#define VMCB_DR_INTCPT 1 +#define VMCB_EXC_INTCPT 2 +#define VMCB_CTRL1_INTCPT 3 +#define VMCB_CTRL2_INTCPT 4 + +/* intercept[VMCB_CTRL1_INTCPT] fields */ +#define VMCB_INTCPT_INTR BIT(0) +#define VMCB_INTCPT_NMI BIT(1) +#define VMCB_INTCPT_SMI BIT(2) +#define VMCB_INTCPT_INIT BIT(3) +#define VMCB_INTCPT_VINTR BIT(4) +#define VMCB_INTCPT_CR0_WRITE BIT(5) +#define VMCB_INTCPT_IDTR_READ BIT(6) +#define VMCB_INTCPT_GDTR_READ BIT(7) +#define VMCB_INTCPT_LDTR_READ BIT(8) +#define VMCB_INTCPT_TR_READ BIT(9) +#define VMCB_INTCPT_IDTR_WRITE BIT(10) +#define VMCB_INTCPT_GDTR_WRITE BIT(11) +#define VMCB_INTCPT_LDTR_WRITE BIT(12) +#define VMCB_INTCPT_TR_WRITE BIT(13) +#define VMCB_INTCPT_RDTSC BIT(14) +#define VMCB_INTCPT_RDPMC BIT(15) +#define VMCB_INTCPT_PUSHF BIT(16) +#define VMCB_INTCPT_POPF BIT(17) +#define VMCB_INTCPT_CPUID BIT(18) +#define VMCB_INTCPT_RSM BIT(19) +#define VMCB_INTCPT_IRET BIT(20) +#define VMCB_INTCPT_INTn BIT(21) +#define VMCB_INTCPT_INVD BIT(22) +#define VMCB_INTCPT_PAUSE BIT(23) +#define VMCB_INTCPT_HLT BIT(24) +#define VMCB_INTCPT_INVPG BIT(25) +#define VMCB_INTCPT_INVPGA BIT(26) +#define VMCB_INTCPT_IO BIT(27) +#define VMCB_INTCPT_MSR BIT(28) +#define VMCB_INTCPT_TASK_SWITCH BIT(29) +#define VMCB_INTCPT_FERR_FREEZE BIT(30) +#define VMCB_INTCPT_SHUTDOWN BIT(31) + +/* intercept[VMCB_CTRL2_INTCPT] fields */ +#define VMCB_INTCPT_VMRUN BIT(0) +#define VMCB_INTCPT_VMMCALL BIT(1) +#define VMCB_INTCPT_VMLOAD BIT(2) +#define VMCB_INTCPT_VMSAVE BIT(3) +#define VMCB_INTCPT_STGI BIT(4) +#define VMCB_INTCPT_CLGI BIT(5) +#define VMCB_INTCPT_SKINIT BIT(6) +#define VMCB_INTCPT_RDTSCP BIT(7) +#define VMCB_INTCPT_ICEBP BIT(8) +#define VMCB_INTCPT_WBINVD BIT(9) +#define VMCB_INTCPT_MONITOR BIT(10) +#define VMCB_INTCPT_MWAIT BIT(11) +#define VMCB_INTCPT_MWAIT_ARMED BIT(12) +#define VMCB_INTCPT_XSETBV BIT(13) + +/* VMCB TLB control */ +#define VMCB_TLB_FLUSH_NOTHING 0 /* Flush nothing */ +#define VMCB_TLB_FLUSH_ALL 1 /* Flush entire TLB */ +#define VMCB_TLB_FLUSH_GUEST 3 /* Flush all guest entries */ +#define VMCB_TLB_FLUSH_GUEST_NONGLOBAL 7 /* Flush guest non-PG entries */ + +/* VMCB state caching */ +#define VMCB_CACHE_NONE 0 /* No caching */ +#define VMCB_CACHE_I BIT(0) /* Intercept, TSC off, Pause filter */ +#define VMCB_CACHE_IOPM BIT(1) /* I/O and MSR permission */ +#define VMCB_CACHE_ASID BIT(2) /* ASID */ +#define VMCB_CACHE_TPR BIT(3) /* V_TPR to V_INTR_VECTOR */ +#define VMCB_CACHE_NP BIT(4) /* Nested Paging */ +#define VMCB_CACHE_CR BIT(5) /* CR0, CR3, CR4 & EFER */ +#define VMCB_CACHE_DR BIT(6) /* Debug registers */ +#define VMCB_CACHE_DT BIT(7) /* GDT/IDT */ +#define VMCB_CACHE_SEG BIT(8) /* User segments, CPL */ +#define VMCB_CACHE_CR2 BIT(9) /* page fault address */ +#define VMCB_CACHE_LBR BIT(10) /* Last branch */ + +/* VMCB control event injection */ +#define VMCB_EVENTINJ_EC_VALID BIT(11) /* Error Code valid */ +#define VMCB_EVENTINJ_VALID BIT(31) /* Event valid */ + +/* Event types that can be injected */ +#define VMCB_EVENTINJ_TYPE_INTR 0 +#define VMCB_EVENTINJ_TYPE_NMI 2 +#define VMCB_EVENTINJ_TYPE_EXCEPTION 3 +#define VMCB_EVENTINJ_TYPE_INTn 4 + +/* VMCB exit code, APM vol2 Appendix C */ +#define VMCB_EXIT_MC 0x52 +#define VMCB_EXIT_INTR 0x60 +#define VMCB_EXIT_NMI 0x61 +#define VMCB_EXIT_VINTR 0x64 +#define VMCB_EXIT_PUSHF 0x70 +#define VMCB_EXIT_POPF 0x71 +#define VMCB_EXIT_CPUID 0x72 +#define VMCB_EXIT_IRET 0x74 +#define VMCB_EXIT_PAUSE 0x77 +#define VMCB_EXIT_HLT 0x78 +#define VMCB_EXIT_IO 0x7B +#define VMCB_EXIT_MSR 0x7C +#define VMCB_EXIT_SHUTDOWN 0x7F +#define VMCB_EXIT_VMSAVE 0x83 +#define VMCB_EXIT_MONITOR 0x8A +#define VMCB_EXIT_MWAIT 0x8B +#define VMCB_EXIT_NPF 0x400 +#define VMCB_EXIT_INVALID -1 + +/* + * Nested page fault. + * Bit definitions to decode EXITINFO1. + */ +#define VMCB_NPF_INFO1_P BIT(0) /* Nested page present. */ +#define VMCB_NPF_INFO1_W BIT(1) /* Access was write. */ +#define VMCB_NPF_INFO1_U BIT(2) /* Access was user access. */ +#define VMCB_NPF_INFO1_RSV BIT(3) /* Reserved bits present. */ +#define VMCB_NPF_INFO1_ID BIT(4) /* Code read. */ + +#define VMCB_NPF_INFO1_GPA BIT(32) /* Guest physical address. */ +#define VMCB_NPF_INFO1_GPT BIT(33) /* Guest page table. */ + +/* + * EXITINTINFO, Interrupt exit info for all intrecepts. + * Section 15.7.2, Intercepts during IDT Interrupt Delivery. + */ +#define VMCB_EXITINTINFO_VECTOR(x) ((x) & 0xFF) +#define VMCB_EXITINTINFO_TYPE(x) (((x) >> 8) & 0x7) +#define VMCB_EXITINTINFO_EC_VALID(x) (((x) & BIT(11)) ? 1 : 0) +#define VMCB_EXITINTINFO_VALID(x) (((x) & BIT(31)) ? 1 : 0) +#define VMCB_EXITINTINFO_EC(x) (((x) >> 32) & 0xFFFFFFFF) + +/* Offset of various VMCB fields. */ +#define VMCB_OFF_CTRL(x) (x) +#define VMCB_OFF_STATE(x) ((x) + 0x400) + +#define VMCB_OFF_CR_INTERCEPT VMCB_OFF_CTRL(0x0) +#define VMCB_OFF_DR_INTERCEPT VMCB_OFF_CTRL(0x4) +#define VMCB_OFF_EXC_INTERCEPT VMCB_OFF_CTRL(0x8) +#define VMCB_OFF_INST1_INTERCEPT VMCB_OFF_CTRL(0xC) +#define VMCB_OFF_INST2_INTERCEPT VMCB_OFF_CTRL(0x10) +#define VMCB_OFF_IO_PERM VMCB_OFF_CTRL(0x40) +#define VMCB_OFF_MSR_PERM VMCB_OFF_CTRL(0x48) +#define VMCB_OFF_TSC_OFFSET VMCB_OFF_CTRL(0x50) +#define VMCB_OFF_ASID VMCB_OFF_CTRL(0x58) +#define VMCB_OFF_TLB_CTRL VMCB_OFF_CTRL(0x5C) +#define VMCB_OFF_VIRQ VMCB_OFF_CTRL(0x60) +#define VMCB_OFF_EXIT_REASON VMCB_OFF_CTRL(0x70) +#define VMCB_OFF_EXITINFO1 VMCB_OFF_CTRL(0x78) +#define VMCB_OFF_EXITINFO2 VMCB_OFF_CTRL(0x80) +#define VMCB_OFF_EXITINTINFO VMCB_OFF_CTRL(0x88) +#define VMCB_OFF_AVIC_BAR VMCB_OFF_CTRL(0x98) +#define VMCB_OFF_NPT_BASE VMCB_OFF_CTRL(0xB0) +#define VMCB_OFF_AVIC_PAGE VMCB_OFF_CTRL(0xE0) +#define VMCB_OFF_AVIC_LT VMCB_OFF_CTRL(0xF0) +#define VMCB_OFF_AVIC_PT VMCB_OFF_CTRL(0xF8) +#define VMCB_OFF_SYSENTER_CS VMCB_OFF_STATE(0x228) +#define VMCB_OFF_SYSENTER_ESP VMCB_OFF_STATE(0x230) +#define VMCB_OFF_SYSENTER_EIP VMCB_OFF_STATE(0x238) +#define VMCB_OFF_GUEST_PAT VMCB_OFF_STATE(0x268) + +/* + * Encode the VMCB offset and bytes that we want to read from VMCB. + */ +#define VMCB_ACCESS(o, w) (0x80000000 | (((w) & 0xF) << 16) | \ + ((o) & 0xFFF)) +#define VMCB_ACCESS_OK(v) ((v) & 0x80000000 ) +#define VMCB_ACCESS_BYTES(v) (((v) >> 16) & 0xF) +#define VMCB_ACCESS_OFFSET(v) ((v) & 0xFFF) + +#ifdef _KERNEL +/* VMCB save state area segment format */ +struct vmcb_segment { + uint16_t selector; + uint16_t attrib; + uint32_t limit; + uint64_t base; +} __attribute__ ((__packed__)); +CTASSERT(sizeof(struct vmcb_segment) == 16); + +/* Code segment descriptor attribute in 12 bit format as saved by VMCB. */ +#define VMCB_CS_ATTRIB_L BIT(9) /* Long mode. */ +#define VMCB_CS_ATTRIB_D BIT(10) /* OPerand size bit. */ + +/* + * The VMCB is divided into two areas - the first one contains various + * control bits including the intercept vector and the second one contains + * the guest state. + */ + +/* VMCB control area - padded up to 1024 bytes */ +struct vmcb_ctrl { + uint32_t intercept[5]; /* all intercepts */ + uint8_t pad1[0x28]; /* Offsets 0x14-0x3B are reserved. */ + uint16_t pause_filthresh; /* Offset 0x3C, PAUSE filter threshold */ + uint16_t pause_filcnt; /* Offset 0x3E, PAUSE filter count */ + uint64_t iopm_base_pa; /* 0x40: IOPM_BASE_PA */ + uint64_t msrpm_base_pa; /* 0x48: MSRPM_BASE_PA */ + uint64_t tsc_offset; /* 0x50: TSC_OFFSET */ + uint32_t asid; /* 0x58: Guest ASID */ + uint8_t tlb_ctrl; /* 0x5C: TLB_CONTROL */ + uint8_t pad2[3]; /* 0x5D-0x5F: Reserved. */ + uint8_t v_tpr; /* 0x60: V_TPR, guest CR8 */ + uint8_t v_irq:1; /* Is virtual interrupt pending? */ + uint8_t :7; /* Padding */ + uint8_t v_intr_prio:4; /* 0x62: Priority for virtual interrupt. */ + uint8_t v_ign_tpr:1; + uint8_t :3; + uint8_t v_intr_masking:1; /* Guest and host sharing of RFLAGS. */ + uint8_t :7; + uint8_t v_intr_vector; /* 0x65: Vector for virtual interrupt. */ + uint8_t pad3[3]; /* Bit64-40 Reserved. */ + uint64_t intr_shadow:1; /* 0x68: Interrupt shadow, section15.2.1 APM2 */ + uint64_t :63; + uint64_t exitcode; /* 0x70, Exitcode */ + uint64_t exitinfo1; /* 0x78, EXITINFO1 */ + uint64_t exitinfo2; /* 0x80, EXITINFO2 */ + uint64_t exitintinfo; /* 0x88, Interrupt exit value. */ + uint64_t np_enable:1; /* 0x90, Nested paging enable. */ + uint64_t :63; + uint8_t pad4[0x10]; /* 0x98-0xA7 reserved. */ + uint64_t eventinj; /* 0xA8, Event injection. */ + uint64_t n_cr3; /* B0, Nested page table. */ + uint64_t lbr_virt_en:1; /* Enable LBR virtualization. */ + uint64_t :63; + uint32_t vmcb_clean; /* 0xC0: VMCB clean bits for caching */ + uint32_t :32; /* 0xC4: Reserved */ + uint64_t nrip; /* 0xC8: Guest next nRIP. */ + uint8_t inst_len; /* 0xD0: #NPF decode assist */ + uint8_t inst_bytes[15]; + uint8_t padd6[0x320]; +} __attribute__ ((__packed__)); +CTASSERT(sizeof(struct vmcb_ctrl) == 1024); + +struct vmcb_state { + struct vmcb_segment es; + struct vmcb_segment cs; + struct vmcb_segment ss; + struct vmcb_segment ds; + struct vmcb_segment fs; + struct vmcb_segment gs; + struct vmcb_segment gdt; + struct vmcb_segment ldt; + struct vmcb_segment idt; + struct vmcb_segment tr; + uint8_t pad1[0x2b]; /* Reserved: 0xA0-0xCA */ + uint8_t cpl; + uint8_t pad2[4]; + uint64_t efer; + uint8_t pad3[0x70]; /* Reserved: 0xd8-0x147 */ + uint64_t cr4; + uint64_t cr3; /* Guest CR3 */ + uint64_t cr0; + uint64_t dr7; + uint64_t dr6; + uint64_t rflags; + uint64_t rip; + uint8_t pad4[0x58]; /* Reserved: 0x180-0x1D7 */ + uint64_t rsp; + uint8_t pad5[0x18]; /* Reserved 0x1E0-0x1F7 */ + uint64_t rax; + uint64_t star; + uint64_t lstar; + uint64_t cstar; + uint64_t sfmask; + uint64_t kernelgsbase; + uint64_t sysenter_cs; + uint64_t sysenter_esp; + uint64_t sysenter_eip; + uint64_t cr2; + uint8_t pad6[0x20]; + uint64_t g_pat; + uint64_t dbgctl; + uint64_t br_from; + uint64_t br_to; + uint64_t int_from; + uint64_t int_to; + uint8_t pad7[0x968]; /* Reserved upto end of VMCB */ +} __attribute__ ((__packed__)); +CTASSERT(sizeof(struct vmcb_state) == 0xC00); + +struct vmcb { + struct vmcb_ctrl ctrl; + struct vmcb_state state; +} __attribute__ ((__packed__)); +CTASSERT(sizeof(struct vmcb) == PAGE_SIZE); +CTASSERT(offsetof(struct vmcb, state) == 0x400); + +int vmcb_read(struct svm_softc *sc, int vcpu, int ident, uint64_t *retval); +int vmcb_write(struct svm_softc *sc, int vcpu, int ident, uint64_t val); +int vmcb_setdesc(void *arg, int vcpu, int ident, struct seg_desc *desc); +int vmcb_getdesc(void *arg, int vcpu, int ident, struct seg_desc *desc); +int vmcb_seg(struct vmcb *vmcb, int ident, struct vmcb_segment *seg); + +#endif /* _KERNEL */ +#endif /* _VMCB_H_ */ diff --git a/vmm/intel/ept.c b/vmm/intel/ept.c new file mode 100644 index 0000000..54320cb --- /dev/null +++ b/vmm/intel/ept.c @@ -0,0 +1,205 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include "vmx_cpufunc.h" +#include "ept.h" + +#define EPT_SUPPORTS_EXEC_ONLY(cap) ((cap) & (1UL << 0)) +#define EPT_PWL4(cap) ((cap) & (1UL << 6)) +#define EPT_MEMORY_TYPE_WB(cap) ((cap) & (1UL << 14)) +#define EPT_PDE_SUPERPAGE(cap) ((cap) & (1UL << 16)) /* 2MB pages */ +#define EPT_PDPTE_SUPERPAGE(cap) ((cap) & (1UL << 17)) /* 1GB pages */ +#define INVEPT_SUPPORTED(cap) ((cap) & (1UL << 20)) +#define AD_BITS_SUPPORTED(cap) ((cap) & (1UL << 21)) +#define INVVPID_SUPPORTED(cap) ((cap) & (1UL << 32)) + +#define INVVPID_ALL_TYPES_MASK 0xF0000000000UL +#define INVVPID_ALL_TYPES_SUPPORTED(cap) \ + (((cap) & INVVPID_ALL_TYPES_MASK) == INVVPID_ALL_TYPES_MASK) + +#define INVEPT_ALL_TYPES_MASK 0x6000000UL +#define INVEPT_ALL_TYPES_SUPPORTED(cap) \ + (((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK) + +#define EPT_PWLEVELS 4 /* page walk levels */ +#define EPT_ENABLE_AD_BITS (1 << 6) + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, ept, CTLFLAG_RW, NULL, NULL); + +static int ept_enable_ad_bits; + +static int ept_pmap_flags; +SYSCTL_INT(_hw_vmm_ept, OID_AUTO, pmap_flags, CTLFLAG_RD, + &ept_pmap_flags, 0, NULL); + +int +ept_init(int ipinum) +{ + int use_hw_ad_bits, use_superpages, use_exec_only; + uint64_t cap; + + cap = rdmsr(MSR_VMX_EPT_VPID_CAP); + + /* + * Verify that: + * - page walk length is 4 steps + * - extended page tables can be laid out in write-back memory + * - invvpid instruction with all possible types is supported + * - invept instruction with all possible types is supported + */ + if (!EPT_PWL4(cap) || + !EPT_MEMORY_TYPE_WB(cap) || + !INVVPID_SUPPORTED(cap) || + !INVVPID_ALL_TYPES_SUPPORTED(cap) || + !INVEPT_SUPPORTED(cap) || + !INVEPT_ALL_TYPES_SUPPORTED(cap)) + return (EINVAL); + + ept_pmap_flags = ipinum & PMAP_NESTED_IPIMASK; + + use_superpages = 1; + TUNABLE_INT_FETCH("hw.vmm.ept.use_superpages", &use_superpages); + if (use_superpages && EPT_PDE_SUPERPAGE(cap)) + ept_pmap_flags |= PMAP_PDE_SUPERPAGE; /* 2MB superpage */ + + use_hw_ad_bits = 1; + TUNABLE_INT_FETCH("hw.vmm.ept.use_hw_ad_bits", &use_hw_ad_bits); + if (use_hw_ad_bits && AD_BITS_SUPPORTED(cap)) + ept_enable_ad_bits = 1; + else + ept_pmap_flags |= PMAP_EMULATE_AD_BITS; + + use_exec_only = 1; + TUNABLE_INT_FETCH("hw.vmm.ept.use_exec_only", &use_exec_only); + if (use_exec_only && EPT_SUPPORTS_EXEC_ONLY(cap)) + ept_pmap_flags |= PMAP_SUPPORTS_EXEC_ONLY; + + return (0); +} + +#if 0 +static void +ept_dump(uint64_t *ptp, int nlevels) +{ + int i, t, tabs; + uint64_t *ptpnext, ptpval; + + if (--nlevels < 0) + return; + + tabs = 3 - nlevels; + for (t = 0; t < tabs; t++) + printf("\t"); + printf("PTP = %p\n", ptp); + + for (i = 0; i < 512; i++) { + ptpval = ptp[i]; + + if (ptpval == 0) + continue; + + for (t = 0; t < tabs; t++) + printf("\t"); + printf("%3d 0x%016lx\n", i, ptpval); + + if (nlevels != 0 && (ptpval & EPT_PG_SUPERPAGE) == 0) { + ptpnext = (uint64_t *) + PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK); + ept_dump(ptpnext, nlevels); + } + } +} +#endif + +static void +invept_single_context(void *arg) +{ + struct invept_desc desc = *(struct invept_desc *)arg; + + invept(INVEPT_TYPE_SINGLE_CONTEXT, desc); +} + +void +ept_invalidate_mappings(u_long eptp) +{ + struct invept_desc invept_desc = { 0 }; + + invept_desc.eptp = eptp; + + smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc); +} + +static int +ept_pinit(pmap_t pmap) +{ + + return (pmap_pinit_type(pmap, PT_EPT, ept_pmap_flags)); +} + +struct vmspace * +ept_vmspace_alloc(vm_offset_t min, vm_offset_t max) +{ + + return (vmspace_alloc(min, max, ept_pinit)); +} + +void +ept_vmspace_free(struct vmspace *vmspace) +{ + + vmspace_free(vmspace); +} + +uint64_t +eptp(uint64_t pml4) +{ + uint64_t eptp_val; + + eptp_val = pml4 | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK; + if (ept_enable_ad_bits) + eptp_val |= EPT_ENABLE_AD_BITS; + + return (eptp_val); +} diff --git a/vmm/intel/ept.h b/vmm/intel/ept.h new file mode 100644 index 0000000..1393e46 --- /dev/null +++ b/vmm/intel/ept.h @@ -0,0 +1,39 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _EPT_H_ +#define _EPT_H_ + +struct vmx; + +int ept_init(int ipinum); +void ept_invalidate_mappings(u_long eptp); +struct vmspace *ept_vmspace_alloc(vm_offset_t min, vm_offset_t max); +void ept_vmspace_free(struct vmspace *vmspace); +uint64_t eptp(uint64_t pml4); +#endif diff --git a/vmm/intel/vmcs.c b/vmm/intel/vmcs.c new file mode 100644 index 0000000..5962526 --- /dev/null +++ b/vmm/intel/vmcs.c @@ -0,0 +1,503 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "opt_ddb.h" + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include + +#include +#include +#include "vmm_host.h" +#include "vmx_cpufunc.h" +#include "vmcs.h" +#include "ept.h" +#include "vmx.h" + +#ifdef DDB +#include +#endif + +static uint64_t +vmcs_fix_regval(uint32_t encoding, uint64_t val) +{ + + switch (encoding) { + case VMCS_GUEST_CR0: + val = vmx_fix_cr0(val); + break; + case VMCS_GUEST_CR4: + val = vmx_fix_cr4(val); + break; + default: + break; + } + return (val); +} + +static uint32_t +vmcs_field_encoding(int ident) +{ + switch (ident) { + case VM_REG_GUEST_CR0: + return (VMCS_GUEST_CR0); + case VM_REG_GUEST_CR3: + return (VMCS_GUEST_CR3); + case VM_REG_GUEST_CR4: + return (VMCS_GUEST_CR4); + case VM_REG_GUEST_DR7: + return (VMCS_GUEST_DR7); + case VM_REG_GUEST_RSP: + return (VMCS_GUEST_RSP); + case VM_REG_GUEST_RIP: + return (VMCS_GUEST_RIP); + case VM_REG_GUEST_RFLAGS: + return (VMCS_GUEST_RFLAGS); + case VM_REG_GUEST_ES: + return (VMCS_GUEST_ES_SELECTOR); + case VM_REG_GUEST_CS: + return (VMCS_GUEST_CS_SELECTOR); + case VM_REG_GUEST_SS: + return (VMCS_GUEST_SS_SELECTOR); + case VM_REG_GUEST_DS: + return (VMCS_GUEST_DS_SELECTOR); + case VM_REG_GUEST_FS: + return (VMCS_GUEST_FS_SELECTOR); + case VM_REG_GUEST_GS: + return (VMCS_GUEST_GS_SELECTOR); + case VM_REG_GUEST_TR: + return (VMCS_GUEST_TR_SELECTOR); + case VM_REG_GUEST_LDTR: + return (VMCS_GUEST_LDTR_SELECTOR); + case VM_REG_GUEST_EFER: + return (VMCS_GUEST_IA32_EFER); + case VM_REG_GUEST_PDPTE0: + return (VMCS_GUEST_PDPTE0); + case VM_REG_GUEST_PDPTE1: + return (VMCS_GUEST_PDPTE1); + case VM_REG_GUEST_PDPTE2: + return (VMCS_GUEST_PDPTE2); + case VM_REG_GUEST_PDPTE3: + return (VMCS_GUEST_PDPTE3); + default: + return (-1); + } + +} + +static int +vmcs_seg_desc_encoding(int seg, uint32_t *base, uint32_t *lim, uint32_t *acc) +{ + + switch (seg) { + case VM_REG_GUEST_ES: + *base = VMCS_GUEST_ES_BASE; + *lim = VMCS_GUEST_ES_LIMIT; + *acc = VMCS_GUEST_ES_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_CS: + *base = VMCS_GUEST_CS_BASE; + *lim = VMCS_GUEST_CS_LIMIT; + *acc = VMCS_GUEST_CS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_SS: + *base = VMCS_GUEST_SS_BASE; + *lim = VMCS_GUEST_SS_LIMIT; + *acc = VMCS_GUEST_SS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_DS: + *base = VMCS_GUEST_DS_BASE; + *lim = VMCS_GUEST_DS_LIMIT; + *acc = VMCS_GUEST_DS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_FS: + *base = VMCS_GUEST_FS_BASE; + *lim = VMCS_GUEST_FS_LIMIT; + *acc = VMCS_GUEST_FS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_GS: + *base = VMCS_GUEST_GS_BASE; + *lim = VMCS_GUEST_GS_LIMIT; + *acc = VMCS_GUEST_GS_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_TR: + *base = VMCS_GUEST_TR_BASE; + *lim = VMCS_GUEST_TR_LIMIT; + *acc = VMCS_GUEST_TR_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_LDTR: + *base = VMCS_GUEST_LDTR_BASE; + *lim = VMCS_GUEST_LDTR_LIMIT; + *acc = VMCS_GUEST_LDTR_ACCESS_RIGHTS; + break; + case VM_REG_GUEST_IDTR: + *base = VMCS_GUEST_IDTR_BASE; + *lim = VMCS_GUEST_IDTR_LIMIT; + *acc = VMCS_INVALID_ENCODING; + break; + case VM_REG_GUEST_GDTR: + *base = VMCS_GUEST_GDTR_BASE; + *lim = VMCS_GUEST_GDTR_LIMIT; + *acc = VMCS_INVALID_ENCODING; + break; + default: + return (EINVAL); + } + + return (0); +} + +int +vmcs_getreg(struct vmcs *vmcs, int running, int ident, uint64_t *retval) +{ + int error; + uint32_t encoding; + + /* + * If we need to get at vmx-specific state in the VMCS we can bypass + * the translation of 'ident' to 'encoding' by simply setting the + * sign bit. As it so happens the upper 16 bits are reserved (i.e + * set to 0) in the encodings for the VMCS so we are free to use the + * sign bit. + */ + if (ident < 0) + encoding = ident & 0x7fffffff; + else + encoding = vmcs_field_encoding(ident); + + if (encoding == (uint32_t)-1) + return (EINVAL); + + if (!running) + VMPTRLD(vmcs); + + error = vmread(encoding, retval); + + if (!running) + VMCLEAR(vmcs); + + return (error); +} + +int +vmcs_setreg(struct vmcs *vmcs, int running, int ident, uint64_t val) +{ + int error; + uint32_t encoding; + + if (ident < 0) + encoding = ident & 0x7fffffff; + else + encoding = vmcs_field_encoding(ident); + + if (encoding == (uint32_t)-1) + return (EINVAL); + + val = vmcs_fix_regval(encoding, val); + + if (!running) + VMPTRLD(vmcs); + + error = vmwrite(encoding, val); + + if (!running) + VMCLEAR(vmcs); + + return (error); +} + +int +vmcs_setdesc(struct vmcs *vmcs, int running, int seg, struct seg_desc *desc) +{ + int error; + uint32_t base, limit, access; + + error = vmcs_seg_desc_encoding(seg, &base, &limit, &access); + if (error != 0) + panic("vmcs_setdesc: invalid segment register %d", seg); + + if (!running) + VMPTRLD(vmcs); + if ((error = vmwrite(base, desc->base)) != 0) + goto done; + + if ((error = vmwrite(limit, desc->limit)) != 0) + goto done; + + if (access != VMCS_INVALID_ENCODING) { + if ((error = vmwrite(access, desc->access)) != 0) + goto done; + } +done: + if (!running) + VMCLEAR(vmcs); + return (error); +} + +int +vmcs_getdesc(struct vmcs *vmcs, int running, int seg, struct seg_desc *desc) +{ + int error; + uint32_t base, limit, access; + uint64_t u64; + + error = vmcs_seg_desc_encoding(seg, &base, &limit, &access); + if (error != 0) + panic("vmcs_getdesc: invalid segment register %d", seg); + + if (!running) + VMPTRLD(vmcs); + if ((error = vmread(base, &u64)) != 0) + goto done; + desc->base = u64; + + if ((error = vmread(limit, &u64)) != 0) + goto done; + desc->limit = u64; + + if (access != VMCS_INVALID_ENCODING) { + if ((error = vmread(access, &u64)) != 0) + goto done; + desc->access = u64; + } +done: + if (!running) + VMCLEAR(vmcs); + return (error); +} + +int +vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count) +{ + int error; + + VMPTRLD(vmcs); + + /* + * Guest MSRs are saved in the VM-exit MSR-store area. + * Guest MSRs are loaded from the VM-entry MSR-load area. + * Both areas point to the same location in memory. + */ + if ((error = vmwrite(VMCS_EXIT_MSR_STORE, g_area)) != 0) + goto done; + if ((error = vmwrite(VMCS_EXIT_MSR_STORE_COUNT, g_count)) != 0) + goto done; + + if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD, g_area)) != 0) + goto done; + if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD_COUNT, g_count)) != 0) + goto done; + + error = 0; +done: + VMCLEAR(vmcs); + return (error); +} + +int +vmcs_init(struct vmcs *vmcs) +{ + int error, codesel, datasel, tsssel; + u_long cr0, cr4, efer; + uint64_t pat, fsbase, idtrbase; + + codesel = vmm_get_host_codesel(); + datasel = vmm_get_host_datasel(); + tsssel = vmm_get_host_tsssel(); + + /* + * Make sure we have a "current" VMCS to work with. + */ + VMPTRLD(vmcs); + + /* Host state */ + + /* Initialize host IA32_PAT MSR */ + pat = vmm_get_host_pat(); + if ((error = vmwrite(VMCS_HOST_IA32_PAT, pat)) != 0) + goto done; + + /* Load the IA32_EFER MSR */ + efer = vmm_get_host_efer(); + if ((error = vmwrite(VMCS_HOST_IA32_EFER, efer)) != 0) + goto done; + + /* Load the control registers */ + + cr0 = vmm_get_host_cr0(); + if ((error = vmwrite(VMCS_HOST_CR0, cr0)) != 0) + goto done; + + cr4 = vmm_get_host_cr4() | CR4_VMXE; + if ((error = vmwrite(VMCS_HOST_CR4, cr4)) != 0) + goto done; + + /* Load the segment selectors */ + if ((error = vmwrite(VMCS_HOST_ES_SELECTOR, datasel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_CS_SELECTOR, codesel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_SS_SELECTOR, datasel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_DS_SELECTOR, datasel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_FS_SELECTOR, datasel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_GS_SELECTOR, datasel)) != 0) + goto done; + + if ((error = vmwrite(VMCS_HOST_TR_SELECTOR, tsssel)) != 0) + goto done; + + /* + * Load the Base-Address for %fs and idtr. + * + * Note that we exclude %gs, tss and gdtr here because their base + * address is pcpu specific. + */ + fsbase = vmm_get_host_fsbase(); + if ((error = vmwrite(VMCS_HOST_FS_BASE, fsbase)) != 0) + goto done; + + idtrbase = vmm_get_host_idtrbase(); + if ((error = vmwrite(VMCS_HOST_IDTR_BASE, idtrbase)) != 0) + goto done; + + /* instruction pointer */ + if ((error = vmwrite(VMCS_HOST_RIP, (u_long)vmx_exit_guest)) != 0) + goto done; + + /* link pointer */ + if ((error = vmwrite(VMCS_LINK_POINTER, ~0)) != 0) + goto done; +done: + VMCLEAR(vmcs); + return (error); +} + +#ifdef DDB +extern int vmxon_enabled[]; + +DB_SHOW_COMMAND(vmcs, db_show_vmcs) +{ + uint64_t cur_vmcs, val; + uint32_t exit; + + if (!vmxon_enabled[curcpu]) { + db_printf("VMX not enabled\n"); + return; + } + + if (have_addr) { + db_printf("Only current VMCS supported\n"); + return; + } + + vmptrst(&cur_vmcs); + if (cur_vmcs == VMCS_INITIAL) { + db_printf("No current VM context\n"); + return; + } + db_printf("VMCS: %jx\n", cur_vmcs); + db_printf("VPID: %lu\n", vmcs_read(VMCS_VPID)); + db_printf("Activity: "); + val = vmcs_read(VMCS_GUEST_ACTIVITY); + switch (val) { + case 0: + db_printf("Active"); + break; + case 1: + db_printf("HLT"); + break; + case 2: + db_printf("Shutdown"); + break; + case 3: + db_printf("Wait for SIPI"); + break; + default: + db_printf("Unknown: %#lx", val); + } + db_printf("\n"); + exit = vmcs_read(VMCS_EXIT_REASON); + if (exit & 0x80000000) + db_printf("Entry Failure Reason: %u\n", exit & 0xffff); + else + db_printf("Exit Reason: %u\n", exit & 0xffff); + db_printf("Qualification: %#lx\n", vmcs_exit_qualification()); + db_printf("Guest Linear Address: %#lx\n", + vmcs_read(VMCS_GUEST_LINEAR_ADDRESS)); + switch (exit & 0x8000ffff) { + case EXIT_REASON_EXCEPTION: + case EXIT_REASON_EXT_INTR: + val = vmcs_read(VMCS_EXIT_INTR_INFO); + db_printf("Interrupt Type: "); + switch (val >> 8 & 0x7) { + case 0: + db_printf("external"); + break; + case 2: + db_printf("NMI"); + break; + case 3: + db_printf("HW exception"); + break; + case 4: + db_printf("SW exception"); + break; + default: + db_printf("?? %lu", val >> 8 & 0x7); + break; + } + db_printf(" Vector: %lu", val & 0xff); + if (val & 0x800) + db_printf(" Error Code: %lx", + vmcs_read(VMCS_EXIT_INTR_ERRCODE)); + db_printf("\n"); + break; + case EXIT_REASON_EPT_FAULT: + case EXIT_REASON_EPT_MISCONFIG: + db_printf("Guest Physical Address: %#lx\n", + vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS)); + break; + } + db_printf("VM-instruction error: %#lx\n", vmcs_instruction_error()); +} +#endif diff --git a/vmm/intel/vmcs.h b/vmm/intel/vmcs.h new file mode 100644 index 0000000..6d78a69 --- /dev/null +++ b/vmm/intel/vmcs.h @@ -0,0 +1,401 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMCS_H_ +#define _VMCS_H_ + +#ifdef _KERNEL +struct vmcs { + uint32_t identifier; + uint32_t abort_code; + char _impl_specific[PAGE_SIZE - sizeof(uint32_t) * 2]; +}; +CTASSERT(sizeof(struct vmcs) == PAGE_SIZE); + +/* MSR save region is composed of an array of 'struct msr_entry' */ +struct msr_entry { + uint32_t index; + uint32_t reserved; + uint64_t val; + +}; + +int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count); +int vmcs_init(struct vmcs *vmcs); +int vmcs_getreg(struct vmcs *vmcs, int running, int ident, uint64_t *rv); +int vmcs_setreg(struct vmcs *vmcs, int running, int ident, uint64_t val); +int vmcs_getdesc(struct vmcs *vmcs, int running, int ident, + struct seg_desc *desc); +int vmcs_setdesc(struct vmcs *vmcs, int running, int ident, + struct seg_desc *desc); + +/* + * Avoid header pollution caused by inline use of 'vtophys()' in vmx_cpufunc.h + */ +#ifdef _VMX_CPUFUNC_H_ +static __inline uint64_t +vmcs_read(uint32_t encoding) +{ + int error; + uint64_t val; + + error = vmread(encoding, &val); + KASSERT(error == 0, ("vmcs_read(%u) error %d", encoding, error)); + return (val); +} + +static __inline void +vmcs_write(uint32_t encoding, uint64_t val) +{ + int error; + + error = vmwrite(encoding, val); + KASSERT(error == 0, ("vmcs_write(%u) error %d", encoding, error)); +} +#endif /* _VMX_CPUFUNC_H_ */ + +#define vmexit_instruction_length() vmcs_read(VMCS_EXIT_INSTRUCTION_LENGTH) +#define vmcs_guest_rip() vmcs_read(VMCS_GUEST_RIP) +#define vmcs_instruction_error() vmcs_read(VMCS_INSTRUCTION_ERROR) +#define vmcs_exit_reason() (vmcs_read(VMCS_EXIT_REASON) & 0xffff) +#define vmcs_exit_qualification() vmcs_read(VMCS_EXIT_QUALIFICATION) +#define vmcs_guest_cr3() vmcs_read(VMCS_GUEST_CR3) +#define vmcs_gpa() vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS) +#define vmcs_gla() vmcs_read(VMCS_GUEST_LINEAR_ADDRESS) +#define vmcs_idt_vectoring_info() vmcs_read(VMCS_IDT_VECTORING_INFO) +#define vmcs_idt_vectoring_err() vmcs_read(VMCS_IDT_VECTORING_ERROR) + +#endif /* _KERNEL */ + +#define VMCS_INITIAL 0xffffffffffffffff + +#define VMCS_IDENT(encoding) ((encoding) | 0x80000000) +/* + * VMCS field encodings from Appendix H, Intel Architecture Manual Vol3B. + */ +#define VMCS_INVALID_ENCODING 0xffffffff + +/* 16-bit control fields */ +#define VMCS_VPID 0x00000000 +#define VMCS_PIR_VECTOR 0x00000002 + +/* 16-bit guest-state fields */ +#define VMCS_GUEST_ES_SELECTOR 0x00000800 +#define VMCS_GUEST_CS_SELECTOR 0x00000802 +#define VMCS_GUEST_SS_SELECTOR 0x00000804 +#define VMCS_GUEST_DS_SELECTOR 0x00000806 +#define VMCS_GUEST_FS_SELECTOR 0x00000808 +#define VMCS_GUEST_GS_SELECTOR 0x0000080A +#define VMCS_GUEST_LDTR_SELECTOR 0x0000080C +#define VMCS_GUEST_TR_SELECTOR 0x0000080E +#define VMCS_GUEST_INTR_STATUS 0x00000810 + +/* 16-bit host-state fields */ +#define VMCS_HOST_ES_SELECTOR 0x00000C00 +#define VMCS_HOST_CS_SELECTOR 0x00000C02 +#define VMCS_HOST_SS_SELECTOR 0x00000C04 +#define VMCS_HOST_DS_SELECTOR 0x00000C06 +#define VMCS_HOST_FS_SELECTOR 0x00000C08 +#define VMCS_HOST_GS_SELECTOR 0x00000C0A +#define VMCS_HOST_TR_SELECTOR 0x00000C0C + +/* 64-bit control fields */ +#define VMCS_IO_BITMAP_A 0x00002000 +#define VMCS_IO_BITMAP_B 0x00002002 +#define VMCS_MSR_BITMAP 0x00002004 +#define VMCS_EXIT_MSR_STORE 0x00002006 +#define VMCS_EXIT_MSR_LOAD 0x00002008 +#define VMCS_ENTRY_MSR_LOAD 0x0000200A +#define VMCS_EXECUTIVE_VMCS 0x0000200C +#define VMCS_TSC_OFFSET 0x00002010 +#define VMCS_VIRTUAL_APIC 0x00002012 +#define VMCS_APIC_ACCESS 0x00002014 +#define VMCS_PIR_DESC 0x00002016 +#define VMCS_EPTP 0x0000201A +#define VMCS_EOI_EXIT0 0x0000201C +#define VMCS_EOI_EXIT1 0x0000201E +#define VMCS_EOI_EXIT2 0x00002020 +#define VMCS_EOI_EXIT3 0x00002022 +#define VMCS_EOI_EXIT(vector) (VMCS_EOI_EXIT0 + ((vector) / 64) * 2) + +/* 64-bit read-only fields */ +#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 + +/* 64-bit guest-state fields */ +#define VMCS_LINK_POINTER 0x00002800 +#define VMCS_GUEST_IA32_DEBUGCTL 0x00002802 +#define VMCS_GUEST_IA32_PAT 0x00002804 +#define VMCS_GUEST_IA32_EFER 0x00002806 +#define VMCS_GUEST_IA32_PERF_GLOBAL_CTRL 0x00002808 +#define VMCS_GUEST_PDPTE0 0x0000280A +#define VMCS_GUEST_PDPTE1 0x0000280C +#define VMCS_GUEST_PDPTE2 0x0000280E +#define VMCS_GUEST_PDPTE3 0x00002810 + +/* 64-bit host-state fields */ +#define VMCS_HOST_IA32_PAT 0x00002C00 +#define VMCS_HOST_IA32_EFER 0x00002C02 +#define VMCS_HOST_IA32_PERF_GLOBAL_CTRL 0x00002C04 + +/* 32-bit control fields */ +#define VMCS_PIN_BASED_CTLS 0x00004000 +#define VMCS_PRI_PROC_BASED_CTLS 0x00004002 +#define VMCS_EXCEPTION_BITMAP 0x00004004 +#define VMCS_PF_ERROR_MASK 0x00004006 +#define VMCS_PF_ERROR_MATCH 0x00004008 +#define VMCS_CR3_TARGET_COUNT 0x0000400A +#define VMCS_EXIT_CTLS 0x0000400C +#define VMCS_EXIT_MSR_STORE_COUNT 0x0000400E +#define VMCS_EXIT_MSR_LOAD_COUNT 0x00004010 +#define VMCS_ENTRY_CTLS 0x00004012 +#define VMCS_ENTRY_MSR_LOAD_COUNT 0x00004014 +#define VMCS_ENTRY_INTR_INFO 0x00004016 +#define VMCS_ENTRY_EXCEPTION_ERROR 0x00004018 +#define VMCS_ENTRY_INST_LENGTH 0x0000401A +#define VMCS_TPR_THRESHOLD 0x0000401C +#define VMCS_SEC_PROC_BASED_CTLS 0x0000401E +#define VMCS_PLE_GAP 0x00004020 +#define VMCS_PLE_WINDOW 0x00004022 + +/* 32-bit read-only data fields */ +#define VMCS_INSTRUCTION_ERROR 0x00004400 +#define VMCS_EXIT_REASON 0x00004402 +#define VMCS_EXIT_INTR_INFO 0x00004404 +#define VMCS_EXIT_INTR_ERRCODE 0x00004406 +#define VMCS_IDT_VECTORING_INFO 0x00004408 +#define VMCS_IDT_VECTORING_ERROR 0x0000440A +#define VMCS_EXIT_INSTRUCTION_LENGTH 0x0000440C +#define VMCS_EXIT_INSTRUCTION_INFO 0x0000440E + +/* 32-bit guest-state fields */ +#define VMCS_GUEST_ES_LIMIT 0x00004800 +#define VMCS_GUEST_CS_LIMIT 0x00004802 +#define VMCS_GUEST_SS_LIMIT 0x00004804 +#define VMCS_GUEST_DS_LIMIT 0x00004806 +#define VMCS_GUEST_FS_LIMIT 0x00004808 +#define VMCS_GUEST_GS_LIMIT 0x0000480A +#define VMCS_GUEST_LDTR_LIMIT 0x0000480C +#define VMCS_GUEST_TR_LIMIT 0x0000480E +#define VMCS_GUEST_GDTR_LIMIT 0x00004810 +#define VMCS_GUEST_IDTR_LIMIT 0x00004812 +#define VMCS_GUEST_ES_ACCESS_RIGHTS 0x00004814 +#define VMCS_GUEST_CS_ACCESS_RIGHTS 0x00004816 +#define VMCS_GUEST_SS_ACCESS_RIGHTS 0x00004818 +#define VMCS_GUEST_DS_ACCESS_RIGHTS 0x0000481A +#define VMCS_GUEST_FS_ACCESS_RIGHTS 0x0000481C +#define VMCS_GUEST_GS_ACCESS_RIGHTS 0x0000481E +#define VMCS_GUEST_LDTR_ACCESS_RIGHTS 0x00004820 +#define VMCS_GUEST_TR_ACCESS_RIGHTS 0x00004822 +#define VMCS_GUEST_INTERRUPTIBILITY 0x00004824 +#define VMCS_GUEST_ACTIVITY 0x00004826 +#define VMCS_GUEST_SMBASE 0x00004828 +#define VMCS_GUEST_IA32_SYSENTER_CS 0x0000482A +#define VMCS_PREEMPTION_TIMER_VALUE 0x0000482E + +/* 32-bit host state fields */ +#define VMCS_HOST_IA32_SYSENTER_CS 0x00004C00 + +/* Natural Width control fields */ +#define VMCS_CR0_MASK 0x00006000 +#define VMCS_CR4_MASK 0x00006002 +#define VMCS_CR0_SHADOW 0x00006004 +#define VMCS_CR4_SHADOW 0x00006006 +#define VMCS_CR3_TARGET0 0x00006008 +#define VMCS_CR3_TARGET1 0x0000600A +#define VMCS_CR3_TARGET2 0x0000600C +#define VMCS_CR3_TARGET3 0x0000600E + +/* Natural Width read-only fields */ +#define VMCS_EXIT_QUALIFICATION 0x00006400 +#define VMCS_IO_RCX 0x00006402 +#define VMCS_IO_RSI 0x00006404 +#define VMCS_IO_RDI 0x00006406 +#define VMCS_IO_RIP 0x00006408 +#define VMCS_GUEST_LINEAR_ADDRESS 0x0000640A + +/* Natural Width guest-state fields */ +#define VMCS_GUEST_CR0 0x00006800 +#define VMCS_GUEST_CR3 0x00006802 +#define VMCS_GUEST_CR4 0x00006804 +#define VMCS_GUEST_ES_BASE 0x00006806 +#define VMCS_GUEST_CS_BASE 0x00006808 +#define VMCS_GUEST_SS_BASE 0x0000680A +#define VMCS_GUEST_DS_BASE 0x0000680C +#define VMCS_GUEST_FS_BASE 0x0000680E +#define VMCS_GUEST_GS_BASE 0x00006810 +#define VMCS_GUEST_LDTR_BASE 0x00006812 +#define VMCS_GUEST_TR_BASE 0x00006814 +#define VMCS_GUEST_GDTR_BASE 0x00006816 +#define VMCS_GUEST_IDTR_BASE 0x00006818 +#define VMCS_GUEST_DR7 0x0000681A +#define VMCS_GUEST_RSP 0x0000681C +#define VMCS_GUEST_RIP 0x0000681E +#define VMCS_GUEST_RFLAGS 0x00006820 +#define VMCS_GUEST_PENDING_DBG_EXCEPTIONS 0x00006822 +#define VMCS_GUEST_IA32_SYSENTER_ESP 0x00006824 +#define VMCS_GUEST_IA32_SYSENTER_EIP 0x00006826 + +/* Natural Width host-state fields */ +#define VMCS_HOST_CR0 0x00006C00 +#define VMCS_HOST_CR3 0x00006C02 +#define VMCS_HOST_CR4 0x00006C04 +#define VMCS_HOST_FS_BASE 0x00006C06 +#define VMCS_HOST_GS_BASE 0x00006C08 +#define VMCS_HOST_TR_BASE 0x00006C0A +#define VMCS_HOST_GDTR_BASE 0x00006C0C +#define VMCS_HOST_IDTR_BASE 0x00006C0E +#define VMCS_HOST_IA32_SYSENTER_ESP 0x00006C10 +#define VMCS_HOST_IA32_SYSENTER_EIP 0x00006C12 +#define VMCS_HOST_RSP 0x00006C14 +#define VMCS_HOST_RIP 0x00006c16 + +/* + * VM instruction error numbers + */ +#define VMRESUME_WITH_NON_LAUNCHED_VMCS 5 + +/* + * VMCS exit reasons + */ +#define EXIT_REASON_EXCEPTION 0 +#define EXIT_REASON_EXT_INTR 1 +#define EXIT_REASON_TRIPLE_FAULT 2 +#define EXIT_REASON_INIT 3 +#define EXIT_REASON_SIPI 4 +#define EXIT_REASON_IO_SMI 5 +#define EXIT_REASON_SMI 6 +#define EXIT_REASON_INTR_WINDOW 7 +#define EXIT_REASON_NMI_WINDOW 8 +#define EXIT_REASON_TASK_SWITCH 9 +#define EXIT_REASON_CPUID 10 +#define EXIT_REASON_GETSEC 11 +#define EXIT_REASON_HLT 12 +#define EXIT_REASON_INVD 13 +#define EXIT_REASON_INVLPG 14 +#define EXIT_REASON_RDPMC 15 +#define EXIT_REASON_RDTSC 16 +#define EXIT_REASON_RSM 17 +#define EXIT_REASON_VMCALL 18 +#define EXIT_REASON_VMCLEAR 19 +#define EXIT_REASON_VMLAUNCH 20 +#define EXIT_REASON_VMPTRLD 21 +#define EXIT_REASON_VMPTRST 22 +#define EXIT_REASON_VMREAD 23 +#define EXIT_REASON_VMRESUME 24 +#define EXIT_REASON_VMWRITE 25 +#define EXIT_REASON_VMXOFF 26 +#define EXIT_REASON_VMXON 27 +#define EXIT_REASON_CR_ACCESS 28 +#define EXIT_REASON_DR_ACCESS 29 +#define EXIT_REASON_INOUT 30 +#define EXIT_REASON_RDMSR 31 +#define EXIT_REASON_WRMSR 32 +#define EXIT_REASON_INVAL_VMCS 33 +#define EXIT_REASON_INVAL_MSR 34 +#define EXIT_REASON_MWAIT 36 +#define EXIT_REASON_MTF 37 +#define EXIT_REASON_MONITOR 39 +#define EXIT_REASON_PAUSE 40 +#define EXIT_REASON_MCE_DURING_ENTRY 41 +#define EXIT_REASON_TPR 43 +#define EXIT_REASON_APIC_ACCESS 44 +#define EXIT_REASON_VIRTUALIZED_EOI 45 +#define EXIT_REASON_GDTR_IDTR 46 +#define EXIT_REASON_LDTR_TR 47 +#define EXIT_REASON_EPT_FAULT 48 +#define EXIT_REASON_EPT_MISCONFIG 49 +#define EXIT_REASON_INVEPT 50 +#define EXIT_REASON_RDTSCP 51 +#define EXIT_REASON_VMX_PREEMPT 52 +#define EXIT_REASON_INVVPID 53 +#define EXIT_REASON_WBINVD 54 +#define EXIT_REASON_XSETBV 55 +#define EXIT_REASON_APIC_WRITE 56 + +/* + * NMI unblocking due to IRET. + * + * Applies to VM-exits due to hardware exception or EPT fault. + */ +#define EXIT_QUAL_NMIUDTI (1 << 12) +/* + * VMCS interrupt information fields + */ +#define VMCS_INTR_VALID (1U << 31) +#define VMCS_INTR_T_MASK 0x700 /* Interruption-info type */ +#define VMCS_INTR_T_HWINTR (0 << 8) +#define VMCS_INTR_T_NMI (2 << 8) +#define VMCS_INTR_T_HWEXCEPTION (3 << 8) +#define VMCS_INTR_T_SWINTR (4 << 8) +#define VMCS_INTR_T_PRIV_SWEXCEPTION (5 << 8) +#define VMCS_INTR_T_SWEXCEPTION (6 << 8) +#define VMCS_INTR_DEL_ERRCODE (1 << 11) + +/* + * VMCS IDT-Vectoring information fields + */ +#define VMCS_IDT_VEC_VALID (1U << 31) +#define VMCS_IDT_VEC_ERRCODE_VALID (1 << 11) + +/* + * VMCS Guest interruptibility field + */ +#define VMCS_INTERRUPTIBILITY_STI_BLOCKING (1 << 0) +#define VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING (1 << 1) +#define VMCS_INTERRUPTIBILITY_SMI_BLOCKING (1 << 2) +#define VMCS_INTERRUPTIBILITY_NMI_BLOCKING (1 << 3) + +/* + * Exit qualification for EXIT_REASON_INVAL_VMCS + */ +#define EXIT_QUAL_NMI_WHILE_STI_BLOCKING 3 + +/* + * Exit qualification for EPT violation + */ +#define EPT_VIOLATION_DATA_READ (1UL << 0) +#define EPT_VIOLATION_DATA_WRITE (1UL << 1) +#define EPT_VIOLATION_INST_FETCH (1UL << 2) +#define EPT_VIOLATION_GPA_READABLE (1UL << 3) +#define EPT_VIOLATION_GPA_WRITEABLE (1UL << 4) +#define EPT_VIOLATION_GPA_EXECUTABLE (1UL << 5) +#define EPT_VIOLATION_GLA_VALID (1UL << 7) +#define EPT_VIOLATION_XLAT_VALID (1UL << 8) + +/* + * Exit qualification for APIC-access VM exit + */ +#define APIC_ACCESS_OFFSET(qual) ((qual) & 0xFFF) +#define APIC_ACCESS_TYPE(qual) (((qual) >> 12) & 0xF) + +/* + * Exit qualification for APIC-write VM exit + */ +#define APIC_WRITE_OFFSET(qual) ((qual) & 0xFFF) + +#endif diff --git a/vmm/intel/vmx.c b/vmm/intel/vmx.c new file mode 100644 index 0000000..03d755c --- /dev/null +++ b/vmm/intel/vmx.c @@ -0,0 +1,3416 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include "vmm_lapic.h" +#include "vmm_host.h" +#include "vmm_ioport.h" +#include "vmm_ktr.h" +#include "vmm_stat.h" +#include "vatpic.h" +#include "vlapic.h" +#include "vlapic_priv.h" + +#include "ept.h" +#include "vmx_cpufunc.h" +#include "vmx.h" +#include "vmx_msr.h" +#include "x86.h" +#include "vmx_controls.h" + +#define PINBASED_CTLS_ONE_SETTING \ + (PINBASED_EXTINT_EXITING | \ + PINBASED_NMI_EXITING | \ + PINBASED_VIRTUAL_NMI) +#define PINBASED_CTLS_ZERO_SETTING 0 + +#define PROCBASED_CTLS_WINDOW_SETTING \ + (PROCBASED_INT_WINDOW_EXITING | \ + PROCBASED_NMI_WINDOW_EXITING) + +#define PROCBASED_CTLS_ONE_SETTING \ + (PROCBASED_SECONDARY_CONTROLS | \ + PROCBASED_MWAIT_EXITING | \ + PROCBASED_MONITOR_EXITING | \ + PROCBASED_IO_EXITING | \ + PROCBASED_MSR_BITMAPS | \ + PROCBASED_CTLS_WINDOW_SETTING | \ + PROCBASED_CR8_LOAD_EXITING | \ + PROCBASED_CR8_STORE_EXITING) +#define PROCBASED_CTLS_ZERO_SETTING \ + (PROCBASED_CR3_LOAD_EXITING | \ + PROCBASED_CR3_STORE_EXITING | \ + PROCBASED_IO_BITMAPS) + +#define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT +#define PROCBASED_CTLS2_ZERO_SETTING 0 + +#define VM_EXIT_CTLS_ONE_SETTING \ + (VM_EXIT_HOST_LMA | \ + VM_EXIT_SAVE_EFER | \ + VM_EXIT_LOAD_EFER | \ + VM_EXIT_ACKNOWLEDGE_INTERRUPT) + +#define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS + +#define VM_ENTRY_CTLS_ONE_SETTING (VM_ENTRY_LOAD_EFER) + +#define VM_ENTRY_CTLS_ZERO_SETTING \ + (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ + VM_ENTRY_INTO_SMM | \ + VM_ENTRY_DEACTIVATE_DUAL_MONITOR) + +#define HANDLED 1 +#define UNHANDLED 0 + +static MALLOC_DEFINE(M_VMX, "vmx", "vmx"); +static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic"); + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL); + +int vmxon_enabled[MAXCPU]; +static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); + +static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; +static uint32_t exit_ctls, entry_ctls; + +static uint64_t cr0_ones_mask, cr0_zeros_mask; +SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD, + &cr0_ones_mask, 0, NULL); +SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD, + &cr0_zeros_mask, 0, NULL); + +static uint64_t cr4_ones_mask, cr4_zeros_mask; +SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD, + &cr4_ones_mask, 0, NULL); +SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD, + &cr4_zeros_mask, 0, NULL); + +static int vmx_initialized; +SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD, + &vmx_initialized, 0, "Intel VMX initialized"); + +/* + * Optional capabilities + */ +static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW, NULL, NULL); + +static int cap_halt_exit; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0, + "HLT triggers a VM-exit"); + +static int cap_pause_exit; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit, + 0, "PAUSE triggers a VM-exit"); + +static int cap_unrestricted_guest; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD, + &cap_unrestricted_guest, 0, "Unrestricted guests"); + +static int cap_monitor_trap; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD, + &cap_monitor_trap, 0, "Monitor trap flag"); + +static int cap_invpcid; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid, + 0, "Guests are allowed to use INVPCID"); + +static int virtual_interrupt_delivery; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD, + &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support"); + +static int posted_interrupts; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD, + &posted_interrupts, 0, "APICv posted interrupt support"); + +static int pirvec = -1; +SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD, + &pirvec, 0, "APICv posted interrupt vector"); + +static struct unrhdr *vpid_unr; +static u_int vpid_alloc_failed; +SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD, + &vpid_alloc_failed, 0, NULL); + +/* + * Use the last page below 4GB as the APIC access address. This address is + * occupied by the boot firmware so it is guaranteed that it will not conflict + * with a page in system memory. + */ +#define APIC_ACCESS_ADDRESS 0xFFFFF000 + +static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc); +static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval); +static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val); +static void vmx_inject_pir(struct vlapic *vlapic); + +#ifdef KTR +static const char * +exit_reason_to_str(int reason) +{ + static char reasonbuf[32]; + + switch (reason) { + case EXIT_REASON_EXCEPTION: + return "exception"; + case EXIT_REASON_EXT_INTR: + return "extint"; + case EXIT_REASON_TRIPLE_FAULT: + return "triplefault"; + case EXIT_REASON_INIT: + return "init"; + case EXIT_REASON_SIPI: + return "sipi"; + case EXIT_REASON_IO_SMI: + return "iosmi"; + case EXIT_REASON_SMI: + return "smi"; + case EXIT_REASON_INTR_WINDOW: + return "intrwindow"; + case EXIT_REASON_NMI_WINDOW: + return "nmiwindow"; + case EXIT_REASON_TASK_SWITCH: + return "taskswitch"; + case EXIT_REASON_CPUID: + return "cpuid"; + case EXIT_REASON_GETSEC: + return "getsec"; + case EXIT_REASON_HLT: + return "hlt"; + case EXIT_REASON_INVD: + return "invd"; + case EXIT_REASON_INVLPG: + return "invlpg"; + case EXIT_REASON_RDPMC: + return "rdpmc"; + case EXIT_REASON_RDTSC: + return "rdtsc"; + case EXIT_REASON_RSM: + return "rsm"; + case EXIT_REASON_VMCALL: + return "vmcall"; + case EXIT_REASON_VMCLEAR: + return "vmclear"; + case EXIT_REASON_VMLAUNCH: + return "vmlaunch"; + case EXIT_REASON_VMPTRLD: + return "vmptrld"; + case EXIT_REASON_VMPTRST: + return "vmptrst"; + case EXIT_REASON_VMREAD: + return "vmread"; + case EXIT_REASON_VMRESUME: + return "vmresume"; + case EXIT_REASON_VMWRITE: + return "vmwrite"; + case EXIT_REASON_VMXOFF: + return "vmxoff"; + case EXIT_REASON_VMXON: + return "vmxon"; + case EXIT_REASON_CR_ACCESS: + return "craccess"; + case EXIT_REASON_DR_ACCESS: + return "draccess"; + case EXIT_REASON_INOUT: + return "inout"; + case EXIT_REASON_RDMSR: + return "rdmsr"; + case EXIT_REASON_WRMSR: + return "wrmsr"; + case EXIT_REASON_INVAL_VMCS: + return "invalvmcs"; + case EXIT_REASON_INVAL_MSR: + return "invalmsr"; + case EXIT_REASON_MWAIT: + return "mwait"; + case EXIT_REASON_MTF: + return "mtf"; + case EXIT_REASON_MONITOR: + return "monitor"; + case EXIT_REASON_PAUSE: + return "pause"; + case EXIT_REASON_MCE_DURING_ENTRY: + return "mce-during-entry"; + case EXIT_REASON_TPR: + return "tpr"; + case EXIT_REASON_APIC_ACCESS: + return "apic-access"; + case EXIT_REASON_GDTR_IDTR: + return "gdtridtr"; + case EXIT_REASON_LDTR_TR: + return "ldtrtr"; + case EXIT_REASON_EPT_FAULT: + return "eptfault"; + case EXIT_REASON_EPT_MISCONFIG: + return "eptmisconfig"; + case EXIT_REASON_INVEPT: + return "invept"; + case EXIT_REASON_RDTSCP: + return "rdtscp"; + case EXIT_REASON_VMX_PREEMPT: + return "vmxpreempt"; + case EXIT_REASON_INVVPID: + return "invvpid"; + case EXIT_REASON_WBINVD: + return "wbinvd"; + case EXIT_REASON_XSETBV: + return "xsetbv"; + case EXIT_REASON_APIC_WRITE: + return "apic-write"; + default: + snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); + return (reasonbuf); + } +} +#endif /* KTR */ + +static int +vmx_allow_x2apic_msrs(struct vmx *vmx) +{ + int i, error; + + error = 0; + + /* + * Allow readonly access to the following x2APIC MSRs from the guest. + */ + error += guest_msr_ro(vmx, MSR_APIC_ID); + error += guest_msr_ro(vmx, MSR_APIC_VERSION); + error += guest_msr_ro(vmx, MSR_APIC_LDR); + error += guest_msr_ro(vmx, MSR_APIC_SVR); + + for (i = 0; i < 8; i++) + error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i); + + for (i = 0; i < 8; i++) + error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i); + + for (i = 0; i < 8; i++) + error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i); + + error += guest_msr_ro(vmx, MSR_APIC_ESR); + error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER); + error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL); + error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT); + error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0); + error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1); + error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR); + error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER); + error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER); + error += guest_msr_ro(vmx, MSR_APIC_ICR); + + /* + * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest. + * + * These registers get special treatment described in the section + * "Virtualizing MSR-Based APIC Accesses". + */ + error += guest_msr_rw(vmx, MSR_APIC_TPR); + error += guest_msr_rw(vmx, MSR_APIC_EOI); + error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI); + + return (error); +} + +u_long +vmx_fix_cr0(u_long cr0) +{ + + return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); +} + +u_long +vmx_fix_cr4(u_long cr4) +{ + + return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); +} + +static void +vpid_free(int vpid) +{ + if (vpid < 0 || vpid > 0xffff) + panic("vpid_free: invalid vpid %d", vpid); + + /* + * VPIDs [0,VM_MAXCPU] are special and are not allocated from + * the unit number allocator. + */ + + if (vpid > VM_MAXCPU) + free_unr(vpid_unr, vpid); +} + +static void +vpid_alloc(uint16_t *vpid, int num) +{ + int i, x; + + if (num <= 0 || num > VM_MAXCPU) + panic("invalid number of vpids requested: %d", num); + + /* + * If the "enable vpid" execution control is not enabled then the + * VPID is required to be 0 for all vcpus. + */ + if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) { + for (i = 0; i < num; i++) + vpid[i] = 0; + return; + } + + /* + * Allocate a unique VPID for each vcpu from the unit number allocator. + */ + for (i = 0; i < num; i++) { + x = alloc_unr(vpid_unr); + if (x == -1) + break; + else + vpid[i] = x; + } + + if (i < num) { + atomic_add_int(&vpid_alloc_failed, 1); + + /* + * If the unit number allocator does not have enough unique + * VPIDs then we need to allocate from the [1,VM_MAXCPU] range. + * + * These VPIDs are not be unique across VMs but this does not + * affect correctness because the combined mappings are also + * tagged with the EP4TA which is unique for each VM. + * + * It is still sub-optimal because the invvpid will invalidate + * combined mappings for a particular VPID across all EP4TAs. + */ + while (i-- > 0) + vpid_free(vpid[i]); + + for (i = 0; i < num; i++) + vpid[i] = i + 1; + } +} + +static void +vpid_init(void) +{ + /* + * VPID 0 is required when the "enable VPID" execution control is + * disabled. + * + * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the + * unit number allocator does not have sufficient unique VPIDs to + * satisfy the allocation. + * + * The remaining VPIDs are managed by the unit number allocator. + */ + vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL); +} + +static void +vmx_disable(void *arg __unused) +{ + struct invvpid_desc invvpid_desc = { 0 }; + struct invept_desc invept_desc = { 0 }; + + if (vmxon_enabled[curcpu]) { + /* + * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. + * + * VMXON or VMXOFF are not required to invalidate any TLB + * caching structures. This prevents potential retention of + * cached information in the TLB between distinct VMX episodes. + */ + invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); + invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); + vmxoff(); + } + load_cr4(rcr4() & ~CR4_VMXE); +} + +static int +vmx_cleanup(void) +{ + + if (pirvec >= 0) + lapic_ipi_free(pirvec); + + if (vpid_unr != NULL) { + delete_unrhdr(vpid_unr); + vpid_unr = NULL; + } + + smp_rendezvous(NULL, vmx_disable, NULL, NULL); + + return (0); +} + +static void +vmx_enable(void *arg __unused) +{ + int error; + uint64_t feature_control; + + feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); + if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 || + (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { + wrmsr(MSR_IA32_FEATURE_CONTROL, + feature_control | IA32_FEATURE_CONTROL_VMX_EN | + IA32_FEATURE_CONTROL_LOCK); + } + + load_cr4(rcr4() | CR4_VMXE); + + *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); + error = vmxon(vmxon_region[curcpu]); + if (error == 0) + vmxon_enabled[curcpu] = 1; +} + +static void +vmx_restore(void) +{ + + if (vmxon_enabled[curcpu]) + vmxon(vmxon_region[curcpu]); +} + +static int +vmx_init(int ipinum) +{ + int error, use_tpr_shadow; + uint64_t basic, fixed0, fixed1, feature_control; + uint32_t tmp, procbased2_vid_bits; + + /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ + if (!(cpu_feature2 & CPUID2_VMX)) { + printf("vmx_init: processor does not support VMX operation\n"); + return (ENXIO); + } + + /* + * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits + * are set (bits 0 and 2 respectively). + */ + feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); + if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 && + (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { + printf("vmx_init: VMX operation disabled by BIOS\n"); + return (ENXIO); + } + + /* + * Verify capabilities MSR_VMX_BASIC: + * - bit 54 indicates support for INS/OUTS decoding + */ + basic = rdmsr(MSR_VMX_BASIC); + if ((basic & (1UL << 54)) == 0) { + printf("vmx_init: processor does not support desired basic " + "capabilities\n"); + return (EINVAL); + } + + /* Check support for primary processor-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, + PROCBASED_CTLS_ONE_SETTING, + PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); + if (error) { + printf("vmx_init: processor does not support desired primary " + "processor-based controls\n"); + return (error); + } + + /* Clear the processor-based ctl bits that are set on demand */ + procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; + + /* Check support for secondary processor-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, + MSR_VMX_PROCBASED_CTLS2, + PROCBASED_CTLS2_ONE_SETTING, + PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); + if (error) { + printf("vmx_init: processor does not support desired secondary " + "processor-based controls\n"); + return (error); + } + + /* Check support for VPID */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, + PROCBASED2_ENABLE_VPID, 0, &tmp); + if (error == 0) + procbased_ctls2 |= PROCBASED2_ENABLE_VPID; + + /* Check support for pin-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, + MSR_VMX_TRUE_PINBASED_CTLS, + PINBASED_CTLS_ONE_SETTING, + PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); + if (error) { + printf("vmx_init: processor does not support desired " + "pin-based controls\n"); + return (error); + } + + /* Check support for VM-exit controls */ + error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, + VM_EXIT_CTLS_ONE_SETTING, + VM_EXIT_CTLS_ZERO_SETTING, + &exit_ctls); + if (error) { + printf("vmx_init: processor does not support desired " + "exit controls\n"); + return (error); + } + + /* Check support for VM-entry controls */ + error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS, + VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING, + &entry_ctls); + if (error) { + printf("vmx_init: processor does not support desired " + "entry controls\n"); + return (error); + } + + /* + * Check support for optional features by testing them + * as individual bits + */ + cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, + PROCBASED_HLT_EXITING, 0, + &tmp) == 0); + + cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_PROCBASED_CTLS, + PROCBASED_MTF, 0, + &tmp) == 0); + + cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, + PROCBASED_PAUSE_EXITING, 0, + &tmp) == 0); + + cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, + MSR_VMX_PROCBASED_CTLS2, + PROCBASED2_UNRESTRICTED_GUEST, 0, + &tmp) == 0); + + cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, + MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0, + &tmp) == 0); + + /* + * Check support for virtual interrupt delivery. + */ + procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES | + PROCBASED2_VIRTUALIZE_X2APIC_MODE | + PROCBASED2_APIC_REGISTER_VIRTUALIZATION | + PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY); + + use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0, + &tmp) == 0); + + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, + procbased2_vid_bits, 0, &tmp); + if (error == 0 && use_tpr_shadow) { + virtual_interrupt_delivery = 1; + TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid", + &virtual_interrupt_delivery); + } + + if (virtual_interrupt_delivery) { + procbased_ctls |= PROCBASED_USE_TPR_SHADOW; + procbased_ctls2 |= procbased2_vid_bits; + procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE; + + /* + * No need to emulate accesses to %CR8 if virtual + * interrupt delivery is enabled. + */ + procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING; + procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING; + + /* + * Check for Posted Interrupts only if Virtual Interrupt + * Delivery is enabled. + */ + error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, + MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0, + &tmp); + if (error == 0) { + pirvec = lapic_ipi_alloc(&IDTVEC(justreturn)); + if (pirvec < 0) { + if (bootverbose) { + printf("vmx_init: unable to allocate " + "posted interrupt vector\n"); + } + } else { + posted_interrupts = 1; + TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir", + &posted_interrupts); + } + } + } + + if (posted_interrupts) + pinbased_ctls |= PINBASED_POSTED_INTERRUPT; + + /* Initialize EPT */ + error = ept_init(ipinum); + if (error) { + printf("vmx_init: ept initialization failed (%d)\n", error); + return (error); + } + + /* + * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 + */ + fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); + fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); + cr0_ones_mask = fixed0 & fixed1; + cr0_zeros_mask = ~fixed0 & ~fixed1; + + /* + * CR0_PE and CR0_PG can be set to zero in VMX non-root operation + * if unrestricted guest execution is allowed. + */ + if (cap_unrestricted_guest) + cr0_ones_mask &= ~(CR0_PG | CR0_PE); + + /* + * Do not allow the guest to set CR0_NW or CR0_CD. + */ + cr0_zeros_mask |= (CR0_NW | CR0_CD); + + fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); + fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); + cr4_ones_mask = fixed0 & fixed1; + cr4_zeros_mask = ~fixed0 & ~fixed1; + + vpid_init(); + + vmx_msr_init(); + + /* enable VMX operation */ + smp_rendezvous(NULL, vmx_enable, NULL, NULL); + + vmx_initialized = 1; + + return (0); +} + +static void +vmx_trigger_hostintr(int vector) +{ + uintptr_t func; + struct gate_descriptor *gd; + + gd = &idt[vector]; + + KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: " + "invalid vector %d", vector)); + KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present", + vector)); + KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d " + "has invalid type %d", vector, gd->gd_type)); + KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d " + "has invalid dpl %d", vector, gd->gd_dpl)); + KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor " + "for vector %d has invalid selector %d", vector, gd->gd_selector)); + KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid " + "IST %d", vector, gd->gd_ist)); + + func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset); + vmx_call_isr(func); +} + +static int +vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial) +{ + int error, mask_ident, shadow_ident; + uint64_t mask_value; + + if (which != 0 && which != 4) + panic("vmx_setup_cr_shadow: unknown cr%d", which); + + if (which == 0) { + mask_ident = VMCS_CR0_MASK; + mask_value = cr0_ones_mask | cr0_zeros_mask; + shadow_ident = VMCS_CR0_SHADOW; + } else { + mask_ident = VMCS_CR4_MASK; + mask_value = cr4_ones_mask | cr4_zeros_mask; + shadow_ident = VMCS_CR4_SHADOW; + } + + error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value); + if (error) + return (error); + + error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial); + if (error) + return (error); + + return (0); +} +#define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init)) +#define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init)) + +static void * +vmx_vminit(struct vm *vm, pmap_t pmap) +{ + uint16_t vpid[VM_MAXCPU]; + int i, error; + struct vmx *vmx; + struct vmcs *vmcs; + uint32_t exc_bitmap; + + vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); + if ((uintptr_t)vmx & PAGE_MASK) { + panic("malloc of struct vmx not aligned on %d byte boundary", + PAGE_SIZE); + } + vmx->vm = vm; + + vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4)); + + /* + * Clean up EPTP-tagged guest physical and combined mappings + * + * VMX transitions are not required to invalidate any guest physical + * mappings. So, it may be possible for stale guest physical mappings + * to be present in the processor TLBs. + * + * Combined mappings for this EP4TA are also invalidated for all VPIDs. + */ + ept_invalidate_mappings(vmx->eptp); + + msr_bitmap_initialize(vmx->msr_bitmap); + + /* + * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. + * The guest FSBASE and GSBASE are saved and restored during + * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are + * always restored from the vmcs host state area on vm-exit. + * + * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in + * how they are saved/restored so can be directly accessed by the + * guest. + * + * MSR_EFER is saved and restored in the guest VMCS area on a + * VM exit and entry respectively. It is also restored from the + * host VMCS area on a VM exit. + * + * The TSC MSR is exposed read-only. Writes are disallowed as that + * will impact the host TSC. + * XXX Writes would be implemented with a wrmsr trap, and + * then modifying the TSC offset in the VMCS. + */ + if (guest_msr_rw(vmx, MSR_GSBASE) || + guest_msr_rw(vmx, MSR_FSBASE) || + guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) || + guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) || + guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) || + guest_msr_rw(vmx, MSR_EFER) || + guest_msr_ro(vmx, MSR_TSC)) + panic("vmx_vminit: error setting guest msr access"); + + vpid_alloc(vpid, VM_MAXCPU); + + if (virtual_interrupt_delivery) { + error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE, + APIC_ACCESS_ADDRESS); + /* XXX this should really return an error to the caller */ + KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error)); + } + + for (i = 0; i < VM_MAXCPU; i++) { + vmcs = &vmx->vmcs[i]; + vmcs->identifier = vmx_revision(); + error = vmclear(vmcs); + if (error != 0) { + panic("vmx_vminit: vmclear error %d on vcpu %d\n", + error, i); + } + + vmx_msr_guest_init(vmx, i); + + error = vmcs_init(vmcs); + KASSERT(error == 0, ("vmcs_init error %d", error)); + + VMPTRLD(vmcs); + error = 0; + error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]); + error += vmwrite(VMCS_EPTP, vmx->eptp); + error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls); + error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls); + error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2); + error += vmwrite(VMCS_EXIT_CTLS, exit_ctls); + error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls); + error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap)); + error += vmwrite(VMCS_VPID, vpid[i]); + + /* exception bitmap */ + if (vcpu_trace_exceptions(vm, i)) + exc_bitmap = 0xffffffff; + else + exc_bitmap = 1 << IDT_MC; + error += vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap); + + if (virtual_interrupt_delivery) { + error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS); + error += vmwrite(VMCS_VIRTUAL_APIC, + vtophys(&vmx->apic_page[i])); + error += vmwrite(VMCS_EOI_EXIT0, 0); + error += vmwrite(VMCS_EOI_EXIT1, 0); + error += vmwrite(VMCS_EOI_EXIT2, 0); + error += vmwrite(VMCS_EOI_EXIT3, 0); + } + if (posted_interrupts) { + error += vmwrite(VMCS_PIR_VECTOR, pirvec); + error += vmwrite(VMCS_PIR_DESC, + vtophys(&vmx->pir_desc[i])); + } + VMCLEAR(vmcs); + KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs")); + + vmx->cap[i].set = 0; + vmx->cap[i].proc_ctls = procbased_ctls; + vmx->cap[i].proc_ctls2 = procbased_ctls2; + + vmx->state[i].nextrip = ~0; + vmx->state[i].lastcpu = NOCPU; + vmx->state[i].vpid = vpid[i]; + + /* + * Set up the CR0/4 shadows, and init the read shadow + * to the power-on register value from the Intel Sys Arch. + * CR0 - 0x60000010 + * CR4 - 0 + */ + error = vmx_setup_cr0_shadow(vmcs, 0x60000010); + if (error != 0) + panic("vmx_setup_cr0_shadow %d", error); + + error = vmx_setup_cr4_shadow(vmcs, 0); + if (error != 0) + panic("vmx_setup_cr4_shadow %d", error); + + vmx->ctx[i].pmap = pmap; + } + + return (vmx); +} + +static int +vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) +{ + int handled, func; + + func = vmxctx->guest_rax; + + handled = x86_emulate_cpuid(vm, vcpu, + (uint32_t*)(&vmxctx->guest_rax), + (uint32_t*)(&vmxctx->guest_rbx), + (uint32_t*)(&vmxctx->guest_rcx), + (uint32_t*)(&vmxctx->guest_rdx)); + return (handled); +} + +static __inline void +vmx_run_trace(struct vmx *vmx, int vcpu) +{ +#ifdef KTR + VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip()); +#endif +} + +static __inline void +vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, + int handled) +{ +#ifdef KTR + VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", + handled ? "handled" : "unhandled", + exit_reason_to_str(exit_reason), rip); +#endif +} + +static __inline void +vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) +{ +#ifdef KTR + VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip); +#endif +} + +static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved"); +static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done"); + +/* + * Invalidate guest mappings identified by its vpid from the TLB. + */ +static __inline void +vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running) +{ + struct vmxstate *vmxstate; + struct invvpid_desc invvpid_desc; + + vmxstate = &vmx->state[vcpu]; + if (vmxstate->vpid == 0) + return; + + if (!running) { + /* + * Set the 'lastcpu' to an invalid host cpu. + * + * This will invalidate TLB entries tagged with the vcpu's + * vpid the next time it runs via vmx_set_pcpu_defaults(). + */ + vmxstate->lastcpu = NOCPU; + return; + } + + KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside " + "critical section", __func__, vcpu)); + + /* + * Invalidate all mappings tagged with 'vpid' + * + * We do this because this vcpu was executing on a different host + * cpu when it last ran. We do not track whether it invalidated + * mappings associated with its 'vpid' during that run. So we must + * assume that the mappings associated with 'vpid' on 'curcpu' are + * stale and invalidate them. + * + * Note that we incur this penalty only when the scheduler chooses to + * move the thread associated with this vcpu between host cpus. + * + * Note also that this will invalidate mappings tagged with 'vpid' + * for "all" EP4TAs. + */ + if (pmap->pm_eptgen == vmx->eptgen[curcpu]) { + invvpid_desc._res1 = 0; + invvpid_desc._res2 = 0; + invvpid_desc.vpid = vmxstate->vpid; + invvpid_desc.linear_addr = 0; + invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); + vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1); + } else { + /* + * The invvpid can be skipped if an invept is going to + * be performed before entering the guest. The invept + * will invalidate combined mappings tagged with + * 'vmx->eptp' for all vpids. + */ + vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1); + } +} + +static void +vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap) +{ + struct vmxstate *vmxstate; + + vmxstate = &vmx->state[vcpu]; + if (vmxstate->lastcpu == curcpu) + return; + + vmxstate->lastcpu = curcpu; + + vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); + + vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); + vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); + vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); + vmx_invvpid(vmx, vcpu, pmap, 1); +} + +/* + * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. + */ +CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); + +static void __inline +vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) +{ + + if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) { + vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; + vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); + VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); + } +} + +static void __inline +vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) +{ + + KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0, + ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls)); + vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; + vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); + VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); +} + +static void __inline +vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) +{ + + if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) { + vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; + vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); + VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); + } +} + +static void __inline +vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) +{ + + KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0, + ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls)); + vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; + vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); + VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); +} + +#define NMI_BLOCKING (VMCS_INTERRUPTIBILITY_NMI_BLOCKING | \ + VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) +#define HWINTR_BLOCKING (VMCS_INTERRUPTIBILITY_STI_BLOCKING | \ + VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) + +static void +vmx_inject_nmi(struct vmx *vmx, int vcpu) +{ + uint32_t gi, info; + + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest " + "interruptibility-state %#x", gi)); + + info = vmcs_read(VMCS_ENTRY_INTR_INFO); + KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid " + "VM-entry interruption information %#x", info)); + + /* + * Inject the virtual NMI. The vector must be the NMI IDT entry + * or the VMCS entry check will fail. + */ + info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID; + vmcs_write(VMCS_ENTRY_INTR_INFO, info); + + VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI"); + + /* Clear the request */ + vm_nmi_clear(vmx->vm, vcpu); +} + +static void +vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic, + uint64_t guestrip) +{ + int vector, need_nmi_exiting, extint_pending; + uint64_t rflags, entryinfo; + uint32_t gi, info; + + if (vmx->state[vcpu].nextrip != guestrip) { + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + if (gi & HWINTR_BLOCKING) { + VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking " + "cleared due to rip change: %#lx/%#lx", + vmx->state[vcpu].nextrip, guestrip); + gi &= ~HWINTR_BLOCKING; + vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); + } + } + + if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) { + KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry " + "intinfo is not valid: %#lx", __func__, entryinfo)); + + info = vmcs_read(VMCS_ENTRY_INTR_INFO); + KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject " + "pending exception: %#lx/%#x", __func__, entryinfo, info)); + + info = entryinfo; + vector = info & 0xff; + if (vector == IDT_BP || vector == IDT_OF) { + /* + * VT-x requires #BP and #OF to be injected as software + * exceptions. + */ + info &= ~VMCS_INTR_T_MASK; + info |= VMCS_INTR_T_SWEXCEPTION; + } + + if (info & VMCS_INTR_DEL_ERRCODE) + vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32); + + vmcs_write(VMCS_ENTRY_INTR_INFO, info); + } + + if (vm_nmi_pending(vmx->vm, vcpu)) { + /* + * If there are no conditions blocking NMI injection then + * inject it directly here otherwise enable "NMI window + * exiting" to inject it as soon as we can. + * + * We also check for STI_BLOCKING because some implementations + * don't allow NMI injection in this case. If we are running + * on a processor that doesn't have this restriction it will + * immediately exit and the NMI will be injected in the + * "NMI window exiting" handler. + */ + need_nmi_exiting = 1; + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) { + info = vmcs_read(VMCS_ENTRY_INTR_INFO); + if ((info & VMCS_INTR_VALID) == 0) { + vmx_inject_nmi(vmx, vcpu); + need_nmi_exiting = 0; + } else { + VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI " + "due to VM-entry intr info %#x", info); + } + } else { + VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to " + "Guest Interruptibility-state %#x", gi); + } + + if (need_nmi_exiting) + vmx_set_nmi_window_exiting(vmx, vcpu); + } + + extint_pending = vm_extint_pending(vmx->vm, vcpu); + + if (!extint_pending && virtual_interrupt_delivery) { + vmx_inject_pir(vlapic); + return; + } + + /* + * If interrupt-window exiting is already in effect then don't bother + * checking for pending interrupts. This is just an optimization and + * not needed for correctness. + */ + if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) { + VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to " + "pending int_window_exiting"); + return; + } + + if (!extint_pending) { + /* Ask the local apic for a vector to inject */ + if (!vlapic_pending_intr(vlapic, &vector)) + return; + + /* + * From the Intel SDM, Volume 3, Section "Maskable + * Hardware Interrupts": + * - maskable interrupt vectors [16,255] can be delivered + * through the local APIC. + */ + KASSERT(vector >= 16 && vector <= 255, + ("invalid vector %d from local APIC", vector)); + } else { + /* Ask the legacy pic for a vector to inject */ + vatpic_pending_intr(vmx->vm, &vector); + + /* + * From the Intel SDM, Volume 3, Section "Maskable + * Hardware Interrupts": + * - maskable interrupt vectors [0,255] can be delivered + * through the INTR pin. + */ + KASSERT(vector >= 0 && vector <= 255, + ("invalid vector %d from INTR", vector)); + } + + /* Check RFLAGS.IF and the interruptibility state of the guest */ + rflags = vmcs_read(VMCS_GUEST_RFLAGS); + if ((rflags & PSL_I) == 0) { + VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " + "rflags %#lx", vector, rflags); + goto cantinject; + } + + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + if (gi & HWINTR_BLOCKING) { + VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " + "Guest Interruptibility-state %#x", vector, gi); + goto cantinject; + } + + info = vmcs_read(VMCS_ENTRY_INTR_INFO); + if (info & VMCS_INTR_VALID) { + /* + * This is expected and could happen for multiple reasons: + * - A vectoring VM-entry was aborted due to astpending + * - A VM-exit happened during event injection. + * - An exception was injected above. + * - An NMI was injected above or after "NMI window exiting" + */ + VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " + "VM-entry intr info %#x", vector, info); + goto cantinject; + } + + /* Inject the interrupt */ + info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID; + info |= vector; + vmcs_write(VMCS_ENTRY_INTR_INFO, info); + + if (!extint_pending) { + /* Update the Local APIC ISR */ + vlapic_intr_accepted(vlapic, vector); + } else { + vm_extint_clear(vmx->vm, vcpu); + vatpic_intr_accepted(vmx->vm, vector); + + /* + * After we accepted the current ExtINT the PIC may + * have posted another one. If that is the case, set + * the Interrupt Window Exiting execution control so + * we can inject that one too. + * + * Also, interrupt window exiting allows us to inject any + * pending APIC vector that was preempted by the ExtINT + * as soon as possible. This applies both for the software + * emulated vlapic and the hardware assisted virtual APIC. + */ + vmx_set_int_window_exiting(vmx, vcpu); + } + + VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); + + return; + +cantinject: + /* + * Set the Interrupt Window Exiting execution control so we can inject + * the interrupt as soon as blocking condition goes away. + */ + vmx_set_int_window_exiting(vmx, vcpu); +} + +/* + * If the Virtual NMIs execution control is '1' then the logical processor + * tracks virtual-NMI blocking in the Guest Interruptibility-state field of + * the VMCS. An IRET instruction in VMX non-root operation will remove any + * virtual-NMI blocking. + * + * This unblocking occurs even if the IRET causes a fault. In this case the + * hypervisor needs to restore virtual-NMI blocking before resuming the guest. + */ +static void +vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid) +{ + uint32_t gi; + + VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking"); + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING; + vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); +} + +static void +vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid) +{ + uint32_t gi; + + VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking"); + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING; + vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); +} + +static void +vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid) +{ + uint32_t gi; + + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING, + ("NMI blocking is not in effect %#x", gi)); +} + +static int +vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) +{ + struct vmxctx *vmxctx; + uint64_t xcrval; + const struct xsave_limits *limits; + + vmxctx = &vmx->ctx[vcpu]; + limits = vmm_get_xsave_limits(); + + /* + * Note that the processor raises a GP# fault on its own if + * xsetbv is executed for CPL != 0, so we do not have to + * emulate that fault here. + */ + + /* Only xcr0 is supported. */ + if (vmxctx->guest_rcx != 0) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + /* We only handle xcr0 if both the host and guest have XSAVE enabled. */ + if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) { + vm_inject_ud(vmx->vm, vcpu); + return (HANDLED); + } + + xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff); + if ((xcrval & ~limits->xcr0_allowed) != 0) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + if (!(xcrval & XFEATURE_ENABLED_X87)) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + /* AVX (YMM_Hi128) requires SSE. */ + if (xcrval & XFEATURE_ENABLED_AVX && + (xcrval & XFEATURE_AVX) != XFEATURE_AVX) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + /* + * AVX512 requires base AVX (YMM_Hi128) as well as OpMask, + * ZMM_Hi256, and Hi16_ZMM. + */ + if (xcrval & XFEATURE_AVX512 && + (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) != + (XFEATURE_AVX512 | XFEATURE_AVX)) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + /* + * Intel MPX requires both bound register state flags to be + * set. + */ + if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) != + ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + /* + * This runs "inside" vmrun() with the guest's FPU state, so + * modifying xcr0 directly modifies the guest's xcr0, not the + * host's. + */ + load_xcr(0, xcrval); + return (HANDLED); +} + +static uint64_t +vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident) +{ + const struct vmxctx *vmxctx; + + vmxctx = &vmx->ctx[vcpu]; + + switch (ident) { + case 0: + return (vmxctx->guest_rax); + case 1: + return (vmxctx->guest_rcx); + case 2: + return (vmxctx->guest_rdx); + case 3: + return (vmxctx->guest_rbx); + case 4: + return (vmcs_read(VMCS_GUEST_RSP)); + case 5: + return (vmxctx->guest_rbp); + case 6: + return (vmxctx->guest_rsi); + case 7: + return (vmxctx->guest_rdi); + case 8: + return (vmxctx->guest_r8); + case 9: + return (vmxctx->guest_r9); + case 10: + return (vmxctx->guest_r10); + case 11: + return (vmxctx->guest_r11); + case 12: + return (vmxctx->guest_r12); + case 13: + return (vmxctx->guest_r13); + case 14: + return (vmxctx->guest_r14); + case 15: + return (vmxctx->guest_r15); + default: + panic("invalid vmx register %d", ident); + } +} + +static void +vmx_set_guest_reg(struct vmx *vmx, int vcpu, int ident, uint64_t regval) +{ + struct vmxctx *vmxctx; + + vmxctx = &vmx->ctx[vcpu]; + + switch (ident) { + case 0: + vmxctx->guest_rax = regval; + break; + case 1: + vmxctx->guest_rcx = regval; + break; + case 2: + vmxctx->guest_rdx = regval; + break; + case 3: + vmxctx->guest_rbx = regval; + break; + case 4: + vmcs_write(VMCS_GUEST_RSP, regval); + break; + case 5: + vmxctx->guest_rbp = regval; + break; + case 6: + vmxctx->guest_rsi = regval; + break; + case 7: + vmxctx->guest_rdi = regval; + break; + case 8: + vmxctx->guest_r8 = regval; + break; + case 9: + vmxctx->guest_r9 = regval; + break; + case 10: + vmxctx->guest_r10 = regval; + break; + case 11: + vmxctx->guest_r11 = regval; + break; + case 12: + vmxctx->guest_r12 = regval; + break; + case 13: + vmxctx->guest_r13 = regval; + break; + case 14: + vmxctx->guest_r14 = regval; + break; + case 15: + vmxctx->guest_r15 = regval; + break; + default: + panic("invalid vmx register %d", ident); + } +} + +static int +vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual) +{ + uint64_t crval, regval; + + /* We only handle mov to %cr0 at this time */ + if ((exitqual & 0xf0) != 0x00) + return (UNHANDLED); + + regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf); + + vmcs_write(VMCS_CR0_SHADOW, regval); + + crval = regval | cr0_ones_mask; + crval &= ~cr0_zeros_mask; + vmcs_write(VMCS_GUEST_CR0, crval); + + if (regval & CR0_PG) { + uint64_t efer, entry_ctls; + + /* + * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and + * the "IA-32e mode guest" bit in VM-entry control must be + * equal. + */ + efer = vmcs_read(VMCS_GUEST_IA32_EFER); + if (efer & EFER_LME) { + efer |= EFER_LMA; + vmcs_write(VMCS_GUEST_IA32_EFER, efer); + entry_ctls = vmcs_read(VMCS_ENTRY_CTLS); + entry_ctls |= VM_ENTRY_GUEST_LMA; + vmcs_write(VMCS_ENTRY_CTLS, entry_ctls); + } + } + + return (HANDLED); +} + +static int +vmx_emulate_cr4_access(struct vmx *vmx, int vcpu, uint64_t exitqual) +{ + uint64_t crval, regval; + + /* We only handle mov to %cr4 at this time */ + if ((exitqual & 0xf0) != 0x00) + return (UNHANDLED); + + regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf); + + vmcs_write(VMCS_CR4_SHADOW, regval); + + crval = regval | cr4_ones_mask; + crval &= ~cr4_zeros_mask; + vmcs_write(VMCS_GUEST_CR4, crval); + + return (HANDLED); +} + +static int +vmx_emulate_cr8_access(struct vmx *vmx, int vcpu, uint64_t exitqual) +{ + struct vlapic *vlapic; + uint64_t cr8; + int regnum; + + /* We only handle mov %cr8 to/from a register at this time. */ + if ((exitqual & 0xe0) != 0x00) { + return (UNHANDLED); + } + + vlapic = vm_lapic(vmx->vm, vcpu); + regnum = (exitqual >> 8) & 0xf; + if (exitqual & 0x10) { + cr8 = vlapic_get_cr8(vlapic); + vmx_set_guest_reg(vmx, vcpu, regnum, cr8); + } else { + cr8 = vmx_get_guest_reg(vmx, vcpu, regnum); + vlapic_set_cr8(vlapic, cr8); + } + + return (HANDLED); +} + +/* + * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL + */ +static int +vmx_cpl(void) +{ + uint32_t ssar; + + ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS); + return ((ssar >> 5) & 0x3); +} + +static enum vm_cpu_mode +vmx_cpu_mode(void) +{ + uint32_t csar; + + if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) { + csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); + if (csar & 0x2000) + return (CPU_MODE_64BIT); /* CS.L = 1 */ + else + return (CPU_MODE_COMPATIBILITY); + } else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) { + return (CPU_MODE_PROTECTED); + } else { + return (CPU_MODE_REAL); + } +} + +static enum vm_paging_mode +vmx_paging_mode(void) +{ + + if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG)) + return (PAGING_MODE_FLAT); + if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE)) + return (PAGING_MODE_32); + if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME) + return (PAGING_MODE_64); + else + return (PAGING_MODE_PAE); +} + +static uint64_t +inout_str_index(struct vmx *vmx, int vcpuid, int in) +{ + uint64_t val; + int error; + enum vm_reg_name reg; + + reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI; + error = vmx_getreg(vmx, vcpuid, reg, &val); + KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error)); + return (val); +} + +static uint64_t +inout_str_count(struct vmx *vmx, int vcpuid, int rep) +{ + uint64_t val; + int error; + + if (rep) { + error = vmx_getreg(vmx, vcpuid, VM_REG_GUEST_RCX, &val); + KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error)); + } else { + val = 1; + } + return (val); +} + +static int +inout_str_addrsize(uint32_t inst_info) +{ + uint32_t size; + + size = (inst_info >> 7) & 0x7; + switch (size) { + case 0: + return (2); /* 16 bit */ + case 1: + return (4); /* 32 bit */ + case 2: + return (8); /* 64 bit */ + default: + panic("%s: invalid size encoding %d", __func__, size); + } +} + +static void +inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in, + struct vm_inout_str *vis) +{ + int error, s; + + if (in) { + vis->seg_name = VM_REG_GUEST_ES; + } else { + s = (inst_info >> 15) & 0x7; + vis->seg_name = vm_segment_name(s); + } + + error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc); + KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error)); +} + +static void +vmx_paging_info(struct vm_guest_paging *paging) +{ + paging->cr3 = vmcs_guest_cr3(); + paging->cpl = vmx_cpl(); + paging->cpu_mode = vmx_cpu_mode(); + paging->paging_mode = vmx_paging_mode(); +} + +static void +vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla) +{ + struct vm_guest_paging *paging; + uint32_t csar; + + paging = &vmexit->u.inst_emul.paging; + + vmexit->exitcode = VM_EXITCODE_INST_EMUL; + vmexit->u.inst_emul.gpa = gpa; + vmexit->u.inst_emul.gla = gla; + vmx_paging_info(paging); + switch (paging->cpu_mode) { + case CPU_MODE_REAL: + vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); + vmexit->u.inst_emul.cs_d = 0; + break; + case CPU_MODE_PROTECTED: + case CPU_MODE_COMPATIBILITY: + vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); + csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); + vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar); + break; + default: + vmexit->u.inst_emul.cs_base = 0; + vmexit->u.inst_emul.cs_d = 0; + break; + } + vie_init(&vmexit->u.inst_emul.vie, NULL, 0); +} + +static int +ept_fault_type(uint64_t ept_qual) +{ + int fault_type; + + if (ept_qual & EPT_VIOLATION_DATA_WRITE) + fault_type = VM_PROT_WRITE; + else if (ept_qual & EPT_VIOLATION_INST_FETCH) + fault_type = VM_PROT_EXECUTE; + else + fault_type= VM_PROT_READ; + + return (fault_type); +} + +static boolean_t +ept_emulation_fault(uint64_t ept_qual) +{ + int read, write; + + /* EPT fault on an instruction fetch doesn't make sense here */ + if (ept_qual & EPT_VIOLATION_INST_FETCH) + return (FALSE); + + /* EPT fault must be a read fault or a write fault */ + read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; + write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; + if ((read | write) == 0) + return (FALSE); + + /* + * The EPT violation must have been caused by accessing a + * guest-physical address that is a translation of a guest-linear + * address. + */ + if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || + (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { + return (FALSE); + } + + return (TRUE); +} + +static __inline int +apic_access_virtualization(struct vmx *vmx, int vcpuid) +{ + uint32_t proc_ctls2; + + proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; + return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0); +} + +static __inline int +x2apic_virtualization(struct vmx *vmx, int vcpuid) +{ + uint32_t proc_ctls2; + + proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; + return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0); +} + +static int +vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic, + uint64_t qual) +{ + int error, handled, offset; + uint32_t *apic_regs, vector; + bool retu; + + handled = HANDLED; + offset = APIC_WRITE_OFFSET(qual); + + if (!apic_access_virtualization(vmx, vcpuid)) { + /* + * In general there should not be any APIC write VM-exits + * unless APIC-access virtualization is enabled. + * + * However self-IPI virtualization can legitimately trigger + * an APIC-write VM-exit so treat it specially. + */ + if (x2apic_virtualization(vmx, vcpuid) && + offset == APIC_OFFSET_SELF_IPI) { + apic_regs = (uint32_t *)(vlapic->apic_page); + vector = apic_regs[APIC_OFFSET_SELF_IPI / 4]; + vlapic_self_ipi_handler(vlapic, vector); + return (HANDLED); + } else + return (UNHANDLED); + } + + switch (offset) { + case APIC_OFFSET_ID: + vlapic_id_write_handler(vlapic); + break; + case APIC_OFFSET_LDR: + vlapic_ldr_write_handler(vlapic); + break; + case APIC_OFFSET_DFR: + vlapic_dfr_write_handler(vlapic); + break; + case APIC_OFFSET_SVR: + vlapic_svr_write_handler(vlapic); + break; + case APIC_OFFSET_ESR: + vlapic_esr_write_handler(vlapic); + break; + case APIC_OFFSET_ICR_LOW: + retu = false; + error = vlapic_icrlo_write_handler(vlapic, &retu); + if (error != 0 || retu) + handled = UNHANDLED; + break; + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: + vlapic_lvt_write_handler(vlapic, offset); + break; + case APIC_OFFSET_TIMER_ICR: + vlapic_icrtmr_write_handler(vlapic); + break; + case APIC_OFFSET_TIMER_DCR: + vlapic_dcr_write_handler(vlapic); + break; + default: + handled = UNHANDLED; + break; + } + return (handled); +} + +static bool +apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa) +{ + + if (apic_access_virtualization(vmx, vcpuid) && + (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE)) + return (true); + else + return (false); +} + +static int +vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) +{ + uint64_t qual; + int access_type, offset, allowed; + + if (!apic_access_virtualization(vmx, vcpuid)) + return (UNHANDLED); + + qual = vmexit->u.vmx.exit_qualification; + access_type = APIC_ACCESS_TYPE(qual); + offset = APIC_ACCESS_OFFSET(qual); + + allowed = 0; + if (access_type == 0) { + /* + * Read data access to the following registers is expected. + */ + switch (offset) { + case APIC_OFFSET_APR: + case APIC_OFFSET_PPR: + case APIC_OFFSET_RRR: + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_CCR: + allowed = 1; + break; + default: + break; + } + } else if (access_type == 1) { + /* + * Write data access to the following registers is expected. + */ + switch (offset) { + case APIC_OFFSET_VER: + case APIC_OFFSET_APR: + case APIC_OFFSET_PPR: + case APIC_OFFSET_RRR: + case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: + case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: + case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_CCR: + allowed = 1; + break; + default: + break; + } + } + + if (allowed) { + vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset, + VIE_INVALID_GLA); + } + + /* + * Regardless of whether the APIC-access is allowed this handler + * always returns UNHANDLED: + * - if the access is allowed then it is handled by emulating the + * instruction that caused the VM-exit (outside the critical section) + * - if the access is not allowed then it will be converted to an + * exitcode of VM_EXITCODE_VMX and will be dealt with in userland. + */ + return (UNHANDLED); +} + +static enum task_switch_reason +vmx_task_switch_reason(uint64_t qual) +{ + int reason; + + reason = (qual >> 30) & 0x3; + switch (reason) { + case 0: + return (TSR_CALL); + case 1: + return (TSR_IRET); + case 2: + return (TSR_JMP); + case 3: + return (TSR_IDT_GATE); + default: + panic("%s: invalid reason %d", __func__, reason); + } +} + +static int +emulate_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu) +{ + int error; + + if (lapic_msr(num)) + error = lapic_wrmsr(vmx->vm, vcpuid, num, val, retu); + else + error = vmx_wrmsr(vmx, vcpuid, num, val, retu); + + return (error); +} + +static int +emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu) +{ + struct vmxctx *vmxctx; + uint64_t result; + uint32_t eax, edx; + int error; + + if (lapic_msr(num)) + error = lapic_rdmsr(vmx->vm, vcpuid, num, &result, retu); + else + error = vmx_rdmsr(vmx, vcpuid, num, &result, retu); + + if (error == 0) { + eax = result; + vmxctx = &vmx->ctx[vcpuid]; + error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RAX, eax); + KASSERT(error == 0, ("vmxctx_setreg(rax) error %d", error)); + + edx = result >> 32; + error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RDX, edx); + KASSERT(error == 0, ("vmxctx_setreg(rdx) error %d", error)); + } + + return (error); +} + +static int +vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) +{ + int error, errcode, errcode_valid, handled, in; + struct vmxctx *vmxctx; + struct vlapic *vlapic; + struct vm_inout_str *vis; + struct vm_task_switch *ts; + uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info; + uint32_t intr_type, intr_vec, reason; + uint64_t exitintinfo, qual, gpa; + bool retu; + + CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0); + CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0); + + handled = UNHANDLED; + vmxctx = &vmx->ctx[vcpu]; + + qual = vmexit->u.vmx.exit_qualification; + reason = vmexit->u.vmx.exit_reason; + vmexit->exitcode = VM_EXITCODE_BOGUS; + + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1); + + /* + * VM-entry failures during or after loading guest state. + * + * These VM-exits are uncommon but must be handled specially + * as most VM-exit fields are not populated as usual. + */ + if (__predict_false(reason == EXIT_REASON_MCE_DURING_ENTRY)) { + VCPU_CTR0(vmx->vm, vcpu, "Handling MCE during VM-entry"); + __asm __volatile("int $18"); + return (1); + } + + /* + * VM exits that can be triggered during event delivery need to + * be handled specially by re-injecting the event if the IDT + * vectoring information field's valid bit is set. + * + * See "Information for VM Exits During Event Delivery" in Intel SDM + * for details. + */ + idtvec_info = vmcs_idt_vectoring_info(); + if (idtvec_info & VMCS_IDT_VEC_VALID) { + idtvec_info &= ~(1 << 12); /* clear undefined bit */ + exitintinfo = idtvec_info; + if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { + idtvec_err = vmcs_idt_vectoring_err(); + exitintinfo |= (uint64_t)idtvec_err << 32; + } + error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo); + KASSERT(error == 0, ("%s: vm_set_intinfo error %d", + __func__, error)); + + /* + * If 'virtual NMIs' are being used and the VM-exit + * happened while injecting an NMI during the previous + * VM-entry, then clear "blocking by NMI" in the + * Guest Interruptibility-State so the NMI can be + * reinjected on the subsequent VM-entry. + * + * However, if the NMI was being delivered through a task + * gate, then the new task must start execution with NMIs + * blocked so don't clear NMI blocking in this case. + */ + intr_type = idtvec_info & VMCS_INTR_T_MASK; + if (intr_type == VMCS_INTR_T_NMI) { + if (reason != EXIT_REASON_TASK_SWITCH) + vmx_clear_nmi_blocking(vmx, vcpu); + else + vmx_assert_nmi_blocking(vmx, vcpu); + } + + /* + * Update VM-entry instruction length if the event being + * delivered was a software interrupt or software exception. + */ + if (intr_type == VMCS_INTR_T_SWINTR || + intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION || + intr_type == VMCS_INTR_T_SWEXCEPTION) { + vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); + } + } + + switch (reason) { + case EXIT_REASON_TASK_SWITCH: + ts = &vmexit->u.task_switch; + ts->tsssel = qual & 0xffff; + ts->reason = vmx_task_switch_reason(qual); + ts->ext = 0; + ts->errcode_valid = 0; + vmx_paging_info(&ts->paging); + /* + * If the task switch was due to a CALL, JMP, IRET, software + * interrupt (INT n) or software exception (INT3, INTO), + * then the saved %rip references the instruction that caused + * the task switch. The instruction length field in the VMCS + * is valid in this case. + * + * In all other cases (e.g., NMI, hardware exception) the + * saved %rip is one that would have been saved in the old TSS + * had the task switch completed normally so the instruction + * length field is not needed in this case and is explicitly + * set to 0. + */ + if (ts->reason == TSR_IDT_GATE) { + KASSERT(idtvec_info & VMCS_IDT_VEC_VALID, + ("invalid idtvec_info %#x for IDT task switch", + idtvec_info)); + intr_type = idtvec_info & VMCS_INTR_T_MASK; + if (intr_type != VMCS_INTR_T_SWINTR && + intr_type != VMCS_INTR_T_SWEXCEPTION && + intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) { + /* Task switch triggered by external event */ + ts->ext = 1; + vmexit->inst_length = 0; + if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { + ts->errcode_valid = 1; + ts->errcode = vmcs_idt_vectoring_err(); + } + } + } + vmexit->exitcode = VM_EXITCODE_TASK_SWITCH; + VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, " + "%s errcode 0x%016lx", ts->reason, ts->tsssel, + ts->ext ? "external" : "internal", + ((uint64_t)ts->errcode << 32) | ts->errcode_valid); + break; + case EXIT_REASON_CR_ACCESS: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1); + switch (qual & 0xf) { + case 0: + handled = vmx_emulate_cr0_access(vmx, vcpu, qual); + break; + case 4: + handled = vmx_emulate_cr4_access(vmx, vcpu, qual); + break; + case 8: + handled = vmx_emulate_cr8_access(vmx, vcpu, qual); + break; + } + break; + case EXIT_REASON_RDMSR: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1); + retu = false; + ecx = vmxctx->guest_rcx; + VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx); + error = emulate_rdmsr(vmx, vcpu, ecx, &retu); + if (error) { + vmexit->exitcode = VM_EXITCODE_RDMSR; + vmexit->u.msr.code = ecx; + } else if (!retu) { + handled = HANDLED; + } else { + /* Return to userspace with a valid exitcode */ + KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, + ("emulate_rdmsr retu with bogus exitcode")); + } + break; + case EXIT_REASON_WRMSR: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1); + retu = false; + eax = vmxctx->guest_rax; + ecx = vmxctx->guest_rcx; + edx = vmxctx->guest_rdx; + VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx", + ecx, (uint64_t)edx << 32 | eax); + error = emulate_wrmsr(vmx, vcpu, ecx, + (uint64_t)edx << 32 | eax, &retu); + if (error) { + vmexit->exitcode = VM_EXITCODE_WRMSR; + vmexit->u.msr.code = ecx; + vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; + } else if (!retu) { + handled = HANDLED; + } else { + /* Return to userspace with a valid exitcode */ + KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, + ("emulate_wrmsr retu with bogus exitcode")); + } + break; + case EXIT_REASON_HLT: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); + vmexit->exitcode = VM_EXITCODE_HLT; + vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS); + break; + case EXIT_REASON_MTF: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1); + vmexit->exitcode = VM_EXITCODE_MTRAP; + vmexit->inst_length = 0; + break; + case EXIT_REASON_PAUSE: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1); + vmexit->exitcode = VM_EXITCODE_PAUSE; + break; + case EXIT_REASON_INTR_WINDOW: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1); + vmx_clear_int_window_exiting(vmx, vcpu); + return (1); + case EXIT_REASON_EXT_INTR: + /* + * External interrupts serve only to cause VM exits and allow + * the host interrupt handler to run. + * + * If this external interrupt triggers a virtual interrupt + * to a VM, then that state will be recorded by the + * host interrupt handler in the VM's softc. We will inject + * this virtual interrupt during the subsequent VM enter. + */ + intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); + + /* + * XXX: Ignore this exit if VMCS_INTR_VALID is not set. + * This appears to be a bug in VMware Fusion? + */ + if (!(intr_info & VMCS_INTR_VALID)) + return (1); + KASSERT((intr_info & VMCS_INTR_VALID) != 0 && + (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR, + ("VM exit interruption info invalid: %#x", intr_info)); + vmx_trigger_hostintr(intr_info & 0xff); + + /* + * This is special. We want to treat this as an 'handled' + * VM-exit but not increment the instruction pointer. + */ + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); + return (1); + case EXIT_REASON_NMI_WINDOW: + /* Exit to allow the pending virtual NMI to be injected */ + if (vm_nmi_pending(vmx->vm, vcpu)) + vmx_inject_nmi(vmx, vcpu); + vmx_clear_nmi_window_exiting(vmx, vcpu); + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1); + return (1); + case EXIT_REASON_INOUT: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1); + vmexit->exitcode = VM_EXITCODE_INOUT; + vmexit->u.inout.bytes = (qual & 0x7) + 1; + vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0; + vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; + vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; + vmexit->u.inout.port = (uint16_t)(qual >> 16); + vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); + if (vmexit->u.inout.string) { + inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO); + vmexit->exitcode = VM_EXITCODE_INOUT_STR; + vis = &vmexit->u.inout_str; + vmx_paging_info(&vis->paging); + vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS); + vis->cr0 = vmcs_read(VMCS_GUEST_CR0); + vis->index = inout_str_index(vmx, vcpu, in); + vis->count = inout_str_count(vmx, vcpu, vis->inout.rep); + vis->addrsize = inout_str_addrsize(inst_info); + inout_str_seginfo(vmx, vcpu, inst_info, in, vis); + } + break; + case EXIT_REASON_CPUID: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1); + handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); + break; + case EXIT_REASON_EXCEPTION: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1); + intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); + KASSERT((intr_info & VMCS_INTR_VALID) != 0, + ("VM exit interruption info invalid: %#x", intr_info)); + + intr_vec = intr_info & 0xff; + intr_type = intr_info & VMCS_INTR_T_MASK; + + /* + * If Virtual NMIs control is 1 and the VM-exit is due to a + * fault encountered during the execution of IRET then we must + * restore the state of "virtual-NMI blocking" before resuming + * the guest. + * + * See "Resuming Guest Software after Handling an Exception". + * See "Information for VM Exits Due to Vectored Events". + */ + if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && + (intr_vec != IDT_DF) && + (intr_info & EXIT_QUAL_NMIUDTI) != 0) + vmx_restore_nmi_blocking(vmx, vcpu); + + /* + * The NMI has already been handled in vmx_exit_handle_nmi(). + */ + if (intr_type == VMCS_INTR_T_NMI) + return (1); + + /* + * Call the machine check handler by hand. Also don't reflect + * the machine check back into the guest. + */ + if (intr_vec == IDT_MC) { + VCPU_CTR0(vmx->vm, vcpu, "Vectoring to MCE handler"); + __asm __volatile("int $18"); + return (1); + } + + if (intr_vec == IDT_PF) { + error = vmxctx_setreg(vmxctx, VM_REG_GUEST_CR2, qual); + KASSERT(error == 0, ("%s: vmxctx_setreg(cr2) error %d", + __func__, error)); + } + + /* + * Software exceptions exhibit trap-like behavior. This in + * turn requires populating the VM-entry instruction length + * so that the %rip in the trap frame is past the INT3/INTO + * instruction. + */ + if (intr_type == VMCS_INTR_T_SWEXCEPTION) + vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); + + /* Reflect all other exceptions back into the guest */ + errcode_valid = errcode = 0; + if (intr_info & VMCS_INTR_DEL_ERRCODE) { + errcode_valid = 1; + errcode = vmcs_read(VMCS_EXIT_INTR_ERRCODE); + } + VCPU_CTR2(vmx->vm, vcpu, "Reflecting exception %d/%#x into " + "the guest", intr_vec, errcode); + error = vm_inject_exception(vmx->vm, vcpu, intr_vec, + errcode_valid, errcode, 0); + KASSERT(error == 0, ("%s: vm_inject_exception error %d", + __func__, error)); + return (1); + + case EXIT_REASON_EPT_FAULT: + /* + * If 'gpa' lies within the address space allocated to + * memory then this must be a nested page fault otherwise + * this must be an instruction that accesses MMIO space. + */ + gpa = vmcs_gpa(); + if (vm_mem_allocated(vmx->vm, gpa) || + apic_access_fault(vmx, vcpu, gpa)) { + vmexit->exitcode = VM_EXITCODE_PAGING; + vmexit->inst_length = 0; + vmexit->u.paging.gpa = gpa; + vmexit->u.paging.fault_type = ept_fault_type(qual); + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1); + } else if (ept_emulation_fault(qual)) { + vmexit_inst_emul(vmexit, gpa, vmcs_gla()); + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1); + } + /* + * If Virtual NMIs control is 1 and the VM-exit is due to an + * EPT fault during the execution of IRET then we must restore + * the state of "virtual-NMI blocking" before resuming. + * + * See description of "NMI unblocking due to IRET" in + * "Exit Qualification for EPT Violations". + */ + if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && + (qual & EXIT_QUAL_NMIUDTI) != 0) + vmx_restore_nmi_blocking(vmx, vcpu); + break; + case EXIT_REASON_VIRTUALIZED_EOI: + vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI; + vmexit->u.ioapic_eoi.vector = qual & 0xFF; + vmexit->inst_length = 0; /* trap-like */ + break; + case EXIT_REASON_APIC_ACCESS: + handled = vmx_handle_apic_access(vmx, vcpu, vmexit); + break; + case EXIT_REASON_APIC_WRITE: + /* + * APIC-write VM exit is trap-like so the %rip is already + * pointing to the next instruction. + */ + vmexit->inst_length = 0; + vlapic = vm_lapic(vmx->vm, vcpu); + handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual); + break; + case EXIT_REASON_XSETBV: + handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit); + break; + case EXIT_REASON_MONITOR: + vmexit->exitcode = VM_EXITCODE_MONITOR; + break; + case EXIT_REASON_MWAIT: + vmexit->exitcode = VM_EXITCODE_MWAIT; + break; + default: + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1); + break; + } + + if (handled) { + /* + * It is possible that control is returned to userland + * even though we were able to handle the VM exit in the + * kernel. + * + * In such a case we want to make sure that the userland + * restarts guest execution at the instruction *after* + * the one we just processed. Therefore we update the + * guest rip in the VMCS and in 'vmexit'. + */ + vmexit->rip += vmexit->inst_length; + vmexit->inst_length = 0; + vmcs_write(VMCS_GUEST_RIP, vmexit->rip); + } else { + if (vmexit->exitcode == VM_EXITCODE_BOGUS) { + /* + * If this VM exit was not claimed by anybody then + * treat it as a generic VMX exit. + */ + vmexit->exitcode = VM_EXITCODE_VMX; + vmexit->u.vmx.status = VM_SUCCESS; + vmexit->u.vmx.inst_type = 0; + vmexit->u.vmx.inst_error = 0; + } else { + /* + * The exitcode and collateral have been populated. + * The VM exit will be processed further in userland. + */ + } + } + return (handled); +} + +static __inline void +vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit) +{ + + KASSERT(vmxctx->inst_fail_status != VM_SUCCESS, + ("vmx_exit_inst_error: invalid inst_fail_status %d", + vmxctx->inst_fail_status)); + + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_VMX; + vmexit->u.vmx.status = vmxctx->inst_fail_status; + vmexit->u.vmx.inst_error = vmcs_instruction_error(); + vmexit->u.vmx.exit_reason = ~0; + vmexit->u.vmx.exit_qualification = ~0; + + switch (rc) { + case VMX_VMRESUME_ERROR: + case VMX_VMLAUNCH_ERROR: + case VMX_INVEPT_ERROR: + vmexit->u.vmx.inst_type = rc; + break; + default: + panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc); + } +} + +/* + * If the NMI-exiting VM execution control is set to '1' then an NMI in + * non-root operation causes a VM-exit. NMI blocking is in effect so it is + * sufficient to simply vector to the NMI handler via a software interrupt. + * However, this must be done before maskable interrupts are enabled + * otherwise the "iret" issued by an interrupt handler will incorrectly + * clear NMI blocking. + */ +static __inline void +vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) +{ + uint32_t intr_info; + + KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled")); + + if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION) + return; + + intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); + KASSERT((intr_info & VMCS_INTR_VALID) != 0, + ("VM exit interruption info invalid: %#x", intr_info)); + + if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) { + KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due " + "to NMI has invalid vector: %#x", intr_info)); + VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler"); + __asm __volatile("int $2"); + } +} + +static int +vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap, + void *rendezvous_cookie, void *suspend_cookie) +{ + int rc, handled, launched; + struct vmx *vmx; + struct vm *vm; + struct vmxctx *vmxctx; + struct vmcs *vmcs; + struct vm_exit *vmexit; + struct vlapic *vlapic; + uint32_t exit_reason; + + vmx = arg; + vm = vmx->vm; + vmcs = &vmx->vmcs[vcpu]; + vmxctx = &vmx->ctx[vcpu]; + vlapic = vm_lapic(vm, vcpu); + vmexit = vm_exitinfo(vm, vcpu); + launched = 0; + + KASSERT(vmxctx->pmap == pmap, + ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap)); + + vmx_msr_guest_enter(vmx, vcpu); + + VMPTRLD(vmcs); + + /* + * XXX + * We do this every time because we may setup the virtual machine + * from a different process than the one that actually runs it. + * + * If the life of a virtual machine was spent entirely in the context + * of a single process we could do this once in vmx_vminit(). + */ + vmcs_write(VMCS_HOST_CR3, rcr3()); + + vmcs_write(VMCS_GUEST_RIP, rip); + vmx_set_pcpu_defaults(vmx, vcpu, pmap); + do { + KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch " + "%#lx/%#lx", __func__, vmcs_guest_rip(), rip)); + + handled = UNHANDLED; + /* + * Interrupts are disabled from this point on until the + * guest starts executing. This is done for the following + * reasons: + * + * If an AST is asserted on this thread after the check below, + * then the IPI_AST notification will not be lost, because it + * will cause a VM exit due to external interrupt as soon as + * the guest state is loaded. + * + * A posted interrupt after 'vmx_inject_interrupts()' will + * not be "lost" because it will be held pending in the host + * APIC because interrupts are disabled. The pending interrupt + * will be recognized as soon as the guest state is loaded. + * + * The same reasoning applies to the IPI generated by + * pmap_invalidate_ept(). + */ + disable_intr(); + vmx_inject_interrupts(vmx, vcpu, vlapic, rip); + + /* + * Check for vcpu suspension after injecting events because + * vmx_inject_interrupts() can suspend the vcpu due to a + * triple fault. + */ + if (vcpu_suspended(suspend_cookie)) { + enable_intr(); + vm_exit_suspended(vmx->vm, vcpu, rip); + break; + } + + if (vcpu_rendezvous_pending(rendezvous_cookie)) { + enable_intr(); + vm_exit_rendezvous(vmx->vm, vcpu, rip); + break; + } + + if (vcpu_should_yield(vm, vcpu)) { + enable_intr(); + vm_exit_astpending(vmx->vm, vcpu, rip); + vmx_astpending_trace(vmx, vcpu, rip); + handled = HANDLED; + break; + } + + vmx_run_trace(vmx, vcpu); + rc = vmx_enter_guest(vmxctx, vmx, launched); + + /* Collect some information for VM exit processing */ + vmexit->rip = rip = vmcs_guest_rip(); + vmexit->inst_length = vmexit_instruction_length(); + vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); + vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); + + /* Update 'nextrip' */ + vmx->state[vcpu].nextrip = rip; + + if (rc == VMX_GUEST_VMEXIT) { + vmx_exit_handle_nmi(vmx, vcpu, vmexit); + enable_intr(); + handled = vmx_exit_process(vmx, vcpu, vmexit); + } else { + enable_intr(); + vmx_exit_inst_error(vmxctx, rc, vmexit); + } + launched = 1; + vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled); + rip = vmexit->rip; + } while (handled); + + /* + * If a VM exit has been handled then the exitcode must be BOGUS + * If a VM exit is not handled then the exitcode must not be BOGUS + */ + if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || + (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { + panic("Mismatch between handled (%d) and exitcode (%d)", + handled, vmexit->exitcode); + } + + if (!handled) + vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1); + + VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d", + vmexit->exitcode); + + VMCLEAR(vmcs); + vmx_msr_guest_exit(vmx, vcpu); + + return (0); +} + +static void +vmx_vmcleanup(void *arg) +{ + int i; + struct vmx *vmx = arg; + + if (apic_access_virtualization(vmx, 0)) + vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); + + for (i = 0; i < VM_MAXCPU; i++) + vpid_free(vmx->state[i].vpid); + + free(vmx, M_VMX); + + return; +} + +static register_t * +vmxctx_regptr(struct vmxctx *vmxctx, int reg) +{ + + switch (reg) { + case VM_REG_GUEST_RAX: + return (&vmxctx->guest_rax); + case VM_REG_GUEST_RBX: + return (&vmxctx->guest_rbx); + case VM_REG_GUEST_RCX: + return (&vmxctx->guest_rcx); + case VM_REG_GUEST_RDX: + return (&vmxctx->guest_rdx); + case VM_REG_GUEST_RSI: + return (&vmxctx->guest_rsi); + case VM_REG_GUEST_RDI: + return (&vmxctx->guest_rdi); + case VM_REG_GUEST_RBP: + return (&vmxctx->guest_rbp); + case VM_REG_GUEST_R8: + return (&vmxctx->guest_r8); + case VM_REG_GUEST_R9: + return (&vmxctx->guest_r9); + case VM_REG_GUEST_R10: + return (&vmxctx->guest_r10); + case VM_REG_GUEST_R11: + return (&vmxctx->guest_r11); + case VM_REG_GUEST_R12: + return (&vmxctx->guest_r12); + case VM_REG_GUEST_R13: + return (&vmxctx->guest_r13); + case VM_REG_GUEST_R14: + return (&vmxctx->guest_r14); + case VM_REG_GUEST_R15: + return (&vmxctx->guest_r15); + case VM_REG_GUEST_CR2: + return (&vmxctx->guest_cr2); + default: + break; + } + return (NULL); +} + +static int +vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) +{ + register_t *regp; + + if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { + *retval = *regp; + return (0); + } else + return (EINVAL); +} + +static int +vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) +{ + register_t *regp; + + if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { + *regp = val; + return (0); + } else + return (EINVAL); +} + +static int +vmx_get_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t *retval) +{ + uint64_t gi; + int error; + + error = vmcs_getreg(&vmx->vmcs[vcpu], running, + VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY), &gi); + *retval = (gi & HWINTR_BLOCKING) ? 1 : 0; + return (error); +} + +static int +vmx_modify_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t val) +{ + struct vmcs *vmcs; + uint64_t gi; + int error, ident; + + /* + * Forcing the vcpu into an interrupt shadow is not supported. + */ + if (val) { + error = EINVAL; + goto done; + } + + vmcs = &vmx->vmcs[vcpu]; + ident = VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY); + error = vmcs_getreg(vmcs, running, ident, &gi); + if (error == 0) { + gi &= ~HWINTR_BLOCKING; + error = vmcs_setreg(vmcs, running, ident, gi); + } +done: + VCPU_CTR2(vmx->vm, vcpu, "Setting intr_shadow to %#lx %s", val, + error ? "failed" : "succeeded"); + return (error); +} + +static int +vmx_shadow_reg(int reg) +{ + int shreg; + + shreg = -1; + + switch (reg) { + case VM_REG_GUEST_CR0: + shreg = VMCS_CR0_SHADOW; + break; + case VM_REG_GUEST_CR4: + shreg = VMCS_CR4_SHADOW; + break; + default: + break; + } + + return (shreg); +} + +static int +vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) +{ + int running, hostcpu; + struct vmx *vmx = arg; + + running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); + + if (reg == VM_REG_GUEST_INTR_SHADOW) + return (vmx_get_intr_shadow(vmx, vcpu, running, retval)); + + if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) + return (0); + + return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval)); +} + +static int +vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) +{ + int error, hostcpu, running, shadow; + uint64_t ctls; + pmap_t pmap; + struct vmx *vmx = arg; + + running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); + + if (reg == VM_REG_GUEST_INTR_SHADOW) + return (vmx_modify_intr_shadow(vmx, vcpu, running, val)); + + if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) + return (0); + + error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val); + + if (error == 0) { + /* + * If the "load EFER" VM-entry control is 1 then the + * value of EFER.LMA must be identical to "IA-32e mode guest" + * bit in the VM-entry control. + */ + if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && + (reg == VM_REG_GUEST_EFER)) { + vmcs_getreg(&vmx->vmcs[vcpu], running, + VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); + if (val & EFER_LMA) + ctls |= VM_ENTRY_GUEST_LMA; + else + ctls &= ~VM_ENTRY_GUEST_LMA; + vmcs_setreg(&vmx->vmcs[vcpu], running, + VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); + } + + shadow = vmx_shadow_reg(reg); + if (shadow > 0) { + /* + * Store the unmodified value in the shadow + */ + error = vmcs_setreg(&vmx->vmcs[vcpu], running, + VMCS_IDENT(shadow), val); + } + + if (reg == VM_REG_GUEST_CR3) { + /* + * Invalidate the guest vcpu's TLB mappings to emulate + * the behavior of updating %cr3. + * + * XXX the processor retains global mappings when %cr3 + * is updated but vmx_invvpid() does not. + */ + pmap = vmx->ctx[vcpu].pmap; + vmx_invvpid(vmx, vcpu, pmap, running); + } + } + + return (error); +} + +static int +vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) +{ + int hostcpu, running; + struct vmx *vmx = arg; + + running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), vcpu); + + return (vmcs_getdesc(&vmx->vmcs[vcpu], running, reg, desc)); +} + +static int +vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) +{ + int hostcpu, running; + struct vmx *vmx = arg; + + running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); + if (running && hostcpu != curcpu) + panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), vcpu); + + return (vmcs_setdesc(&vmx->vmcs[vcpu], running, reg, desc)); +} + +static int +vmx_getcap(void *arg, int vcpu, int type, int *retval) +{ + struct vmx *vmx = arg; + int vcap; + int ret; + + ret = ENOENT; + + vcap = vmx->cap[vcpu].set; + + switch (type) { + case VM_CAP_HALT_EXIT: + if (cap_halt_exit) + ret = 0; + break; + case VM_CAP_PAUSE_EXIT: + if (cap_pause_exit) + ret = 0; + break; + case VM_CAP_MTRAP_EXIT: + if (cap_monitor_trap) + ret = 0; + break; + case VM_CAP_UNRESTRICTED_GUEST: + if (cap_unrestricted_guest) + ret = 0; + break; + case VM_CAP_ENABLE_INVPCID: + if (cap_invpcid) + ret = 0; + break; + default: + break; + } + + if (ret == 0) + *retval = (vcap & (1 << type)) ? 1 : 0; + + return (ret); +} + +static int +vmx_setcap(void *arg, int vcpu, int type, int val) +{ + struct vmx *vmx = arg; + struct vmcs *vmcs = &vmx->vmcs[vcpu]; + uint32_t baseval; + uint32_t *pptr; + int error; + int flag; + int reg; + int retval; + + retval = ENOENT; + pptr = NULL; + + switch (type) { + case VM_CAP_HALT_EXIT: + if (cap_halt_exit) { + retval = 0; + pptr = &vmx->cap[vcpu].proc_ctls; + baseval = *pptr; + flag = PROCBASED_HLT_EXITING; + reg = VMCS_PRI_PROC_BASED_CTLS; + } + break; + case VM_CAP_MTRAP_EXIT: + if (cap_monitor_trap) { + retval = 0; + pptr = &vmx->cap[vcpu].proc_ctls; + baseval = *pptr; + flag = PROCBASED_MTF; + reg = VMCS_PRI_PROC_BASED_CTLS; + } + break; + case VM_CAP_PAUSE_EXIT: + if (cap_pause_exit) { + retval = 0; + pptr = &vmx->cap[vcpu].proc_ctls; + baseval = *pptr; + flag = PROCBASED_PAUSE_EXITING; + reg = VMCS_PRI_PROC_BASED_CTLS; + } + break; + case VM_CAP_UNRESTRICTED_GUEST: + if (cap_unrestricted_guest) { + retval = 0; + pptr = &vmx->cap[vcpu].proc_ctls2; + baseval = *pptr; + flag = PROCBASED2_UNRESTRICTED_GUEST; + reg = VMCS_SEC_PROC_BASED_CTLS; + } + break; + case VM_CAP_ENABLE_INVPCID: + if (cap_invpcid) { + retval = 0; + pptr = &vmx->cap[vcpu].proc_ctls2; + baseval = *pptr; + flag = PROCBASED2_ENABLE_INVPCID; + reg = VMCS_SEC_PROC_BASED_CTLS; + } + break; + default: + break; + } + + if (retval == 0) { + if (val) { + baseval |= flag; + } else { + baseval &= ~flag; + } + VMPTRLD(vmcs); + error = vmwrite(reg, baseval); + VMCLEAR(vmcs); + + if (error) { + retval = error; + } else { + /* + * Update optional stored flags, and record + * setting + */ + if (pptr != NULL) { + *pptr = baseval; + } + + if (val) { + vmx->cap[vcpu].set |= (1 << type); + } else { + vmx->cap[vcpu].set &= ~(1 << type); + } + } + } + + return (retval); +} + +struct vlapic_vtx { + struct vlapic vlapic; + struct pir_desc *pir_desc; + struct vmx *vmx; +}; + +#define VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg) \ +do { \ + VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d", \ + level ? "level" : "edge", vector); \ + VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]); \ + VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]); \ + VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]); \ + VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]); \ + VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\ +} while (0) + +/* + * vlapic->ops handlers that utilize the APICv hardware assist described in + * Chapter 29 of the Intel SDM. + */ +static int +vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level) +{ + struct vlapic_vtx *vlapic_vtx; + struct pir_desc *pir_desc; + uint64_t mask; + int idx, notify; + + vlapic_vtx = (struct vlapic_vtx *)vlapic; + pir_desc = vlapic_vtx->pir_desc; + + /* + * Keep track of interrupt requests in the PIR descriptor. This is + * because the virtual APIC page pointed to by the VMCS cannot be + * modified if the vcpu is running. + */ + idx = vector / 64; + mask = 1UL << (vector % 64); + atomic_set_long(&pir_desc->pir[idx], mask); + notify = atomic_cmpset_long(&pir_desc->pending, 0, 1); + + VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector, + level, "vmx_set_intr_ready"); + return (notify); +} + +static int +vmx_pending_intr(struct vlapic *vlapic, int *vecptr) +{ + struct vlapic_vtx *vlapic_vtx; + struct pir_desc *pir_desc; + struct LAPIC *lapic; + uint64_t pending, pirval; + uint32_t ppr, vpr; + int i; + + /* + * This function is only expected to be called from the 'HLT' exit + * handler which does not care about the vector that is pending. + */ + KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL")); + + vlapic_vtx = (struct vlapic_vtx *)vlapic; + pir_desc = vlapic_vtx->pir_desc; + + pending = atomic_load_acq_long(&pir_desc->pending); + if (!pending) + return (0); /* common case */ + + /* + * If there is an interrupt pending then it will be recognized only + * if its priority is greater than the processor priority. + * + * Special case: if the processor priority is zero then any pending + * interrupt will be recognized. + */ + lapic = vlapic->apic_page; + ppr = lapic->ppr & 0xf0; + if (ppr == 0) + return (1); + + VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d", + lapic->ppr); + + for (i = 3; i >= 0; i--) { + pirval = pir_desc->pir[i]; + if (pirval != 0) { + vpr = (i * 64 + flsl(pirval) - 1) & 0xf0; + return (vpr > ppr); + } + } + return (0); +} + +static void +vmx_intr_accepted(struct vlapic *vlapic, int vector) +{ + + panic("vmx_intr_accepted: not expected to be called"); +} + +static void +vmx_set_tmr(struct vlapic *vlapic, int vector, bool level) +{ + struct vlapic_vtx *vlapic_vtx; + struct vmx *vmx; + struct vmcs *vmcs; + uint64_t mask, val; + + KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); + KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL), + ("vmx_set_tmr: vcpu cannot be running")); + + vlapic_vtx = (struct vlapic_vtx *)vlapic; + vmx = vlapic_vtx->vmx; + vmcs = &vmx->vmcs[vlapic->vcpuid]; + mask = 1UL << (vector % 64); + + VMPTRLD(vmcs); + val = vmcs_read(VMCS_EOI_EXIT(vector)); + if (level) + val |= mask; + else + val &= ~mask; + vmcs_write(VMCS_EOI_EXIT(vector), val); + VMCLEAR(vmcs); +} + +static void +vmx_enable_x2apic_mode(struct vlapic *vlapic) +{ + struct vmx *vmx; + struct vmcs *vmcs; + uint32_t proc_ctls2; + int vcpuid, error; + + vcpuid = vlapic->vcpuid; + vmx = ((struct vlapic_vtx *)vlapic)->vmx; + vmcs = &vmx->vmcs[vcpuid]; + + proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; + KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0, + ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2)); + + proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES; + proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE; + vmx->cap[vcpuid].proc_ctls2 = proc_ctls2; + + VMPTRLD(vmcs); + vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2); + VMCLEAR(vmcs); + + if (vlapic->vcpuid == 0) { + /* + * The nested page table mappings are shared by all vcpus + * so unmap the APIC access page just once. + */ + error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); + KASSERT(error == 0, ("%s: vm_unmap_mmio error %d", + __func__, error)); + + /* + * The MSR bitmap is shared by all vcpus so modify it only + * once in the context of vcpu 0. + */ + error = vmx_allow_x2apic_msrs(vmx); + KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d", + __func__, error)); + } +} + +static void +vmx_post_intr(struct vlapic *vlapic, int hostcpu) +{ + + ipi_cpu(hostcpu, pirvec); +} + +/* + * Transfer the pending interrupts in the PIR descriptor to the IRR + * in the virtual APIC page. + */ +static void +vmx_inject_pir(struct vlapic *vlapic) +{ + struct vlapic_vtx *vlapic_vtx; + struct pir_desc *pir_desc; + struct LAPIC *lapic; + uint64_t val, pirval; + int rvi, pirbase = -1; + uint16_t intr_status_old, intr_status_new; + + vlapic_vtx = (struct vlapic_vtx *)vlapic; + pir_desc = vlapic_vtx->pir_desc; + if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) { + VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: " + "no posted interrupt pending"); + return; + } + + pirval = 0; + pirbase = -1; + lapic = vlapic->apic_page; + + val = atomic_readandclear_long(&pir_desc->pir[0]); + if (val != 0) { + lapic->irr0 |= val; + lapic->irr1 |= val >> 32; + pirbase = 0; + pirval = val; + } + + val = atomic_readandclear_long(&pir_desc->pir[1]); + if (val != 0) { + lapic->irr2 |= val; + lapic->irr3 |= val >> 32; + pirbase = 64; + pirval = val; + } + + val = atomic_readandclear_long(&pir_desc->pir[2]); + if (val != 0) { + lapic->irr4 |= val; + lapic->irr5 |= val >> 32; + pirbase = 128; + pirval = val; + } + + val = atomic_readandclear_long(&pir_desc->pir[3]); + if (val != 0) { + lapic->irr6 |= val; + lapic->irr7 |= val >> 32; + pirbase = 192; + pirval = val; + } + + VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir"); + + /* + * Update RVI so the processor can evaluate pending virtual + * interrupts on VM-entry. + * + * It is possible for pirval to be 0 here, even though the + * pending bit has been set. The scenario is: + * CPU-Y is sending a posted interrupt to CPU-X, which + * is running a guest and processing posted interrupts in h/w. + * CPU-X will eventually exit and the state seen in s/w is + * the pending bit set, but no PIR bits set. + * + * CPU-X CPU-Y + * (vm running) (host running) + * rx posted interrupt + * CLEAR pending bit + * SET PIR bit + * READ/CLEAR PIR bits + * SET pending bit + * (vm exit) + * pending bit set, PIR 0 + */ + if (pirval != 0) { + rvi = pirbase + flsl(pirval) - 1; + intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS); + intr_status_new = (intr_status_old & 0xFF00) | rvi; + if (intr_status_new > intr_status_old) { + vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new); + VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: " + "guest_intr_status changed from 0x%04x to 0x%04x", + intr_status_old, intr_status_new); + } + } +} + +static struct vlapic * +vmx_vlapic_init(void *arg, int vcpuid) +{ + struct vmx *vmx; + struct vlapic *vlapic; + struct vlapic_vtx *vlapic_vtx; + + vmx = arg; + + vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO); + vlapic->vm = vmx->vm; + vlapic->vcpuid = vcpuid; + vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid]; + + vlapic_vtx = (struct vlapic_vtx *)vlapic; + vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid]; + vlapic_vtx->vmx = vmx; + + if (virtual_interrupt_delivery) { + vlapic->ops.set_intr_ready = vmx_set_intr_ready; + vlapic->ops.pending_intr = vmx_pending_intr; + vlapic->ops.intr_accepted = vmx_intr_accepted; + vlapic->ops.set_tmr = vmx_set_tmr; + vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode; + } + + if (posted_interrupts) + vlapic->ops.post_intr = vmx_post_intr; + + vlapic_init(vlapic); + + return (vlapic); +} + +static void +vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic) +{ + + vlapic_cleanup(vlapic); + free(vlapic, M_VLAPIC); +} + +struct vmm_ops vmm_ops_intel = { + vmx_init, + vmx_cleanup, + vmx_restore, + vmx_vminit, + vmx_run, + vmx_vmcleanup, + vmx_getreg, + vmx_setreg, + vmx_getdesc, + vmx_setdesc, + vmx_getcap, + vmx_setcap, + ept_vmspace_alloc, + ept_vmspace_free, + vmx_vlapic_init, + vmx_vlapic_cleanup, +}; diff --git a/vmm/intel/vmx.h b/vmm/intel/vmx.h new file mode 100644 index 0000000..bc48861 --- /dev/null +++ b/vmm/intel/vmx.h @@ -0,0 +1,140 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMX_H_ +#define _VMX_H_ + +#include "vmcs.h" + +struct pmap; + +struct vmxctx { + register_t guest_rdi; /* Guest state */ + register_t guest_rsi; + register_t guest_rdx; + register_t guest_rcx; + register_t guest_r8; + register_t guest_r9; + register_t guest_rax; + register_t guest_rbx; + register_t guest_rbp; + register_t guest_r10; + register_t guest_r11; + register_t guest_r12; + register_t guest_r13; + register_t guest_r14; + register_t guest_r15; + register_t guest_cr2; + + register_t host_r15; /* Host state */ + register_t host_r14; + register_t host_r13; + register_t host_r12; + register_t host_rbp; + register_t host_rsp; + register_t host_rbx; + /* + * XXX todo debug registers and fpu state + */ + + int inst_fail_status; + + /* + * The pmap needs to be deactivated in vmx_enter_guest() + * so keep a copy of the 'pmap' in each vmxctx. + */ + struct pmap *pmap; +}; + +struct vmxcap { + int set; + uint32_t proc_ctls; + uint32_t proc_ctls2; +}; + +struct vmxstate { + uint64_t nextrip; /* next instruction to be executed by guest */ + int lastcpu; /* host cpu that this 'vcpu' last ran on */ + uint16_t vpid; +}; + +struct apic_page { + uint32_t reg[PAGE_SIZE / 4]; +}; +CTASSERT(sizeof(struct apic_page) == PAGE_SIZE); + +/* Posted Interrupt Descriptor (described in section 29.6 of the Intel SDM) */ +struct pir_desc { + uint64_t pir[4]; + uint64_t pending; + uint64_t unused[3]; +} __aligned(64); +CTASSERT(sizeof(struct pir_desc) == 64); + +/* Index into the 'guest_msrs[]' array */ +enum { + IDX_MSR_LSTAR, + IDX_MSR_CSTAR, + IDX_MSR_STAR, + IDX_MSR_SF_MASK, + IDX_MSR_KGSBASE, + IDX_MSR_PAT, + GUEST_MSR_NUM /* must be the last enumeration */ +}; + +/* virtual machine softc */ +struct vmx { + struct vmcs vmcs[VM_MAXCPU]; /* one vmcs per virtual cpu */ + struct apic_page apic_page[VM_MAXCPU]; /* one apic page per vcpu */ + char msr_bitmap[PAGE_SIZE]; + struct pir_desc pir_desc[VM_MAXCPU]; + uint64_t guest_msrs[VM_MAXCPU][GUEST_MSR_NUM]; + struct vmxctx ctx[VM_MAXCPU]; + struct vmxcap cap[VM_MAXCPU]; + struct vmxstate state[VM_MAXCPU]; + uint64_t eptp; + struct vm *vm; + long eptgen[MAXCPU]; /* cached pmap->pm_eptgen */ +}; +CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0); +CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0); +CTASSERT((offsetof(struct vmx, pir_desc[0]) & 63) == 0); + +#define VMX_GUEST_VMEXIT 0 +#define VMX_VMRESUME_ERROR 1 +#define VMX_VMLAUNCH_ERROR 2 +#define VMX_INVEPT_ERROR 3 +int vmx_enter_guest(struct vmxctx *ctx, struct vmx *vmx, int launched); +void vmx_call_isr(uintptr_t entry); + +u_long vmx_fix_cr0(u_long cr0); +u_long vmx_fix_cr4(u_long cr4); + +extern char vmx_exit_guest[]; + +#endif diff --git a/vmm/intel/vmx_controls.h b/vmm/intel/vmx_controls.h new file mode 100644 index 0000000..2b117ae --- /dev/null +++ b/vmm/intel/vmx_controls.h @@ -0,0 +1,96 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMX_CONTROLS_H_ +#define _VMX_CONTROLS_H_ + +/* Pin-Based VM-Execution Controls */ +#define PINBASED_EXTINT_EXITING (1 << 0) +#define PINBASED_NMI_EXITING (1 << 3) +#define PINBASED_VIRTUAL_NMI (1 << 5) +#define PINBASED_PREMPTION_TIMER (1 << 6) +#define PINBASED_POSTED_INTERRUPT (1 << 7) + +/* Primary Processor-Based VM-Execution Controls */ +#define PROCBASED_INT_WINDOW_EXITING (1 << 2) +#define PROCBASED_TSC_OFFSET (1 << 3) +#define PROCBASED_HLT_EXITING (1 << 7) +#define PROCBASED_INVLPG_EXITING (1 << 9) +#define PROCBASED_MWAIT_EXITING (1 << 10) +#define PROCBASED_RDPMC_EXITING (1 << 11) +#define PROCBASED_RDTSC_EXITING (1 << 12) +#define PROCBASED_CR3_LOAD_EXITING (1 << 15) +#define PROCBASED_CR3_STORE_EXITING (1 << 16) +#define PROCBASED_CR8_LOAD_EXITING (1 << 19) +#define PROCBASED_CR8_STORE_EXITING (1 << 20) +#define PROCBASED_USE_TPR_SHADOW (1 << 21) +#define PROCBASED_NMI_WINDOW_EXITING (1 << 22) +#define PROCBASED_MOV_DR_EXITING (1 << 23) +#define PROCBASED_IO_EXITING (1 << 24) +#define PROCBASED_IO_BITMAPS (1 << 25) +#define PROCBASED_MTF (1 << 27) +#define PROCBASED_MSR_BITMAPS (1 << 28) +#define PROCBASED_MONITOR_EXITING (1 << 29) +#define PROCBASED_PAUSE_EXITING (1 << 30) +#define PROCBASED_SECONDARY_CONTROLS (1U << 31) + +/* Secondary Processor-Based VM-Execution Controls */ +#define PROCBASED2_VIRTUALIZE_APIC_ACCESSES (1 << 0) +#define PROCBASED2_ENABLE_EPT (1 << 1) +#define PROCBASED2_DESC_TABLE_EXITING (1 << 2) +#define PROCBASED2_ENABLE_RDTSCP (1 << 3) +#define PROCBASED2_VIRTUALIZE_X2APIC_MODE (1 << 4) +#define PROCBASED2_ENABLE_VPID (1 << 5) +#define PROCBASED2_WBINVD_EXITING (1 << 6) +#define PROCBASED2_UNRESTRICTED_GUEST (1 << 7) +#define PROCBASED2_APIC_REGISTER_VIRTUALIZATION (1 << 8) +#define PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY (1 << 9) +#define PROCBASED2_PAUSE_LOOP_EXITING (1 << 10) +#define PROCBASED2_ENABLE_INVPCID (1 << 12) + +/* VM Exit Controls */ +#define VM_EXIT_SAVE_DEBUG_CONTROLS (1 << 2) +#define VM_EXIT_HOST_LMA (1 << 9) +#define VM_EXIT_LOAD_PERF_GLOBAL_CTRL (1 << 12) +#define VM_EXIT_ACKNOWLEDGE_INTERRUPT (1 << 15) +#define VM_EXIT_SAVE_PAT (1 << 18) +#define VM_EXIT_LOAD_PAT (1 << 19) +#define VM_EXIT_SAVE_EFER (1 << 20) +#define VM_EXIT_LOAD_EFER (1 << 21) +#define VM_EXIT_SAVE_PREEMPTION_TIMER (1 << 22) + +/* VM Entry Controls */ +#define VM_ENTRY_LOAD_DEBUG_CONTROLS (1 << 2) +#define VM_ENTRY_GUEST_LMA (1 << 9) +#define VM_ENTRY_INTO_SMM (1 << 10) +#define VM_ENTRY_DEACTIVATE_DUAL_MONITOR (1 << 11) +#define VM_ENTRY_LOAD_PERF_GLOBAL_CTRL (1 << 13) +#define VM_ENTRY_LOAD_PAT (1 << 14) +#define VM_ENTRY_LOAD_EFER (1 << 15) + +#endif diff --git a/vmm/intel/vmx_cpufunc.h b/vmm/intel/vmx_cpufunc.h new file mode 100644 index 0000000..2e66443 --- /dev/null +++ b/vmm/intel/vmx_cpufunc.h @@ -0,0 +1,218 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMX_CPUFUNC_H_ +#define _VMX_CPUFUNC_H_ + +struct vmcs; + +/* + * Section 5.2 "Conventions" from Intel Architecture Manual 2B. + * + * error + * VMsucceed 0 + * VMFailInvalid 1 + * VMFailValid 2 see also VMCS VM-Instruction Error Field + */ +#define VM_SUCCESS 0 +#define VM_FAIL_INVALID 1 +#define VM_FAIL_VALID 2 +#define VMX_SET_ERROR_CODE \ + " jnc 1f;" \ + " mov $1, %[error];" /* CF: error = 1 */ \ + " jmp 3f;" \ + "1: jnz 2f;" \ + " mov $2, %[error];" /* ZF: error = 2 */ \ + " jmp 3f;" \ + "2: mov $0, %[error];" \ + "3:" + +/* returns 0 on success and non-zero on failure */ +static __inline int +vmxon(char *region) +{ + int error; + uint64_t addr; + + addr = vtophys(region); + __asm __volatile("vmxon %[addr];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [addr] "m" (*(uint64_t *)&addr) + : "memory"); + + return (error); +} + +/* returns 0 on success and non-zero on failure */ +static __inline int +vmclear(struct vmcs *vmcs) +{ + int error; + uint64_t addr; + + addr = vtophys(vmcs); + __asm __volatile("vmclear %[addr];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [addr] "m" (*(uint64_t *)&addr) + : "memory"); + return (error); +} + +static __inline void +vmxoff(void) +{ + + __asm __volatile("vmxoff"); +} + +static __inline void +vmptrst(uint64_t *addr) +{ + + __asm __volatile("vmptrst %[addr]" :: [addr]"m" (*addr) : "memory"); +} + +static __inline int +vmptrld(struct vmcs *vmcs) +{ + int error; + uint64_t addr; + + addr = vtophys(vmcs); + __asm __volatile("vmptrld %[addr];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [addr] "m" (*(uint64_t *)&addr) + : "memory"); + return (error); +} + +static __inline int +vmwrite(uint64_t reg, uint64_t val) +{ + int error; + + __asm __volatile("vmwrite %[val], %[reg];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [val] "r" (val), [reg] "r" (reg) + : "memory"); + + return (error); +} + +static __inline int +vmread(uint64_t r, uint64_t *addr) +{ + int error; + + __asm __volatile("vmread %[r], %[addr];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [r] "r" (r), [addr] "m" (*addr) + : "memory"); + + return (error); +} + +static void __inline +VMCLEAR(struct vmcs *vmcs) +{ + int err; + + err = vmclear(vmcs); + if (err != 0) + panic("%s: vmclear(%p) error %d", __func__, vmcs, err); + + critical_exit(); +} + +static void __inline +VMPTRLD(struct vmcs *vmcs) +{ + int err; + + critical_enter(); + + err = vmptrld(vmcs); + if (err != 0) + panic("%s: vmptrld(%p) error %d", __func__, vmcs, err); +} + +#define INVVPID_TYPE_ADDRESS 0UL +#define INVVPID_TYPE_SINGLE_CONTEXT 1UL +#define INVVPID_TYPE_ALL_CONTEXTS 2UL + +struct invvpid_desc { + uint16_t vpid; + uint16_t _res1; + uint32_t _res2; + uint64_t linear_addr; +}; +CTASSERT(sizeof(struct invvpid_desc) == 16); + +static void __inline +invvpid(uint64_t type, struct invvpid_desc desc) +{ + int error; + + __asm __volatile("invvpid %[desc], %[type];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [desc] "m" (desc), [type] "r" (type) + : "memory"); + + if (error) + panic("invvpid error %d", error); +} + +#define INVEPT_TYPE_SINGLE_CONTEXT 1UL +#define INVEPT_TYPE_ALL_CONTEXTS 2UL +struct invept_desc { + uint64_t eptp; + uint64_t _res; +}; +CTASSERT(sizeof(struct invept_desc) == 16); + +static void __inline +invept(uint64_t type, struct invept_desc desc) +{ + int error; + + __asm __volatile("invept %[desc], %[type];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [desc] "m" (desc), [type] "r" (type) + : "memory"); + + if (error) + panic("invept error %d", error); +} +#endif diff --git a/vmm/intel/vmx_genassym.c b/vmm/intel/vmx_genassym.c new file mode 100644 index 0000000..e1b98d6 --- /dev/null +++ b/vmm/intel/vmx_genassym.c @@ -0,0 +1,88 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include + +#include +#include "vmx_cpufunc.h" +#include "vmx.h" + +ASSYM(VMXCTX_GUEST_RDI, offsetof(struct vmxctx, guest_rdi)); +ASSYM(VMXCTX_GUEST_RSI, offsetof(struct vmxctx, guest_rsi)); +ASSYM(VMXCTX_GUEST_RDX, offsetof(struct vmxctx, guest_rdx)); +ASSYM(VMXCTX_GUEST_RCX, offsetof(struct vmxctx, guest_rcx)); +ASSYM(VMXCTX_GUEST_R8, offsetof(struct vmxctx, guest_r8)); +ASSYM(VMXCTX_GUEST_R9, offsetof(struct vmxctx, guest_r9)); +ASSYM(VMXCTX_GUEST_RAX, offsetof(struct vmxctx, guest_rax)); +ASSYM(VMXCTX_GUEST_RBX, offsetof(struct vmxctx, guest_rbx)); +ASSYM(VMXCTX_GUEST_RBP, offsetof(struct vmxctx, guest_rbp)); +ASSYM(VMXCTX_GUEST_R10, offsetof(struct vmxctx, guest_r10)); +ASSYM(VMXCTX_GUEST_R11, offsetof(struct vmxctx, guest_r11)); +ASSYM(VMXCTX_GUEST_R12, offsetof(struct vmxctx, guest_r12)); +ASSYM(VMXCTX_GUEST_R13, offsetof(struct vmxctx, guest_r13)); +ASSYM(VMXCTX_GUEST_R14, offsetof(struct vmxctx, guest_r14)); +ASSYM(VMXCTX_GUEST_R15, offsetof(struct vmxctx, guest_r15)); +ASSYM(VMXCTX_GUEST_CR2, offsetof(struct vmxctx, guest_cr2)); + +ASSYM(VMXCTX_HOST_R15, offsetof(struct vmxctx, host_r15)); +ASSYM(VMXCTX_HOST_R14, offsetof(struct vmxctx, host_r14)); +ASSYM(VMXCTX_HOST_R13, offsetof(struct vmxctx, host_r13)); +ASSYM(VMXCTX_HOST_R12, offsetof(struct vmxctx, host_r12)); +ASSYM(VMXCTX_HOST_RBP, offsetof(struct vmxctx, host_rbp)); +ASSYM(VMXCTX_HOST_RSP, offsetof(struct vmxctx, host_rsp)); +ASSYM(VMXCTX_HOST_RBX, offsetof(struct vmxctx, host_rbx)); + +ASSYM(VMXCTX_INST_FAIL_STATUS, offsetof(struct vmxctx, inst_fail_status)); +ASSYM(VMXCTX_PMAP, offsetof(struct vmxctx, pmap)); + +ASSYM(VMX_EPTGEN, offsetof(struct vmx, eptgen)); +ASSYM(VMX_EPTP, offsetof(struct vmx, eptp)); + +ASSYM(VM_FAIL_INVALID, VM_FAIL_INVALID); +ASSYM(VM_FAIL_VALID, VM_FAIL_VALID); +ASSYM(VMX_GUEST_VMEXIT, VMX_GUEST_VMEXIT); +ASSYM(VMX_VMRESUME_ERROR, VMX_VMRESUME_ERROR); +ASSYM(VMX_VMLAUNCH_ERROR, VMX_VMLAUNCH_ERROR); +ASSYM(VMX_INVEPT_ERROR, VMX_INVEPT_ERROR); + +ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid)); + +ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active)); +ASSYM(PM_EPTGEN, offsetof(struct pmap, pm_eptgen)); + +ASSYM(KERNEL_SS, GSEL(GDATA_SEL, SEL_KPL)); +ASSYM(KERNEL_CS, GSEL(GCODE_SEL, SEL_KPL)); diff --git a/vmm/intel/vmx_msr.c b/vmm/intel/vmx_msr.c new file mode 100644 index 0000000..3091f68 --- /dev/null +++ b/vmm/intel/vmx_msr.c @@ -0,0 +1,483 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include +#include +#include + +#include "vmx.h" +#include "vmx_msr.h" + +static boolean_t +vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos) +{ + + if (msr_val & (1UL << (bitpos + 32))) + return (TRUE); + else + return (FALSE); +} + +static boolean_t +vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos) +{ + + if ((msr_val & (1UL << bitpos)) == 0) + return (TRUE); + else + return (FALSE); +} + +uint32_t +vmx_revision(void) +{ + + return (rdmsr(MSR_VMX_BASIC) & 0xffffffff); +} + +/* + * Generate a bitmask to be used for the VMCS execution control fields. + * + * The caller specifies what bits should be set to one in 'ones_mask' + * and what bits should be set to zero in 'zeros_mask'. The don't-care + * bits are set to the default value. The default values are obtained + * based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining + * VMX Capabilities". + * + * Returns zero on success and non-zero on error. + */ +int +vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask, + uint32_t zeros_mask, uint32_t *retval) +{ + int i; + uint64_t val, trueval; + boolean_t true_ctls_avail, one_allowed, zero_allowed; + + /* We cannot ask the same bit to be set to both '1' and '0' */ + if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask)) + return (EINVAL); + + if (rdmsr(MSR_VMX_BASIC) & (1UL << 55)) + true_ctls_avail = TRUE; + else + true_ctls_avail = FALSE; + + val = rdmsr(ctl_reg); + if (true_ctls_avail) + trueval = rdmsr(true_ctl_reg); /* step c */ + else + trueval = val; /* step a */ + + for (i = 0; i < 32; i++) { + one_allowed = vmx_ctl_allows_one_setting(trueval, i); + zero_allowed = vmx_ctl_allows_zero_setting(trueval, i); + + KASSERT(one_allowed || zero_allowed, + ("invalid zero/one setting for bit %d of ctl 0x%0x, " + "truectl 0x%0x\n", i, ctl_reg, true_ctl_reg)); + + if (zero_allowed && !one_allowed) { /* b(i),c(i) */ + if (ones_mask & (1 << i)) + return (EINVAL); + *retval &= ~(1 << i); + } else if (one_allowed && !zero_allowed) { /* b(i),c(i) */ + if (zeros_mask & (1 << i)) + return (EINVAL); + *retval |= 1 << i; + } else { + if (zeros_mask & (1 << i)) /* b(ii),c(ii) */ + *retval &= ~(1 << i); + else if (ones_mask & (1 << i)) /* b(ii), c(ii) */ + *retval |= 1 << i; + else if (!true_ctls_avail) + *retval &= ~(1 << i); /* b(iii) */ + else if (vmx_ctl_allows_zero_setting(val, i))/* c(iii)*/ + *retval &= ~(1 << i); + else if (vmx_ctl_allows_one_setting(val, i)) /* c(iv) */ + *retval |= 1 << i; + else { + panic("vmx_set_ctlreg: unable to determine " + "correct value of ctl bit %d for msr " + "0x%0x and true msr 0x%0x", i, ctl_reg, + true_ctl_reg); + } + } + } + + return (0); +} + +void +msr_bitmap_initialize(char *bitmap) +{ + + memset(bitmap, 0xff, PAGE_SIZE); +} + +int +msr_bitmap_change_access(char *bitmap, u_int msr, int access) +{ + int byte, bit; + + if (msr <= 0x00001FFF) + byte = msr / 8; + else if (msr >= 0xC0000000 && msr <= 0xC0001FFF) + byte = 1024 + (msr - 0xC0000000) / 8; + else + return (EINVAL); + + bit = msr & 0x7; + + if (access & MSR_BITMAP_ACCESS_READ) + bitmap[byte] &= ~(1 << bit); + else + bitmap[byte] |= 1 << bit; + + byte += 2048; + if (access & MSR_BITMAP_ACCESS_WRITE) + bitmap[byte] &= ~(1 << bit); + else + bitmap[byte] |= 1 << bit; + + return (0); +} + +static uint64_t misc_enable; +static uint64_t platform_info; +static uint64_t turbo_ratio_limit; +static uint64_t host_msrs[GUEST_MSR_NUM]; + +static bool +nehalem_cpu(void) +{ + u_int family, model; + + /* + * The family:model numbers belonging to the Nehalem microarchitecture + * are documented in Section 35.5, Intel SDM dated Feb 2014. + */ + family = CPUID_TO_FAMILY(cpu_id); + model = CPUID_TO_MODEL(cpu_id); + if (family == 0x6) { + switch (model) { + case 0x1A: + case 0x1E: + case 0x1F: + case 0x2E: + return (true); + default: + break; + } + } + return (false); +} + +static bool +westmere_cpu(void) +{ + u_int family, model; + + /* + * The family:model numbers belonging to the Westmere microarchitecture + * are documented in Section 35.6, Intel SDM dated Feb 2014. + */ + family = CPUID_TO_FAMILY(cpu_id); + model = CPUID_TO_MODEL(cpu_id); + if (family == 0x6) { + switch (model) { + case 0x25: + case 0x2C: + return (true); + default: + break; + } + } + return (false); +} + +static bool +pat_valid(uint64_t val) +{ + int i, pa; + + /* + * From Intel SDM: Table "Memory Types That Can Be Encoded With PAT" + * + * Extract PA0 through PA7 and validate that each one encodes a + * valid memory type. + */ + for (i = 0; i < 8; i++) { + pa = (val >> (i * 8)) & 0xff; + if (pa == 2 || pa == 3 || pa >= 8) + return (false); + } + return (true); +} + +void +vmx_msr_init(void) +{ + uint64_t bus_freq, ratio; + int i; + + /* + * It is safe to cache the values of the following MSRs because + * they don't change based on curcpu, curproc or curthread. + */ + host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); + host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); + host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); + host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); + + /* + * Initialize emulated MSRs + */ + misc_enable = rdmsr(MSR_IA32_MISC_ENABLE); + /* + * Set mandatory bits + * 11: branch trace disabled + * 12: PEBS unavailable + * Clear unsupported features + * 16: SpeedStep enable + * 18: enable MONITOR FSM + */ + misc_enable |= (1 << 12) | (1 << 11); + misc_enable &= ~((1 << 18) | (1 << 16)); + + if (nehalem_cpu() || westmere_cpu()) + bus_freq = 133330000; /* 133Mhz */ + else + bus_freq = 100000000; /* 100Mhz */ + + /* + * XXXtime + * The ratio should really be based on the virtual TSC frequency as + * opposed to the host TSC. + */ + ratio = (tsc_freq / bus_freq) & 0xff; + + /* + * The register definition is based on the micro-architecture + * but the following bits are always the same: + * [15:8] Maximum Non-Turbo Ratio + * [28] Programmable Ratio Limit for Turbo Mode + * [29] Programmable TDC-TDP Limit for Turbo Mode + * [47:40] Maximum Efficiency Ratio + * + * The other bits can be safely set to 0 on all + * micro-architectures up to Haswell. + */ + platform_info = (ratio << 8) | (ratio << 40); + + /* + * The number of valid bits in the MSR_TURBO_RATIO_LIMITx register is + * dependent on the maximum cores per package supported by the micro- + * architecture. For e.g., Westmere supports 6 cores per package and + * uses the low 48 bits. Sandybridge support 8 cores per package and + * uses up all 64 bits. + * + * However, the unused bits are reserved so we pretend that all bits + * in this MSR are valid. + */ + for (i = 0; i < 8; i++) + turbo_ratio_limit = (turbo_ratio_limit << 8) | ratio; +} + +void +vmx_msr_guest_init(struct vmx *vmx, int vcpuid) +{ + uint64_t *guest_msrs; + + guest_msrs = vmx->guest_msrs[vcpuid]; + + /* + * The permissions bitmap is shared between all vcpus so initialize it + * once when initializing the vBSP. + */ + if (vcpuid == 0) { + guest_msr_rw(vmx, MSR_LSTAR); + guest_msr_rw(vmx, MSR_CSTAR); + guest_msr_rw(vmx, MSR_STAR); + guest_msr_rw(vmx, MSR_SF_MASK); + guest_msr_rw(vmx, MSR_KGSBASE); + } + + /* + * Initialize guest IA32_PAT MSR with default value after reset. + */ + guest_msrs[IDX_MSR_PAT] = PAT_VALUE(0, PAT_WRITE_BACK) | + PAT_VALUE(1, PAT_WRITE_THROUGH) | + PAT_VALUE(2, PAT_UNCACHED) | + PAT_VALUE(3, PAT_UNCACHEABLE) | + PAT_VALUE(4, PAT_WRITE_BACK) | + PAT_VALUE(5, PAT_WRITE_THROUGH) | + PAT_VALUE(6, PAT_UNCACHED) | + PAT_VALUE(7, PAT_UNCACHEABLE); + + return; +} + +void +vmx_msr_guest_enter(struct vmx *vmx, int vcpuid) +{ + uint64_t *guest_msrs = vmx->guest_msrs[vcpuid]; + + /* Save host MSRs (if any) and restore guest MSRs */ + wrmsr(MSR_LSTAR, guest_msrs[IDX_MSR_LSTAR]); + wrmsr(MSR_CSTAR, guest_msrs[IDX_MSR_CSTAR]); + wrmsr(MSR_STAR, guest_msrs[IDX_MSR_STAR]); + wrmsr(MSR_SF_MASK, guest_msrs[IDX_MSR_SF_MASK]); + wrmsr(MSR_KGSBASE, guest_msrs[IDX_MSR_KGSBASE]); +} + +void +vmx_msr_guest_exit(struct vmx *vmx, int vcpuid) +{ + uint64_t *guest_msrs = vmx->guest_msrs[vcpuid]; + + /* Save guest MSRs */ + guest_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); + guest_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); + guest_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); + guest_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); + guest_msrs[IDX_MSR_KGSBASE] = rdmsr(MSR_KGSBASE); + + /* Restore host MSRs */ + wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]); + wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]); + wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]); + wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]); + + /* MSR_KGSBASE will be restored on the way back to userspace */ +} + +int +vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val, bool *retu) +{ + const uint64_t *guest_msrs; + int error; + + guest_msrs = vmx->guest_msrs[vcpuid]; + error = 0; + + switch (num) { + case MSR_MCG_CAP: + case MSR_MCG_STATUS: + *val = 0; + break; + case MSR_MTRRcap: + case MSR_MTRRdefType: + case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8: + case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: + case MSR_MTRR64kBase: + *val = 0; + break; + case MSR_IA32_MISC_ENABLE: + *val = misc_enable; + break; + case MSR_PLATFORM_INFO: + *val = platform_info; + break; + case MSR_TURBO_RATIO_LIMIT: + case MSR_TURBO_RATIO_LIMIT1: + *val = turbo_ratio_limit; + break; + case MSR_PAT: + *val = guest_msrs[IDX_MSR_PAT]; + break; + default: + error = EINVAL; + break; + } + return (error); +} + +int +vmx_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu) +{ + uint64_t *guest_msrs; + uint64_t changed; + int error; + + guest_msrs = vmx->guest_msrs[vcpuid]; + error = 0; + + switch (num) { + case MSR_MCG_CAP: + case MSR_MCG_STATUS: + break; /* ignore writes */ + case MSR_MTRRcap: + vm_inject_gp(vmx->vm, vcpuid); + break; + case MSR_MTRRdefType: + case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8: + case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: + case MSR_MTRR64kBase: + break; /* Ignore writes */ + case MSR_IA32_MISC_ENABLE: + changed = val ^ misc_enable; + /* + * If the host has disabled the NX feature then the guest + * also cannot use it. However, a Linux guest will try to + * enable the NX feature by writing to the MISC_ENABLE MSR. + * + * This can be safely ignored because the memory management + * code looks at CPUID.80000001H:EDX.NX to check if the + * functionality is actually enabled. + */ + changed &= ~(1UL << 34); + + /* + * Punt to userspace if any other bits are being modified. + */ + if (changed) + error = EINVAL; + + break; + case MSR_PAT: + if (pat_valid(val)) + guest_msrs[IDX_MSR_PAT] = val; + else + vm_inject_gp(vmx->vm, vcpuid); + break; + default: + error = EINVAL; + break; + } + + return (error); +} diff --git a/vmm/intel/vmx_msr.h b/vmm/intel/vmx_msr.h new file mode 100644 index 0000000..e77881c --- /dev/null +++ b/vmm/intel/vmx_msr.h @@ -0,0 +1,70 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMX_MSR_H_ +#define _VMX_MSR_H_ + +struct vmx; + +void vmx_msr_init(void); +void vmx_msr_guest_init(struct vmx *vmx, int vcpuid); +void vmx_msr_guest_enter(struct vmx *vmx, int vcpuid); +void vmx_msr_guest_exit(struct vmx *vmx, int vcpuid); +int vmx_rdmsr(struct vmx *, int vcpuid, u_int num, uint64_t *val, bool *retu); +int vmx_wrmsr(struct vmx *, int vcpuid, u_int num, uint64_t val, bool *retu); + +uint32_t vmx_revision(void); + +int vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask, + uint32_t zeros_mask, uint32_t *retval); + +/* + * According to Section 21.10.4 "Software Access to Related Structures", + * changes to data structures pointed to by the VMCS must be made only when + * there is no logical processor with a current VMCS that points to the + * data structure. + * + * This pretty much limits us to configuring the MSR bitmap before VMCS + * initialization for SMP VMs. Unless of course we do it the hard way - which + * would involve some form of synchronization between the vcpus to vmclear + * all VMCSs' that point to the bitmap. + */ +#define MSR_BITMAP_ACCESS_NONE 0x0 +#define MSR_BITMAP_ACCESS_READ 0x1 +#define MSR_BITMAP_ACCESS_WRITE 0x2 +#define MSR_BITMAP_ACCESS_RW (MSR_BITMAP_ACCESS_READ|MSR_BITMAP_ACCESS_WRITE) +void msr_bitmap_initialize(char *bitmap); +int msr_bitmap_change_access(char *bitmap, u_int msr, int access); + +#define guest_msr_rw(vmx, msr) \ + msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW) + +#define guest_msr_ro(vmx, msr) \ + msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_READ) + +#endif diff --git a/vmm/intel/vmx_support.S b/vmm/intel/vmx_support.S new file mode 100644 index 0000000..84fb5b0 --- /dev/null +++ b/vmm/intel/vmx_support.S @@ -0,0 +1,262 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2013 Neel Natu + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include + +#include "vmx_assym.h" + +#ifdef SMP +#define LK lock ; +#else +#define LK +#endif + +/* Be friendly to DTrace FBT's prologue/epilogue pattern matching */ +#define VENTER push %rbp ; mov %rsp,%rbp +#define VLEAVE pop %rbp + +/* + * Assumes that %rdi holds a pointer to the 'vmxctx'. + * + * On "return" all registers are updated to reflect guest state. The two + * exceptions are %rip and %rsp. These registers are atomically switched + * by hardware from the guest area of the vmcs. + * + * We modify %rsp to point to the 'vmxctx' so we can use it to restore + * host context in case of an error with 'vmlaunch' or 'vmresume'. + */ +#define VMX_GUEST_RESTORE \ + movq %rdi,%rsp; \ + movq VMXCTX_GUEST_CR2(%rdi),%rsi; \ + movq %rsi,%cr2; \ + movq VMXCTX_GUEST_RSI(%rdi),%rsi; \ + movq VMXCTX_GUEST_RDX(%rdi),%rdx; \ + movq VMXCTX_GUEST_RCX(%rdi),%rcx; \ + movq VMXCTX_GUEST_R8(%rdi),%r8; \ + movq VMXCTX_GUEST_R9(%rdi),%r9; \ + movq VMXCTX_GUEST_RAX(%rdi),%rax; \ + movq VMXCTX_GUEST_RBX(%rdi),%rbx; \ + movq VMXCTX_GUEST_RBP(%rdi),%rbp; \ + movq VMXCTX_GUEST_R10(%rdi),%r10; \ + movq VMXCTX_GUEST_R11(%rdi),%r11; \ + movq VMXCTX_GUEST_R12(%rdi),%r12; \ + movq VMXCTX_GUEST_R13(%rdi),%r13; \ + movq VMXCTX_GUEST_R14(%rdi),%r14; \ + movq VMXCTX_GUEST_R15(%rdi),%r15; \ + movq VMXCTX_GUEST_RDI(%rdi),%rdi; /* restore rdi the last */ + +/* + * Save and restore the host context. + * + * Assumes that %rdi holds a pointer to the 'vmxctx'. + */ +#define VMX_HOST_SAVE \ + movq %r15, VMXCTX_HOST_R15(%rdi); \ + movq %r14, VMXCTX_HOST_R14(%rdi); \ + movq %r13, VMXCTX_HOST_R13(%rdi); \ + movq %r12, VMXCTX_HOST_R12(%rdi); \ + movq %rbp, VMXCTX_HOST_RBP(%rdi); \ + movq %rsp, VMXCTX_HOST_RSP(%rdi); \ + movq %rbx, VMXCTX_HOST_RBX(%rdi); \ + +#define VMX_HOST_RESTORE \ + movq VMXCTX_HOST_R15(%rdi), %r15; \ + movq VMXCTX_HOST_R14(%rdi), %r14; \ + movq VMXCTX_HOST_R13(%rdi), %r13; \ + movq VMXCTX_HOST_R12(%rdi), %r12; \ + movq VMXCTX_HOST_RBP(%rdi), %rbp; \ + movq VMXCTX_HOST_RSP(%rdi), %rsp; \ + movq VMXCTX_HOST_RBX(%rdi), %rbx; \ + +/* + * vmx_enter_guest(struct vmxctx *vmxctx, int launched) + * %rdi: pointer to the 'vmxctx' + * %rsi: pointer to the 'vmx' + * %edx: launch state of the VMCS + * Interrupts must be disabled on entry. + */ +ENTRY(vmx_enter_guest) + VENTER + /* + * Save host state before doing anything else. + */ + VMX_HOST_SAVE + + /* + * Activate guest pmap on this cpu. + */ + movq VMXCTX_PMAP(%rdi), %r11 + movl PCPU(CPUID), %eax + LK btsl %eax, PM_ACTIVE(%r11) + + /* + * If 'vmx->eptgen[curcpu]' is not identical to 'pmap->pm_eptgen' + * then we must invalidate all mappings associated with this EPTP. + */ + movq PM_EPTGEN(%r11), %r10 + cmpq %r10, VMX_EPTGEN(%rsi, %rax, 8) + je guest_restore + + /* Refresh 'vmx->eptgen[curcpu]' */ + movq %r10, VMX_EPTGEN(%rsi, %rax, 8) + + /* Setup the invept descriptor on the host stack */ + mov %rsp, %r11 + movq VMX_EPTP(%rsi), %rax + movq %rax, -16(%r11) + movq $0x0, -8(%r11) + mov $0x1, %eax /* Single context invalidate */ + invept -16(%r11), %rax + jbe invept_error /* Check invept instruction error */ + +guest_restore: + cmpl $0, %edx + je do_launch + + VMX_GUEST_RESTORE + vmresume + /* + * In the common case 'vmresume' returns back to the host through + * 'vmx_exit_guest' with %rsp pointing to 'vmxctx'. + * + * If there is an error we return VMX_VMRESUME_ERROR to the caller. + */ + movq %rsp, %rdi /* point %rdi back to 'vmxctx' */ + movl $VMX_VMRESUME_ERROR, %eax + jmp decode_inst_error + +do_launch: + VMX_GUEST_RESTORE + vmlaunch + /* + * In the common case 'vmlaunch' returns back to the host through + * 'vmx_exit_guest' with %rsp pointing to 'vmxctx'. + * + * If there is an error we return VMX_VMLAUNCH_ERROR to the caller. + */ + movq %rsp, %rdi /* point %rdi back to 'vmxctx' */ + movl $VMX_VMLAUNCH_ERROR, %eax + jmp decode_inst_error + +invept_error: + movl $VMX_INVEPT_ERROR, %eax + jmp decode_inst_error + +decode_inst_error: + movl $VM_FAIL_VALID, %r11d + jz inst_error + movl $VM_FAIL_INVALID, %r11d +inst_error: + movl %r11d, VMXCTX_INST_FAIL_STATUS(%rdi) + + /* + * The return value is already populated in %eax so we cannot use + * it as a scratch register beyond this point. + */ + + /* + * Deactivate guest pmap from this cpu. + */ + movq VMXCTX_PMAP(%rdi), %r11 + movl PCPU(CPUID), %r10d + LK btrl %r10d, PM_ACTIVE(%r11) + + VMX_HOST_RESTORE + VLEAVE + ret + +/* + * Non-error VM-exit from the guest. Make this a label so it can + * be used by C code when setting up the VMCS. + * The VMCS-restored %rsp points to the struct vmxctx + */ + ALIGN_TEXT + .globl vmx_exit_guest +vmx_exit_guest: + /* + * Save guest state that is not automatically saved in the vmcs. + */ + movq %rdi,VMXCTX_GUEST_RDI(%rsp) + movq %rsi,VMXCTX_GUEST_RSI(%rsp) + movq %rdx,VMXCTX_GUEST_RDX(%rsp) + movq %rcx,VMXCTX_GUEST_RCX(%rsp) + movq %r8,VMXCTX_GUEST_R8(%rsp) + movq %r9,VMXCTX_GUEST_R9(%rsp) + movq %rax,VMXCTX_GUEST_RAX(%rsp) + movq %rbx,VMXCTX_GUEST_RBX(%rsp) + movq %rbp,VMXCTX_GUEST_RBP(%rsp) + movq %r10,VMXCTX_GUEST_R10(%rsp) + movq %r11,VMXCTX_GUEST_R11(%rsp) + movq %r12,VMXCTX_GUEST_R12(%rsp) + movq %r13,VMXCTX_GUEST_R13(%rsp) + movq %r14,VMXCTX_GUEST_R14(%rsp) + movq %r15,VMXCTX_GUEST_R15(%rsp) + + movq %cr2,%rdi + movq %rdi,VMXCTX_GUEST_CR2(%rsp) + + movq %rsp,%rdi + + /* + * Deactivate guest pmap from this cpu. + */ + movq VMXCTX_PMAP(%rdi), %r11 + movl PCPU(CPUID), %r10d + LK btrl %r10d, PM_ACTIVE(%r11) + + VMX_HOST_RESTORE + + /* + * This will return to the caller of 'vmx_enter_guest()' with a return + * value of VMX_GUEST_VMEXIT. + */ + movl $VMX_GUEST_VMEXIT, %eax + VLEAVE + ret +END(vmx_enter_guest) + +/* + * %rdi = interrupt handler entry point + * + * Calling sequence described in the "Instruction Set Reference" for the "INT" + * instruction in Intel SDM, Vol 2. + */ +ENTRY(vmx_call_isr) + VENTER + mov %rsp, %r11 /* save %rsp */ + and $~0xf, %rsp /* align on 16-byte boundary */ + pushq $KERNEL_SS /* %ss */ + pushq %r11 /* %rsp */ + pushfq /* %rflags */ + pushq $KERNEL_CS /* %cs */ + cli /* disable interrupts */ + callq *%rdi /* push %rip and call isr */ + VLEAVE + ret +END(vmx_call_isr) diff --git a/vmm/intel/vtd.c b/vmm/intel/vtd.c new file mode 100644 index 0000000..be57aff --- /dev/null +++ b/vmm/intel/vtd.c @@ -0,0 +1,688 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include + +#include "io/iommu.h" + +/* + * Documented in the "Intel Virtualization Technology for Directed I/O", + * Architecture Spec, September 2008. + */ + +/* Section 10.4 "Register Descriptions" */ +struct vtdmap { + volatile uint32_t version; + volatile uint32_t res0; + volatile uint64_t cap; + volatile uint64_t ext_cap; + volatile uint32_t gcr; + volatile uint32_t gsr; + volatile uint64_t rta; + volatile uint64_t ccr; +}; + +#define VTD_CAP_SAGAW(cap) (((cap) >> 8) & 0x1F) +#define VTD_CAP_ND(cap) ((cap) & 0x7) +#define VTD_CAP_CM(cap) (((cap) >> 7) & 0x1) +#define VTD_CAP_SPS(cap) (((cap) >> 34) & 0xF) +#define VTD_CAP_RWBF(cap) (((cap) >> 4) & 0x1) + +#define VTD_ECAP_DI(ecap) (((ecap) >> 2) & 0x1) +#define VTD_ECAP_COHERENCY(ecap) ((ecap) & 0x1) +#define VTD_ECAP_IRO(ecap) (((ecap) >> 8) & 0x3FF) + +#define VTD_GCR_WBF (1 << 27) +#define VTD_GCR_SRTP (1 << 30) +#define VTD_GCR_TE (1U << 31) + +#define VTD_GSR_WBFS (1 << 27) +#define VTD_GSR_RTPS (1 << 30) +#define VTD_GSR_TES (1U << 31) + +#define VTD_CCR_ICC (1UL << 63) /* invalidate context cache */ +#define VTD_CCR_CIRG_GLOBAL (1UL << 61) /* global invalidation */ + +#define VTD_IIR_IVT (1UL << 63) /* invalidation IOTLB */ +#define VTD_IIR_IIRG_GLOBAL (1ULL << 60) /* global IOTLB invalidation */ +#define VTD_IIR_IIRG_DOMAIN (2ULL << 60) /* domain IOTLB invalidation */ +#define VTD_IIR_IIRG_PAGE (3ULL << 60) /* page IOTLB invalidation */ +#define VTD_IIR_DRAIN_READS (1ULL << 49) /* drain pending DMA reads */ +#define VTD_IIR_DRAIN_WRITES (1ULL << 48) /* drain pending DMA writes */ +#define VTD_IIR_DOMAIN_P 32 + +#define VTD_ROOT_PRESENT 0x1 +#define VTD_CTX_PRESENT 0x1 +#define VTD_CTX_TT_ALL (1UL << 2) + +#define VTD_PTE_RD (1UL << 0) +#define VTD_PTE_WR (1UL << 1) +#define VTD_PTE_SUPERPAGE (1UL << 7) +#define VTD_PTE_ADDR_M (0x000FFFFFFFFFF000UL) + +#define VTD_RID2IDX(rid) (((rid) & 0xff) * 2) + +struct domain { + uint64_t *ptp; /* first level page table page */ + int pt_levels; /* number of page table levels */ + int addrwidth; /* 'AW' field in context entry */ + int spsmask; /* supported super page sizes */ + u_int id; /* domain id */ + vm_paddr_t maxaddr; /* highest address to be mapped */ + SLIST_ENTRY(domain) next; +}; + +static SLIST_HEAD(, domain) domhead; + +#define DRHD_MAX_UNITS 8 +static int drhd_num; +static struct vtdmap *vtdmaps[DRHD_MAX_UNITS]; +static int max_domains; +typedef int (*drhd_ident_func_t)(void); + +static uint64_t root_table[PAGE_SIZE / sizeof(uint64_t)] __aligned(4096); +static uint64_t ctx_tables[256][PAGE_SIZE / sizeof(uint64_t)] __aligned(4096); + +static MALLOC_DEFINE(M_VTD, "vtd", "vtd"); + +static int +vtd_max_domains(struct vtdmap *vtdmap) +{ + int nd; + + nd = VTD_CAP_ND(vtdmap->cap); + + switch (nd) { + case 0: + return (16); + case 1: + return (64); + case 2: + return (256); + case 3: + return (1024); + case 4: + return (4 * 1024); + case 5: + return (16 * 1024); + case 6: + return (64 * 1024); + default: + panic("vtd_max_domains: invalid value of nd (0x%0x)", nd); + } +} + +static u_int +domain_id(void) +{ + u_int id; + struct domain *dom; + + /* Skip domain id 0 - it is reserved when Caching Mode field is set */ + for (id = 1; id < max_domains; id++) { + SLIST_FOREACH(dom, &domhead, next) { + if (dom->id == id) + break; + } + if (dom == NULL) + break; /* found it */ + } + + if (id >= max_domains) + panic("domain ids exhausted"); + + return (id); +} + +static void +vtd_wbflush(struct vtdmap *vtdmap) +{ + + if (VTD_ECAP_COHERENCY(vtdmap->ext_cap) == 0) + pmap_invalidate_cache(); + + if (VTD_CAP_RWBF(vtdmap->cap)) { + vtdmap->gcr = VTD_GCR_WBF; + while ((vtdmap->gsr & VTD_GSR_WBFS) != 0) + ; + } +} + +static void +vtd_ctx_global_invalidate(struct vtdmap *vtdmap) +{ + + vtdmap->ccr = VTD_CCR_ICC | VTD_CCR_CIRG_GLOBAL; + while ((vtdmap->ccr & VTD_CCR_ICC) != 0) + ; +} + +static void +vtd_iotlb_global_invalidate(struct vtdmap *vtdmap) +{ + int offset; + volatile uint64_t *iotlb_reg, val; + + vtd_wbflush(vtdmap); + + offset = VTD_ECAP_IRO(vtdmap->ext_cap) * 16; + iotlb_reg = (volatile uint64_t *)((caddr_t)vtdmap + offset + 8); + + *iotlb_reg = VTD_IIR_IVT | VTD_IIR_IIRG_GLOBAL | + VTD_IIR_DRAIN_READS | VTD_IIR_DRAIN_WRITES; + + while (1) { + val = *iotlb_reg; + if ((val & VTD_IIR_IVT) == 0) + break; + } +} + +static void +vtd_translation_enable(struct vtdmap *vtdmap) +{ + + vtdmap->gcr = VTD_GCR_TE; + while ((vtdmap->gsr & VTD_GSR_TES) == 0) + ; +} + +static void +vtd_translation_disable(struct vtdmap *vtdmap) +{ + + vtdmap->gcr = 0; + while ((vtdmap->gsr & VTD_GSR_TES) != 0) + ; +} + +static int +vtd_init(void) +{ + int i, units, remaining; + struct vtdmap *vtdmap; + vm_paddr_t ctx_paddr; + char *end, envname[32]; + unsigned long mapaddr; + ACPI_STATUS status; + ACPI_TABLE_DMAR *dmar; + ACPI_DMAR_HEADER *hdr; + ACPI_DMAR_HARDWARE_UNIT *drhd; + + /* + * Allow the user to override the ACPI DMAR table by specifying the + * physical address of each remapping unit. + * + * The following example specifies two remapping units at + * physical addresses 0xfed90000 and 0xfeda0000 respectively. + * set vtd.regmap.0.addr=0xfed90000 + * set vtd.regmap.1.addr=0xfeda0000 + */ + for (units = 0; units < DRHD_MAX_UNITS; units++) { + snprintf(envname, sizeof(envname), "vtd.regmap.%d.addr", units); + if (getenv_ulong(envname, &mapaddr) == 0) + break; + vtdmaps[units] = (struct vtdmap *)PHYS_TO_DMAP(mapaddr); + } + + if (units > 0) + goto skip_dmar; + + /* Search for DMAR table. */ + status = AcpiGetTable(ACPI_SIG_DMAR, 0, (ACPI_TABLE_HEADER **)&dmar); + if (ACPI_FAILURE(status)) + return (ENXIO); + + end = (char *)dmar + dmar->Header.Length; + remaining = dmar->Header.Length - sizeof(ACPI_TABLE_DMAR); + while (remaining > sizeof(ACPI_DMAR_HEADER)) { + hdr = (ACPI_DMAR_HEADER *)(end - remaining); + if (hdr->Length > remaining) + break; + /* + * From Intel VT-d arch spec, version 1.3: + * BIOS implementations must report mapping structures + * in numerical order, i.e. All remapping structures of + * type 0 (DRHD) enumerated before remapping structures of + * type 1 (RMRR) and so forth. + */ + if (hdr->Type != ACPI_DMAR_TYPE_HARDWARE_UNIT) + break; + + drhd = (ACPI_DMAR_HARDWARE_UNIT *)hdr; + vtdmaps[units++] = (struct vtdmap *)PHYS_TO_DMAP(drhd->Address); + if (units >= DRHD_MAX_UNITS) + break; + remaining -= hdr->Length; + } + + if (units <= 0) + return (ENXIO); + +skip_dmar: + drhd_num = units; + vtdmap = vtdmaps[0]; + + if (VTD_CAP_CM(vtdmap->cap) != 0) + panic("vtd_init: invalid caching mode"); + + max_domains = vtd_max_domains(vtdmap); + + /* + * Set up the root-table to point to the context-entry tables + */ + for (i = 0; i < 256; i++) { + ctx_paddr = vtophys(ctx_tables[i]); + if (ctx_paddr & PAGE_MASK) + panic("ctx table (0x%0lx) not page aligned", ctx_paddr); + + root_table[i * 2] = ctx_paddr | VTD_ROOT_PRESENT; + } + + return (0); +} + +static void +vtd_cleanup(void) +{ +} + +static void +vtd_enable(void) +{ + int i; + struct vtdmap *vtdmap; + + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_wbflush(vtdmap); + + /* Update the root table address */ + vtdmap->rta = vtophys(root_table); + vtdmap->gcr = VTD_GCR_SRTP; + while ((vtdmap->gsr & VTD_GSR_RTPS) == 0) + ; + + vtd_ctx_global_invalidate(vtdmap); + vtd_iotlb_global_invalidate(vtdmap); + + vtd_translation_enable(vtdmap); + } +} + +static void +vtd_disable(void) +{ + int i; + struct vtdmap *vtdmap; + + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_translation_disable(vtdmap); + } +} + +static void +vtd_add_device(void *arg, uint16_t rid) +{ + int idx; + uint64_t *ctxp; + struct domain *dom = arg; + vm_paddr_t pt_paddr; + struct vtdmap *vtdmap; + uint8_t bus; + + vtdmap = vtdmaps[0]; + bus = PCI_RID2BUS(rid); + ctxp = ctx_tables[bus]; + pt_paddr = vtophys(dom->ptp); + idx = VTD_RID2IDX(rid); + + if (ctxp[idx] & VTD_CTX_PRESENT) { + panic("vtd_add_device: device %x is already owned by " + "domain %d", rid, + (uint16_t)(ctxp[idx + 1] >> 8)); + } + + /* + * Order is important. The 'present' bit is set only after all fields + * of the context pointer are initialized. + */ + ctxp[idx + 1] = dom->addrwidth | (dom->id << 8); + + if (VTD_ECAP_DI(vtdmap->ext_cap)) + ctxp[idx] = VTD_CTX_TT_ALL; + else + ctxp[idx] = 0; + + ctxp[idx] |= pt_paddr | VTD_CTX_PRESENT; + + /* + * 'Not Present' entries are not cached in either the Context Cache + * or in the IOTLB, so there is no need to invalidate either of them. + */ +} + +static void +vtd_remove_device(void *arg, uint16_t rid) +{ + int i, idx; + uint64_t *ctxp; + struct vtdmap *vtdmap; + uint8_t bus; + + bus = PCI_RID2BUS(rid); + ctxp = ctx_tables[bus]; + idx = VTD_RID2IDX(rid); + + /* + * Order is important. The 'present' bit is must be cleared first. + */ + ctxp[idx] = 0; + ctxp[idx + 1] = 0; + + /* + * Invalidate the Context Cache and the IOTLB. + * + * XXX use device-selective invalidation for Context Cache + * XXX use domain-selective invalidation for IOTLB + */ + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_ctx_global_invalidate(vtdmap); + vtd_iotlb_global_invalidate(vtdmap); + } +} + +#define CREATE_MAPPING 0 +#define REMOVE_MAPPING 1 + +static uint64_t +vtd_update_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len, + int remove) +{ + struct domain *dom; + int i, spshift, ptpshift, ptpindex, nlevels; + uint64_t spsize, *ptp; + + dom = arg; + ptpindex = 0; + ptpshift = 0; + + KASSERT(gpa + len > gpa, ("%s: invalid gpa range %#lx/%#lx", __func__, + gpa, len)); + KASSERT(gpa + len <= dom->maxaddr, ("%s: gpa range %#lx/%#lx beyond " + "domain maxaddr %#lx", __func__, gpa, len, dom->maxaddr)); + + if (gpa & PAGE_MASK) + panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa); + + if (hpa & PAGE_MASK) + panic("vtd_create_mapping: unaligned hpa 0x%0lx", hpa); + + if (len & PAGE_MASK) + panic("vtd_create_mapping: unaligned len 0x%0lx", len); + + /* + * Compute the size of the mapping that we can accomodate. + * + * This is based on three factors: + * - supported super page size + * - alignment of the region starting at 'gpa' and 'hpa' + * - length of the region 'len' + */ + spshift = 48; + for (i = 3; i >= 0; i--) { + spsize = 1UL << spshift; + if ((dom->spsmask & (1 << i)) != 0 && + (gpa & (spsize - 1)) == 0 && + (hpa & (spsize - 1)) == 0 && + (len >= spsize)) { + break; + } + spshift -= 9; + } + + ptp = dom->ptp; + nlevels = dom->pt_levels; + while (--nlevels >= 0) { + ptpshift = 12 + nlevels * 9; + ptpindex = (gpa >> ptpshift) & 0x1FF; + + /* We have reached the leaf mapping */ + if (spshift >= ptpshift) { + break; + } + + /* + * We are working on a non-leaf page table page. + * + * Create a downstream page table page if necessary and point + * to it from the current page table. + */ + if (ptp[ptpindex] == 0) { + void *nlp = malloc(PAGE_SIZE, M_VTD, M_WAITOK | M_ZERO); + ptp[ptpindex] = vtophys(nlp)| VTD_PTE_RD | VTD_PTE_WR; + } + + ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & VTD_PTE_ADDR_M); + } + + if ((gpa & ((1UL << ptpshift) - 1)) != 0) + panic("gpa 0x%lx and ptpshift %d mismatch", gpa, ptpshift); + + /* + * Update the 'gpa' -> 'hpa' mapping + */ + if (remove) { + ptp[ptpindex] = 0; + } else { + ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR; + + if (nlevels > 0) + ptp[ptpindex] |= VTD_PTE_SUPERPAGE; + } + + return (1UL << ptpshift); +} + +static uint64_t +vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len) +{ + + return (vtd_update_mapping(arg, gpa, hpa, len, CREATE_MAPPING)); +} + +static uint64_t +vtd_remove_mapping(void *arg, vm_paddr_t gpa, uint64_t len) +{ + + return (vtd_update_mapping(arg, gpa, 0, len, REMOVE_MAPPING)); +} + +static void +vtd_invalidate_tlb(void *dom) +{ + int i; + struct vtdmap *vtdmap; + + /* + * Invalidate the IOTLB. + * XXX use domain-selective invalidation for IOTLB + */ + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_iotlb_global_invalidate(vtdmap); + } +} + +static void * +vtd_create_domain(vm_paddr_t maxaddr) +{ + struct domain *dom; + vm_paddr_t addr; + int tmp, i, gaw, agaw, sagaw, res, pt_levels, addrwidth; + struct vtdmap *vtdmap; + + if (drhd_num <= 0) + panic("vtd_create_domain: no dma remapping hardware available"); + + vtdmap = vtdmaps[0]; + + /* + * Calculate AGAW. + * Section 3.4.2 "Adjusted Guest Address Width", Architecture Spec. + */ + addr = 0; + for (gaw = 0; addr < maxaddr; gaw++) + addr = 1ULL << gaw; + + res = (gaw - 12) % 9; + if (res == 0) + agaw = gaw; + else + agaw = gaw + 9 - res; + + if (agaw > 64) + agaw = 64; + + /* + * Select the smallest Supported AGAW and the corresponding number + * of page table levels. + */ + pt_levels = 2; + sagaw = 30; + addrwidth = 0; + tmp = VTD_CAP_SAGAW(vtdmap->cap); + for (i = 0; i < 5; i++) { + if ((tmp & (1 << i)) != 0 && sagaw >= agaw) + break; + pt_levels++; + addrwidth++; + sagaw += 9; + if (sagaw > 64) + sagaw = 64; + } + + if (i >= 5) { + panic("vtd_create_domain: SAGAW 0x%lx does not support AGAW %d", + VTD_CAP_SAGAW(vtdmap->cap), agaw); + } + + dom = malloc(sizeof(struct domain), M_VTD, M_ZERO | M_WAITOK); + dom->pt_levels = pt_levels; + dom->addrwidth = addrwidth; + dom->id = domain_id(); + dom->maxaddr = maxaddr; + dom->ptp = malloc(PAGE_SIZE, M_VTD, M_ZERO | M_WAITOK); + if ((uintptr_t)dom->ptp & PAGE_MASK) + panic("vtd_create_domain: ptp (%p) not page aligned", dom->ptp); + +#ifdef notyet + /* + * XXX superpage mappings for the iommu do not work correctly. + * + * By default all physical memory is mapped into the host_domain. + * When a VM is allocated wired memory the pages belonging to it + * are removed from the host_domain and added to the vm's domain. + * + * If the page being removed was mapped using a superpage mapping + * in the host_domain then we need to demote the mapping before + * removing the page. + * + * There is not any code to deal with the demotion at the moment + * so we disable superpage mappings altogether. + */ + dom->spsmask = VTD_CAP_SPS(vtdmap->cap); +#endif + + SLIST_INSERT_HEAD(&domhead, dom, next); + + return (dom); +} + +static void +vtd_free_ptp(uint64_t *ptp, int level) +{ + int i; + uint64_t *nlp; + + if (level > 1) { + for (i = 0; i < 512; i++) { + if ((ptp[i] & (VTD_PTE_RD | VTD_PTE_WR)) == 0) + continue; + if ((ptp[i] & VTD_PTE_SUPERPAGE) != 0) + continue; + nlp = (uint64_t *)PHYS_TO_DMAP(ptp[i] & VTD_PTE_ADDR_M); + vtd_free_ptp(nlp, level - 1); + } + } + + bzero(ptp, PAGE_SIZE); + free(ptp, M_VTD); +} + +static void +vtd_destroy_domain(void *arg) +{ + struct domain *dom; + + dom = arg; + + SLIST_REMOVE(&domhead, dom, domain, next); + vtd_free_ptp(dom->ptp, dom->pt_levels); + free(dom, M_VTD); +} + +struct iommu_ops iommu_ops_intel = { + vtd_init, + vtd_cleanup, + vtd_enable, + vtd_disable, + vtd_create_domain, + vtd_destroy_domain, + vtd_create_mapping, + vtd_remove_mapping, + vtd_add_device, + vtd_remove_device, + vtd_invalidate_tlb, +}; diff --git a/vmm/io/iommu.c b/vmm/io/iommu.c new file mode 100644 index 0000000..9cfc4c2 --- /dev/null +++ b/vmm/io/iommu.c @@ -0,0 +1,285 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include "vmm_util.h" +#include "vmm_mem.h" +#include "iommu.h" + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, iommu, CTLFLAG_RW, 0, "bhyve iommu parameters"); + +static int iommu_avail; +SYSCTL_INT(_hw_vmm_iommu, OID_AUTO, initialized, CTLFLAG_RD, &iommu_avail, + 0, "bhyve iommu initialized?"); + +static struct iommu_ops *ops; +static void *host_domain; + +static __inline int +IOMMU_INIT(void) +{ + if (ops != NULL) + return ((*ops->init)()); + else + return (ENXIO); +} + +static __inline void +IOMMU_CLEANUP(void) +{ + if (ops != NULL && iommu_avail) + (*ops->cleanup)(); +} + +static __inline void * +IOMMU_CREATE_DOMAIN(vm_paddr_t maxaddr) +{ + + if (ops != NULL && iommu_avail) + return ((*ops->create_domain)(maxaddr)); + else + return (NULL); +} + +static __inline void +IOMMU_DESTROY_DOMAIN(void *dom) +{ + + if (ops != NULL && iommu_avail) + (*ops->destroy_domain)(dom); +} + +static __inline uint64_t +IOMMU_CREATE_MAPPING(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len) +{ + + if (ops != NULL && iommu_avail) + return ((*ops->create_mapping)(domain, gpa, hpa, len)); + else + return (len); /* XXX */ +} + +static __inline uint64_t +IOMMU_REMOVE_MAPPING(void *domain, vm_paddr_t gpa, uint64_t len) +{ + + if (ops != NULL && iommu_avail) + return ((*ops->remove_mapping)(domain, gpa, len)); + else + return (len); /* XXX */ +} + +static __inline void +IOMMU_ADD_DEVICE(void *domain, uint16_t rid) +{ + + if (ops != NULL && iommu_avail) + (*ops->add_device)(domain, rid); +} + +static __inline void +IOMMU_REMOVE_DEVICE(void *domain, uint16_t rid) +{ + + if (ops != NULL && iommu_avail) + (*ops->remove_device)(domain, rid); +} + +static __inline void +IOMMU_INVALIDATE_TLB(void *domain) +{ + + if (ops != NULL && iommu_avail) + (*ops->invalidate_tlb)(domain); +} + +static __inline void +IOMMU_ENABLE(void) +{ + + if (ops != NULL && iommu_avail) + (*ops->enable)(); +} + +static __inline void +IOMMU_DISABLE(void) +{ + + if (ops != NULL && iommu_avail) + (*ops->disable)(); +} + +void +iommu_init(void) +{ + int error, bus, slot, func; + vm_paddr_t maxaddr; + const char *name; + device_t dev; + + if (vmm_is_intel()) + ops = &iommu_ops_intel; + else if (vmm_is_amd()) + ops = &iommu_ops_amd; + else + ops = NULL; + + error = IOMMU_INIT(); + if (error) + return; + + iommu_avail = 1; + + /* + * Create a domain for the devices owned by the host + */ + maxaddr = vmm_mem_maxaddr(); + host_domain = IOMMU_CREATE_DOMAIN(maxaddr); + if (host_domain == NULL) + panic("iommu_init: unable to create a host domain"); + + /* + * Create 1:1 mappings from '0' to 'maxaddr' for devices assigned to + * the host + */ + iommu_create_mapping(host_domain, 0, 0, maxaddr); + + for (bus = 0; bus <= PCI_BUSMAX; bus++) { + for (slot = 0; slot <= PCI_SLOTMAX; slot++) { + for (func = 0; func <= PCI_FUNCMAX; func++) { + dev = pci_find_dbsf(0, bus, slot, func); + if (dev == NULL) + continue; + + /* skip passthrough devices */ + name = device_get_name(dev); + if (name != NULL && strcmp(name, "ppt") == 0) + continue; + + /* everything else belongs to the host domain */ + iommu_add_device(host_domain, + pci_get_rid(dev)); + } + } + } + IOMMU_ENABLE(); + +} + +void +iommu_cleanup(void) +{ + IOMMU_DISABLE(); + IOMMU_DESTROY_DOMAIN(host_domain); + IOMMU_CLEANUP(); +} + +void * +iommu_create_domain(vm_paddr_t maxaddr) +{ + + return (IOMMU_CREATE_DOMAIN(maxaddr)); +} + +void +iommu_destroy_domain(void *dom) +{ + + IOMMU_DESTROY_DOMAIN(dom); +} + +void +iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, size_t len) +{ + uint64_t mapped, remaining; + + remaining = len; + + while (remaining > 0) { + mapped = IOMMU_CREATE_MAPPING(dom, gpa, hpa, remaining); + gpa += mapped; + hpa += mapped; + remaining -= mapped; + } +} + +void +iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len) +{ + uint64_t unmapped, remaining; + + remaining = len; + + while (remaining > 0) { + unmapped = IOMMU_REMOVE_MAPPING(dom, gpa, remaining); + gpa += unmapped; + remaining -= unmapped; + } +} + +void * +iommu_host_domain(void) +{ + + return (host_domain); +} + +void +iommu_add_device(void *dom, uint16_t rid) +{ + + IOMMU_ADD_DEVICE(dom, rid); +} + +void +iommu_remove_device(void *dom, uint16_t rid) +{ + + IOMMU_REMOVE_DEVICE(dom, rid); +} + +void +iommu_invalidate_tlb(void *domain) +{ + + IOMMU_INVALIDATE_TLB(domain); +} diff --git a/vmm/io/iommu.h b/vmm/io/iommu.h new file mode 100644 index 0000000..36b44fa --- /dev/null +++ b/vmm/io/iommu.h @@ -0,0 +1,75 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IO_IOMMU_H_ +#define _IO_IOMMU_H_ + +typedef int (*iommu_init_func_t)(void); +typedef void (*iommu_cleanup_func_t)(void); +typedef void (*iommu_enable_func_t)(void); +typedef void (*iommu_disable_func_t)(void); +typedef void *(*iommu_create_domain_t)(vm_paddr_t maxaddr); +typedef void (*iommu_destroy_domain_t)(void *domain); +typedef uint64_t (*iommu_create_mapping_t)(void *domain, vm_paddr_t gpa, + vm_paddr_t hpa, uint64_t len); +typedef uint64_t (*iommu_remove_mapping_t)(void *domain, vm_paddr_t gpa, + uint64_t len); +typedef void (*iommu_add_device_t)(void *domain, uint16_t rid); +typedef void (*iommu_remove_device_t)(void *dom, uint16_t rid); +typedef void (*iommu_invalidate_tlb_t)(void *dom); + +struct iommu_ops { + iommu_init_func_t init; /* module wide */ + iommu_cleanup_func_t cleanup; + iommu_enable_func_t enable; + iommu_disable_func_t disable; + + iommu_create_domain_t create_domain; /* domain-specific */ + iommu_destroy_domain_t destroy_domain; + iommu_create_mapping_t create_mapping; + iommu_remove_mapping_t remove_mapping; + iommu_add_device_t add_device; + iommu_remove_device_t remove_device; + iommu_invalidate_tlb_t invalidate_tlb; +}; + +extern struct iommu_ops iommu_ops_intel; +extern struct iommu_ops iommu_ops_amd; + +void iommu_init(void); +void iommu_cleanup(void); +void *iommu_host_domain(void); +void *iommu_create_domain(vm_paddr_t maxaddr); +void iommu_destroy_domain(void *dom); +void iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, + size_t len); +void iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len); +void iommu_add_device(void *dom, uint16_t rid); +void iommu_remove_device(void *dom, uint16_t rid); +void iommu_invalidate_tlb(void *domain); +#endif diff --git a/vmm/io/ppt.c b/vmm/io/ppt.c new file mode 100644 index 0000000..b789f77 --- /dev/null +++ b/vmm/io/ppt.c @@ -0,0 +1,651 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include + +#include "vmm_lapic.h" +#include "vmm_ktr.h" + +#include "iommu.h" +#include "ppt.h" + +/* XXX locking */ + +#define MAX_MSIMSGS 32 + +/* + * If the MSI-X table is located in the middle of a BAR then that MMIO + * region gets split into two segments - one segment above the MSI-X table + * and the other segment below the MSI-X table - with a hole in place of + * the MSI-X table so accesses to it can be trapped and emulated. + * + * So, allocate a MMIO segment for each BAR register + 1 additional segment. + */ +#define MAX_MMIOSEGS ((PCIR_MAX_BAR_0 + 1) + 1) + +MALLOC_DEFINE(M_PPTMSIX, "pptmsix", "Passthru MSI-X resources"); + +struct pptintr_arg { /* pptintr(pptintr_arg) */ + struct pptdev *pptdev; + uint64_t addr; + uint64_t msg_data; +}; + +struct pptdev { + device_t dev; + struct vm *vm; /* owner of this device */ + TAILQ_ENTRY(pptdev) next; + struct vm_memory_segment mmio[MAX_MMIOSEGS]; + struct { + int num_msgs; /* guest state */ + + int startrid; /* host state */ + struct resource *res[MAX_MSIMSGS]; + void *cookie[MAX_MSIMSGS]; + struct pptintr_arg arg[MAX_MSIMSGS]; + } msi; + + struct { + int num_msgs; + int startrid; + int msix_table_rid; + struct resource *msix_table_res; + struct resource **res; + void **cookie; + struct pptintr_arg *arg; + } msix; +}; + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, ppt, CTLFLAG_RW, 0, "bhyve passthru devices"); + +static int num_pptdevs; +SYSCTL_INT(_hw_vmm_ppt, OID_AUTO, devices, CTLFLAG_RD, &num_pptdevs, 0, + "number of pci passthru devices"); + +static TAILQ_HEAD(, pptdev) pptdev_list = TAILQ_HEAD_INITIALIZER(pptdev_list); + +static int +ppt_probe(device_t dev) +{ + int bus, slot, func; + struct pci_devinfo *dinfo; + + dinfo = (struct pci_devinfo *)device_get_ivars(dev); + + bus = pci_get_bus(dev); + slot = pci_get_slot(dev); + func = pci_get_function(dev); + + /* + * To qualify as a pci passthrough device a device must: + * - be allowed by administrator to be used in this role + * - be an endpoint device + */ + if ((dinfo->cfg.hdrtype & PCIM_HDRTYPE) != PCIM_HDRTYPE_NORMAL) + return (ENXIO); + else if (vmm_is_pptdev(bus, slot, func)) + return (0); + else + /* + * Returning BUS_PROBE_NOWILDCARD here matches devices that the + * SR-IOV infrastructure specified as "ppt" passthrough devices. + * All normal devices that did not have "ppt" specified as their + * driver will not be matched by this. + */ + return (BUS_PROBE_NOWILDCARD); +} + +static int +ppt_attach(device_t dev) +{ + struct pptdev *ppt; + + ppt = device_get_softc(dev); + + num_pptdevs++; + TAILQ_INSERT_TAIL(&pptdev_list, ppt, next); + ppt->dev = dev; + + if (bootverbose) + device_printf(dev, "attached\n"); + + return (0); +} + +static int +ppt_detach(device_t dev) +{ + struct pptdev *ppt; + + ppt = device_get_softc(dev); + + if (ppt->vm != NULL) + return (EBUSY); + num_pptdevs--; + TAILQ_REMOVE(&pptdev_list, ppt, next); + + return (0); +} + +static device_method_t ppt_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, ppt_probe), + DEVMETHOD(device_attach, ppt_attach), + DEVMETHOD(device_detach, ppt_detach), + {0, 0} +}; + +static devclass_t ppt_devclass; +DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, sizeof(struct pptdev)); +DRIVER_MODULE(ppt, pci, ppt_driver, ppt_devclass, NULL, NULL); + +static struct pptdev * +ppt_find(int bus, int slot, int func) +{ + device_t dev; + struct pptdev *ppt; + int b, s, f; + + TAILQ_FOREACH(ppt, &pptdev_list, next) { + dev = ppt->dev; + b = pci_get_bus(dev); + s = pci_get_slot(dev); + f = pci_get_function(dev); + if (bus == b && slot == s && func == f) + return (ppt); + } + return (NULL); +} + +static void +ppt_unmap_mmio(struct vm *vm, struct pptdev *ppt) +{ + int i; + struct vm_memory_segment *seg; + + for (i = 0; i < MAX_MMIOSEGS; i++) { + seg = &ppt->mmio[i]; + if (seg->len == 0) + continue; + (void)vm_unmap_mmio(vm, seg->gpa, seg->len); + bzero(seg, sizeof(struct vm_memory_segment)); + } +} + +static void +ppt_teardown_msi(struct pptdev *ppt) +{ + int i, rid; + void *cookie; + struct resource *res; + + if (ppt->msi.num_msgs == 0) + return; + + for (i = 0; i < ppt->msi.num_msgs; i++) { + rid = ppt->msi.startrid + i; + res = ppt->msi.res[i]; + cookie = ppt->msi.cookie[i]; + + if (cookie != NULL) + bus_teardown_intr(ppt->dev, res, cookie); + + if (res != NULL) + bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res); + + ppt->msi.res[i] = NULL; + ppt->msi.cookie[i] = NULL; + } + + if (ppt->msi.startrid == 1) + pci_release_msi(ppt->dev); + + ppt->msi.num_msgs = 0; +} + +static void +ppt_teardown_msix_intr(struct pptdev *ppt, int idx) +{ + int rid; + struct resource *res; + void *cookie; + + rid = ppt->msix.startrid + idx; + res = ppt->msix.res[idx]; + cookie = ppt->msix.cookie[idx]; + + if (cookie != NULL) + bus_teardown_intr(ppt->dev, res, cookie); + + if (res != NULL) + bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res); + + ppt->msix.res[idx] = NULL; + ppt->msix.cookie[idx] = NULL; +} + +static void +ppt_teardown_msix(struct pptdev *ppt) +{ + int i; + + if (ppt->msix.num_msgs == 0) + return; + + for (i = 0; i < ppt->msix.num_msgs; i++) + ppt_teardown_msix_intr(ppt, i); + + if (ppt->msix.msix_table_res) { + bus_release_resource(ppt->dev, SYS_RES_MEMORY, + ppt->msix.msix_table_rid, + ppt->msix.msix_table_res); + ppt->msix.msix_table_res = NULL; + ppt->msix.msix_table_rid = 0; + } + + free(ppt->msix.res, M_PPTMSIX); + free(ppt->msix.cookie, M_PPTMSIX); + free(ppt->msix.arg, M_PPTMSIX); + + pci_release_msi(ppt->dev); + + ppt->msix.num_msgs = 0; +} + +int +ppt_avail_devices(void) +{ + + return (num_pptdevs); +} + +int +ppt_assigned_devices(struct vm *vm) +{ + struct pptdev *ppt; + int num; + + num = 0; + TAILQ_FOREACH(ppt, &pptdev_list, next) { + if (ppt->vm == vm) + num++; + } + return (num); +} + +boolean_t +ppt_is_mmio(struct vm *vm, vm_paddr_t gpa) +{ + int i; + struct pptdev *ppt; + struct vm_memory_segment *seg; + + TAILQ_FOREACH(ppt, &pptdev_list, next) { + if (ppt->vm != vm) + continue; + + for (i = 0; i < MAX_MMIOSEGS; i++) { + seg = &ppt->mmio[i]; + if (seg->len == 0) + continue; + if (gpa >= seg->gpa && gpa < seg->gpa + seg->len) + return (TRUE); + } + } + + return (FALSE); +} + +int +ppt_assign_device(struct vm *vm, int bus, int slot, int func) +{ + struct pptdev *ppt; + + ppt = ppt_find(bus, slot, func); + if (ppt != NULL) { + /* + * If this device is owned by a different VM then we + * cannot change its owner. + */ + if (ppt->vm != NULL && ppt->vm != vm) + return (EBUSY); + + ppt->vm = vm; + iommu_add_device(vm_iommu_domain(vm), pci_get_rid(ppt->dev)); + return (0); + } + return (ENOENT); +} + +int +ppt_unassign_device(struct vm *vm, int bus, int slot, int func) +{ + struct pptdev *ppt; + + ppt = ppt_find(bus, slot, func); + if (ppt != NULL) { + /* + * If this device is not owned by this 'vm' then bail out. + */ + if (ppt->vm != vm) + return (EBUSY); + ppt_unmap_mmio(vm, ppt); + ppt_teardown_msi(ppt); + ppt_teardown_msix(ppt); + iommu_remove_device(vm_iommu_domain(vm), pci_get_rid(ppt->dev)); + ppt->vm = NULL; + return (0); + } + return (ENOENT); +} + +int +ppt_unassign_all(struct vm *vm) +{ + struct pptdev *ppt; + int bus, slot, func; + device_t dev; + + TAILQ_FOREACH(ppt, &pptdev_list, next) { + if (ppt->vm == vm) { + dev = ppt->dev; + bus = pci_get_bus(dev); + slot = pci_get_slot(dev); + func = pci_get_function(dev); + vm_unassign_pptdev(vm, bus, slot, func); + } + } + + return (0); +} + +int +ppt_map_mmio(struct vm *vm, int bus, int slot, int func, + vm_paddr_t gpa, size_t len, vm_paddr_t hpa) +{ + int i, error; + struct vm_memory_segment *seg; + struct pptdev *ppt; + + ppt = ppt_find(bus, slot, func); + if (ppt != NULL) { + if (ppt->vm != vm) + return (EBUSY); + + for (i = 0; i < MAX_MMIOSEGS; i++) { + seg = &ppt->mmio[i]; + if (seg->len == 0) { + error = vm_map_mmio(vm, gpa, len, hpa); + if (error == 0) { + seg->gpa = gpa; + seg->len = len; + } + return (error); + } + } + return (ENOSPC); + } + return (ENOENT); +} + +static int +pptintr(void *arg) +{ + struct pptdev *ppt; + struct pptintr_arg *pptarg; + + pptarg = arg; + ppt = pptarg->pptdev; + + if (ppt->vm != NULL) + lapic_intr_msi(ppt->vm, pptarg->addr, pptarg->msg_data); + else { + /* + * XXX + * This is not expected to happen - panic? + */ + } + + /* + * For legacy interrupts give other filters a chance in case + * the interrupt was not generated by the passthrough device. + */ + if (ppt->msi.startrid == 0) + return (FILTER_STRAY); + else + return (FILTER_HANDLED); +} + +int +ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func, + uint64_t addr, uint64_t msg, int numvec) +{ + int i, rid, flags; + int msi_count, startrid, error, tmp; + struct pptdev *ppt; + + if (numvec < 0 || numvec > MAX_MSIMSGS) + return (EINVAL); + + ppt = ppt_find(bus, slot, func); + if (ppt == NULL) + return (ENOENT); + if (ppt->vm != vm) /* Make sure we own this device */ + return (EBUSY); + + /* Free any allocated resources */ + ppt_teardown_msi(ppt); + + if (numvec == 0) /* nothing more to do */ + return (0); + + flags = RF_ACTIVE; + msi_count = pci_msi_count(ppt->dev); + if (msi_count == 0) { + startrid = 0; /* legacy interrupt */ + msi_count = 1; + flags |= RF_SHAREABLE; + } else + startrid = 1; /* MSI */ + + /* + * The device must be capable of supporting the number of vectors + * the guest wants to allocate. + */ + if (numvec > msi_count) + return (EINVAL); + + /* + * Make sure that we can allocate all the MSI vectors that are needed + * by the guest. + */ + if (startrid == 1) { + tmp = numvec; + error = pci_alloc_msi(ppt->dev, &tmp); + if (error) + return (error); + else if (tmp != numvec) { + pci_release_msi(ppt->dev); + return (ENOSPC); + } else { + /* success */ + } + } + + ppt->msi.startrid = startrid; + + /* + * Allocate the irq resource and attach it to the interrupt handler. + */ + for (i = 0; i < numvec; i++) { + ppt->msi.num_msgs = i + 1; + ppt->msi.cookie[i] = NULL; + + rid = startrid + i; + ppt->msi.res[i] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ, + &rid, flags); + if (ppt->msi.res[i] == NULL) + break; + + ppt->msi.arg[i].pptdev = ppt; + ppt->msi.arg[i].addr = addr; + ppt->msi.arg[i].msg_data = msg + i; + + error = bus_setup_intr(ppt->dev, ppt->msi.res[i], + INTR_TYPE_NET | INTR_MPSAFE, + pptintr, NULL, &ppt->msi.arg[i], + &ppt->msi.cookie[i]); + if (error != 0) + break; + } + + if (i < numvec) { + ppt_teardown_msi(ppt); + return (ENXIO); + } + + return (0); +} + +int +ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func, + int idx, uint64_t addr, uint64_t msg, uint32_t vector_control) +{ + struct pptdev *ppt; + struct pci_devinfo *dinfo; + int numvec, alloced, rid, error; + size_t res_size, cookie_size, arg_size; + + ppt = ppt_find(bus, slot, func); + if (ppt == NULL) + return (ENOENT); + if (ppt->vm != vm) /* Make sure we own this device */ + return (EBUSY); + + dinfo = device_get_ivars(ppt->dev); + if (!dinfo) + return (ENXIO); + + /* + * First-time configuration: + * Allocate the MSI-X table + * Allocate the IRQ resources + * Set up some variables in ppt->msix + */ + if (ppt->msix.num_msgs == 0) { + numvec = pci_msix_count(ppt->dev); + if (numvec <= 0) + return (EINVAL); + + ppt->msix.startrid = 1; + ppt->msix.num_msgs = numvec; + + res_size = numvec * sizeof(ppt->msix.res[0]); + cookie_size = numvec * sizeof(ppt->msix.cookie[0]); + arg_size = numvec * sizeof(ppt->msix.arg[0]); + + ppt->msix.res = malloc(res_size, M_PPTMSIX, M_WAITOK | M_ZERO); + ppt->msix.cookie = malloc(cookie_size, M_PPTMSIX, + M_WAITOK | M_ZERO); + ppt->msix.arg = malloc(arg_size, M_PPTMSIX, M_WAITOK | M_ZERO); + + rid = dinfo->cfg.msix.msix_table_bar; + ppt->msix.msix_table_res = bus_alloc_resource_any(ppt->dev, + SYS_RES_MEMORY, &rid, RF_ACTIVE); + + if (ppt->msix.msix_table_res == NULL) { + ppt_teardown_msix(ppt); + return (ENOSPC); + } + ppt->msix.msix_table_rid = rid; + + alloced = numvec; + error = pci_alloc_msix(ppt->dev, &alloced); + if (error || alloced != numvec) { + ppt_teardown_msix(ppt); + return (error == 0 ? ENOSPC: error); + } + } + + if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { + /* Tear down the IRQ if it's already set up */ + ppt_teardown_msix_intr(ppt, idx); + + /* Allocate the IRQ resource */ + ppt->msix.cookie[idx] = NULL; + rid = ppt->msix.startrid + idx; + ppt->msix.res[idx] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ, + &rid, RF_ACTIVE); + if (ppt->msix.res[idx] == NULL) + return (ENXIO); + + ppt->msix.arg[idx].pptdev = ppt; + ppt->msix.arg[idx].addr = addr; + ppt->msix.arg[idx].msg_data = msg; + + /* Setup the MSI-X interrupt */ + error = bus_setup_intr(ppt->dev, ppt->msix.res[idx], + INTR_TYPE_NET | INTR_MPSAFE, + pptintr, NULL, &ppt->msix.arg[idx], + &ppt->msix.cookie[idx]); + + if (error != 0) { + bus_teardown_intr(ppt->dev, ppt->msix.res[idx], ppt->msix.cookie[idx]); + bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, ppt->msix.res[idx]); + ppt->msix.cookie[idx] = NULL; + ppt->msix.res[idx] = NULL; + return (ENXIO); + } + } else { + /* Masked, tear it down if it's already been set up */ + ppt_teardown_msix_intr(ppt, idx); + } + + return (0); +} diff --git a/vmm/io/ppt.h b/vmm/io/ppt.h new file mode 100644 index 0000000..8078896 --- /dev/null +++ b/vmm/io/ppt.h @@ -0,0 +1,54 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IO_PPT_H_ +#define _IO_PPT_H_ + +int ppt_unassign_all(struct vm *vm); +int ppt_map_mmio(struct vm *vm, int bus, int slot, int func, + vm_paddr_t gpa, size_t len, vm_paddr_t hpa); +int ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func, + uint64_t addr, uint64_t msg, int numvec); +int ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func, + int idx, uint64_t addr, uint64_t msg, uint32_t vector_control); +int ppt_assigned_devices(struct vm *vm); +boolean_t ppt_is_mmio(struct vm *vm, vm_paddr_t gpa); + +/* + * Returns the number of devices sequestered by the ppt driver for assignment + * to virtual machines. + */ +int ppt_avail_devices(void); + +/* + * The following functions should never be called directly. + * Use 'vm_assign_pptdev()' and 'vm_unassign_pptdev()' instead. + */ +int ppt_assign_device(struct vm *vm, int bus, int slot, int func); +int ppt_unassign_device(struct vm *vm, int bus, int slot, int func); +#endif diff --git a/vmm/io/vatpic.c b/vmm/io/vatpic.c new file mode 100644 index 0000000..6e94f5b --- /dev/null +++ b/vmm/io/vatpic.c @@ -0,0 +1,808 @@ +/*- + * Copyright (c) 2014 Tycho Nightingale + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include "vmm_ktr.h" +#include "vmm_lapic.h" +#include "vioapic.h" +#include "vatpic.h" + +static MALLOC_DEFINE(M_VATPIC, "atpic", "bhyve virtual atpic (8259)"); + +#define VATPIC_LOCK(vatpic) mtx_lock_spin(&((vatpic)->mtx)) +#define VATPIC_UNLOCK(vatpic) mtx_unlock_spin(&((vatpic)->mtx)) +#define VATPIC_LOCKED(vatpic) mtx_owned(&((vatpic)->mtx)) + +enum irqstate { + IRQSTATE_ASSERT, + IRQSTATE_DEASSERT, + IRQSTATE_PULSE +}; + +struct atpic { + bool ready; + int icw_num; + int rd_cmd_reg; + + bool aeoi; + bool poll; + bool rotate; + bool sfn; /* special fully-nested mode */ + + int irq_base; + uint8_t request; /* Interrupt Request Register (IIR) */ + uint8_t service; /* Interrupt Service (ISR) */ + uint8_t mask; /* Interrupt Mask Register (IMR) */ + uint8_t smm; /* special mask mode */ + + int acnt[8]; /* sum of pin asserts and deasserts */ + int lowprio; /* lowest priority irq */ + + bool intr_raised; +}; + +struct vatpic { + struct vm *vm; + struct mtx mtx; + struct atpic atpic[2]; + uint8_t elc[2]; +}; + +#define VATPIC_CTR0(vatpic, fmt) \ + VM_CTR0((vatpic)->vm, fmt) + +#define VATPIC_CTR1(vatpic, fmt, a1) \ + VM_CTR1((vatpic)->vm, fmt, a1) + +#define VATPIC_CTR2(vatpic, fmt, a1, a2) \ + VM_CTR2((vatpic)->vm, fmt, a1, a2) + +#define VATPIC_CTR3(vatpic, fmt, a1, a2, a3) \ + VM_CTR3((vatpic)->vm, fmt, a1, a2, a3) + +#define VATPIC_CTR4(vatpic, fmt, a1, a2, a3, a4) \ + VM_CTR4((vatpic)->vm, fmt, a1, a2, a3, a4) + +/* + * Loop over all the pins in priority order from highest to lowest. + */ +#define ATPIC_PIN_FOREACH(pinvar, atpic, tmpvar) \ + for (tmpvar = 0, pinvar = (atpic->lowprio + 1) & 0x7; \ + tmpvar < 8; \ + tmpvar++, pinvar = (pinvar + 1) & 0x7) + +static void vatpic_set_pinstate(struct vatpic *vatpic, int pin, bool newstate); + +static __inline bool +master_atpic(struct vatpic *vatpic, struct atpic *atpic) +{ + + if (atpic == &vatpic->atpic[0]) + return (true); + else + return (false); +} + +static __inline int +vatpic_get_highest_isrpin(struct atpic *atpic) +{ + int bit, pin; + int i; + + ATPIC_PIN_FOREACH(pin, atpic, i) { + bit = (1 << pin); + + if (atpic->service & bit) { + /* + * An IS bit that is masked by an IMR bit will not be + * cleared by a non-specific EOI in Special Mask Mode. + */ + if (atpic->smm && (atpic->mask & bit) != 0) + continue; + else + return (pin); + } + } + + return (-1); +} + +static __inline int +vatpic_get_highest_irrpin(struct atpic *atpic) +{ + int serviced; + int bit, pin, tmp; + + /* + * In 'Special Fully-Nested Mode' when an interrupt request from + * a slave is in service, the slave is not locked out from the + * master's priority logic. + */ + serviced = atpic->service; + if (atpic->sfn) + serviced &= ~(1 << 2); + + /* + * In 'Special Mask Mode', when a mask bit is set in OCW1 it inhibits + * further interrupts at that level and enables interrupts from all + * other levels that are not masked. In other words the ISR has no + * bearing on the levels that can generate interrupts. + */ + if (atpic->smm) + serviced = 0; + + ATPIC_PIN_FOREACH(pin, atpic, tmp) { + bit = 1 << pin; + + /* + * If there is already an interrupt in service at the same + * or higher priority then bail. + */ + if ((serviced & bit) != 0) + break; + + /* + * If an interrupt is asserted and not masked then return + * the corresponding 'pin' to the caller. + */ + if ((atpic->request & bit) != 0 && (atpic->mask & bit) == 0) + return (pin); + } + + return (-1); +} + +static void +vatpic_notify_intr(struct vatpic *vatpic) +{ + struct atpic *atpic; + int pin; + + KASSERT(VATPIC_LOCKED(vatpic), ("vatpic_notify_intr not locked")); + + /* + * First check the slave. + */ + atpic = &vatpic->atpic[1]; + if (!atpic->intr_raised && + (pin = vatpic_get_highest_irrpin(atpic)) != -1) { + VATPIC_CTR4(vatpic, "atpic slave notify pin = %d " + "(imr 0x%x irr 0x%x isr 0x%x)", pin, + atpic->mask, atpic->request, atpic->service); + + /* + * Cascade the request from the slave to the master. + */ + atpic->intr_raised = true; + vatpic_set_pinstate(vatpic, 2, true); + vatpic_set_pinstate(vatpic, 2, false); + } else { + VATPIC_CTR3(vatpic, "atpic slave no eligible interrupts " + "(imr 0x%x irr 0x%x isr 0x%x)", + atpic->mask, atpic->request, atpic->service); + } + + /* + * Then check the master. + */ + atpic = &vatpic->atpic[0]; + if (!atpic->intr_raised && + (pin = vatpic_get_highest_irrpin(atpic)) != -1) { + VATPIC_CTR4(vatpic, "atpic master notify pin = %d " + "(imr 0x%x irr 0x%x isr 0x%x)", pin, + atpic->mask, atpic->request, atpic->service); + + /* + * From Section 3.6.2, "Interrupt Modes", in the + * MPtable Specification, Version 1.4 + * + * PIC interrupts are routed to both the Local APIC + * and the I/O APIC to support operation in 1 of 3 + * modes. + * + * 1. Legacy PIC Mode: the PIC effectively bypasses + * all APIC components. In this mode the local APIC is + * disabled and LINT0 is reconfigured as INTR to + * deliver the PIC interrupt directly to the CPU. + * + * 2. Virtual Wire Mode: the APIC is treated as a + * virtual wire which delivers interrupts from the PIC + * to the CPU. In this mode LINT0 is programmed as + * ExtINT to indicate that the PIC is the source of + * the interrupt. + * + * 3. Virtual Wire Mode via I/O APIC: PIC interrupts are + * fielded by the I/O APIC and delivered to the appropriate + * CPU. In this mode the I/O APIC input 0 is programmed + * as ExtINT to indicate that the PIC is the source of the + * interrupt. + */ + atpic->intr_raised = true; + lapic_set_local_intr(vatpic->vm, -1, APIC_LVT_LINT0); + vioapic_pulse_irq(vatpic->vm, 0); + } else { + VATPIC_CTR3(vatpic, "atpic master no eligible interrupts " + "(imr 0x%x irr 0x%x isr 0x%x)", + atpic->mask, atpic->request, atpic->service); + } +} + +static int +vatpic_icw1(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic icw1 0x%x", val); + + atpic->ready = false; + + atpic->icw_num = 1; + atpic->request = 0; + atpic->mask = 0; + atpic->lowprio = 7; + atpic->rd_cmd_reg = 0; + atpic->poll = 0; + atpic->smm = 0; + + if ((val & ICW1_SNGL) != 0) { + VATPIC_CTR0(vatpic, "vatpic cascade mode required"); + return (-1); + } + + if ((val & ICW1_IC4) == 0) { + VATPIC_CTR0(vatpic, "vatpic icw4 required"); + return (-1); + } + + atpic->icw_num++; + + return (0); +} + +static int +vatpic_icw2(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic icw2 0x%x", val); + + atpic->irq_base = val & 0xf8; + + atpic->icw_num++; + + return (0); +} + +static int +vatpic_icw3(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic icw3 0x%x", val); + + atpic->icw_num++; + + return (0); +} + +static int +vatpic_icw4(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic icw4 0x%x", val); + + if ((val & ICW4_8086) == 0) { + VATPIC_CTR0(vatpic, "vatpic microprocessor mode required"); + return (-1); + } + + if ((val & ICW4_AEOI) != 0) + atpic->aeoi = true; + + if ((val & ICW4_SFNM) != 0) { + if (master_atpic(vatpic, atpic)) { + atpic->sfn = true; + } else { + VATPIC_CTR1(vatpic, "Ignoring special fully nested " + "mode on slave atpic: %#x", val); + } + } + + atpic->icw_num = 0; + atpic->ready = true; + + return (0); +} + +static int +vatpic_ocw1(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic ocw1 0x%x", val); + + atpic->mask = val & 0xff; + + return (0); +} + +static int +vatpic_ocw2(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic ocw2 0x%x", val); + + atpic->rotate = ((val & OCW2_R) != 0); + + if ((val & OCW2_EOI) != 0) { + int isr_bit; + + if ((val & OCW2_SL) != 0) { + /* specific EOI */ + isr_bit = val & 0x7; + } else { + /* non-specific EOI */ + isr_bit = vatpic_get_highest_isrpin(atpic); + } + + if (isr_bit != -1) { + atpic->service &= ~(1 << isr_bit); + + if (atpic->rotate) + atpic->lowprio = isr_bit; + } + } else if ((val & OCW2_SL) != 0 && atpic->rotate == true) { + /* specific priority */ + atpic->lowprio = val & 0x7; + } + + return (0); +} + +static int +vatpic_ocw3(struct vatpic *vatpic, struct atpic *atpic, uint8_t val) +{ + VATPIC_CTR1(vatpic, "atpic ocw3 0x%x", val); + + if (val & OCW3_ESMM) { + atpic->smm = val & OCW3_SMM ? 1 : 0; + VATPIC_CTR2(vatpic, "%s atpic special mask mode %s", + master_atpic(vatpic, atpic) ? "master" : "slave", + atpic->smm ? "enabled" : "disabled"); + } + + if (val & OCW3_RR) { + /* read register command */ + atpic->rd_cmd_reg = val & OCW3_RIS; + + /* Polling mode */ + atpic->poll = ((val & OCW3_P) != 0); + } + + return (0); +} + +static void +vatpic_set_pinstate(struct vatpic *vatpic, int pin, bool newstate) +{ + struct atpic *atpic; + int oldcnt, newcnt; + bool level; + + KASSERT(pin >= 0 && pin < 16, + ("vatpic_set_pinstate: invalid pin number %d", pin)); + KASSERT(VATPIC_LOCKED(vatpic), + ("vatpic_set_pinstate: vatpic is not locked")); + + atpic = &vatpic->atpic[pin >> 3]; + + oldcnt = atpic->acnt[pin & 0x7]; + if (newstate) + atpic->acnt[pin & 0x7]++; + else + atpic->acnt[pin & 0x7]--; + newcnt = atpic->acnt[pin & 0x7]; + + if (newcnt < 0) { + VATPIC_CTR2(vatpic, "atpic pin%d: bad acnt %d", pin, newcnt); + } + + level = ((vatpic->elc[pin >> 3] & (1 << (pin & 0x7))) != 0); + + if ((oldcnt == 0 && newcnt == 1) || (newcnt > 0 && level == true)) { + /* rising edge or level */ + VATPIC_CTR1(vatpic, "atpic pin%d: asserted", pin); + atpic->request |= (1 << (pin & 0x7)); + } else if (oldcnt == 1 && newcnt == 0) { + /* falling edge */ + VATPIC_CTR1(vatpic, "atpic pin%d: deasserted", pin); + if (level) + atpic->request &= ~(1 << (pin & 0x7)); + } else { + VATPIC_CTR3(vatpic, "atpic pin%d: %s, ignored, acnt %d", + pin, newstate ? "asserted" : "deasserted", newcnt); + } + + vatpic_notify_intr(vatpic); +} + +static int +vatpic_set_irqstate(struct vm *vm, int irq, enum irqstate irqstate) +{ + struct vatpic *vatpic; + struct atpic *atpic; + + if (irq < 0 || irq > 15) + return (EINVAL); + + vatpic = vm_atpic(vm); + atpic = &vatpic->atpic[irq >> 3]; + + if (atpic->ready == false) + return (0); + + VATPIC_LOCK(vatpic); + switch (irqstate) { + case IRQSTATE_ASSERT: + vatpic_set_pinstate(vatpic, irq, true); + break; + case IRQSTATE_DEASSERT: + vatpic_set_pinstate(vatpic, irq, false); + break; + case IRQSTATE_PULSE: + vatpic_set_pinstate(vatpic, irq, true); + vatpic_set_pinstate(vatpic, irq, false); + break; + default: + panic("vatpic_set_irqstate: invalid irqstate %d", irqstate); + } + VATPIC_UNLOCK(vatpic); + + return (0); +} + +int +vatpic_assert_irq(struct vm *vm, int irq) +{ + return (vatpic_set_irqstate(vm, irq, IRQSTATE_ASSERT)); +} + +int +vatpic_deassert_irq(struct vm *vm, int irq) +{ + return (vatpic_set_irqstate(vm, irq, IRQSTATE_DEASSERT)); +} + +int +vatpic_pulse_irq(struct vm *vm, int irq) +{ + return (vatpic_set_irqstate(vm, irq, IRQSTATE_PULSE)); +} + +int +vatpic_set_irq_trigger(struct vm *vm, int irq, enum vm_intr_trigger trigger) +{ + struct vatpic *vatpic; + + if (irq < 0 || irq > 15) + return (EINVAL); + + /* + * See comment in vatpic_elc_handler. These IRQs must be + * edge triggered. + */ + if (trigger == LEVEL_TRIGGER) { + switch (irq) { + case 0: + case 1: + case 2: + case 8: + case 13: + return (EINVAL); + } + } + + vatpic = vm_atpic(vm); + + VATPIC_LOCK(vatpic); + + if (trigger == LEVEL_TRIGGER) + vatpic->elc[irq >> 3] |= 1 << (irq & 0x7); + else + vatpic->elc[irq >> 3] &= ~(1 << (irq & 0x7)); + + VATPIC_UNLOCK(vatpic); + + return (0); +} + +void +vatpic_pending_intr(struct vm *vm, int *vecptr) +{ + struct vatpic *vatpic; + struct atpic *atpic; + int pin; + + vatpic = vm_atpic(vm); + + atpic = &vatpic->atpic[0]; + + VATPIC_LOCK(vatpic); + + pin = vatpic_get_highest_irrpin(atpic); + if (pin == 2) { + atpic = &vatpic->atpic[1]; + pin = vatpic_get_highest_irrpin(atpic); + } + + /* + * If there are no pins active at this moment then return the spurious + * interrupt vector instead. + */ + if (pin == -1) + pin = 7; + + KASSERT(pin >= 0 && pin <= 7, ("%s: invalid pin %d", __func__, pin)); + *vecptr = atpic->irq_base + pin; + + VATPIC_UNLOCK(vatpic); +} + +static void +vatpic_pin_accepted(struct atpic *atpic, int pin) +{ + atpic->intr_raised = false; + + if (atpic->acnt[pin] == 0) + atpic->request &= ~(1 << pin); + + if (atpic->aeoi == true) { + if (atpic->rotate == true) + atpic->lowprio = pin; + } else { + atpic->service |= (1 << pin); + } +} + +void +vatpic_intr_accepted(struct vm *vm, int vector) +{ + struct vatpic *vatpic; + int pin; + + vatpic = vm_atpic(vm); + + VATPIC_LOCK(vatpic); + + pin = vector & 0x7; + + if ((vector & ~0x7) == vatpic->atpic[1].irq_base) { + vatpic_pin_accepted(&vatpic->atpic[1], pin); + /* + * If this vector originated from the slave, + * accept the cascaded interrupt too. + */ + vatpic_pin_accepted(&vatpic->atpic[0], 2); + } else { + vatpic_pin_accepted(&vatpic->atpic[0], pin); + } + + vatpic_notify_intr(vatpic); + + VATPIC_UNLOCK(vatpic); +} + +static int +vatpic_read(struct vatpic *vatpic, struct atpic *atpic, bool in, int port, + int bytes, uint32_t *eax) +{ + int pin; + + VATPIC_LOCK(vatpic); + + if (atpic->poll) { + atpic->poll = 0; + pin = vatpic_get_highest_irrpin(atpic); + if (pin >= 0) { + vatpic_pin_accepted(atpic, pin); + *eax = 0x80 | pin; + } else { + *eax = 0; + } + } else { + if (port & ICU_IMR_OFFSET) { + /* read interrrupt mask register */ + *eax = atpic->mask; + } else { + if (atpic->rd_cmd_reg == OCW3_RIS) { + /* read interrupt service register */ + *eax = atpic->service; + } else { + /* read interrupt request register */ + *eax = atpic->request; + } + } + } + + VATPIC_UNLOCK(vatpic); + + return (0); + +} + +static int +vatpic_write(struct vatpic *vatpic, struct atpic *atpic, bool in, int port, + int bytes, uint32_t *eax) +{ + int error; + uint8_t val; + + error = 0; + val = *eax; + + VATPIC_LOCK(vatpic); + + if (port & ICU_IMR_OFFSET) { + switch (atpic->icw_num) { + case 2: + error = vatpic_icw2(vatpic, atpic, val); + break; + case 3: + error = vatpic_icw3(vatpic, atpic, val); + break; + case 4: + error = vatpic_icw4(vatpic, atpic, val); + break; + default: + error = vatpic_ocw1(vatpic, atpic, val); + break; + } + } else { + if (val & (1 << 4)) + error = vatpic_icw1(vatpic, atpic, val); + + if (atpic->ready) { + if (val & (1 << 3)) + error = vatpic_ocw3(vatpic, atpic, val); + else + error = vatpic_ocw2(vatpic, atpic, val); + } + } + + if (atpic->ready) + vatpic_notify_intr(vatpic); + + VATPIC_UNLOCK(vatpic); + + return (error); +} + +int +vatpic_master_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *eax) +{ + struct vatpic *vatpic; + struct atpic *atpic; + + vatpic = vm_atpic(vm); + atpic = &vatpic->atpic[0]; + + if (bytes != 1) + return (-1); + + if (in) { + return (vatpic_read(vatpic, atpic, in, port, bytes, eax)); + } + + return (vatpic_write(vatpic, atpic, in, port, bytes, eax)); +} + +int +vatpic_slave_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *eax) +{ + struct vatpic *vatpic; + struct atpic *atpic; + + vatpic = vm_atpic(vm); + atpic = &vatpic->atpic[1]; + + if (bytes != 1) + return (-1); + + if (in) { + return (vatpic_read(vatpic, atpic, in, port, bytes, eax)); + } + + return (vatpic_write(vatpic, atpic, in, port, bytes, eax)); +} + +int +vatpic_elc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *eax) +{ + struct vatpic *vatpic; + bool is_master; + + vatpic = vm_atpic(vm); + is_master = (port == IO_ELCR1); + + if (bytes != 1) + return (-1); + + VATPIC_LOCK(vatpic); + + if (in) { + if (is_master) + *eax = vatpic->elc[0]; + else + *eax = vatpic->elc[1]; + } else { + /* + * For the master PIC the cascade channel (IRQ2), the + * heart beat timer (IRQ0), and the keyboard + * controller (IRQ1) cannot be programmed for level + * mode. + * + * For the slave PIC the real time clock (IRQ8) and + * the floating point error interrupt (IRQ13) cannot + * be programmed for level mode. + */ + if (is_master) + vatpic->elc[0] = (*eax & 0xf8); + else + vatpic->elc[1] = (*eax & 0xde); + } + + VATPIC_UNLOCK(vatpic); + + return (0); +} + +struct vatpic * +vatpic_init(struct vm *vm) +{ + struct vatpic *vatpic; + + vatpic = malloc(sizeof(struct vatpic), M_VATPIC, M_WAITOK | M_ZERO); + vatpic->vm = vm; + + mtx_init(&vatpic->mtx, "vatpic lock", NULL, MTX_SPIN); + + return (vatpic); +} + +void +vatpic_cleanup(struct vatpic *vatpic) +{ + free(vatpic, M_VATPIC); +} diff --git a/vmm/io/vatpic.h b/vmm/io/vatpic.h new file mode 100644 index 0000000..d4a1be1 --- /dev/null +++ b/vmm/io/vatpic.h @@ -0,0 +1,57 @@ +/*- + * Copyright (c) 2014 Tycho Nightingale + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VATPIC_H_ +#define _VATPIC_H_ + +#include + +#define ICU_IMR_OFFSET 1 + +#define IO_ELCR1 0x4d0 +#define IO_ELCR2 0x4d1 + +struct vatpic *vatpic_init(struct vm *vm); +void vatpic_cleanup(struct vatpic *vatpic); + +int vatpic_master_handler(struct vm *vm, int vcpuid, bool in, int port, + int bytes, uint32_t *eax); +int vatpic_slave_handler(struct vm *vm, int vcpuid, bool in, int port, + int bytes, uint32_t *eax); +int vatpic_elc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *eax); + +int vatpic_assert_irq(struct vm *vm, int irq); +int vatpic_deassert_irq(struct vm *vm, int irq); +int vatpic_pulse_irq(struct vm *vm, int irq); +int vatpic_set_irq_trigger(struct vm *vm, int irq, enum vm_intr_trigger trigger); + +void vatpic_pending_intr(struct vm *vm, int *vecptr); +void vatpic_intr_accepted(struct vm *vm, int vector); + +#endif /* _VATPIC_H_ */ diff --git a/vmm/io/vatpit.c b/vmm/io/vatpit.c new file mode 100644 index 0000000..173ef1f --- /dev/null +++ b/vmm/io/vatpit.c @@ -0,0 +1,457 @@ +/*- + * Copyright (c) 2014 Tycho Nightingale + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "vmm_ktr.h" +#include "vatpic.h" +#include "vioapic.h" +#include "vatpit.h" + +static MALLOC_DEFINE(M_VATPIT, "atpit", "bhyve virtual atpit (8254)"); + +#define VATPIT_LOCK(vatpit) mtx_lock_spin(&((vatpit)->mtx)) +#define VATPIT_UNLOCK(vatpit) mtx_unlock_spin(&((vatpit)->mtx)) +#define VATPIT_LOCKED(vatpit) mtx_owned(&((vatpit)->mtx)) + +#define TIMER_SEL_MASK 0xc0 +#define TIMER_RW_MASK 0x30 +#define TIMER_MODE_MASK 0x0f +#define TIMER_SEL_READBACK 0xc0 + +#define TIMER_STS_OUT 0x80 +#define TIMER_STS_NULLCNT 0x40 + +#define TIMER_RB_LCTR 0x20 +#define TIMER_RB_LSTATUS 0x10 +#define TIMER_RB_CTR_2 0x08 +#define TIMER_RB_CTR_1 0x04 +#define TIMER_RB_CTR_0 0x02 + +#define TMR2_OUT_STS 0x20 + +#define PIT_8254_FREQ 1193182 +#define TIMER_DIV(freq, hz) (((freq) + (hz) / 2) / (hz)) + +struct vatpit_callout_arg { + struct vatpit *vatpit; + int channel_num; +}; + + +struct channel { + int mode; + uint16_t initial; /* initial counter value */ + sbintime_t now_sbt; /* uptime when counter was loaded */ + uint8_t cr[2]; + uint8_t ol[2]; + bool slatched; /* status latched */ + uint8_t status; + int crbyte; + int olbyte; + int frbyte; + struct callout callout; + sbintime_t callout_sbt; /* target time */ + struct vatpit_callout_arg callout_arg; +}; + +struct vatpit { + struct vm *vm; + struct mtx mtx; + + sbintime_t freq_sbt; + + struct channel channel[3]; +}; + +static void pit_timer_start_cntr0(struct vatpit *vatpit); + +static int +vatpit_get_out(struct vatpit *vatpit, int channel) +{ + struct channel *c; + sbintime_t delta_ticks; + int out; + + c = &vatpit->channel[channel]; + + switch (c->mode) { + case TIMER_INTTC: + delta_ticks = (sbinuptime() - c->now_sbt) / vatpit->freq_sbt; + out = ((c->initial - delta_ticks) <= 0); + break; + default: + out = 0; + break; + } + + return (out); +} + +static void +vatpit_callout_handler(void *a) +{ + struct vatpit_callout_arg *arg = a; + struct vatpit *vatpit; + struct callout *callout; + struct channel *c; + + vatpit = arg->vatpit; + c = &vatpit->channel[arg->channel_num]; + callout = &c->callout; + + VM_CTR1(vatpit->vm, "atpit t%d fired", arg->channel_num); + + VATPIT_LOCK(vatpit); + + if (callout_pending(callout)) /* callout was reset */ + goto done; + + if (!callout_active(callout)) /* callout was stopped */ + goto done; + + callout_deactivate(callout); + + if (c->mode == TIMER_RATEGEN) { + pit_timer_start_cntr0(vatpit); + } + + vatpic_pulse_irq(vatpit->vm, 0); + vioapic_pulse_irq(vatpit->vm, 2); + +done: + VATPIT_UNLOCK(vatpit); + return; +} + +static void +pit_timer_start_cntr0(struct vatpit *vatpit) +{ + struct channel *c; + sbintime_t now, delta, precision; + + c = &vatpit->channel[0]; + if (c->initial != 0) { + delta = c->initial * vatpit->freq_sbt; + precision = delta >> tc_precexp; + c->callout_sbt = c->callout_sbt + delta; + + /* + * Reset 'callout_sbt' if the time that the callout + * was supposed to fire is more than 'c->initial' + * ticks in the past. + */ + now = sbinuptime(); + if (c->callout_sbt < now) + c->callout_sbt = now + delta; + + callout_reset_sbt(&c->callout, c->callout_sbt, + precision, vatpit_callout_handler, &c->callout_arg, + C_ABSOLUTE); + } +} + +static uint16_t +pit_update_counter(struct vatpit *vatpit, struct channel *c, bool latch) +{ + uint16_t lval; + sbintime_t delta_ticks; + + /* cannot latch a new value until the old one has been consumed */ + if (latch && c->olbyte != 0) + return (0); + + if (c->initial == 0) { + /* + * This is possibly an o/s bug - reading the value of + * the timer without having set up the initial value. + * + * The original user-space version of this code set + * the timer to 100hz in this condition; do the same + * here. + */ + c->initial = TIMER_DIV(PIT_8254_FREQ, 100); + c->now_sbt = sbinuptime(); + c->status &= ~TIMER_STS_NULLCNT; + } + + delta_ticks = (sbinuptime() - c->now_sbt) / vatpit->freq_sbt; + + lval = c->initial - delta_ticks % c->initial; + + if (latch) { + c->olbyte = 2; + c->ol[1] = lval; /* LSB */ + c->ol[0] = lval >> 8; /* MSB */ + } + + return (lval); +} + +static int +pit_readback1(struct vatpit *vatpit, int channel, uint8_t cmd) +{ + struct channel *c; + + c = &vatpit->channel[channel]; + + /* + * Latch the count/status of the timer if not already latched. + * N.B. that the count/status latch-select bits are active-low. + */ + if (!(cmd & TIMER_RB_LCTR) && !c->olbyte) { + (void) pit_update_counter(vatpit, c, true); + } + + if (!(cmd & TIMER_RB_LSTATUS) && !c->slatched) { + c->slatched = true; + /* + * For mode 0, see if the elapsed time is greater + * than the initial value - this results in the + * output pin being set to 1 in the status byte. + */ + if (c->mode == TIMER_INTTC && vatpit_get_out(vatpit, channel)) + c->status |= TIMER_STS_OUT; + else + c->status &= ~TIMER_STS_OUT; + } + + return (0); +} + +static int +pit_readback(struct vatpit *vatpit, uint8_t cmd) +{ + int error; + + /* + * The readback command can apply to all timers. + */ + error = 0; + if (cmd & TIMER_RB_CTR_0) + error = pit_readback1(vatpit, 0, cmd); + if (!error && cmd & TIMER_RB_CTR_1) + error = pit_readback1(vatpit, 1, cmd); + if (!error && cmd & TIMER_RB_CTR_2) + error = pit_readback1(vatpit, 2, cmd); + + return (error); +} + + +static int +vatpit_update_mode(struct vatpit *vatpit, uint8_t val) +{ + struct channel *c; + int sel, rw, mode; + + sel = val & TIMER_SEL_MASK; + rw = val & TIMER_RW_MASK; + mode = val & TIMER_MODE_MASK; + + if (sel == TIMER_SEL_READBACK) + return (pit_readback(vatpit, val)); + + if (rw != TIMER_LATCH && rw != TIMER_16BIT) + return (-1); + + if (rw != TIMER_LATCH) { + /* + * Counter mode is not affected when issuing a + * latch command. + */ + if (mode != TIMER_INTTC && + mode != TIMER_RATEGEN && + mode != TIMER_SQWAVE && + mode != TIMER_SWSTROBE) + return (-1); + } + + c = &vatpit->channel[sel >> 6]; + if (rw == TIMER_LATCH) + pit_update_counter(vatpit, c, true); + else { + c->mode = mode; + c->olbyte = 0; /* reset latch after reprogramming */ + c->status |= TIMER_STS_NULLCNT; + } + + return (0); +} + +int +vatpit_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *eax) +{ + struct vatpit *vatpit; + struct channel *c; + uint8_t val; + int error; + + vatpit = vm_atpit(vm); + + if (bytes != 1) + return (-1); + + val = *eax; + + if (port == TIMER_MODE) { + if (in) { + VM_CTR0(vatpit->vm, "vatpit attempt to read mode"); + return (-1); + } + + VATPIT_LOCK(vatpit); + error = vatpit_update_mode(vatpit, val); + VATPIT_UNLOCK(vatpit); + + return (error); + } + + /* counter ports */ + KASSERT(port >= TIMER_CNTR0 && port <= TIMER_CNTR2, + ("invalid port 0x%x", port)); + c = &vatpit->channel[port - TIMER_CNTR0]; + + VATPIT_LOCK(vatpit); + if (in && c->slatched) { + /* + * Return the status byte if latched + */ + *eax = c->status; + c->slatched = false; + c->status = 0; + } else if (in) { + /* + * The spec says that once the output latch is completely + * read it should revert to "following" the counter. Use + * the free running counter for this case (i.e. Linux + * TSC calibration). Assuming the access mode is 16-bit, + * toggle the MSB/LSB bit on each read. + */ + if (c->olbyte == 0) { + uint16_t tmp; + + tmp = pit_update_counter(vatpit, c, false); + if (c->frbyte) + tmp >>= 8; + tmp &= 0xff; + *eax = tmp; + c->frbyte ^= 1; + } else + *eax = c->ol[--c->olbyte]; + } else { + c->cr[c->crbyte++] = *eax; + if (c->crbyte == 2) { + c->status &= ~TIMER_STS_NULLCNT; + c->frbyte = 0; + c->crbyte = 0; + c->initial = c->cr[0] | (uint16_t)c->cr[1] << 8; + c->now_sbt = sbinuptime(); + /* Start an interval timer for channel 0 */ + if (port == TIMER_CNTR0) { + c->callout_sbt = c->now_sbt; + pit_timer_start_cntr0(vatpit); + } + if (c->initial == 0) + c->initial = 0xffff; + } + } + VATPIT_UNLOCK(vatpit); + + return (0); +} + +int +vatpit_nmisc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *eax) +{ + struct vatpit *vatpit; + + vatpit = vm_atpit(vm); + + if (in) { + VATPIT_LOCK(vatpit); + if (vatpit_get_out(vatpit, 2)) + *eax = TMR2_OUT_STS; + else + *eax = 0; + + VATPIT_UNLOCK(vatpit); + } + + return (0); +} + +struct vatpit * +vatpit_init(struct vm *vm) +{ + struct vatpit *vatpit; + struct bintime bt; + struct vatpit_callout_arg *arg; + int i; + + vatpit = malloc(sizeof(struct vatpit), M_VATPIT, M_WAITOK | M_ZERO); + vatpit->vm = vm; + + mtx_init(&vatpit->mtx, "vatpit lock", NULL, MTX_SPIN); + + FREQ2BT(PIT_8254_FREQ, &bt); + vatpit->freq_sbt = bttosbt(bt); + + for (i = 0; i < 3; i++) { + callout_init(&vatpit->channel[i].callout, true); + arg = &vatpit->channel[i].callout_arg; + arg->vatpit = vatpit; + arg->channel_num = i; + } + + return (vatpit); +} + +void +vatpit_cleanup(struct vatpit *vatpit) +{ + int i; + + for (i = 0; i < 3; i++) + callout_drain(&vatpit->channel[i].callout); + + free(vatpit, M_VATPIT); +} diff --git a/vmm/io/vatpit.h b/vmm/io/vatpit.h new file mode 100644 index 0000000..5719c9c --- /dev/null +++ b/vmm/io/vatpit.h @@ -0,0 +1,45 @@ +/*- + * Copyright (c) 2014 Tycho Nightingale + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VATPIT_H_ +#define _VATPIT_H_ + +#include + +#define NMISC_PORT 0x61 + +struct vatpit *vatpit_init(struct vm *vm); +void vatpit_cleanup(struct vatpit *vatpit); + +int vatpit_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *eax); +int vatpit_nmisc_handler(struct vm *vm, int vcpuid, bool in, int port, + int bytes, uint32_t *eax); + +#endif /* _VATPIT_H_ */ diff --git a/vmm/io/vhpet.c b/vmm/io/vhpet.c new file mode 100644 index 0000000..1db1c51 --- /dev/null +++ b/vmm/io/vhpet.c @@ -0,0 +1,759 @@ +/*- + * Copyright (c) 2013 Tycho Nightingale + * Copyright (c) 2013 Neel Natu + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include "vmm_lapic.h" +#include "vatpic.h" +#include "vioapic.h" +#include "vhpet.h" + +#include "vmm_ktr.h" + +static MALLOC_DEFINE(M_VHPET, "vhpet", "bhyve virtual hpet"); + +#define HPET_FREQ 10000000 /* 10.0 Mhz */ +#define FS_PER_S 1000000000000000ul + +/* Timer N Configuration and Capabilities Register */ +#define HPET_TCAP_RO_MASK (HPET_TCAP_INT_ROUTE | \ + HPET_TCAP_FSB_INT_DEL | \ + HPET_TCAP_SIZE | \ + HPET_TCAP_PER_INT) +/* + * HPET requires at least 3 timers and up to 32 timers per block. + */ +#define VHPET_NUM_TIMERS 8 +CTASSERT(VHPET_NUM_TIMERS >= 3 && VHPET_NUM_TIMERS <= 32); + +struct vhpet_callout_arg { + struct vhpet *vhpet; + int timer_num; +}; + +struct vhpet { + struct vm *vm; + struct mtx mtx; + sbintime_t freq_sbt; + + uint64_t config; /* Configuration */ + uint64_t isr; /* Interrupt Status */ + uint32_t countbase; /* HPET counter base value */ + sbintime_t countbase_sbt; /* uptime corresponding to base value */ + + struct { + uint64_t cap_config; /* Configuration */ + uint64_t msireg; /* FSB interrupt routing */ + uint32_t compval; /* Comparator */ + uint32_t comprate; + struct callout callout; + sbintime_t callout_sbt; /* time when counter==compval */ + struct vhpet_callout_arg arg; + } timer[VHPET_NUM_TIMERS]; +}; + +#define VHPET_LOCK(vhp) mtx_lock(&((vhp)->mtx)) +#define VHPET_UNLOCK(vhp) mtx_unlock(&((vhp)->mtx)) + +static void vhpet_start_timer(struct vhpet *vhpet, int n, uint32_t counter, + sbintime_t now); + +static uint64_t +vhpet_capabilities(void) +{ + uint64_t cap = 0; + + cap |= 0x8086 << 16; /* vendor id */ + cap |= (VHPET_NUM_TIMERS - 1) << 8; /* number of timers */ + cap |= 1; /* revision */ + cap &= ~HPET_CAP_COUNT_SIZE; /* 32-bit timer */ + + cap &= 0xffffffff; + cap |= (FS_PER_S / HPET_FREQ) << 32; /* tick period in fs */ + + return (cap); +} + +static __inline bool +vhpet_counter_enabled(struct vhpet *vhpet) +{ + + return ((vhpet->config & HPET_CNF_ENABLE) ? true : false); +} + +static __inline bool +vhpet_timer_msi_enabled(struct vhpet *vhpet, int n) +{ + const uint64_t msi_enable = HPET_TCAP_FSB_INT_DEL | HPET_TCNF_FSB_EN; + + if ((vhpet->timer[n].cap_config & msi_enable) == msi_enable) + return (true); + else + return (false); +} + +static __inline int +vhpet_timer_ioapic_pin(struct vhpet *vhpet, int n) +{ + /* + * If the timer is configured to use MSI then treat it as if the + * timer is not connected to the ioapic. + */ + if (vhpet_timer_msi_enabled(vhpet, n)) + return (0); + + return ((vhpet->timer[n].cap_config & HPET_TCNF_INT_ROUTE) >> 9); +} + +static uint32_t +vhpet_counter(struct vhpet *vhpet, sbintime_t *nowptr) +{ + uint32_t val; + sbintime_t now, delta; + + val = vhpet->countbase; + if (vhpet_counter_enabled(vhpet)) { + now = sbinuptime(); + delta = now - vhpet->countbase_sbt; + KASSERT(delta >= 0, ("vhpet_counter: uptime went backwards: " + "%#lx to %#lx", vhpet->countbase_sbt, now)); + val += delta / vhpet->freq_sbt; + if (nowptr != NULL) + *nowptr = now; + } else { + /* + * The sbinuptime corresponding to the 'countbase' is + * meaningless when the counter is disabled. Make sure + * that the the caller doesn't want to use it. + */ + KASSERT(nowptr == NULL, ("vhpet_counter: nowptr must be NULL")); + } + return (val); +} + +static void +vhpet_timer_clear_isr(struct vhpet *vhpet, int n) +{ + int pin; + + if (vhpet->isr & (1 << n)) { + pin = vhpet_timer_ioapic_pin(vhpet, n); + KASSERT(pin != 0, ("vhpet timer %d irq incorrectly routed", n)); + vioapic_deassert_irq(vhpet->vm, pin); + vhpet->isr &= ~(1 << n); + } +} + +static __inline bool +vhpet_periodic_timer(struct vhpet *vhpet, int n) +{ + + return ((vhpet->timer[n].cap_config & HPET_TCNF_TYPE) != 0); +} + +static __inline bool +vhpet_timer_interrupt_enabled(struct vhpet *vhpet, int n) +{ + + return ((vhpet->timer[n].cap_config & HPET_TCNF_INT_ENB) != 0); +} + +static __inline bool +vhpet_timer_edge_trig(struct vhpet *vhpet, int n) +{ + + KASSERT(!vhpet_timer_msi_enabled(vhpet, n), ("vhpet_timer_edge_trig: " + "timer %d is using MSI", n)); + + if ((vhpet->timer[n].cap_config & HPET_TCNF_INT_TYPE) == 0) + return (true); + else + return (false); +} + +static void +vhpet_timer_interrupt(struct vhpet *vhpet, int n) +{ + int pin; + + /* If interrupts are not enabled for this timer then just return. */ + if (!vhpet_timer_interrupt_enabled(vhpet, n)) + return; + + /* + * If a level triggered interrupt is already asserted then just return. + */ + if ((vhpet->isr & (1 << n)) != 0) { + VM_CTR1(vhpet->vm, "hpet t%d intr is already asserted", n); + return; + } + + if (vhpet_timer_msi_enabled(vhpet, n)) { + lapic_intr_msi(vhpet->vm, vhpet->timer[n].msireg >> 32, + vhpet->timer[n].msireg & 0xffffffff); + return; + } + + pin = vhpet_timer_ioapic_pin(vhpet, n); + if (pin == 0) { + VM_CTR1(vhpet->vm, "hpet t%d intr is not routed to ioapic", n); + return; + } + + if (vhpet_timer_edge_trig(vhpet, n)) { + vioapic_pulse_irq(vhpet->vm, pin); + } else { + vhpet->isr |= 1 << n; + vioapic_assert_irq(vhpet->vm, pin); + } +} + +static void +vhpet_adjust_compval(struct vhpet *vhpet, int n, uint32_t counter) +{ + uint32_t compval, comprate, compnext; + + KASSERT(vhpet->timer[n].comprate != 0, ("hpet t%d is not periodic", n)); + + compval = vhpet->timer[n].compval; + comprate = vhpet->timer[n].comprate; + + /* + * Calculate the comparator value to be used for the next periodic + * interrupt. + * + * This function is commonly called from the callout handler. + * In this scenario the 'counter' is ahead of 'compval'. To find + * the next value to program into the accumulator we divide the + * number space between 'compval' and 'counter' into 'comprate' + * sized units. The 'compval' is rounded up such that is "ahead" + * of 'counter'. + */ + compnext = compval + ((counter - compval) / comprate + 1) * comprate; + + vhpet->timer[n].compval = compnext; +} + +static void +vhpet_handler(void *a) +{ + int n; + uint32_t counter; + sbintime_t now; + struct vhpet *vhpet; + struct callout *callout; + struct vhpet_callout_arg *arg; + + arg = a; + vhpet = arg->vhpet; + n = arg->timer_num; + callout = &vhpet->timer[n].callout; + + VM_CTR1(vhpet->vm, "hpet t%d fired", n); + + VHPET_LOCK(vhpet); + + if (callout_pending(callout)) /* callout was reset */ + goto done; + + if (!callout_active(callout)) /* callout was stopped */ + goto done; + + callout_deactivate(callout); + + if (!vhpet_counter_enabled(vhpet)) + panic("vhpet(%p) callout with counter disabled", vhpet); + + counter = vhpet_counter(vhpet, &now); + vhpet_start_timer(vhpet, n, counter, now); + vhpet_timer_interrupt(vhpet, n); +done: + VHPET_UNLOCK(vhpet); + return; +} + +static void +vhpet_stop_timer(struct vhpet *vhpet, int n, sbintime_t now) +{ + + VM_CTR1(vhpet->vm, "hpet t%d stopped", n); + callout_stop(&vhpet->timer[n].callout); + + /* + * If the callout was scheduled to expire in the past but hasn't + * had a chance to execute yet then trigger the timer interrupt + * here. Failing to do so will result in a missed timer interrupt + * in the guest. This is especially bad in one-shot mode because + * the next interrupt has to wait for the counter to wrap around. + */ + if (vhpet->timer[n].callout_sbt < now) { + VM_CTR1(vhpet->vm, "hpet t%d interrupt triggered after " + "stopping timer", n); + vhpet_timer_interrupt(vhpet, n); + } +} + +static void +vhpet_start_timer(struct vhpet *vhpet, int n, uint32_t counter, sbintime_t now) +{ + sbintime_t delta, precision; + + if (vhpet->timer[n].comprate != 0) + vhpet_adjust_compval(vhpet, n, counter); + else { + /* + * In one-shot mode it is the guest's responsibility to make + * sure that the comparator value is not in the "past". The + * hardware doesn't have any belt-and-suspenders to deal with + * this so we don't either. + */ + } + + delta = (vhpet->timer[n].compval - counter) * vhpet->freq_sbt; + precision = delta >> tc_precexp; + vhpet->timer[n].callout_sbt = now + delta; + callout_reset_sbt(&vhpet->timer[n].callout, vhpet->timer[n].callout_sbt, + precision, vhpet_handler, &vhpet->timer[n].arg, C_ABSOLUTE); +} + +static void +vhpet_start_counting(struct vhpet *vhpet) +{ + int i; + + vhpet->countbase_sbt = sbinuptime(); + for (i = 0; i < VHPET_NUM_TIMERS; i++) { + /* + * Restart the timers based on the value of the main counter + * when it stopped counting. + */ + vhpet_start_timer(vhpet, i, vhpet->countbase, + vhpet->countbase_sbt); + } +} + +static void +vhpet_stop_counting(struct vhpet *vhpet, uint32_t counter, sbintime_t now) +{ + int i; + + vhpet->countbase = counter; + for (i = 0; i < VHPET_NUM_TIMERS; i++) + vhpet_stop_timer(vhpet, i, now); +} + +static __inline void +update_register(uint64_t *regptr, uint64_t data, uint64_t mask) +{ + + *regptr &= ~mask; + *regptr |= (data & mask); +} + +static void +vhpet_timer_update_config(struct vhpet *vhpet, int n, uint64_t data, + uint64_t mask) +{ + bool clear_isr; + int old_pin, new_pin; + uint32_t allowed_irqs; + uint64_t oldval, newval; + + if (vhpet_timer_msi_enabled(vhpet, n) || + vhpet_timer_edge_trig(vhpet, n)) { + if (vhpet->isr & (1 << n)) + panic("vhpet timer %d isr should not be asserted", n); + } + old_pin = vhpet_timer_ioapic_pin(vhpet, n); + oldval = vhpet->timer[n].cap_config; + + newval = oldval; + update_register(&newval, data, mask); + newval &= ~(HPET_TCAP_RO_MASK | HPET_TCNF_32MODE); + newval |= oldval & HPET_TCAP_RO_MASK; + + if (newval == oldval) + return; + + vhpet->timer[n].cap_config = newval; + VM_CTR2(vhpet->vm, "hpet t%d cap_config set to 0x%016x", n, newval); + + /* + * Validate the interrupt routing in the HPET_TCNF_INT_ROUTE field. + * If it does not match the bits set in HPET_TCAP_INT_ROUTE then set + * it to the default value of 0. + */ + allowed_irqs = vhpet->timer[n].cap_config >> 32; + new_pin = vhpet_timer_ioapic_pin(vhpet, n); + if (new_pin != 0 && (allowed_irqs & (1 << new_pin)) == 0) { + VM_CTR3(vhpet->vm, "hpet t%d configured invalid irq %d, " + "allowed_irqs 0x%08x", n, new_pin, allowed_irqs); + new_pin = 0; + vhpet->timer[n].cap_config &= ~HPET_TCNF_INT_ROUTE; + } + + if (!vhpet_periodic_timer(vhpet, n)) + vhpet->timer[n].comprate = 0; + + /* + * If the timer's ISR bit is set then clear it in the following cases: + * - interrupt is disabled + * - interrupt type is changed from level to edge or fsb. + * - interrupt routing is changed + * + * This is to ensure that this timer's level triggered interrupt does + * not remain asserted forever. + */ + if (vhpet->isr & (1 << n)) { + KASSERT(old_pin != 0, ("timer %d isr asserted to ioapic pin %d", + n, old_pin)); + if (!vhpet_timer_interrupt_enabled(vhpet, n)) + clear_isr = true; + else if (vhpet_timer_msi_enabled(vhpet, n)) + clear_isr = true; + else if (vhpet_timer_edge_trig(vhpet, n)) + clear_isr = true; + else if (vhpet_timer_ioapic_pin(vhpet, n) != old_pin) + clear_isr = true; + else + clear_isr = false; + + if (clear_isr) { + VM_CTR1(vhpet->vm, "hpet t%d isr cleared due to " + "configuration change", n); + vioapic_deassert_irq(vhpet->vm, old_pin); + vhpet->isr &= ~(1 << n); + } + } +} + +int +vhpet_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t val, int size, + void *arg) +{ + struct vhpet *vhpet; + uint64_t data, mask, oldval, val64; + uint32_t isr_clear_mask, old_compval, old_comprate, counter; + sbintime_t now, *nowptr; + int i, offset; + + vhpet = vm_hpet(vm); + offset = gpa - VHPET_BASE; + + VHPET_LOCK(vhpet); + + /* Accesses to the HPET should be 4 or 8 bytes wide */ + switch (size) { + case 8: + mask = 0xffffffffffffffff; + data = val; + break; + case 4: + mask = 0xffffffff; + data = val; + if ((offset & 0x4) != 0) { + mask <<= 32; + data <<= 32; + } + break; + default: + VM_CTR2(vhpet->vm, "hpet invalid mmio write: " + "offset 0x%08x, size %d", offset, size); + goto done; + } + + /* Access to the HPET should be naturally aligned to its width */ + if (offset & (size - 1)) { + VM_CTR2(vhpet->vm, "hpet invalid mmio write: " + "offset 0x%08x, size %d", offset, size); + goto done; + } + + if (offset == HPET_CONFIG || offset == HPET_CONFIG + 4) { + /* + * Get the most recent value of the counter before updating + * the 'config' register. If the HPET is going to be disabled + * then we need to update 'countbase' with the value right + * before it is disabled. + */ + nowptr = vhpet_counter_enabled(vhpet) ? &now : NULL; + counter = vhpet_counter(vhpet, nowptr); + oldval = vhpet->config; + update_register(&vhpet->config, data, mask); + + /* + * LegacyReplacement Routing is not supported so clear the + * bit explicitly. + */ + vhpet->config &= ~HPET_CNF_LEG_RT; + + if ((oldval ^ vhpet->config) & HPET_CNF_ENABLE) { + if (vhpet_counter_enabled(vhpet)) { + vhpet_start_counting(vhpet); + VM_CTR0(vhpet->vm, "hpet enabled"); + } else { + vhpet_stop_counting(vhpet, counter, now); + VM_CTR0(vhpet->vm, "hpet disabled"); + } + } + goto done; + } + + if (offset == HPET_ISR || offset == HPET_ISR + 4) { + isr_clear_mask = vhpet->isr & data; + for (i = 0; i < VHPET_NUM_TIMERS; i++) { + if ((isr_clear_mask & (1 << i)) != 0) { + VM_CTR1(vhpet->vm, "hpet t%d isr cleared", i); + vhpet_timer_clear_isr(vhpet, i); + } + } + goto done; + } + + if (offset == HPET_MAIN_COUNTER || offset == HPET_MAIN_COUNTER + 4) { + /* Zero-extend the counter to 64-bits before updating it */ + val64 = vhpet_counter(vhpet, NULL); + update_register(&val64, data, mask); + vhpet->countbase = val64; + if (vhpet_counter_enabled(vhpet)) + vhpet_start_counting(vhpet); + goto done; + } + + for (i = 0; i < VHPET_NUM_TIMERS; i++) { + if (offset == HPET_TIMER_CAP_CNF(i) || + offset == HPET_TIMER_CAP_CNF(i) + 4) { + vhpet_timer_update_config(vhpet, i, data, mask); + break; + } + + if (offset == HPET_TIMER_COMPARATOR(i) || + offset == HPET_TIMER_COMPARATOR(i) + 4) { + old_compval = vhpet->timer[i].compval; + old_comprate = vhpet->timer[i].comprate; + if (vhpet_periodic_timer(vhpet, i)) { + /* + * In periodic mode writes to the comparator + * change the 'compval' register only if the + * HPET_TCNF_VAL_SET bit is set in the config + * register. + */ + val64 = vhpet->timer[i].comprate; + update_register(&val64, data, mask); + vhpet->timer[i].comprate = val64; + if ((vhpet->timer[i].cap_config & + HPET_TCNF_VAL_SET) != 0) { + vhpet->timer[i].compval = val64; + } + } else { + KASSERT(vhpet->timer[i].comprate == 0, + ("vhpet one-shot timer %d has invalid " + "rate %u", i, vhpet->timer[i].comprate)); + val64 = vhpet->timer[i].compval; + update_register(&val64, data, mask); + vhpet->timer[i].compval = val64; + } + vhpet->timer[i].cap_config &= ~HPET_TCNF_VAL_SET; + + if (vhpet->timer[i].compval != old_compval || + vhpet->timer[i].comprate != old_comprate) { + if (vhpet_counter_enabled(vhpet)) { + counter = vhpet_counter(vhpet, &now); + vhpet_start_timer(vhpet, i, counter, + now); + } + } + break; + } + + if (offset == HPET_TIMER_FSB_VAL(i) || + offset == HPET_TIMER_FSB_ADDR(i)) { + update_register(&vhpet->timer[i].msireg, data, mask); + break; + } + } +done: + VHPET_UNLOCK(vhpet); + return (0); +} + +int +vhpet_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *rval, int size, + void *arg) +{ + int i, offset; + struct vhpet *vhpet; + uint64_t data; + + vhpet = vm_hpet(vm); + offset = gpa - VHPET_BASE; + + VHPET_LOCK(vhpet); + + /* Accesses to the HPET should be 4 or 8 bytes wide */ + if (size != 4 && size != 8) { + VM_CTR2(vhpet->vm, "hpet invalid mmio read: " + "offset 0x%08x, size %d", offset, size); + data = 0; + goto done; + } + + /* Access to the HPET should be naturally aligned to its width */ + if (offset & (size - 1)) { + VM_CTR2(vhpet->vm, "hpet invalid mmio read: " + "offset 0x%08x, size %d", offset, size); + data = 0; + goto done; + } + + if (offset == HPET_CAPABILITIES || offset == HPET_CAPABILITIES + 4) { + data = vhpet_capabilities(); + goto done; + } + + if (offset == HPET_CONFIG || offset == HPET_CONFIG + 4) { + data = vhpet->config; + goto done; + } + + if (offset == HPET_ISR || offset == HPET_ISR + 4) { + data = vhpet->isr; + goto done; + } + + if (offset == HPET_MAIN_COUNTER || offset == HPET_MAIN_COUNTER + 4) { + data = vhpet_counter(vhpet, NULL); + goto done; + } + + for (i = 0; i < VHPET_NUM_TIMERS; i++) { + if (offset == HPET_TIMER_CAP_CNF(i) || + offset == HPET_TIMER_CAP_CNF(i) + 4) { + data = vhpet->timer[i].cap_config; + break; + } + + if (offset == HPET_TIMER_COMPARATOR(i) || + offset == HPET_TIMER_COMPARATOR(i) + 4) { + data = vhpet->timer[i].compval; + break; + } + + if (offset == HPET_TIMER_FSB_VAL(i) || + offset == HPET_TIMER_FSB_ADDR(i)) { + data = vhpet->timer[i].msireg; + break; + } + } + + if (i >= VHPET_NUM_TIMERS) + data = 0; +done: + VHPET_UNLOCK(vhpet); + + if (size == 4) { + if (offset & 0x4) + data >>= 32; + } + *rval = data; + return (0); +} + +struct vhpet * +vhpet_init(struct vm *vm) +{ + int i, pincount; + struct vhpet *vhpet; + uint64_t allowed_irqs; + struct vhpet_callout_arg *arg; + struct bintime bt; + + vhpet = malloc(sizeof(struct vhpet), M_VHPET, M_WAITOK | M_ZERO); + vhpet->vm = vm; + mtx_init(&vhpet->mtx, "vhpet lock", NULL, MTX_DEF); + + FREQ2BT(HPET_FREQ, &bt); + vhpet->freq_sbt = bttosbt(bt); + + pincount = vioapic_pincount(vm); + if (pincount >= 24) + allowed_irqs = 0x00f00000; /* irqs 20, 21, 22 and 23 */ + else + allowed_irqs = 0; + + /* + * Initialize HPET timer hardware state. + */ + for (i = 0; i < VHPET_NUM_TIMERS; i++) { + vhpet->timer[i].cap_config = allowed_irqs << 32; + vhpet->timer[i].cap_config |= HPET_TCAP_PER_INT; + vhpet->timer[i].cap_config |= HPET_TCAP_FSB_INT_DEL; + + vhpet->timer[i].compval = 0xffffffff; + callout_init(&vhpet->timer[i].callout, 1); + + arg = &vhpet->timer[i].arg; + arg->vhpet = vhpet; + arg->timer_num = i; + } + + return (vhpet); +} + +void +vhpet_cleanup(struct vhpet *vhpet) +{ + int i; + + for (i = 0; i < VHPET_NUM_TIMERS; i++) + callout_drain(&vhpet->timer[i].callout); + + free(vhpet, M_VHPET); +} + +int +vhpet_getcap(struct vm_hpet_cap *cap) +{ + + cap->capabilities = vhpet_capabilities(); + return (0); +} diff --git a/vmm/io/vhpet.h b/vmm/io/vhpet.h new file mode 100644 index 0000000..330e017 --- /dev/null +++ b/vmm/io/vhpet.h @@ -0,0 +1,44 @@ +/*- + * Copyright (c) 2013 Tycho Nightingale + * Copyright (c) 2013 Neel Natu + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VHPET_H_ +#define _VHPET_H_ + +#define VHPET_BASE 0xfed00000 +#define VHPET_SIZE 1024 + +struct vhpet *vhpet_init(struct vm *vm); +void vhpet_cleanup(struct vhpet *vhpet); +int vhpet_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t val, + int size, void *arg); +int vhpet_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *val, + int size, void *arg); +int vhpet_getcap(struct vm_hpet_cap *cap); + +#endif /* _VHPET_H_ */ diff --git a/vmm/io/vioapic.c b/vmm/io/vioapic.c new file mode 100644 index 0000000..e6b8b5a --- /dev/null +++ b/vmm/io/vioapic.c @@ -0,0 +1,499 @@ +/*- + * Copyright (c) 2013 Tycho Nightingale + * Copyright (c) 2013 Neel Natu + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "vmm_ktr.h" +#include "vmm_lapic.h" +#include "vlapic.h" +#include "vioapic.h" + +#define IOREGSEL 0x00 +#define IOWIN 0x10 + +#define REDIR_ENTRIES 24 +#define RTBL_RO_BITS ((uint64_t)(IOART_REM_IRR | IOART_DELIVS)) + +struct vioapic { + struct vm *vm; + struct mtx mtx; + uint32_t id; + uint32_t ioregsel; + struct { + uint64_t reg; + int acnt; /* sum of pin asserts (+1) and deasserts (-1) */ + } rtbl[REDIR_ENTRIES]; +}; + +#define VIOAPIC_LOCK(vioapic) mtx_lock_spin(&((vioapic)->mtx)) +#define VIOAPIC_UNLOCK(vioapic) mtx_unlock_spin(&((vioapic)->mtx)) +#define VIOAPIC_LOCKED(vioapic) mtx_owned(&((vioapic)->mtx)) + +static MALLOC_DEFINE(M_VIOAPIC, "vioapic", "bhyve virtual ioapic"); + +#define VIOAPIC_CTR1(vioapic, fmt, a1) \ + VM_CTR1((vioapic)->vm, fmt, a1) + +#define VIOAPIC_CTR2(vioapic, fmt, a1, a2) \ + VM_CTR2((vioapic)->vm, fmt, a1, a2) + +#define VIOAPIC_CTR3(vioapic, fmt, a1, a2, a3) \ + VM_CTR3((vioapic)->vm, fmt, a1, a2, a3) + +#define VIOAPIC_CTR4(vioapic, fmt, a1, a2, a3, a4) \ + VM_CTR4((vioapic)->vm, fmt, a1, a2, a3, a4) + +#ifdef KTR +static const char * +pinstate_str(bool asserted) +{ + + if (asserted) + return ("asserted"); + else + return ("deasserted"); +} +#endif + +static void +vioapic_send_intr(struct vioapic *vioapic, int pin) +{ + int vector, delmode; + uint32_t low, high, dest; + bool level, phys; + + KASSERT(pin >= 0 && pin < REDIR_ENTRIES, + ("vioapic_set_pinstate: invalid pin number %d", pin)); + + KASSERT(VIOAPIC_LOCKED(vioapic), + ("vioapic_set_pinstate: vioapic is not locked")); + + low = vioapic->rtbl[pin].reg; + high = vioapic->rtbl[pin].reg >> 32; + + if ((low & IOART_INTMASK) == IOART_INTMSET) { + VIOAPIC_CTR1(vioapic, "ioapic pin%d: masked", pin); + return; + } + + phys = ((low & IOART_DESTMOD) == IOART_DESTPHY); + delmode = low & IOART_DELMOD; + level = low & IOART_TRGRLVL ? true : false; + if (level) + vioapic->rtbl[pin].reg |= IOART_REM_IRR; + + vector = low & IOART_INTVEC; + dest = high >> APIC_ID_SHIFT; + vlapic_deliver_intr(vioapic->vm, level, dest, phys, delmode, vector); +} + +static void +vioapic_set_pinstate(struct vioapic *vioapic, int pin, bool newstate) +{ + int oldcnt, newcnt; + bool needintr; + + KASSERT(pin >= 0 && pin < REDIR_ENTRIES, + ("vioapic_set_pinstate: invalid pin number %d", pin)); + + KASSERT(VIOAPIC_LOCKED(vioapic), + ("vioapic_set_pinstate: vioapic is not locked")); + + oldcnt = vioapic->rtbl[pin].acnt; + if (newstate) + vioapic->rtbl[pin].acnt++; + else + vioapic->rtbl[pin].acnt--; + newcnt = vioapic->rtbl[pin].acnt; + + if (newcnt < 0) { + VIOAPIC_CTR2(vioapic, "ioapic pin%d: bad acnt %d", + pin, newcnt); + } + + needintr = false; + if (oldcnt == 0 && newcnt == 1) { + needintr = true; + VIOAPIC_CTR1(vioapic, "ioapic pin%d: asserted", pin); + } else if (oldcnt == 1 && newcnt == 0) { + VIOAPIC_CTR1(vioapic, "ioapic pin%d: deasserted", pin); + } else { + VIOAPIC_CTR3(vioapic, "ioapic pin%d: %s, ignored, acnt %d", + pin, pinstate_str(newstate), newcnt); + } + + if (needintr) + vioapic_send_intr(vioapic, pin); +} + +enum irqstate { + IRQSTATE_ASSERT, + IRQSTATE_DEASSERT, + IRQSTATE_PULSE +}; + +static int +vioapic_set_irqstate(struct vm *vm, int irq, enum irqstate irqstate) +{ + struct vioapic *vioapic; + + if (irq < 0 || irq >= REDIR_ENTRIES) + return (EINVAL); + + vioapic = vm_ioapic(vm); + + VIOAPIC_LOCK(vioapic); + switch (irqstate) { + case IRQSTATE_ASSERT: + vioapic_set_pinstate(vioapic, irq, true); + break; + case IRQSTATE_DEASSERT: + vioapic_set_pinstate(vioapic, irq, false); + break; + case IRQSTATE_PULSE: + vioapic_set_pinstate(vioapic, irq, true); + vioapic_set_pinstate(vioapic, irq, false); + break; + default: + panic("vioapic_set_irqstate: invalid irqstate %d", irqstate); + } + VIOAPIC_UNLOCK(vioapic); + + return (0); +} + +int +vioapic_assert_irq(struct vm *vm, int irq) +{ + + return (vioapic_set_irqstate(vm, irq, IRQSTATE_ASSERT)); +} + +int +vioapic_deassert_irq(struct vm *vm, int irq) +{ + + return (vioapic_set_irqstate(vm, irq, IRQSTATE_DEASSERT)); +} + +int +vioapic_pulse_irq(struct vm *vm, int irq) +{ + + return (vioapic_set_irqstate(vm, irq, IRQSTATE_PULSE)); +} + +/* + * Reset the vlapic's trigger-mode register to reflect the ioapic pin + * configuration. + */ +static void +vioapic_update_tmr(struct vm *vm, int vcpuid, void *arg) +{ + struct vioapic *vioapic; + struct vlapic *vlapic; + uint32_t low, high, dest; + int delmode, pin, vector; + bool level, phys; + + vlapic = vm_lapic(vm, vcpuid); + vioapic = vm_ioapic(vm); + + VIOAPIC_LOCK(vioapic); + /* + * Reset all vectors to be edge-triggered. + */ + vlapic_reset_tmr(vlapic); + for (pin = 0; pin < REDIR_ENTRIES; pin++) { + low = vioapic->rtbl[pin].reg; + high = vioapic->rtbl[pin].reg >> 32; + + level = low & IOART_TRGRLVL ? true : false; + if (!level) + continue; + + /* + * For a level-triggered 'pin' let the vlapic figure out if + * an assertion on this 'pin' would result in an interrupt + * being delivered to it. If yes, then it will modify the + * TMR bit associated with this vector to level-triggered. + */ + phys = ((low & IOART_DESTMOD) == IOART_DESTPHY); + delmode = low & IOART_DELMOD; + vector = low & IOART_INTVEC; + dest = high >> APIC_ID_SHIFT; + vlapic_set_tmr_level(vlapic, dest, phys, delmode, vector); + } + VIOAPIC_UNLOCK(vioapic); +} + +static uint32_t +vioapic_read(struct vioapic *vioapic, int vcpuid, uint32_t addr) +{ + int regnum, pin, rshift; + + regnum = addr & 0xff; + switch (regnum) { + case IOAPIC_ID: + return (vioapic->id); + break; + case IOAPIC_VER: + return (((REDIR_ENTRIES - 1) << MAXREDIRSHIFT) | 0x11); + break; + case IOAPIC_ARB: + return (vioapic->id); + break; + default: + break; + } + + /* redirection table entries */ + if (regnum >= IOAPIC_REDTBL && + regnum < IOAPIC_REDTBL + REDIR_ENTRIES * 2) { + pin = (regnum - IOAPIC_REDTBL) / 2; + if ((regnum - IOAPIC_REDTBL) % 2) + rshift = 32; + else + rshift = 0; + + return (vioapic->rtbl[pin].reg >> rshift); + } + + return (0); +} + +static void +vioapic_write(struct vioapic *vioapic, int vcpuid, uint32_t addr, uint32_t data) +{ + uint64_t data64, mask64; + uint64_t last, changed; + int regnum, pin, lshift; + cpuset_t allvcpus; + + regnum = addr & 0xff; + switch (regnum) { + case IOAPIC_ID: + vioapic->id = data & APIC_ID_MASK; + break; + case IOAPIC_VER: + case IOAPIC_ARB: + /* readonly */ + break; + default: + break; + } + + /* redirection table entries */ + if (regnum >= IOAPIC_REDTBL && + regnum < IOAPIC_REDTBL + REDIR_ENTRIES * 2) { + pin = (regnum - IOAPIC_REDTBL) / 2; + if ((regnum - IOAPIC_REDTBL) % 2) + lshift = 32; + else + lshift = 0; + + last = vioapic->rtbl[pin].reg; + + data64 = (uint64_t)data << lshift; + mask64 = (uint64_t)0xffffffff << lshift; + vioapic->rtbl[pin].reg &= ~mask64 | RTBL_RO_BITS; + vioapic->rtbl[pin].reg |= data64 & ~RTBL_RO_BITS; + + VIOAPIC_CTR2(vioapic, "ioapic pin%d: redir table entry %#lx", + pin, vioapic->rtbl[pin].reg); + + /* + * If any fields in the redirection table entry (except mask + * or polarity) have changed then rendezvous all the vcpus + * to update their vlapic trigger-mode registers. + */ + changed = last ^ vioapic->rtbl[pin].reg; + if (changed & ~(IOART_INTMASK | IOART_INTPOL)) { + VIOAPIC_CTR1(vioapic, "ioapic pin%d: recalculate " + "vlapic trigger-mode register", pin); + VIOAPIC_UNLOCK(vioapic); + allvcpus = vm_active_cpus(vioapic->vm); + vm_smp_rendezvous(vioapic->vm, vcpuid, allvcpus, + vioapic_update_tmr, NULL); + VIOAPIC_LOCK(vioapic); + } + + /* + * Generate an interrupt if the following conditions are met: + * - pin is not masked + * - previous interrupt has been EOIed + * - pin level is asserted + */ + if ((vioapic->rtbl[pin].reg & IOART_INTMASK) == IOART_INTMCLR && + (vioapic->rtbl[pin].reg & IOART_REM_IRR) == 0 && + (vioapic->rtbl[pin].acnt > 0)) { + VIOAPIC_CTR2(vioapic, "ioapic pin%d: asserted at rtbl " + "write, acnt %d", pin, vioapic->rtbl[pin].acnt); + vioapic_send_intr(vioapic, pin); + } + } +} + +static int +vioapic_mmio_rw(struct vioapic *vioapic, int vcpuid, uint64_t gpa, + uint64_t *data, int size, bool doread) +{ + uint64_t offset; + + offset = gpa - VIOAPIC_BASE; + + /* + * The IOAPIC specification allows 32-bit wide accesses to the + * IOREGSEL (offset 0) and IOWIN (offset 16) registers. + */ + if (size != 4 || (offset != IOREGSEL && offset != IOWIN)) { + if (doread) + *data = 0; + return (0); + } + + VIOAPIC_LOCK(vioapic); + if (offset == IOREGSEL) { + if (doread) + *data = vioapic->ioregsel; + else + vioapic->ioregsel = *data; + } else { + if (doread) { + *data = vioapic_read(vioapic, vcpuid, + vioapic->ioregsel); + } else { + vioapic_write(vioapic, vcpuid, vioapic->ioregsel, + *data); + } + } + VIOAPIC_UNLOCK(vioapic); + + return (0); +} + +int +vioapic_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *rval, + int size, void *arg) +{ + int error; + struct vioapic *vioapic; + + vioapic = vm_ioapic(vm); + error = vioapic_mmio_rw(vioapic, vcpuid, gpa, rval, size, true); + return (error); +} + +int +vioapic_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t wval, + int size, void *arg) +{ + int error; + struct vioapic *vioapic; + + vioapic = vm_ioapic(vm); + error = vioapic_mmio_rw(vioapic, vcpuid, gpa, &wval, size, false); + return (error); +} + +void +vioapic_process_eoi(struct vm *vm, int vcpuid, int vector) +{ + struct vioapic *vioapic; + int pin; + + KASSERT(vector >= 0 && vector < 256, + ("vioapic_process_eoi: invalid vector %d", vector)); + + vioapic = vm_ioapic(vm); + VIOAPIC_CTR1(vioapic, "ioapic processing eoi for vector %d", vector); + + /* + * XXX keep track of the pins associated with this vector instead + * of iterating on every single pin each time. + */ + VIOAPIC_LOCK(vioapic); + for (pin = 0; pin < REDIR_ENTRIES; pin++) { + if ((vioapic->rtbl[pin].reg & IOART_REM_IRR) == 0) + continue; + if ((vioapic->rtbl[pin].reg & IOART_INTVEC) != vector) + continue; + vioapic->rtbl[pin].reg &= ~IOART_REM_IRR; + if (vioapic->rtbl[pin].acnt > 0) { + VIOAPIC_CTR2(vioapic, "ioapic pin%d: asserted at eoi, " + "acnt %d", pin, vioapic->rtbl[pin].acnt); + vioapic_send_intr(vioapic, pin); + } + } + VIOAPIC_UNLOCK(vioapic); +} + +struct vioapic * +vioapic_init(struct vm *vm) +{ + int i; + struct vioapic *vioapic; + + vioapic = malloc(sizeof(struct vioapic), M_VIOAPIC, M_WAITOK | M_ZERO); + + vioapic->vm = vm; + mtx_init(&vioapic->mtx, "vioapic lock", NULL, MTX_SPIN); + + /* Initialize all redirection entries to mask all interrupts */ + for (i = 0; i < REDIR_ENTRIES; i++) + vioapic->rtbl[i].reg = 0x0001000000010000UL; + + return (vioapic); +} + +void +vioapic_cleanup(struct vioapic *vioapic) +{ + + free(vioapic, M_VIOAPIC); +} + +int +vioapic_pincount(struct vm *vm) +{ + + return (REDIR_ENTRIES); +} diff --git a/vmm/io/vioapic.h b/vmm/io/vioapic.h new file mode 100644 index 0000000..65176b3 --- /dev/null +++ b/vmm/io/vioapic.h @@ -0,0 +1,50 @@ +/*- + * Copyright (c) 2013 Tycho Nightingale + * Copyright (c) 2013 Neel Natu + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VIOAPIC_H_ +#define _VIOAPIC_H_ + +#define VIOAPIC_BASE 0xFEC00000 +#define VIOAPIC_SIZE 4096 + +struct vioapic *vioapic_init(struct vm *vm); +void vioapic_cleanup(struct vioapic *vioapic); + +int vioapic_assert_irq(struct vm *vm, int irq); +int vioapic_deassert_irq(struct vm *vm, int irq); +int vioapic_pulse_irq(struct vm *vm, int irq); + +int vioapic_mmio_write(void *vm, int vcpuid, uint64_t gpa, + uint64_t wval, int size, void *arg); +int vioapic_mmio_read(void *vm, int vcpuid, uint64_t gpa, + uint64_t *rval, int size, void *arg); + +int vioapic_pincount(struct vm *vm); +void vioapic_process_eoi(struct vm *vm, int vcpuid, int vector); +#endif diff --git a/vmm/io/vlapic.c b/vmm/io/vlapic.c new file mode 100644 index 0000000..3451e1e --- /dev/null +++ b/vmm/io/vlapic.c @@ -0,0 +1,1654 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include + +#include "vmm_lapic.h" +#include "vmm_ktr.h" +#include "vmm_stat.h" + +#include "vlapic.h" +#include "vlapic_priv.h" +#include "vioapic.h" + +#define PRIO(x) ((x) >> 4) + +#define VLAPIC_VERSION (16) + +#define x2apic(vlapic) (((vlapic)->msr_apicbase & APICBASE_X2APIC) ? 1 : 0) + +/* + * The 'vlapic->timer_mtx' is used to provide mutual exclusion between the + * vlapic_callout_handler() and vcpu accesses to: + * - timer_freq_bt, timer_period_bt, timer_fire_bt + * - timer LVT register + */ +#define VLAPIC_TIMER_LOCK(vlapic) mtx_lock_spin(&((vlapic)->timer_mtx)) +#define VLAPIC_TIMER_UNLOCK(vlapic) mtx_unlock_spin(&((vlapic)->timer_mtx)) +#define VLAPIC_TIMER_LOCKED(vlapic) mtx_owned(&((vlapic)->timer_mtx)) + +/* + * APIC timer frequency: + * - arbitrary but chosen to be in the ballpark of contemporary hardware. + * - power-of-two to avoid loss of precision when converted to a bintime. + */ +#define VLAPIC_BUS_FREQ (128 * 1024 * 1024) + +static __inline uint32_t +vlapic_get_id(struct vlapic *vlapic) +{ + + if (x2apic(vlapic)) + return (vlapic->vcpuid); + else + return (vlapic->vcpuid << 24); +} + +static uint32_t +x2apic_ldr(struct vlapic *vlapic) +{ + int apicid; + uint32_t ldr; + + apicid = vlapic_get_id(vlapic); + ldr = 1 << (apicid & 0xf); + ldr |= (apicid & 0xffff0) << 12; + return (ldr); +} + +void +vlapic_dfr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + + lapic = vlapic->apic_page; + if (x2apic(vlapic)) { + VM_CTR1(vlapic->vm, "ignoring write to DFR in x2apic mode: %#x", + lapic->dfr); + lapic->dfr = 0; + return; + } + + lapic->dfr &= APIC_DFR_MODEL_MASK; + lapic->dfr |= APIC_DFR_RESERVED; + + if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_FLAT) + VLAPIC_CTR0(vlapic, "vlapic DFR in Flat Model"); + else if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_CLUSTER) + VLAPIC_CTR0(vlapic, "vlapic DFR in Cluster Model"); + else + VLAPIC_CTR1(vlapic, "DFR in Unknown Model %#x", lapic->dfr); +} + +void +vlapic_ldr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + + lapic = vlapic->apic_page; + + /* LDR is read-only in x2apic mode */ + if (x2apic(vlapic)) { + VLAPIC_CTR1(vlapic, "ignoring write to LDR in x2apic mode: %#x", + lapic->ldr); + lapic->ldr = x2apic_ldr(vlapic); + } else { + lapic->ldr &= ~APIC_LDR_RESERVED; + VLAPIC_CTR1(vlapic, "vlapic LDR set to %#x", lapic->ldr); + } +} + +void +vlapic_id_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + + /* + * We don't allow the ID register to be modified so reset it back to + * its default value. + */ + lapic = vlapic->apic_page; + lapic->id = vlapic_get_id(vlapic); +} + +static int +vlapic_timer_divisor(uint32_t dcr) +{ + switch (dcr & 0xB) { + case APIC_TDCR_1: + return (1); + case APIC_TDCR_2: + return (2); + case APIC_TDCR_4: + return (4); + case APIC_TDCR_8: + return (8); + case APIC_TDCR_16: + return (16); + case APIC_TDCR_32: + return (32); + case APIC_TDCR_64: + return (64); + case APIC_TDCR_128: + return (128); + default: + panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr); + } +} + +#if 0 +static inline void +vlapic_dump_lvt(uint32_t offset, uint32_t *lvt) +{ + printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset, + *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS, + *lvt & APIC_LVTT_M); +} +#endif + +static uint32_t +vlapic_get_ccr(struct vlapic *vlapic) +{ + struct bintime bt_now, bt_rem; + struct LAPIC *lapic; + uint32_t ccr; + + ccr = 0; + lapic = vlapic->apic_page; + + VLAPIC_TIMER_LOCK(vlapic); + if (callout_active(&vlapic->callout)) { + /* + * If the timer is scheduled to expire in the future then + * compute the value of 'ccr' based on the remaining time. + */ + binuptime(&bt_now); + if (bintime_cmp(&vlapic->timer_fire_bt, &bt_now, >)) { + bt_rem = vlapic->timer_fire_bt; + bintime_sub(&bt_rem, &bt_now); + ccr += bt_rem.sec * BT2FREQ(&vlapic->timer_freq_bt); + ccr += bt_rem.frac / vlapic->timer_freq_bt.frac; + } + } + KASSERT(ccr <= lapic->icr_timer, ("vlapic_get_ccr: invalid ccr %#x, " + "icr_timer is %#x", ccr, lapic->icr_timer)); + VLAPIC_CTR2(vlapic, "vlapic ccr_timer = %#x, icr_timer = %#x", + ccr, lapic->icr_timer); + VLAPIC_TIMER_UNLOCK(vlapic); + return (ccr); +} + +void +vlapic_dcr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + int divisor; + + lapic = vlapic->apic_page; + VLAPIC_TIMER_LOCK(vlapic); + + divisor = vlapic_timer_divisor(lapic->dcr_timer); + VLAPIC_CTR2(vlapic, "vlapic dcr_timer=%#x, divisor=%d", + lapic->dcr_timer, divisor); + + /* + * Update the timer frequency and the timer period. + * + * XXX changes to the frequency divider will not take effect until + * the timer is reloaded. + */ + FREQ2BT(VLAPIC_BUS_FREQ / divisor, &vlapic->timer_freq_bt); + vlapic->timer_period_bt = vlapic->timer_freq_bt; + bintime_mul(&vlapic->timer_period_bt, lapic->icr_timer); + + VLAPIC_TIMER_UNLOCK(vlapic); +} + +void +vlapic_esr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + + lapic = vlapic->apic_page; + lapic->esr = vlapic->esr_pending; + vlapic->esr_pending = 0; +} + +int +vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level) +{ + struct LAPIC *lapic; + uint32_t *irrptr, *tmrptr, mask; + int idx; + + KASSERT(vector >= 0 && vector < 256, ("invalid vector %d", vector)); + + lapic = vlapic->apic_page; + if (!(lapic->svr & APIC_SVR_ENABLE)) { + VLAPIC_CTR1(vlapic, "vlapic is software disabled, ignoring " + "interrupt %d", vector); + return (0); + } + + if (vector < 16) { + vlapic_set_error(vlapic, APIC_ESR_RECEIVE_ILLEGAL_VECTOR); + VLAPIC_CTR1(vlapic, "vlapic ignoring interrupt to vector %d", + vector); + return (1); + } + + if (vlapic->ops.set_intr_ready) + return ((*vlapic->ops.set_intr_ready)(vlapic, vector, level)); + + idx = (vector / 32) * 4; + mask = 1 << (vector % 32); + + irrptr = &lapic->irr0; + atomic_set_int(&irrptr[idx], mask); + + /* + * Verify that the trigger-mode of the interrupt matches with + * the vlapic TMR registers. + */ + tmrptr = &lapic->tmr0; + if ((tmrptr[idx] & mask) != (level ? mask : 0)) { + VLAPIC_CTR3(vlapic, "vlapic TMR[%d] is 0x%08x but " + "interrupt is %s-triggered", idx / 4, tmrptr[idx], + level ? "level" : "edge"); + } + + VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready"); + return (1); +} + +static __inline uint32_t * +vlapic_get_lvtptr(struct vlapic *vlapic, uint32_t offset) +{ + struct LAPIC *lapic = vlapic->apic_page; + int i; + + switch (offset) { + case APIC_OFFSET_CMCI_LVT: + return (&lapic->lvt_cmci); + case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: + i = (offset - APIC_OFFSET_TIMER_LVT) >> 2; + return ((&lapic->lvt_timer) + i);; + default: + panic("vlapic_get_lvt: invalid LVT\n"); + } +} + +static __inline int +lvt_off_to_idx(uint32_t offset) +{ + int index; + + switch (offset) { + case APIC_OFFSET_CMCI_LVT: + index = APIC_LVT_CMCI; + break; + case APIC_OFFSET_TIMER_LVT: + index = APIC_LVT_TIMER; + break; + case APIC_OFFSET_THERM_LVT: + index = APIC_LVT_THERMAL; + break; + case APIC_OFFSET_PERF_LVT: + index = APIC_LVT_PMC; + break; + case APIC_OFFSET_LINT0_LVT: + index = APIC_LVT_LINT0; + break; + case APIC_OFFSET_LINT1_LVT: + index = APIC_LVT_LINT1; + break; + case APIC_OFFSET_ERROR_LVT: + index = APIC_LVT_ERROR; + break; + default: + index = -1; + break; + } + KASSERT(index >= 0 && index <= VLAPIC_MAXLVT_INDEX, ("lvt_off_to_idx: " + "invalid lvt index %d for offset %#x", index, offset)); + + return (index); +} + +static __inline uint32_t +vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset) +{ + int idx; + uint32_t val; + + idx = lvt_off_to_idx(offset); + val = atomic_load_acq_32(&vlapic->lvt_last[idx]); + return (val); +} + +void +vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset) +{ + uint32_t *lvtptr, mask, val; + struct LAPIC *lapic; + int idx; + + lapic = vlapic->apic_page; + lvtptr = vlapic_get_lvtptr(vlapic, offset); + val = *lvtptr; + idx = lvt_off_to_idx(offset); + + if (!(lapic->svr & APIC_SVR_ENABLE)) + val |= APIC_LVT_M; + mask = APIC_LVT_M | APIC_LVT_DS | APIC_LVT_VECTOR; + switch (offset) { + case APIC_OFFSET_TIMER_LVT: + mask |= APIC_LVTT_TM; + break; + case APIC_OFFSET_ERROR_LVT: + break; + case APIC_OFFSET_LINT0_LVT: + case APIC_OFFSET_LINT1_LVT: + mask |= APIC_LVT_TM | APIC_LVT_RIRR | APIC_LVT_IIPP; + /* FALLTHROUGH */ + default: + mask |= APIC_LVT_DM; + break; + } + val &= mask; + *lvtptr = val; + atomic_store_rel_32(&vlapic->lvt_last[idx], val); +} + +static void +vlapic_mask_lvts(struct vlapic *vlapic) +{ + struct LAPIC *lapic = vlapic->apic_page; + + lapic->lvt_cmci |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_CMCI_LVT); + + lapic->lvt_timer |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_TIMER_LVT); + + lapic->lvt_thermal |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_THERM_LVT); + + lapic->lvt_pcint |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_PERF_LVT); + + lapic->lvt_lint0 |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT0_LVT); + + lapic->lvt_lint1 |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT1_LVT); + + lapic->lvt_error |= APIC_LVT_M; + vlapic_lvt_write_handler(vlapic, APIC_OFFSET_ERROR_LVT); +} + +static int +vlapic_fire_lvt(struct vlapic *vlapic, uint32_t lvt) +{ + uint32_t vec, mode; + + if (lvt & APIC_LVT_M) + return (0); + + vec = lvt & APIC_LVT_VECTOR; + mode = lvt & APIC_LVT_DM; + + switch (mode) { + case APIC_LVT_DM_FIXED: + if (vec < 16) { + vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR); + return (0); + } + if (vlapic_set_intr_ready(vlapic, vec, false)) + vcpu_notify_event(vlapic->vm, vlapic->vcpuid, true); + break; + case APIC_LVT_DM_NMI: + vm_inject_nmi(vlapic->vm, vlapic->vcpuid); + break; + case APIC_LVT_DM_EXTINT: + vm_inject_extint(vlapic->vm, vlapic->vcpuid); + break; + default: + // Other modes ignored + return (0); + } + return (1); +} + +#if 1 +static void +dump_isrvec_stk(struct vlapic *vlapic) +{ + int i; + uint32_t *isrptr; + + isrptr = &vlapic->apic_page->isr0; + for (i = 0; i < 8; i++) + printf("ISR%d 0x%08x\n", i, isrptr[i * 4]); + + for (i = 0; i <= vlapic->isrvec_stk_top; i++) + printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]); +} +#endif + +/* + * Algorithm adopted from section "Interrupt, Task and Processor Priority" + * in Intel Architecture Manual Vol 3a. + */ +static void +vlapic_update_ppr(struct vlapic *vlapic) +{ + int isrvec, tpr, ppr; + + /* + * Note that the value on the stack at index 0 is always 0. + * + * This is a placeholder for the value of ISRV when none of the + * bits is set in the ISRx registers. + */ + isrvec = vlapic->isrvec_stk[vlapic->isrvec_stk_top]; + tpr = vlapic->apic_page->tpr; + +#if 1 + { + int i, lastprio, curprio, vector, idx; + uint32_t *isrptr; + + if (vlapic->isrvec_stk_top == 0 && isrvec != 0) + panic("isrvec_stk is corrupted: %d", isrvec); + + /* + * Make sure that the priority of the nested interrupts is + * always increasing. + */ + lastprio = -1; + for (i = 1; i <= vlapic->isrvec_stk_top; i++) { + curprio = PRIO(vlapic->isrvec_stk[i]); + if (curprio <= lastprio) { + dump_isrvec_stk(vlapic); + panic("isrvec_stk does not satisfy invariant"); + } + lastprio = curprio; + } + + /* + * Make sure that each bit set in the ISRx registers has a + * corresponding entry on the isrvec stack. + */ + i = 1; + isrptr = &vlapic->apic_page->isr0; + for (vector = 0; vector < 256; vector++) { + idx = (vector / 32) * 4; + if (isrptr[idx] & (1 << (vector % 32))) { + if (i > vlapic->isrvec_stk_top || + vlapic->isrvec_stk[i] != vector) { + dump_isrvec_stk(vlapic); + panic("ISR and isrvec_stk out of sync"); + } + i++; + } + } + } +#endif + + if (PRIO(tpr) >= PRIO(isrvec)) + ppr = tpr; + else + ppr = isrvec & 0xf0; + + vlapic->apic_page->ppr = ppr; + VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr); +} + +static VMM_STAT(VLAPIC_GRATUITOUS_EOI, "EOI without any in-service interrupt"); + +static void +vlapic_process_eoi(struct vlapic *vlapic) +{ + struct LAPIC *lapic = vlapic->apic_page; + uint32_t *isrptr, *tmrptr; + int i, idx, bitpos, vector; + + isrptr = &lapic->isr0; + tmrptr = &lapic->tmr0; + + for (i = 7; i >= 0; i--) { + idx = i * 4; + bitpos = fls(isrptr[idx]); + if (bitpos-- != 0) { + if (vlapic->isrvec_stk_top <= 0) { + panic("invalid vlapic isrvec_stk_top %d", + vlapic->isrvec_stk_top); + } + isrptr[idx] &= ~(1 << bitpos); + vector = i * 32 + bitpos; + VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "EOI vector %d", + vector); + VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi"); + vlapic->isrvec_stk_top--; + vlapic_update_ppr(vlapic); + if ((tmrptr[idx] & (1 << bitpos)) != 0) { + vioapic_process_eoi(vlapic->vm, vlapic->vcpuid, + vector); + } + return; + } + } + VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "Gratuitous EOI"); + vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_GRATUITOUS_EOI, 1); +} + +static __inline int +vlapic_get_lvt_field(uint32_t lvt, uint32_t mask) +{ + + return (lvt & mask); +} + +static __inline int +vlapic_periodic_timer(struct vlapic *vlapic) +{ + uint32_t lvt; + + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); + + return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC)); +} + +static VMM_STAT(VLAPIC_INTR_ERROR, "error interrupts generated by vlapic"); + +void +vlapic_set_error(struct vlapic *vlapic, uint32_t mask) +{ + uint32_t lvt; + + vlapic->esr_pending |= mask; + if (vlapic->esr_firing) + return; + vlapic->esr_firing = 1; + + // The error LVT always uses the fixed delivery mode. + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_ERROR_LVT); + if (vlapic_fire_lvt(vlapic, lvt | APIC_LVT_DM_FIXED)) { + vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_ERROR, 1); + } + vlapic->esr_firing = 0; +} + +static VMM_STAT(VLAPIC_INTR_TIMER, "timer interrupts generated by vlapic"); + +static void +vlapic_fire_timer(struct vlapic *vlapic) +{ + uint32_t lvt; + + KASSERT(VLAPIC_TIMER_LOCKED(vlapic), ("vlapic_fire_timer not locked")); + + // The timer LVT always uses the fixed delivery mode. + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); + if (vlapic_fire_lvt(vlapic, lvt | APIC_LVT_DM_FIXED)) { + VLAPIC_CTR0(vlapic, "vlapic timer fired"); + vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_TIMER, 1); + } +} + +static VMM_STAT(VLAPIC_INTR_CMC, + "corrected machine check interrupts generated by vlapic"); + +void +vlapic_fire_cmci(struct vlapic *vlapic) +{ + uint32_t lvt; + + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_CMCI_LVT); + if (vlapic_fire_lvt(vlapic, lvt)) { + vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_CMC, 1); + } +} + +static VMM_STAT_ARRAY(LVTS_TRIGGERRED, VLAPIC_MAXLVT_INDEX + 1, + "lvts triggered"); + +int +vlapic_trigger_lvt(struct vlapic *vlapic, int vector) +{ + uint32_t lvt; + + if (vlapic_enabled(vlapic) == false) { + /* + * When the local APIC is global/hardware disabled, + * LINT[1:0] pins are configured as INTR and NMI pins, + * respectively. + */ + switch (vector) { + case APIC_LVT_LINT0: + vm_inject_extint(vlapic->vm, vlapic->vcpuid); + break; + case APIC_LVT_LINT1: + vm_inject_nmi(vlapic->vm, vlapic->vcpuid); + break; + default: + break; + } + return (0); + } + + switch (vector) { + case APIC_LVT_LINT0: + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_LINT0_LVT); + break; + case APIC_LVT_LINT1: + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_LINT1_LVT); + break; + case APIC_LVT_TIMER: + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); + lvt |= APIC_LVT_DM_FIXED; + break; + case APIC_LVT_ERROR: + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_ERROR_LVT); + lvt |= APIC_LVT_DM_FIXED; + break; + case APIC_LVT_PMC: + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_PERF_LVT); + break; + case APIC_LVT_THERMAL: + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_THERM_LVT); + break; + case APIC_LVT_CMCI: + lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_CMCI_LVT); + break; + default: + return (EINVAL); + } + if (vlapic_fire_lvt(vlapic, lvt)) { + vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, + LVTS_TRIGGERRED, vector, 1); + } + return (0); +} + +static void +vlapic_callout_handler(void *arg) +{ + struct vlapic *vlapic; + struct bintime bt, btnow; + sbintime_t rem_sbt; + + vlapic = arg; + + VLAPIC_TIMER_LOCK(vlapic); + if (callout_pending(&vlapic->callout)) /* callout was reset */ + goto done; + + if (!callout_active(&vlapic->callout)) /* callout was stopped */ + goto done; + + callout_deactivate(&vlapic->callout); + + vlapic_fire_timer(vlapic); + + if (vlapic_periodic_timer(vlapic)) { + binuptime(&btnow); + KASSERT(bintime_cmp(&btnow, &vlapic->timer_fire_bt, >=), + ("vlapic callout at %#lx.%#lx, expected at %#lx.#%lx", + btnow.sec, btnow.frac, vlapic->timer_fire_bt.sec, + vlapic->timer_fire_bt.frac)); + + /* + * Compute the delta between when the timer was supposed to + * fire and the present time. + */ + bt = btnow; + bintime_sub(&bt, &vlapic->timer_fire_bt); + + rem_sbt = bttosbt(vlapic->timer_period_bt); + if (bintime_cmp(&bt, &vlapic->timer_period_bt, <)) { + /* + * Adjust the time until the next countdown downward + * to account for the lost time. + */ + rem_sbt -= bttosbt(bt); + } else { + /* + * If the delta is greater than the timer period then + * just reset our time base instead of trying to catch + * up. + */ + vlapic->timer_fire_bt = btnow; + VLAPIC_CTR2(vlapic, "vlapic timer lagging by %lu " + "usecs, period is %lu usecs - resetting time base", + bttosbt(bt) / SBT_1US, + bttosbt(vlapic->timer_period_bt) / SBT_1US); + } + + bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt); + callout_reset_sbt(&vlapic->callout, rem_sbt, 0, + vlapic_callout_handler, vlapic, 0); + } +done: + VLAPIC_TIMER_UNLOCK(vlapic); +} + +void +vlapic_icrtmr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + sbintime_t sbt; + uint32_t icr_timer; + + VLAPIC_TIMER_LOCK(vlapic); + + lapic = vlapic->apic_page; + icr_timer = lapic->icr_timer; + + vlapic->timer_period_bt = vlapic->timer_freq_bt; + bintime_mul(&vlapic->timer_period_bt, icr_timer); + + if (icr_timer != 0) { + binuptime(&vlapic->timer_fire_bt); + bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt); + + sbt = bttosbt(vlapic->timer_period_bt); + callout_reset_sbt(&vlapic->callout, sbt, 0, + vlapic_callout_handler, vlapic, 0); + } else + callout_stop(&vlapic->callout); + + VLAPIC_TIMER_UNLOCK(vlapic); +} + +/* + * This function populates 'dmask' with the set of vcpus that match the + * addressing specified by the (dest, phys, lowprio) tuple. + * + * 'x2apic_dest' specifies whether 'dest' is interpreted as x2APIC (32-bit) + * or xAPIC (8-bit) destination field. + */ +static void +vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys, + bool lowprio, bool x2apic_dest) +{ + struct vlapic *vlapic; + uint32_t dfr, ldr, ldest, cluster; + uint32_t mda_flat_ldest, mda_cluster_ldest, mda_ldest, mda_cluster_id; + cpuset_t amask; + int vcpuid; + + if ((x2apic_dest && dest == 0xffffffff) || + (!x2apic_dest && dest == 0xff)) { + /* + * Broadcast in both logical and physical modes. + */ + *dmask = vm_active_cpus(vm); + return; + } + + if (phys) { + /* + * Physical mode: destination is APIC ID. + */ + CPU_ZERO(dmask); + vcpuid = vm_apicid2vcpuid(vm, dest); + if (vcpuid < VM_MAXCPU) + CPU_SET(vcpuid, dmask); + } else { + /* + * In the "Flat Model" the MDA is interpreted as an 8-bit wide + * bitmask. This model is only avilable in the xAPIC mode. + */ + mda_flat_ldest = dest & 0xff; + + /* + * In the "Cluster Model" the MDA is used to identify a + * specific cluster and a set of APICs in that cluster. + */ + if (x2apic_dest) { + mda_cluster_id = dest >> 16; + mda_cluster_ldest = dest & 0xffff; + } else { + mda_cluster_id = (dest >> 4) & 0xf; + mda_cluster_ldest = dest & 0xf; + } + + /* + * Logical mode: match each APIC that has a bit set + * in it's LDR that matches a bit in the ldest. + */ + CPU_ZERO(dmask); + amask = vm_active_cpus(vm); + while ((vcpuid = CPU_FFS(&amask)) != 0) { + vcpuid--; + CPU_CLR(vcpuid, &amask); + + vlapic = vm_lapic(vm, vcpuid); + dfr = vlapic->apic_page->dfr; + ldr = vlapic->apic_page->ldr; + + if ((dfr & APIC_DFR_MODEL_MASK) == + APIC_DFR_MODEL_FLAT) { + ldest = ldr >> 24; + mda_ldest = mda_flat_ldest; + } else if ((dfr & APIC_DFR_MODEL_MASK) == + APIC_DFR_MODEL_CLUSTER) { + if (x2apic(vlapic)) { + cluster = ldr >> 16; + ldest = ldr & 0xffff; + } else { + cluster = ldr >> 28; + ldest = (ldr >> 24) & 0xf; + } + if (cluster != mda_cluster_id) + continue; + mda_ldest = mda_cluster_ldest; + } else { + /* + * Guest has configured a bad logical + * model for this vcpu - skip it. + */ + VLAPIC_CTR1(vlapic, "vlapic has bad logical " + "model %x - cannot deliver interrupt", dfr); + continue; + } + + if ((mda_ldest & ldest) != 0) { + CPU_SET(vcpuid, dmask); + if (lowprio) + break; + } + } + } +} + +static VMM_STAT_ARRAY(IPIS_SENT, VM_MAXCPU, "ipis sent to vcpu"); + +static void +vlapic_set_tpr(struct vlapic *vlapic, uint8_t val) +{ + struct LAPIC *lapic = vlapic->apic_page; + + if (lapic->tpr != val) { + VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vlapic TPR changed " + "from %#x to %#x", lapic->tpr, val); + lapic->tpr = val; + vlapic_update_ppr(vlapic); + } +} + +static uint8_t +vlapic_get_tpr(struct vlapic *vlapic) +{ + struct LAPIC *lapic = vlapic->apic_page; + + return (lapic->tpr); +} + +void +vlapic_set_cr8(struct vlapic *vlapic, uint64_t val) +{ + uint8_t tpr; + + if (val & ~0xf) { + vm_inject_gp(vlapic->vm, vlapic->vcpuid); + return; + } + + tpr = val << 4; + vlapic_set_tpr(vlapic, tpr); +} + +uint64_t +vlapic_get_cr8(struct vlapic *vlapic) +{ + uint8_t tpr; + + tpr = vlapic_get_tpr(vlapic); + return (tpr >> 4); +} + +int +vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu) +{ + int i; + bool phys; + cpuset_t dmask; + uint64_t icrval; + uint32_t dest, vec, mode; + struct vlapic *vlapic2; + struct vm_exit *vmexit; + struct LAPIC *lapic; + + lapic = vlapic->apic_page; + lapic->icr_lo &= ~APIC_DELSTAT_PEND; + icrval = ((uint64_t)lapic->icr_hi << 32) | lapic->icr_lo; + + if (x2apic(vlapic)) + dest = icrval >> 32; + else + dest = icrval >> (32 + 24); + vec = icrval & APIC_VECTOR_MASK; + mode = icrval & APIC_DELMODE_MASK; + + if (mode == APIC_DELMODE_FIXED && vec < 16) { + vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR); + VLAPIC_CTR1(vlapic, "Ignoring invalid IPI %d", vec); + return (0); + } + + VLAPIC_CTR2(vlapic, "icrlo 0x%016lx triggered ipi %d", icrval, vec); + + if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) { + switch (icrval & APIC_DEST_MASK) { + case APIC_DEST_DESTFLD: + phys = ((icrval & APIC_DESTMODE_LOG) == 0); + vlapic_calcdest(vlapic->vm, &dmask, dest, phys, false, + x2apic(vlapic)); + break; + case APIC_DEST_SELF: + CPU_SETOF(vlapic->vcpuid, &dmask); + break; + case APIC_DEST_ALLISELF: + dmask = vm_active_cpus(vlapic->vm); + break; + case APIC_DEST_ALLESELF: + dmask = vm_active_cpus(vlapic->vm); + CPU_CLR(vlapic->vcpuid, &dmask); + break; + default: + CPU_ZERO(&dmask); /* satisfy gcc */ + break; + } + + while ((i = CPU_FFS(&dmask)) != 0) { + i--; + CPU_CLR(i, &dmask); + if (mode == APIC_DELMODE_FIXED) { + lapic_intr_edge(vlapic->vm, i, vec); + vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, + IPIS_SENT, i, 1); + VLAPIC_CTR2(vlapic, "vlapic sending ipi %d " + "to vcpuid %d", vec, i); + } else { + vm_inject_nmi(vlapic->vm, i); + VLAPIC_CTR1(vlapic, "vlapic sending ipi nmi " + "to vcpuid %d", i); + } + } + + return (0); /* handled completely in the kernel */ + } + + if (mode == APIC_DELMODE_INIT) { + if ((icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT) + return (0); + + if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) { + vlapic2 = vm_lapic(vlapic->vm, dest); + + /* move from INIT to waiting-for-SIPI state */ + if (vlapic2->boot_state == BS_INIT) { + vlapic2->boot_state = BS_SIPI; + } + + return (0); + } + } + + if (mode == APIC_DELMODE_STARTUP) { + if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) { + vlapic2 = vm_lapic(vlapic->vm, dest); + + /* + * Ignore SIPIs in any state other than wait-for-SIPI + */ + if (vlapic2->boot_state != BS_SIPI) + return (0); + + vlapic2->boot_state = BS_RUNNING; + + *retu = true; + vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid); + vmexit->exitcode = VM_EXITCODE_SPINUP_AP; + vmexit->u.spinup_ap.vcpu = dest; + vmexit->u.spinup_ap.rip = vec << PAGE_SHIFT; + + return (0); + } + } + + /* + * This will cause a return to userland. + */ + return (1); +} + +void +vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val) +{ + int vec; + + KASSERT(x2apic(vlapic), ("SELF_IPI does not exist in xAPIC mode")); + + vec = val & 0xff; + lapic_intr_edge(vlapic->vm, vlapic->vcpuid, vec); + vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, IPIS_SENT, + vlapic->vcpuid, 1); + VLAPIC_CTR1(vlapic, "vlapic self-ipi %d", vec); +} + +int +vlapic_pending_intr(struct vlapic *vlapic, int *vecptr) +{ + struct LAPIC *lapic = vlapic->apic_page; + int idx, i, bitpos, vector; + uint32_t *irrptr, val; + + if (vlapic->ops.pending_intr) + return ((*vlapic->ops.pending_intr)(vlapic, vecptr)); + + irrptr = &lapic->irr0; + + for (i = 7; i >= 0; i--) { + idx = i * 4; + val = atomic_load_acq_int(&irrptr[idx]); + bitpos = fls(val); + if (bitpos != 0) { + vector = i * 32 + (bitpos - 1); + if (PRIO(vector) > PRIO(lapic->ppr)) { + VLAPIC_CTR1(vlapic, "pending intr %d", vector); + if (vecptr != NULL) + *vecptr = vector; + return (1); + } else + break; + } + } + return (0); +} + +void +vlapic_intr_accepted(struct vlapic *vlapic, int vector) +{ + struct LAPIC *lapic = vlapic->apic_page; + uint32_t *irrptr, *isrptr; + int idx, stk_top; + + if (vlapic->ops.intr_accepted) + return ((*vlapic->ops.intr_accepted)(vlapic, vector)); + + /* + * clear the ready bit for vector being accepted in irr + * and set the vector as in service in isr. + */ + idx = (vector / 32) * 4; + + irrptr = &lapic->irr0; + atomic_clear_int(&irrptr[idx], 1 << (vector % 32)); + VLAPIC_CTR_IRR(vlapic, "vlapic_intr_accepted"); + + isrptr = &lapic->isr0; + isrptr[idx] |= 1 << (vector % 32); + VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted"); + + /* + * Update the PPR + */ + vlapic->isrvec_stk_top++; + + stk_top = vlapic->isrvec_stk_top; + if (stk_top >= ISRVEC_STK_SIZE) + panic("isrvec_stk_top overflow %d", stk_top); + + vlapic->isrvec_stk[stk_top] = vector; + vlapic_update_ppr(vlapic); +} + +void +vlapic_svr_write_handler(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + uint32_t old, new, changed; + + lapic = vlapic->apic_page; + + new = lapic->svr; + old = vlapic->svr_last; + vlapic->svr_last = new; + + changed = old ^ new; + if ((changed & APIC_SVR_ENABLE) != 0) { + if ((new & APIC_SVR_ENABLE) == 0) { + /* + * The apic is now disabled so stop the apic timer + * and mask all the LVT entries. + */ + VLAPIC_CTR0(vlapic, "vlapic is software-disabled"); + VLAPIC_TIMER_LOCK(vlapic); + callout_stop(&vlapic->callout); + VLAPIC_TIMER_UNLOCK(vlapic); + vlapic_mask_lvts(vlapic); + } else { + /* + * The apic is now enabled so restart the apic timer + * if it is configured in periodic mode. + */ + VLAPIC_CTR0(vlapic, "vlapic is software-enabled"); + if (vlapic_periodic_timer(vlapic)) + vlapic_icrtmr_write_handler(vlapic); + } + } +} + +int +vlapic_read(struct vlapic *vlapic, int mmio_access, uint64_t offset, + uint64_t *data, bool *retu) +{ + struct LAPIC *lapic = vlapic->apic_page; + uint32_t *reg; + int i; + + /* Ignore MMIO accesses in x2APIC mode */ + if (x2apic(vlapic) && mmio_access) { + VLAPIC_CTR1(vlapic, "MMIO read from offset %#lx in x2APIC mode", + offset); + *data = 0; + goto done; + } + + if (!x2apic(vlapic) && !mmio_access) { + /* + * XXX Generate GP fault for MSR accesses in xAPIC mode + */ + VLAPIC_CTR1(vlapic, "x2APIC MSR read from offset %#lx in " + "xAPIC mode", offset); + *data = 0; + goto done; + } + + if (offset > sizeof(*lapic)) { + *data = 0; + goto done; + } + + offset &= ~3; + switch(offset) + { + case APIC_OFFSET_ID: + *data = lapic->id; + break; + case APIC_OFFSET_VER: + *data = lapic->version; + break; + case APIC_OFFSET_TPR: + *data = vlapic_get_tpr(vlapic); + break; + case APIC_OFFSET_APR: + *data = lapic->apr; + break; + case APIC_OFFSET_PPR: + *data = lapic->ppr; + break; + case APIC_OFFSET_EOI: + *data = lapic->eoi; + break; + case APIC_OFFSET_LDR: + *data = lapic->ldr; + break; + case APIC_OFFSET_DFR: + *data = lapic->dfr; + break; + case APIC_OFFSET_SVR: + *data = lapic->svr; + break; + case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: + i = (offset - APIC_OFFSET_ISR0) >> 2; + reg = &lapic->isr0; + *data = *(reg + i); + break; + case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: + i = (offset - APIC_OFFSET_TMR0) >> 2; + reg = &lapic->tmr0; + *data = *(reg + i); + break; + case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: + i = (offset - APIC_OFFSET_IRR0) >> 2; + reg = &lapic->irr0; + *data = atomic_load_acq_int(reg + i); + break; + case APIC_OFFSET_ESR: + *data = lapic->esr; + break; + case APIC_OFFSET_ICR_LOW: + *data = lapic->icr_lo; + if (x2apic(vlapic)) + *data |= (uint64_t)lapic->icr_hi << 32; + break; + case APIC_OFFSET_ICR_HI: + *data = lapic->icr_hi; + break; + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: + *data = vlapic_get_lvt(vlapic, offset); +#ifdef INVARIANTS + reg = vlapic_get_lvtptr(vlapic, offset); + KASSERT(*data == *reg, ("inconsistent lvt value at " + "offset %#lx: %#lx/%#x", offset, *data, *reg)); +#endif + break; + case APIC_OFFSET_TIMER_ICR: + *data = lapic->icr_timer; + break; + case APIC_OFFSET_TIMER_CCR: + *data = vlapic_get_ccr(vlapic); + break; + case APIC_OFFSET_TIMER_DCR: + *data = lapic->dcr_timer; + break; + case APIC_OFFSET_SELF_IPI: + /* + * XXX generate a GP fault if vlapic is in x2apic mode + */ + *data = 0; + break; + case APIC_OFFSET_RRR: + default: + *data = 0; + break; + } +done: + VLAPIC_CTR2(vlapic, "vlapic read offset %#x, data %#lx", offset, *data); + return 0; +} + +int +vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset, + uint64_t data, bool *retu) +{ + struct LAPIC *lapic = vlapic->apic_page; + uint32_t *regptr; + int retval; + + KASSERT((offset & 0xf) == 0 && offset < PAGE_SIZE, + ("vlapic_write: invalid offset %#lx", offset)); + + VLAPIC_CTR2(vlapic, "vlapic write offset %#lx, data %#lx", + offset, data); + + if (offset > sizeof(*lapic)) + return (0); + + /* Ignore MMIO accesses in x2APIC mode */ + if (x2apic(vlapic) && mmio_access) { + VLAPIC_CTR2(vlapic, "MMIO write of %#lx to offset %#lx " + "in x2APIC mode", data, offset); + return (0); + } + + /* + * XXX Generate GP fault for MSR accesses in xAPIC mode + */ + if (!x2apic(vlapic) && !mmio_access) { + VLAPIC_CTR2(vlapic, "x2APIC MSR write of %#lx to offset %#lx " + "in xAPIC mode", data, offset); + return (0); + } + + retval = 0; + switch(offset) + { + case APIC_OFFSET_ID: + lapic->id = data; + vlapic_id_write_handler(vlapic); + break; + case APIC_OFFSET_TPR: + vlapic_set_tpr(vlapic, data & 0xff); + break; + case APIC_OFFSET_EOI: + vlapic_process_eoi(vlapic); + break; + case APIC_OFFSET_LDR: + lapic->ldr = data; + vlapic_ldr_write_handler(vlapic); + break; + case APIC_OFFSET_DFR: + lapic->dfr = data; + vlapic_dfr_write_handler(vlapic); + break; + case APIC_OFFSET_SVR: + lapic->svr = data; + vlapic_svr_write_handler(vlapic); + break; + case APIC_OFFSET_ICR_LOW: + lapic->icr_lo = data; + if (x2apic(vlapic)) + lapic->icr_hi = data >> 32; + retval = vlapic_icrlo_write_handler(vlapic, retu); + break; + case APIC_OFFSET_ICR_HI: + lapic->icr_hi = data; + break; + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: + regptr = vlapic_get_lvtptr(vlapic, offset); + *regptr = data; + vlapic_lvt_write_handler(vlapic, offset); + break; + case APIC_OFFSET_TIMER_ICR: + lapic->icr_timer = data; + vlapic_icrtmr_write_handler(vlapic); + break; + + case APIC_OFFSET_TIMER_DCR: + lapic->dcr_timer = data; + vlapic_dcr_write_handler(vlapic); + break; + + case APIC_OFFSET_ESR: + vlapic_esr_write_handler(vlapic); + break; + + case APIC_OFFSET_SELF_IPI: + if (x2apic(vlapic)) + vlapic_self_ipi_handler(vlapic, data); + break; + + case APIC_OFFSET_VER: + case APIC_OFFSET_APR: + case APIC_OFFSET_PPR: + case APIC_OFFSET_RRR: + case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: + case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: + case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: + case APIC_OFFSET_TIMER_CCR: + default: + // Read only. + break; + } + + return (retval); +} + +static void +vlapic_reset(struct vlapic *vlapic) +{ + struct LAPIC *lapic; + + lapic = vlapic->apic_page; + bzero(lapic, sizeof(struct LAPIC)); + + lapic->id = vlapic_get_id(vlapic); + lapic->version = VLAPIC_VERSION; + lapic->version |= (VLAPIC_MAXLVT_INDEX << MAXLVTSHIFT); + lapic->dfr = 0xffffffff; + lapic->svr = APIC_SVR_VECTOR; + vlapic_mask_lvts(vlapic); + vlapic_reset_tmr(vlapic); + + lapic->dcr_timer = 0; + vlapic_dcr_write_handler(vlapic); + + if (vlapic->vcpuid == 0) + vlapic->boot_state = BS_RUNNING; /* BSP */ + else + vlapic->boot_state = BS_INIT; /* AP */ + + vlapic->svr_last = lapic->svr; +} + +void +vlapic_init(struct vlapic *vlapic) +{ + KASSERT(vlapic->vm != NULL, ("vlapic_init: vm is not initialized")); + KASSERT(vlapic->vcpuid >= 0 && vlapic->vcpuid < VM_MAXCPU, + ("vlapic_init: vcpuid is not initialized")); + KASSERT(vlapic->apic_page != NULL, ("vlapic_init: apic_page is not " + "initialized")); + + /* + * If the vlapic is configured in x2apic mode then it will be + * accessed in the critical section via the MSR emulation code. + * + * Therefore the timer mutex must be a spinlock because blockable + * mutexes cannot be acquired in a critical section. + */ + mtx_init(&vlapic->timer_mtx, "vlapic timer mtx", NULL, MTX_SPIN); + callout_init(&vlapic->callout, 1); + + vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED; + + if (vlapic->vcpuid == 0) + vlapic->msr_apicbase |= APICBASE_BSP; + + vlapic_reset(vlapic); +} + +void +vlapic_cleanup(struct vlapic *vlapic) +{ + + callout_drain(&vlapic->callout); +} + +uint64_t +vlapic_get_apicbase(struct vlapic *vlapic) +{ + + return (vlapic->msr_apicbase); +} + +int +vlapic_set_apicbase(struct vlapic *vlapic, uint64_t new) +{ + + if (vlapic->msr_apicbase != new) { + VLAPIC_CTR2(vlapic, "Changing APIC_BASE MSR from %#lx to %#lx " + "not supported", vlapic->msr_apicbase, new); + return (-1); + } + + return (0); +} + +void +vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) +{ + struct vlapic *vlapic; + struct LAPIC *lapic; + + vlapic = vm_lapic(vm, vcpuid); + + if (state == X2APIC_DISABLED) + vlapic->msr_apicbase &= ~APICBASE_X2APIC; + else + vlapic->msr_apicbase |= APICBASE_X2APIC; + + /* + * Reset the local APIC registers whose values are mode-dependent. + * + * XXX this works because the APIC mode can be changed only at vcpu + * initialization time. + */ + lapic = vlapic->apic_page; + lapic->id = vlapic_get_id(vlapic); + if (x2apic(vlapic)) { + lapic->ldr = x2apic_ldr(vlapic); + lapic->dfr = 0; + } else { + lapic->ldr = 0; + lapic->dfr = 0xffffffff; + } + + if (state == X2APIC_ENABLED) { + if (vlapic->ops.enable_x2apic_mode) + (*vlapic->ops.enable_x2apic_mode)(vlapic); + } +} + +void +vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys, + int delmode, int vec) +{ + bool lowprio; + int vcpuid; + cpuset_t dmask; + + if (delmode != IOART_DELFIXED && + delmode != IOART_DELLOPRI && + delmode != IOART_DELEXINT) { + VM_CTR1(vm, "vlapic intr invalid delmode %#x", delmode); + return; + } + lowprio = (delmode == IOART_DELLOPRI); + + /* + * We don't provide any virtual interrupt redirection hardware so + * all interrupts originating from the ioapic or MSI specify the + * 'dest' in the legacy xAPIC format. + */ + vlapic_calcdest(vm, &dmask, dest, phys, lowprio, false); + + while ((vcpuid = CPU_FFS(&dmask)) != 0) { + vcpuid--; + CPU_CLR(vcpuid, &dmask); + if (delmode == IOART_DELEXINT) { + vm_inject_extint(vm, vcpuid); + } else { + lapic_set_intr(vm, vcpuid, vec, level); + } + } +} + +void +vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum) +{ + /* + * Post an interrupt to the vcpu currently running on 'hostcpu'. + * + * This is done by leveraging features like Posted Interrupts (Intel) + * Doorbell MSR (AMD AVIC) that avoid a VM exit. + * + * If neither of these features are available then fallback to + * sending an IPI to 'hostcpu'. + */ + if (vlapic->ops.post_intr) + (*vlapic->ops.post_intr)(vlapic, hostcpu); + else + ipi_cpu(hostcpu, ipinum); +} + +bool +vlapic_enabled(struct vlapic *vlapic) +{ + struct LAPIC *lapic = vlapic->apic_page; + + if ((vlapic->msr_apicbase & APICBASE_ENABLED) != 0 && + (lapic->svr & APIC_SVR_ENABLE) != 0) + return (true); + else + return (false); +} + +static void +vlapic_set_tmr(struct vlapic *vlapic, int vector, bool level) +{ + struct LAPIC *lapic; + uint32_t *tmrptr, mask; + int idx; + + lapic = vlapic->apic_page; + tmrptr = &lapic->tmr0; + idx = (vector / 32) * 4; + mask = 1 << (vector % 32); + if (level) + tmrptr[idx] |= mask; + else + tmrptr[idx] &= ~mask; + + if (vlapic->ops.set_tmr != NULL) + (*vlapic->ops.set_tmr)(vlapic, vector, level); +} + +void +vlapic_reset_tmr(struct vlapic *vlapic) +{ + int vector; + + VLAPIC_CTR0(vlapic, "vlapic resetting all vectors to edge-triggered"); + + for (vector = 0; vector <= 255; vector++) + vlapic_set_tmr(vlapic, vector, false); +} + +void +vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys, + int delmode, int vector) +{ + cpuset_t dmask; + bool lowprio; + + KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); + + /* + * A level trigger is valid only for fixed and lowprio delivery modes. + */ + if (delmode != APIC_DELMODE_FIXED && delmode != APIC_DELMODE_LOWPRIO) { + VLAPIC_CTR1(vlapic, "Ignoring level trigger-mode for " + "delivery-mode %d", delmode); + return; + } + + lowprio = (delmode == APIC_DELMODE_LOWPRIO); + vlapic_calcdest(vlapic->vm, &dmask, dest, phys, lowprio, false); + + if (!CPU_ISSET(vlapic->vcpuid, &dmask)) + return; + + VLAPIC_CTR1(vlapic, "vector %d set to level-triggered", vector); + vlapic_set_tmr(vlapic, vector, true); +} diff --git a/vmm/io/vlapic.h b/vmm/io/vlapic.h new file mode 100644 index 0000000..0e68b2f --- /dev/null +++ b/vmm/io/vlapic.h @@ -0,0 +1,109 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VLAPIC_H_ +#define _VLAPIC_H_ + +struct vm; +enum x2apic_state; + +int vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset, + uint64_t data, bool *retu); +int vlapic_read(struct vlapic *vlapic, int mmio_access, uint64_t offset, + uint64_t *data, bool *retu); + +/* + * Returns 0 if there is no eligible vector that can be delivered to the + * guest at this time and non-zero otherwise. + * + * If an eligible vector number is found and 'vecptr' is not NULL then it will + * be stored in the location pointed to by 'vecptr'. + * + * Note that the vector does not automatically transition to the ISR as a + * result of calling this function. + */ +int vlapic_pending_intr(struct vlapic *vlapic, int *vecptr); + +/* + * Transition 'vector' from IRR to ISR. This function is called with the + * vector returned by 'vlapic_pending_intr()' when the guest is able to + * accept this interrupt (i.e. RFLAGS.IF = 1 and no conditions exist that + * block interrupt delivery). + */ +void vlapic_intr_accepted(struct vlapic *vlapic, int vector); + +/* + * Returns 1 if the vcpu needs to be notified of the interrupt and 0 otherwise. + */ +int vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level); + +/* + * Post an interrupt to the vcpu running on 'hostcpu'. This will use a + * hardware assist if available (e.g. Posted Interrupt) or fall back to + * sending an 'ipinum' to interrupt the 'hostcpu'. + */ +void vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum); + +void vlapic_set_error(struct vlapic *vlapic, uint32_t mask); +void vlapic_fire_cmci(struct vlapic *vlapic); +int vlapic_trigger_lvt(struct vlapic *vlapic, int vector); + +uint64_t vlapic_get_apicbase(struct vlapic *vlapic); +int vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val); +void vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state s); +bool vlapic_enabled(struct vlapic *vlapic); + +void vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys, + int delmode, int vec); + +/* Reset the trigger-mode bits for all vectors to be edge-triggered */ +void vlapic_reset_tmr(struct vlapic *vlapic); + +/* + * Set the trigger-mode bit associated with 'vector' to level-triggered if + * the (dest,phys,delmode) tuple resolves to an interrupt being delivered to + * this 'vlapic'. + */ +void vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys, + int delmode, int vector); + +void vlapic_set_cr8(struct vlapic *vlapic, uint64_t val); +uint64_t vlapic_get_cr8(struct vlapic *vlapic); + +/* APIC write handlers */ +void vlapic_id_write_handler(struct vlapic *vlapic); +void vlapic_ldr_write_handler(struct vlapic *vlapic); +void vlapic_dfr_write_handler(struct vlapic *vlapic); +void vlapic_svr_write_handler(struct vlapic *vlapic); +void vlapic_esr_write_handler(struct vlapic *vlapic); +int vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu); +void vlapic_icrtmr_write_handler(struct vlapic *vlapic); +void vlapic_dcr_write_handler(struct vlapic *vlapic); +void vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset); +void vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val); +#endif /* _VLAPIC_H_ */ diff --git a/vmm/io/vlapic_priv.h b/vmm/io/vlapic_priv.h new file mode 100644 index 0000000..08592c8 --- /dev/null +++ b/vmm/io/vlapic_priv.h @@ -0,0 +1,190 @@ +/*- + * Copyright (c) 2013 Neel Natu + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VLAPIC_PRIV_H_ +#define _VLAPIC_PRIV_H_ + +#include + +/* + * APIC Register: Offset Description + */ +#define APIC_OFFSET_ID 0x20 /* Local APIC ID */ +#define APIC_OFFSET_VER 0x30 /* Local APIC Version */ +#define APIC_OFFSET_TPR 0x80 /* Task Priority Register */ +#define APIC_OFFSET_APR 0x90 /* Arbitration Priority */ +#define APIC_OFFSET_PPR 0xA0 /* Processor Priority Register */ +#define APIC_OFFSET_EOI 0xB0 /* EOI Register */ +#define APIC_OFFSET_RRR 0xC0 /* Remote read */ +#define APIC_OFFSET_LDR 0xD0 /* Logical Destination */ +#define APIC_OFFSET_DFR 0xE0 /* Destination Format Register */ +#define APIC_OFFSET_SVR 0xF0 /* Spurious Vector Register */ +#define APIC_OFFSET_ISR0 0x100 /* In Service Register */ +#define APIC_OFFSET_ISR1 0x110 +#define APIC_OFFSET_ISR2 0x120 +#define APIC_OFFSET_ISR3 0x130 +#define APIC_OFFSET_ISR4 0x140 +#define APIC_OFFSET_ISR5 0x150 +#define APIC_OFFSET_ISR6 0x160 +#define APIC_OFFSET_ISR7 0x170 +#define APIC_OFFSET_TMR0 0x180 /* Trigger Mode Register */ +#define APIC_OFFSET_TMR1 0x190 +#define APIC_OFFSET_TMR2 0x1A0 +#define APIC_OFFSET_TMR3 0x1B0 +#define APIC_OFFSET_TMR4 0x1C0 +#define APIC_OFFSET_TMR5 0x1D0 +#define APIC_OFFSET_TMR6 0x1E0 +#define APIC_OFFSET_TMR7 0x1F0 +#define APIC_OFFSET_IRR0 0x200 /* Interrupt Request Register */ +#define APIC_OFFSET_IRR1 0x210 +#define APIC_OFFSET_IRR2 0x220 +#define APIC_OFFSET_IRR3 0x230 +#define APIC_OFFSET_IRR4 0x240 +#define APIC_OFFSET_IRR5 0x250 +#define APIC_OFFSET_IRR6 0x260 +#define APIC_OFFSET_IRR7 0x270 +#define APIC_OFFSET_ESR 0x280 /* Error Status Register */ +#define APIC_OFFSET_CMCI_LVT 0x2F0 /* Local Vector Table (CMCI) */ +#define APIC_OFFSET_ICR_LOW 0x300 /* Interrupt Command Register */ +#define APIC_OFFSET_ICR_HI 0x310 +#define APIC_OFFSET_TIMER_LVT 0x320 /* Local Vector Table (Timer) */ +#define APIC_OFFSET_THERM_LVT 0x330 /* Local Vector Table (Thermal) */ +#define APIC_OFFSET_PERF_LVT 0x340 /* Local Vector Table (PMC) */ +#define APIC_OFFSET_LINT0_LVT 0x350 /* Local Vector Table (LINT0) */ +#define APIC_OFFSET_LINT1_LVT 0x360 /* Local Vector Table (LINT1) */ +#define APIC_OFFSET_ERROR_LVT 0x370 /* Local Vector Table (ERROR) */ +#define APIC_OFFSET_TIMER_ICR 0x380 /* Timer's Initial Count */ +#define APIC_OFFSET_TIMER_CCR 0x390 /* Timer's Current Count */ +#define APIC_OFFSET_TIMER_DCR 0x3E0 /* Timer's Divide Configuration */ +#define APIC_OFFSET_SELF_IPI 0x3F0 /* Self IPI register */ + +#define VLAPIC_CTR0(vlapic, format) \ + VCPU_CTR0((vlapic)->vm, (vlapic)->vcpuid, format) + +#define VLAPIC_CTR1(vlapic, format, p1) \ + VCPU_CTR1((vlapic)->vm, (vlapic)->vcpuid, format, p1) + +#define VLAPIC_CTR2(vlapic, format, p1, p2) \ + VCPU_CTR2((vlapic)->vm, (vlapic)->vcpuid, format, p1, p2) + +#define VLAPIC_CTR3(vlapic, format, p1, p2, p3) \ + VCPU_CTR3((vlapic)->vm, (vlapic)->vcpuid, format, p1, p2, p3) + +#define VLAPIC_CTR_IRR(vlapic, msg) \ +do { \ + uint32_t *irrptr = &(vlapic)->apic_page->irr0; \ + irrptr[0] = irrptr[0]; /* silence compiler */ \ + VLAPIC_CTR1((vlapic), msg " irr0 0x%08x", irrptr[0 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr1 0x%08x", irrptr[1 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr2 0x%08x", irrptr[2 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr3 0x%08x", irrptr[3 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr4 0x%08x", irrptr[4 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr5 0x%08x", irrptr[5 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr6 0x%08x", irrptr[6 << 2]); \ + VLAPIC_CTR1((vlapic), msg " irr7 0x%08x", irrptr[7 << 2]); \ +} while (0) + +#define VLAPIC_CTR_ISR(vlapic, msg) \ +do { \ + uint32_t *isrptr = &(vlapic)->apic_page->isr0; \ + isrptr[0] = isrptr[0]; /* silence compiler */ \ + VLAPIC_CTR1((vlapic), msg " isr0 0x%08x", isrptr[0 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr1 0x%08x", isrptr[1 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr2 0x%08x", isrptr[2 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr3 0x%08x", isrptr[3 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr4 0x%08x", isrptr[4 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr5 0x%08x", isrptr[5 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr6 0x%08x", isrptr[6 << 2]); \ + VLAPIC_CTR1((vlapic), msg " isr7 0x%08x", isrptr[7 << 2]); \ +} while (0) + +enum boot_state { + BS_INIT, + BS_SIPI, + BS_RUNNING +}; + +/* + * 16 priority levels with at most one vector injected per level. + */ +#define ISRVEC_STK_SIZE (16 + 1) + +#define VLAPIC_MAXLVT_INDEX APIC_LVT_CMCI + +struct vlapic; + +struct vlapic_ops { + int (*set_intr_ready)(struct vlapic *vlapic, int vector, bool level); + int (*pending_intr)(struct vlapic *vlapic, int *vecptr); + void (*intr_accepted)(struct vlapic *vlapic, int vector); + void (*post_intr)(struct vlapic *vlapic, int hostcpu); + void (*set_tmr)(struct vlapic *vlapic, int vector, bool level); + void (*enable_x2apic_mode)(struct vlapic *vlapic); +}; + +struct vlapic { + struct vm *vm; + int vcpuid; + struct LAPIC *apic_page; + struct vlapic_ops ops; + + uint32_t esr_pending; + int esr_firing; + + struct callout callout; /* vlapic timer */ + struct bintime timer_fire_bt; /* callout expiry time */ + struct bintime timer_freq_bt; /* timer frequency */ + struct bintime timer_period_bt; /* timer period */ + struct mtx timer_mtx; + + /* + * The 'isrvec_stk' is a stack of vectors injected by the local apic. + * A vector is popped from the stack when the processor does an EOI. + * The vector on the top of the stack is used to compute the + * Processor Priority in conjunction with the TPR. + */ + uint8_t isrvec_stk[ISRVEC_STK_SIZE]; + int isrvec_stk_top; + + uint64_t msr_apicbase; + enum boot_state boot_state; + + /* + * Copies of some registers in the virtual APIC page. We do this for + * a couple of different reasons: + * - to be able to detect what changed (e.g. svr_last) + * - to maintain a coherent snapshot of the register (e.g. lvt_last) + */ + uint32_t svr_last; + uint32_t lvt_last[VLAPIC_MAXLVT_INDEX + 1]; +}; + +void vlapic_init(struct vlapic *vlapic); +void vlapic_cleanup(struct vlapic *vlapic); + +#endif /* _VLAPIC_PRIV_H_ */ diff --git a/vmm/io/vpmtmr.c b/vmm/io/vpmtmr.c new file mode 100644 index 0000000..1e7bb93 --- /dev/null +++ b/vmm/io/vpmtmr.c @@ -0,0 +1,103 @@ +/*- + * Copyright (c) 2014, Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include + +#include + +#include "vpmtmr.h" + +/* + * The ACPI Power Management timer is a free-running 24- or 32-bit + * timer with a frequency of 3.579545MHz + * + * This implementation will be 32-bits + */ + +#define PMTMR_FREQ 3579545 /* 3.579545MHz */ + +struct vpmtmr { + sbintime_t freq_sbt; + sbintime_t baseuptime; + uint32_t baseval; +}; + +static MALLOC_DEFINE(M_VPMTMR, "vpmtmr", "bhyve virtual acpi timer"); + +struct vpmtmr * +vpmtmr_init(struct vm *vm) +{ + struct vpmtmr *vpmtmr; + struct bintime bt; + + vpmtmr = malloc(sizeof(struct vpmtmr), M_VPMTMR, M_WAITOK | M_ZERO); + vpmtmr->baseuptime = sbinuptime(); + vpmtmr->baseval = 0; + + FREQ2BT(PMTMR_FREQ, &bt); + vpmtmr->freq_sbt = bttosbt(bt); + + return (vpmtmr); +} + +void +vpmtmr_cleanup(struct vpmtmr *vpmtmr) +{ + + free(vpmtmr, M_VPMTMR); +} + +int +vpmtmr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *val) +{ + struct vpmtmr *vpmtmr; + sbintime_t now, delta; + + if (!in || bytes != 4) + return (-1); + + vpmtmr = vm_pmtmr(vm); + + /* + * No locking needed because 'baseuptime' and 'baseval' are + * written only during initialization. + */ + now = sbinuptime(); + delta = now - vpmtmr->baseuptime; + KASSERT(delta >= 0, ("vpmtmr_handler: uptime went backwards: " + "%#lx to %#lx", vpmtmr->baseuptime, now)); + *val = vpmtmr->baseval + delta / vpmtmr->freq_sbt; + + return (0); +} diff --git a/vmm/io/vpmtmr.h b/vmm/io/vpmtmr.h new file mode 100644 index 0000000..039a281 --- /dev/null +++ b/vmm/io/vpmtmr.h @@ -0,0 +1,42 @@ +/*- + * Copyright (c) 2014 Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VPMTMR_H_ +#define _VPMTMR_H_ + +#define IO_PMTMR 0x408 + +struct vpmtmr; + +struct vpmtmr *vpmtmr_init(struct vm *vm); +void vpmtmr_cleanup(struct vpmtmr *pmtmr); + +int vpmtmr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *val); + +#endif diff --git a/vmm/io/vrtc.c b/vmm/io/vrtc.c new file mode 100644 index 0000000..18ebc4b --- /dev/null +++ b/vmm/io/vrtc.c @@ -0,0 +1,1019 @@ +/*- + * Copyright (c) 2014, Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "vmm_ktr.h" +#include "vatpic.h" +#include "vioapic.h" +#include "vrtc.h" + +/* Register layout of the RTC */ +struct rtcdev { + uint8_t sec; + uint8_t alarm_sec; + uint8_t min; + uint8_t alarm_min; + uint8_t hour; + uint8_t alarm_hour; + uint8_t day_of_week; + uint8_t day_of_month; + uint8_t month; + uint8_t year; + uint8_t reg_a; + uint8_t reg_b; + uint8_t reg_c; + uint8_t reg_d; + uint8_t nvram[36]; + uint8_t century; + uint8_t nvram2[128 - 51]; +} __packed; +CTASSERT(sizeof(struct rtcdev) == 128); +CTASSERT(offsetof(struct rtcdev, century) == RTC_CENTURY); + +struct vrtc { + struct vm *vm; + struct mtx mtx; + struct callout callout; + u_int addr; /* RTC register to read or write */ + sbintime_t base_uptime; + time_t base_rtctime; + struct rtcdev rtcdev; +}; + +#define VRTC_LOCK(vrtc) mtx_lock(&((vrtc)->mtx)) +#define VRTC_UNLOCK(vrtc) mtx_unlock(&((vrtc)->mtx)) +#define VRTC_LOCKED(vrtc) mtx_owned(&((vrtc)->mtx)) + +/* + * RTC time is considered "broken" if: + * - RTC updates are halted by the guest + * - RTC date/time fields have invalid values + */ +#define VRTC_BROKEN_TIME ((time_t)-1) + +#define RTC_IRQ 8 +#define RTCSB_BIN 0x04 +#define RTCSB_ALL_INTRS (RTCSB_UINTR | RTCSB_AINTR | RTCSB_PINTR) +#define rtc_halted(vrtc) ((vrtc->rtcdev.reg_b & RTCSB_HALT) != 0) +#define aintr_enabled(vrtc) (((vrtc)->rtcdev.reg_b & RTCSB_AINTR) != 0) +#define pintr_enabled(vrtc) (((vrtc)->rtcdev.reg_b & RTCSB_PINTR) != 0) +#define uintr_enabled(vrtc) (((vrtc)->rtcdev.reg_b & RTCSB_UINTR) != 0) + +static void vrtc_callout_handler(void *arg); +static void vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval); + +static MALLOC_DEFINE(M_VRTC, "vrtc", "bhyve virtual rtc"); + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, vrtc, CTLFLAG_RW, NULL, NULL); + +static int rtc_flag_broken_time = 1; +SYSCTL_INT(_hw_vmm_vrtc, OID_AUTO, flag_broken_time, CTLFLAG_RDTUN, + &rtc_flag_broken_time, 0, "Stop guest when invalid RTC time is detected"); + +static __inline bool +divider_enabled(int reg_a) +{ + /* + * The RTC is counting only when dividers are not held in reset. + */ + return ((reg_a & 0x70) == 0x20); +} + +static __inline bool +update_enabled(struct vrtc *vrtc) +{ + /* + * RTC date/time can be updated only if: + * - divider is not held in reset + * - guest has not disabled updates + * - the date/time fields have valid contents + */ + if (!divider_enabled(vrtc->rtcdev.reg_a)) + return (false); + + if (rtc_halted(vrtc)) + return (false); + + if (vrtc->base_rtctime == VRTC_BROKEN_TIME) + return (false); + + return (true); +} + +static time_t +vrtc_curtime(struct vrtc *vrtc, sbintime_t *basetime) +{ + sbintime_t now, delta; + time_t t, secs; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + t = vrtc->base_rtctime; + *basetime = vrtc->base_uptime; + if (update_enabled(vrtc)) { + now = sbinuptime(); + delta = now - vrtc->base_uptime; + KASSERT(delta >= 0, ("vrtc_curtime: uptime went backwards: " + "%#lx to %#lx", vrtc->base_uptime, now)); + secs = delta / SBT_1S; + t += secs; + *basetime += secs * SBT_1S; + } + return (t); +} + +static __inline uint8_t +rtcset(struct rtcdev *rtc, int val) +{ + + KASSERT(val >= 0 && val < 100, ("%s: invalid bin2bcd index %d", + __func__, val)); + + return ((rtc->reg_b & RTCSB_BIN) ? val : bin2bcd_data[val]); +} + +static void +secs_to_rtc(time_t rtctime, struct vrtc *vrtc, int force_update) +{ + struct clocktime ct; + struct timespec ts; + struct rtcdev *rtc; + int hour; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + if (rtctime < 0) { + KASSERT(rtctime == VRTC_BROKEN_TIME, + ("%s: invalid vrtc time %#lx", __func__, rtctime)); + return; + } + + /* + * If the RTC is halted then the guest has "ownership" of the + * date/time fields. Don't update the RTC date/time fields in + * this case (unless forced). + */ + if (rtc_halted(vrtc) && !force_update) + return; + + ts.tv_sec = rtctime; + ts.tv_nsec = 0; + clock_ts_to_ct(&ts, &ct); + + KASSERT(ct.sec >= 0 && ct.sec <= 59, ("invalid clocktime sec %d", + ct.sec)); + KASSERT(ct.min >= 0 && ct.min <= 59, ("invalid clocktime min %d", + ct.min)); + KASSERT(ct.hour >= 0 && ct.hour <= 23, ("invalid clocktime hour %d", + ct.hour)); + KASSERT(ct.dow >= 0 && ct.dow <= 6, ("invalid clocktime wday %d", + ct.dow)); + KASSERT(ct.day >= 1 && ct.day <= 31, ("invalid clocktime mday %d", + ct.day)); + KASSERT(ct.mon >= 1 && ct.mon <= 12, ("invalid clocktime month %d", + ct.mon)); + KASSERT(ct.year >= POSIX_BASE_YEAR, ("invalid clocktime year %d", + ct.year)); + + rtc = &vrtc->rtcdev; + rtc->sec = rtcset(rtc, ct.sec); + rtc->min = rtcset(rtc, ct.min); + + if (rtc->reg_b & RTCSB_24HR) { + hour = ct.hour; + } else { + /* + * Convert to the 12-hour format. + */ + switch (ct.hour) { + case 0: /* 12 AM */ + case 12: /* 12 PM */ + hour = 12; + break; + default: + /* + * The remaining 'ct.hour' values are interpreted as: + * [1 - 11] -> 1 - 11 AM + * [13 - 23] -> 1 - 11 PM + */ + hour = ct.hour % 12; + break; + } + } + + rtc->hour = rtcset(rtc, hour); + + if ((rtc->reg_b & RTCSB_24HR) == 0 && ct.hour >= 12) + rtc->hour |= 0x80; /* set MSB to indicate PM */ + + rtc->day_of_week = rtcset(rtc, ct.dow + 1); + rtc->day_of_month = rtcset(rtc, ct.day); + rtc->month = rtcset(rtc, ct.mon); + rtc->year = rtcset(rtc, ct.year % 100); + rtc->century = rtcset(rtc, ct.year / 100); +} + +static int +rtcget(struct rtcdev *rtc, int val, int *retval) +{ + uint8_t upper, lower; + + if (rtc->reg_b & RTCSB_BIN) { + *retval = val; + return (0); + } + + lower = val & 0xf; + upper = (val >> 4) & 0xf; + + if (lower > 9 || upper > 9) + return (-1); + + *retval = upper * 10 + lower; + return (0); +} + +static time_t +rtc_to_secs(struct vrtc *vrtc) +{ + struct clocktime ct; + struct timespec ts; + struct rtcdev *rtc; + struct vm *vm; + int century, error, hour, pm, year; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + vm = vrtc->vm; + rtc = &vrtc->rtcdev; + + bzero(&ct, sizeof(struct clocktime)); + + error = rtcget(rtc, rtc->sec, &ct.sec); + if (error || ct.sec < 0 || ct.sec > 59) { + VM_CTR2(vm, "Invalid RTC sec %#x/%d", rtc->sec, ct.sec); + goto fail; + } + + error = rtcget(rtc, rtc->min, &ct.min); + if (error || ct.min < 0 || ct.min > 59) { + VM_CTR2(vm, "Invalid RTC min %#x/%d", rtc->min, ct.min); + goto fail; + } + + pm = 0; + hour = rtc->hour; + if ((rtc->reg_b & RTCSB_24HR) == 0) { + if (hour & 0x80) { + hour &= ~0x80; + pm = 1; + } + } + error = rtcget(rtc, hour, &ct.hour); + if ((rtc->reg_b & RTCSB_24HR) == 0) { + if (ct.hour >= 1 && ct.hour <= 12) { + /* + * Convert from 12-hour format to internal 24-hour + * representation as follows: + * + * 12-hour format ct.hour + * 12 AM 0 + * 1 - 11 AM 1 - 11 + * 12 PM 12 + * 1 - 11 PM 13 - 23 + */ + if (ct.hour == 12) + ct.hour = 0; + if (pm) + ct.hour += 12; + } else { + VM_CTR2(vm, "Invalid RTC 12-hour format %#x/%d", + rtc->hour, ct.hour); + goto fail; + } + } + + if (error || ct.hour < 0 || ct.hour > 23) { + VM_CTR2(vm, "Invalid RTC hour %#x/%d", rtc->hour, ct.hour); + goto fail; + } + + /* + * Ignore 'rtc->dow' because some guests like Linux don't bother + * setting it at all while others like OpenBSD/i386 set it incorrectly. + * + * clock_ct_to_ts() does not depend on 'ct.dow' anyways so ignore it. + */ + ct.dow = -1; + + error = rtcget(rtc, rtc->day_of_month, &ct.day); + if (error || ct.day < 1 || ct.day > 31) { + VM_CTR2(vm, "Invalid RTC mday %#x/%d", rtc->day_of_month, + ct.day); + goto fail; + } + + error = rtcget(rtc, rtc->month, &ct.mon); + if (error || ct.mon < 1 || ct.mon > 12) { + VM_CTR2(vm, "Invalid RTC month %#x/%d", rtc->month, ct.mon); + goto fail; + } + + error = rtcget(rtc, rtc->year, &year); + if (error || year < 0 || year > 99) { + VM_CTR2(vm, "Invalid RTC year %#x/%d", rtc->year, year); + goto fail; + } + + error = rtcget(rtc, rtc->century, ¢ury); + ct.year = century * 100 + year; + if (error || ct.year < POSIX_BASE_YEAR) { + VM_CTR2(vm, "Invalid RTC century %#x/%d", rtc->century, + ct.year); + goto fail; + } + + error = clock_ct_to_ts(&ct, &ts); + if (error || ts.tv_sec < 0) { + VM_CTR3(vm, "Invalid RTC clocktime.date %04d-%02d-%02d", + ct.year, ct.mon, ct.day); + VM_CTR3(vm, "Invalid RTC clocktime.time %02d:%02d:%02d", + ct.hour, ct.min, ct.sec); + goto fail; + } + return (ts.tv_sec); /* success */ +fail: + /* + * Stop updating the RTC if the date/time fields programmed by + * the guest are invalid. + */ + VM_CTR0(vrtc->vm, "Invalid RTC date/time programming detected"); + return (VRTC_BROKEN_TIME); +} + +static int +vrtc_time_update(struct vrtc *vrtc, time_t newtime, sbintime_t newbase) +{ + struct rtcdev *rtc; + sbintime_t oldbase; + time_t oldtime; + uint8_t alarm_sec, alarm_min, alarm_hour; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + rtc = &vrtc->rtcdev; + alarm_sec = rtc->alarm_sec; + alarm_min = rtc->alarm_min; + alarm_hour = rtc->alarm_hour; + + oldtime = vrtc->base_rtctime; + VM_CTR2(vrtc->vm, "Updating RTC secs from %#lx to %#lx", + oldtime, newtime); + + oldbase = vrtc->base_uptime; + VM_CTR2(vrtc->vm, "Updating RTC base uptime from %#lx to %#lx", + oldbase, newbase); + vrtc->base_uptime = newbase; + + if (newtime == oldtime) + return (0); + + /* + * If 'newtime' indicates that RTC updates are disabled then just + * record that and return. There is no need to do alarm interrupt + * processing in this case. + */ + if (newtime == VRTC_BROKEN_TIME) { + vrtc->base_rtctime = VRTC_BROKEN_TIME; + return (0); + } + + /* + * Return an error if RTC updates are halted by the guest. + */ + if (rtc_halted(vrtc)) { + VM_CTR0(vrtc->vm, "RTC update halted by guest"); + return (EBUSY); + } + + do { + /* + * If the alarm interrupt is enabled and 'oldtime' is valid + * then visit all the seconds between 'oldtime' and 'newtime' + * to check for the alarm condition. + * + * Otherwise move the RTC time forward directly to 'newtime'. + */ + if (aintr_enabled(vrtc) && oldtime != VRTC_BROKEN_TIME) + vrtc->base_rtctime++; + else + vrtc->base_rtctime = newtime; + + if (aintr_enabled(vrtc)) { + /* + * Update the RTC date/time fields before checking + * if the alarm conditions are satisfied. + */ + secs_to_rtc(vrtc->base_rtctime, vrtc, 0); + + if ((alarm_sec >= 0xC0 || alarm_sec == rtc->sec) && + (alarm_min >= 0xC0 || alarm_min == rtc->min) && + (alarm_hour >= 0xC0 || alarm_hour == rtc->hour)) { + vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_ALARM); + } + } + } while (vrtc->base_rtctime != newtime); + + if (uintr_enabled(vrtc)) + vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_UPDATE); + + return (0); +} + +static sbintime_t +vrtc_freq(struct vrtc *vrtc) +{ + int ratesel; + + static sbintime_t pf[16] = { + 0, + SBT_1S / 256, + SBT_1S / 128, + SBT_1S / 8192, + SBT_1S / 4096, + SBT_1S / 2048, + SBT_1S / 1024, + SBT_1S / 512, + SBT_1S / 256, + SBT_1S / 128, + SBT_1S / 64, + SBT_1S / 32, + SBT_1S / 16, + SBT_1S / 8, + SBT_1S / 4, + SBT_1S / 2, + }; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + /* + * If both periodic and alarm interrupts are enabled then use the + * periodic frequency to drive the callout. The minimum periodic + * frequency (2 Hz) is higher than the alarm frequency (1 Hz) so + * piggyback the alarm on top of it. The same argument applies to + * the update interrupt. + */ + if (pintr_enabled(vrtc) && divider_enabled(vrtc->rtcdev.reg_a)) { + ratesel = vrtc->rtcdev.reg_a & 0xf; + return (pf[ratesel]); + } else if (aintr_enabled(vrtc) && update_enabled(vrtc)) { + return (SBT_1S); + } else if (uintr_enabled(vrtc) && update_enabled(vrtc)) { + return (SBT_1S); + } else { + return (0); + } +} + +static void +vrtc_callout_reset(struct vrtc *vrtc, sbintime_t freqsbt) +{ + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + if (freqsbt == 0) { + if (callout_active(&vrtc->callout)) { + VM_CTR0(vrtc->vm, "RTC callout stopped"); + callout_stop(&vrtc->callout); + } + return; + } + VM_CTR1(vrtc->vm, "RTC callout frequency %d hz", SBT_1S / freqsbt); + callout_reset_sbt(&vrtc->callout, freqsbt, 0, vrtc_callout_handler, + vrtc, 0); +} + +static void +vrtc_callout_handler(void *arg) +{ + struct vrtc *vrtc = arg; + sbintime_t freqsbt, basetime; + time_t rtctime; + int error; + + VM_CTR0(vrtc->vm, "vrtc callout fired"); + + VRTC_LOCK(vrtc); + if (callout_pending(&vrtc->callout)) /* callout was reset */ + goto done; + + if (!callout_active(&vrtc->callout)) /* callout was stopped */ + goto done; + + callout_deactivate(&vrtc->callout); + + KASSERT((vrtc->rtcdev.reg_b & RTCSB_ALL_INTRS) != 0, + ("gratuitous vrtc callout")); + + if (pintr_enabled(vrtc)) + vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c | RTCIR_PERIOD); + + if (aintr_enabled(vrtc) || uintr_enabled(vrtc)) { + rtctime = vrtc_curtime(vrtc, &basetime); + error = vrtc_time_update(vrtc, rtctime, basetime); + KASSERT(error == 0, ("%s: vrtc_time_update error %d", + __func__, error)); + } + + freqsbt = vrtc_freq(vrtc); + KASSERT(freqsbt != 0, ("%s: vrtc frequency cannot be zero", __func__)); + vrtc_callout_reset(vrtc, freqsbt); +done: + VRTC_UNLOCK(vrtc); +} + +static __inline void +vrtc_callout_check(struct vrtc *vrtc, sbintime_t freq) +{ + int active; + + active = callout_active(&vrtc->callout) ? 1 : 0; + KASSERT((freq == 0 && !active) || (freq != 0 && active), + ("vrtc callout %s with frequency %#lx", + active ? "active" : "inactive", freq)); +} + +static void +vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval) +{ + struct rtcdev *rtc; + int oldirqf, newirqf; + uint8_t oldval, changed; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + rtc = &vrtc->rtcdev; + newval &= RTCIR_ALARM | RTCIR_PERIOD | RTCIR_UPDATE; + + oldirqf = rtc->reg_c & RTCIR_INT; + if ((aintr_enabled(vrtc) && (newval & RTCIR_ALARM) != 0) || + (pintr_enabled(vrtc) && (newval & RTCIR_PERIOD) != 0) || + (uintr_enabled(vrtc) && (newval & RTCIR_UPDATE) != 0)) { + newirqf = RTCIR_INT; + } else { + newirqf = 0; + } + + oldval = rtc->reg_c; + rtc->reg_c = newirqf | newval; + changed = oldval ^ rtc->reg_c; + if (changed) { + VM_CTR2(vrtc->vm, "RTC reg_c changed from %#x to %#x", + oldval, rtc->reg_c); + } + + if (!oldirqf && newirqf) { + VM_CTR1(vrtc->vm, "RTC irq %d asserted", RTC_IRQ); + vatpic_pulse_irq(vrtc->vm, RTC_IRQ); + vioapic_pulse_irq(vrtc->vm, RTC_IRQ); + } else if (oldirqf && !newirqf) { + VM_CTR1(vrtc->vm, "RTC irq %d deasserted", RTC_IRQ); + } +} + +static int +vrtc_set_reg_b(struct vrtc *vrtc, uint8_t newval) +{ + struct rtcdev *rtc; + sbintime_t oldfreq, newfreq, basetime; + time_t curtime, rtctime; + int error; + uint8_t oldval, changed; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + rtc = &vrtc->rtcdev; + oldval = rtc->reg_b; + oldfreq = vrtc_freq(vrtc); + + rtc->reg_b = newval; + changed = oldval ^ newval; + if (changed) { + VM_CTR2(vrtc->vm, "RTC reg_b changed from %#x to %#x", + oldval, newval); + } + + if (changed & RTCSB_HALT) { + if ((newval & RTCSB_HALT) == 0) { + rtctime = rtc_to_secs(vrtc); + basetime = sbinuptime(); + if (rtctime == VRTC_BROKEN_TIME) { + if (rtc_flag_broken_time) + return (-1); + } + } else { + curtime = vrtc_curtime(vrtc, &basetime); + KASSERT(curtime == vrtc->base_rtctime, ("%s: mismatch " + "between vrtc basetime (%#lx) and curtime (%#lx)", + __func__, vrtc->base_rtctime, curtime)); + + /* + * Force a refresh of the RTC date/time fields so + * they reflect the time right before the guest set + * the HALT bit. + */ + secs_to_rtc(curtime, vrtc, 1); + + /* + * Updates are halted so mark 'base_rtctime' to denote + * that the RTC date/time is in flux. + */ + rtctime = VRTC_BROKEN_TIME; + rtc->reg_b &= ~RTCSB_UINTR; + } + error = vrtc_time_update(vrtc, rtctime, basetime); + KASSERT(error == 0, ("vrtc_time_update error %d", error)); + } + + /* + * Side effect of changes to the interrupt enable bits. + */ + if (changed & RTCSB_ALL_INTRS) + vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c); + + /* + * Change the callout frequency if it has changed. + */ + newfreq = vrtc_freq(vrtc); + if (newfreq != oldfreq) + vrtc_callout_reset(vrtc, newfreq); + else + vrtc_callout_check(vrtc, newfreq); + + /* + * The side effect of bits that control the RTC date/time format + * is handled lazily when those fields are actually read. + */ + return (0); +} + +static void +vrtc_set_reg_a(struct vrtc *vrtc, uint8_t newval) +{ + sbintime_t oldfreq, newfreq; + uint8_t oldval, changed; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + newval &= ~RTCSA_TUP; + oldval = vrtc->rtcdev.reg_a; + oldfreq = vrtc_freq(vrtc); + + if (divider_enabled(oldval) && !divider_enabled(newval)) { + VM_CTR2(vrtc->vm, "RTC divider held in reset at %#lx/%#lx", + vrtc->base_rtctime, vrtc->base_uptime); + } else if (!divider_enabled(oldval) && divider_enabled(newval)) { + /* + * If the dividers are coming out of reset then update + * 'base_uptime' before this happens. This is done to + * maintain the illusion that the RTC date/time was frozen + * while the dividers were disabled. + */ + vrtc->base_uptime = sbinuptime(); + VM_CTR2(vrtc->vm, "RTC divider out of reset at %#lx/%#lx", + vrtc->base_rtctime, vrtc->base_uptime); + } else { + /* NOTHING */ + } + + vrtc->rtcdev.reg_a = newval; + changed = oldval ^ newval; + if (changed) { + VM_CTR2(vrtc->vm, "RTC reg_a changed from %#x to %#x", + oldval, newval); + } + + /* + * Side effect of changes to rate select and divider enable bits. + */ + newfreq = vrtc_freq(vrtc); + if (newfreq != oldfreq) + vrtc_callout_reset(vrtc, newfreq); + else + vrtc_callout_check(vrtc, newfreq); +} + +int +vrtc_set_time(struct vm *vm, time_t secs) +{ + struct vrtc *vrtc; + int error; + + vrtc = vm_rtc(vm); + VRTC_LOCK(vrtc); + error = vrtc_time_update(vrtc, secs, sbinuptime()); + VRTC_UNLOCK(vrtc); + + if (error) { + VM_CTR2(vrtc->vm, "Error %d setting RTC time to %#lx", error, + secs); + } else { + VM_CTR1(vrtc->vm, "RTC time set to %#lx", secs); + } + + return (error); +} + +time_t +vrtc_get_time(struct vm *vm) +{ + struct vrtc *vrtc; + sbintime_t basetime; + time_t t; + + vrtc = vm_rtc(vm); + VRTC_LOCK(vrtc); + t = vrtc_curtime(vrtc, &basetime); + VRTC_UNLOCK(vrtc); + + return (t); +} + +int +vrtc_nvram_write(struct vm *vm, int offset, uint8_t value) +{ + struct vrtc *vrtc; + uint8_t *ptr; + + vrtc = vm_rtc(vm); + + /* + * Don't allow writes to RTC control registers or the date/time fields. + */ + if (offset < offsetof(struct rtcdev, nvram[0]) || + offset == RTC_CENTURY || offset >= sizeof(struct rtcdev)) { + VM_CTR1(vrtc->vm, "RTC nvram write to invalid offset %d", + offset); + return (EINVAL); + } + + VRTC_LOCK(vrtc); + ptr = (uint8_t *)(&vrtc->rtcdev); + ptr[offset] = value; + VM_CTR2(vrtc->vm, "RTC nvram write %#x to offset %#x", value, offset); + VRTC_UNLOCK(vrtc); + + return (0); +} + +int +vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval) +{ + struct vrtc *vrtc; + sbintime_t basetime; + time_t curtime; + uint8_t *ptr; + + /* + * Allow all offsets in the RTC to be read. + */ + if (offset < 0 || offset >= sizeof(struct rtcdev)) + return (EINVAL); + + vrtc = vm_rtc(vm); + VRTC_LOCK(vrtc); + + /* + * Update RTC date/time fields if necessary. + */ + if (offset < 10 || offset == RTC_CENTURY) { + curtime = vrtc_curtime(vrtc, &basetime); + secs_to_rtc(curtime, vrtc, 0); + } + + ptr = (uint8_t *)(&vrtc->rtcdev); + *retval = ptr[offset]; + + VRTC_UNLOCK(vrtc); + return (0); +} + +int +vrtc_addr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *val) +{ + struct vrtc *vrtc; + + vrtc = vm_rtc(vm); + + if (bytes != 1) + return (-1); + + if (in) { + *val = 0xff; + return (0); + } + + VRTC_LOCK(vrtc); + vrtc->addr = *val & 0x7f; + VRTC_UNLOCK(vrtc); + + return (0); +} + +int +vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *val) +{ + struct vrtc *vrtc; + struct rtcdev *rtc; + sbintime_t basetime; + time_t curtime; + int error, offset; + + vrtc = vm_rtc(vm); + rtc = &vrtc->rtcdev; + + if (bytes != 1) + return (-1); + + VRTC_LOCK(vrtc); + offset = vrtc->addr; + if (offset >= sizeof(struct rtcdev)) { + VRTC_UNLOCK(vrtc); + return (-1); + } + + error = 0; + curtime = vrtc_curtime(vrtc, &basetime); + vrtc_time_update(vrtc, curtime, basetime); + + /* + * Update RTC date/time fields if necessary. + * + * This is not just for reads of the RTC. The side-effect of writing + * the century byte requires other RTC date/time fields (e.g. sec) + * to be updated here. + */ + if (offset < 10 || offset == RTC_CENTURY) + secs_to_rtc(curtime, vrtc, 0); + + if (in) { + if (offset == 12) { + /* + * XXX + * reg_c interrupt flags are updated only if the + * corresponding interrupt enable bit in reg_b is set. + */ + *val = vrtc->rtcdev.reg_c; + vrtc_set_reg_c(vrtc, 0); + } else { + *val = *((uint8_t *)rtc + offset); + } + VCPU_CTR2(vm, vcpuid, "Read value %#x from RTC offset %#x", + *val, offset); + } else { + switch (offset) { + case 10: + VCPU_CTR1(vm, vcpuid, "RTC reg_a set to %#x", *val); + vrtc_set_reg_a(vrtc, *val); + break; + case 11: + VCPU_CTR1(vm, vcpuid, "RTC reg_b set to %#x", *val); + error = vrtc_set_reg_b(vrtc, *val); + break; + case 12: + VCPU_CTR1(vm, vcpuid, "RTC reg_c set to %#x (ignored)", + *val); + break; + case 13: + VCPU_CTR1(vm, vcpuid, "RTC reg_d set to %#x (ignored)", + *val); + break; + case 0: + /* + * High order bit of 'seconds' is readonly. + */ + *val &= 0x7f; + /* FALLTHRU */ + default: + VCPU_CTR2(vm, vcpuid, "RTC offset %#x set to %#x", + offset, *val); + *((uint8_t *)rtc + offset) = *val; + break; + } + + /* + * XXX some guests (e.g. OpenBSD) write the century byte + * outside of RTCSB_HALT so re-calculate the RTC date/time. + */ + if (offset == RTC_CENTURY && !rtc_halted(vrtc)) { + curtime = rtc_to_secs(vrtc); + error = vrtc_time_update(vrtc, curtime, sbinuptime()); + KASSERT(!error, ("vrtc_time_update error %d", error)); + if (curtime == VRTC_BROKEN_TIME && rtc_flag_broken_time) + error = -1; + } + } + VRTC_UNLOCK(vrtc); + return (error); +} + +void +vrtc_reset(struct vrtc *vrtc) +{ + struct rtcdev *rtc; + + VRTC_LOCK(vrtc); + + rtc = &vrtc->rtcdev; + vrtc_set_reg_b(vrtc, rtc->reg_b & ~(RTCSB_ALL_INTRS | RTCSB_SQWE)); + vrtc_set_reg_c(vrtc, 0); + KASSERT(!callout_active(&vrtc->callout), ("rtc callout still active")); + + VRTC_UNLOCK(vrtc); +} + +struct vrtc * +vrtc_init(struct vm *vm) +{ + struct vrtc *vrtc; + struct rtcdev *rtc; + time_t curtime; + + vrtc = malloc(sizeof(struct vrtc), M_VRTC, M_WAITOK | M_ZERO); + vrtc->vm = vm; + mtx_init(&vrtc->mtx, "vrtc lock", NULL, MTX_DEF); + callout_init(&vrtc->callout, 1); + + /* Allow dividers to keep time but disable everything else */ + rtc = &vrtc->rtcdev; + rtc->reg_a = 0x20; + rtc->reg_b = RTCSB_24HR; + rtc->reg_c = 0; + rtc->reg_d = RTCSD_PWR; + + /* Reset the index register to a safe value. */ + vrtc->addr = RTC_STATUSD; + + /* + * Initialize RTC time to 00:00:00 Jan 1, 1970. + */ + curtime = 0; + + VRTC_LOCK(vrtc); + vrtc->base_rtctime = VRTC_BROKEN_TIME; + vrtc_time_update(vrtc, curtime, sbinuptime()); + secs_to_rtc(curtime, vrtc, 0); + VRTC_UNLOCK(vrtc); + + return (vrtc); +} + +void +vrtc_cleanup(struct vrtc *vrtc) +{ + + callout_drain(&vrtc->callout); + free(vrtc, M_VRTC); +} diff --git a/vmm/io/vrtc.h b/vmm/io/vrtc.h new file mode 100644 index 0000000..6fbbc9c --- /dev/null +++ b/vmm/io/vrtc.h @@ -0,0 +1,50 @@ +/*- + * Copyright (c) 2014 Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VRTC_H_ +#define _VRTC_H_ + +#include + +struct vrtc; + +struct vrtc *vrtc_init(struct vm *vm); +void vrtc_cleanup(struct vrtc *vrtc); +void vrtc_reset(struct vrtc *vrtc); + +time_t vrtc_get_time(struct vm *vm); +int vrtc_set_time(struct vm *vm, time_t secs); +int vrtc_nvram_write(struct vm *vm, int offset, uint8_t value); +int vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval); + +int vrtc_addr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *val); +int vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *val); + +#endif diff --git a/vmm/vmm.c b/vmm/vmm.c new file mode 100644 index 0000000..51c63f5 --- /dev/null +++ b/vmm/vmm.c @@ -0,0 +1,2427 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "vmm_ioport.h" +#include "vmm_ktr.h" +#include "vmm_host.h" +#include "vmm_mem.h" +#include "vmm_util.h" +#include "vatpic.h" +#include "vatpit.h" +#include "vhpet.h" +#include "vioapic.h" +#include "vlapic.h" +#include "vpmtmr.h" +#include "vrtc.h" +#include "vmm_stat.h" +#include "vmm_lapic.h" + +#include "io/ppt.h" +#include "io/iommu.h" + +struct vlapic; + +/* + * Initialization: + * (a) allocated when vcpu is created + * (i) initialized when vcpu is created and when it is reinitialized + * (o) initialized the first time the vcpu is created + * (x) initialized before use + */ +struct vcpu { + struct mtx mtx; /* (o) protects 'state' and 'hostcpu' */ + enum vcpu_state state; /* (o) vcpu state */ + int hostcpu; /* (o) vcpu's host cpu */ + struct vlapic *vlapic; /* (i) APIC device model */ + enum x2apic_state x2apic_state; /* (i) APIC mode */ + uint64_t exitintinfo; /* (i) events pending at VM exit */ + int nmi_pending; /* (i) NMI pending */ + int extint_pending; /* (i) INTR pending */ + int exception_pending; /* (i) exception pending */ + int exc_vector; /* (x) exception collateral */ + int exc_errcode_valid; + uint32_t exc_errcode; + struct savefpu *guestfpu; /* (a,i) guest fpu state */ + uint64_t guest_xcr0; /* (i) guest %xcr0 register */ + void *stats; /* (a,i) statistics */ + struct vm_exit exitinfo; /* (x) exit reason and collateral */ + uint64_t nextrip; /* (x) next instruction to execute */ +}; + +#define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx)) +#define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) +#define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) +#define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) +#define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) + +struct mem_seg { + vm_paddr_t gpa; + size_t len; + boolean_t wired; + vm_object_t object; +}; +#define VM_MAX_MEMORY_SEGMENTS 2 + +/* + * Initialization: + * (o) initialized the first time the VM is created + * (i) initialized when VM is created and when it is reinitialized + * (x) initialized before use + */ +struct vm { + void *cookie; /* (i) cpu-specific data */ + void *iommu; /* (x) iommu-specific data */ + struct vhpet *vhpet; /* (i) virtual HPET */ + struct vioapic *vioapic; /* (i) virtual ioapic */ + struct vatpic *vatpic; /* (i) virtual atpic */ + struct vatpit *vatpit; /* (i) virtual atpit */ + struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */ + struct vrtc *vrtc; /* (o) virtual RTC */ + volatile cpuset_t active_cpus; /* (i) active vcpus */ + int suspend; /* (i) stop VM execution */ + volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ + volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ + cpuset_t rendezvous_req_cpus; /* (x) rendezvous requested */ + cpuset_t rendezvous_done_cpus; /* (x) rendezvous finished */ + void *rendezvous_arg; /* (x) rendezvous func/arg */ + vm_rendezvous_func_t rendezvous_func; + struct mtx rendezvous_mtx; /* (o) rendezvous lock */ + int num_mem_segs; /* (o) guest memory segments */ + struct mem_seg mem_segs[VM_MAX_MEMORY_SEGMENTS]; + struct vmspace *vmspace; /* (o) guest's address space */ + char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */ + struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */ +}; + +static int vmm_initialized; + +static struct vmm_ops *ops; +#define VMM_INIT(num) (ops != NULL ? (*ops->init)(num) : 0) +#define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0) +#define VMM_RESUME() (ops != NULL ? (*ops->resume)() : 0) + +#define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL) +#define VMRUN(vmi, vcpu, rip, pmap, rptr, sptr) \ + (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr, sptr) : ENXIO) +#define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) +#define VMSPACE_ALLOC(min, max) \ + (ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL) +#define VMSPACE_FREE(vmspace) \ + (ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO) +#define VMGETREG(vmi, vcpu, num, retval) \ + (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO) +#define VMSETREG(vmi, vcpu, num, val) \ + (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO) +#define VMGETDESC(vmi, vcpu, num, desc) \ + (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO) +#define VMSETDESC(vmi, vcpu, num, desc) \ + (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO) +#define VMGETCAP(vmi, vcpu, num, retval) \ + (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO) +#define VMSETCAP(vmi, vcpu, num, val) \ + (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO) +#define VLAPIC_INIT(vmi, vcpu) \ + (ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL) +#define VLAPIC_CLEANUP(vmi, vlapic) \ + (ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL) + +#define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) +#define fpu_stop_emulating() clts() + +static MALLOC_DEFINE(M_VM, "vm", "vm"); + +/* statistics */ +static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); + +SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); + +/* + * Halt the guest if all vcpus are executing a HLT instruction with + * interrupts disabled. + */ +static int halt_detection_enabled = 1; +SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN, + &halt_detection_enabled, 0, + "Halt VM if all vcpus execute HLT with interrupts disabled"); + +static int vmm_ipinum; +SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0, + "IPI vector used for vcpu notifications"); + +static int trace_guest_exceptions; +SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN, + &trace_guest_exceptions, 0, + "Trap into hypervisor on all guest exceptions and reflect them back"); + +static int vmm_force_iommu = 0; +TUNABLE_INT("hw.vmm.force_iommu", &vmm_force_iommu); +SYSCTL_INT(_hw_vmm, OID_AUTO, force_iommu, CTLFLAG_RDTUN, &vmm_force_iommu, 0, + "Force use of I/O MMU even if no passthrough devices were found."); + +static void +vcpu_cleanup(struct vm *vm, int i, bool destroy) +{ + struct vcpu *vcpu = &vm->vcpu[i]; + + VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic); + if (destroy) { + vmm_stat_free(vcpu->stats); + fpu_save_area_free(vcpu->guestfpu); + } +} + +static void +vcpu_init(struct vm *vm, int vcpu_id, bool create) +{ + struct vcpu *vcpu; + + KASSERT(vcpu_id >= 0 && vcpu_id < VM_MAXCPU, + ("vcpu_init: invalid vcpu %d", vcpu_id)); + + vcpu = &vm->vcpu[vcpu_id]; + + if (create) { + KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already " + "initialized", vcpu_id)); + vcpu_lock_init(vcpu); + vcpu->state = VCPU_IDLE; + vcpu->hostcpu = NOCPU; + vcpu->guestfpu = fpu_save_area_alloc(); + vcpu->stats = vmm_stat_alloc(); + } + + vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); + vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); + vcpu->exitintinfo = 0; + vcpu->nmi_pending = 0; + vcpu->extint_pending = 0; + vcpu->exception_pending = 0; + vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; + fpu_save_area_reset(vcpu->guestfpu); + vmm_stat_init(vcpu->stats); +} + +int +vcpu_trace_exceptions(struct vm *vm, int vcpuid) +{ + + return (trace_guest_exceptions); +} + +struct vm_exit * +vm_exitinfo(struct vm *vm, int cpuid) +{ + struct vcpu *vcpu; + + if (cpuid < 0 || cpuid >= VM_MAXCPU) + panic("vm_exitinfo: invalid cpuid %d", cpuid); + + vcpu = &vm->vcpu[cpuid]; + + return (&vcpu->exitinfo); +} + +static void +vmm_resume(void) +{ + VMM_RESUME(); +} + +static int +vmm_init(void) +{ + int error; + + vmm_host_state_init(); + + vmm_ipinum = lapic_ipi_alloc(&IDTVEC(justreturn)); + if (vmm_ipinum < 0) + vmm_ipinum = IPI_AST; + + error = vmm_mem_init(); + if (error) + return (error); + + if (vmm_is_intel()) + ops = &vmm_ops_intel; + else if (vmm_is_amd()) + ops = &vmm_ops_amd; + else + return (ENXIO); + + vmm_resume_p = vmm_resume; + + return (VMM_INIT(vmm_ipinum)); +} + +static int +vmm_handler(module_t mod, int what, void *arg) +{ + int error; + + switch (what) { + case MOD_LOAD: + vmmdev_init(); + if (vmm_force_iommu || ppt_avail_devices() > 0) + iommu_init(); + error = vmm_init(); + if (error == 0) + vmm_initialized = 1; + break; + case MOD_UNLOAD: + error = vmmdev_cleanup(); + if (error == 0) { + vmm_resume_p = NULL; + iommu_cleanup(); + if (vmm_ipinum != IPI_AST) + lapic_ipi_free(vmm_ipinum); + error = VMM_CLEANUP(); + /* + * Something bad happened - prevent new + * VMs from being created + */ + if (error) + vmm_initialized = 0; + } + break; + default: + error = 0; + break; + } + return (error); +} + +static moduledata_t vmm_kmod = { + "vmm", + vmm_handler, + NULL +}; + +/* + * vmm initialization has the following dependencies: + * + * - iommu initialization must happen after the pci passthru driver has had + * a chance to attach to any passthru devices (after SI_SUB_CONFIGURE). + * + * - VT-x initialization requires smp_rendezvous() and therefore must happen + * after SMP is fully functional (after SI_SUB_SMP). + */ +DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY); +MODULE_VERSION(vmm, 1); + +static void +vm_init(struct vm *vm, bool create) +{ + int i; + + vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace)); + vm->iommu = NULL; + vm->vioapic = vioapic_init(vm); + vm->vhpet = vhpet_init(vm); + vm->vatpic = vatpic_init(vm); + vm->vatpit = vatpit_init(vm); + vm->vpmtmr = vpmtmr_init(vm); + if (create) + vm->vrtc = vrtc_init(vm); + + CPU_ZERO(&vm->active_cpus); + + vm->suspend = 0; + CPU_ZERO(&vm->suspended_cpus); + + for (i = 0; i < VM_MAXCPU; i++) + vcpu_init(vm, i, create); +} + +int +vm_create(const char *name, struct vm **retvm) +{ + struct vm *vm; + struct vmspace *vmspace; + + /* + * If vmm.ko could not be successfully initialized then don't attempt + * to create the virtual machine. + */ + if (!vmm_initialized) + return (ENXIO); + + if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) + return (EINVAL); + + vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS); + if (vmspace == NULL) + return (ENOMEM); + + vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); + strcpy(vm->name, name); + vm->num_mem_segs = 0; + vm->vmspace = vmspace; + mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF); + + vm_init(vm, true); + + *retvm = vm; + return (0); +} + +static void +vm_free_mem_seg(struct vm *vm, struct mem_seg *seg) +{ + + if (seg->object != NULL) + vmm_mem_free(vm->vmspace, seg->gpa, seg->len); + + bzero(seg, sizeof(*seg)); +} + +static void +vm_cleanup(struct vm *vm, bool destroy) +{ + int i; + + ppt_unassign_all(vm); + + if (vm->iommu != NULL) + iommu_destroy_domain(vm->iommu); + + if (destroy) + vrtc_cleanup(vm->vrtc); + else + vrtc_reset(vm->vrtc); + vpmtmr_cleanup(vm->vpmtmr); + vatpit_cleanup(vm->vatpit); + vhpet_cleanup(vm->vhpet); + vatpic_cleanup(vm->vatpic); + vioapic_cleanup(vm->vioapic); + + for (i = 0; i < VM_MAXCPU; i++) + vcpu_cleanup(vm, i, destroy); + + VMCLEANUP(vm->cookie); + + if (destroy) { + for (i = 0; i < vm->num_mem_segs; i++) + vm_free_mem_seg(vm, &vm->mem_segs[i]); + + vm->num_mem_segs = 0; + + VMSPACE_FREE(vm->vmspace); + vm->vmspace = NULL; + } +} + +void +vm_destroy(struct vm *vm) +{ + vm_cleanup(vm, true); + free(vm, M_VM); +} + +int +vm_reinit(struct vm *vm) +{ + int error; + + /* + * A virtual machine can be reset only if all vcpus are suspended. + */ + if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { + vm_cleanup(vm, false); + vm_init(vm, false); + error = 0; + } else { + error = EBUSY; + } + + return (error); +} + +const char * +vm_name(struct vm *vm) +{ + return (vm->name); +} + +int +vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) +{ + vm_object_t obj; + + if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) + return (ENOMEM); + else + return (0); +} + +int +vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) +{ + + vmm_mmio_free(vm->vmspace, gpa, len); + return (0); +} + +boolean_t +vm_mem_allocated(struct vm *vm, vm_paddr_t gpa) +{ + int i; + vm_paddr_t gpabase, gpalimit; + + for (i = 0; i < vm->num_mem_segs; i++) { + gpabase = vm->mem_segs[i].gpa; + gpalimit = gpabase + vm->mem_segs[i].len; + if (gpa >= gpabase && gpa < gpalimit) + return (TRUE); /* 'gpa' is regular memory */ + } + + if (ppt_is_mmio(vm, gpa)) + return (TRUE); /* 'gpa' is pci passthru mmio */ + + return (FALSE); +} + +int +vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len) +{ + int available, allocated; + struct mem_seg *seg; + vm_object_t object; + vm_paddr_t g; + + if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0) + return (EINVAL); + + available = allocated = 0; + g = gpa; + while (g < gpa + len) { + if (vm_mem_allocated(vm, g)) + allocated++; + else + available++; + + g += PAGE_SIZE; + } + + /* + * If there are some allocated and some available pages in the address + * range then it is an error. + */ + if (allocated && available) + return (EINVAL); + + /* + * If the entire address range being requested has already been + * allocated then there isn't anything more to do. + */ + if (allocated && available == 0) + return (0); + + if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS) + return (E2BIG); + + seg = &vm->mem_segs[vm->num_mem_segs]; + + if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL) + return (ENOMEM); + + seg->gpa = gpa; + seg->len = len; + seg->object = object; + seg->wired = FALSE; + + vm->num_mem_segs++; + + return (0); +} + +static vm_paddr_t +vm_maxmem(struct vm *vm) +{ + int i; + vm_paddr_t gpa, maxmem; + + maxmem = 0; + for (i = 0; i < vm->num_mem_segs; i++) { + gpa = vm->mem_segs[i].gpa + vm->mem_segs[i].len; + if (gpa > maxmem) + maxmem = gpa; + } + return (maxmem); +} + +static void +vm_gpa_unwire(struct vm *vm) +{ + int i, rv; + struct mem_seg *seg; + + for (i = 0; i < vm->num_mem_segs; i++) { + seg = &vm->mem_segs[i]; + if (!seg->wired) + continue; + + rv = vm_map_unwire(&vm->vmspace->vm_map, + seg->gpa, seg->gpa + seg->len, + VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); + KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment " + "%#lx/%ld could not be unwired: %d", + vm_name(vm), seg->gpa, seg->len, rv)); + + seg->wired = FALSE; + } +} + +static int +vm_gpa_wire(struct vm *vm) +{ + int i, rv; + struct mem_seg *seg; + + for (i = 0; i < vm->num_mem_segs; i++) { + seg = &vm->mem_segs[i]; + if (seg->wired) + continue; + + /* XXX rlimits? */ + rv = vm_map_wire(&vm->vmspace->vm_map, + seg->gpa, seg->gpa + seg->len, + VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); + if (rv != KERN_SUCCESS) + break; + + seg->wired = TRUE; + } + + if (i < vm->num_mem_segs) { + /* + * Undo the wiring before returning an error. + */ + vm_gpa_unwire(vm); + return (EAGAIN); + } + + return (0); +} + +static void +vm_iommu_modify(struct vm *vm, boolean_t map) +{ + int i, sz; + vm_paddr_t gpa, hpa; + struct mem_seg *seg; + void *vp, *cookie, *host_domain; + + sz = PAGE_SIZE; + host_domain = iommu_host_domain(); + + for (i = 0; i < vm->num_mem_segs; i++) { + seg = &vm->mem_segs[i]; + KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired", + vm_name(vm), seg->gpa, seg->len)); + + gpa = seg->gpa; + while (gpa < seg->gpa + seg->len) { + vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE, + &cookie); + KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx", + vm_name(vm), gpa)); + + vm_gpa_release(cookie); + + hpa = DMAP_TO_PHYS((uintptr_t)vp); + if (map) { + iommu_create_mapping(vm->iommu, gpa, hpa, sz); + iommu_remove_mapping(host_domain, hpa, sz); + } else { + iommu_remove_mapping(vm->iommu, gpa, sz); + iommu_create_mapping(host_domain, hpa, hpa, sz); + } + + gpa += PAGE_SIZE; + } + } + + /* + * Invalidate the cached translations associated with the domain + * from which pages were removed. + */ + if (map) + iommu_invalidate_tlb(host_domain); + else + iommu_invalidate_tlb(vm->iommu); +} + +#define vm_iommu_unmap(vm) vm_iommu_modify((vm), FALSE) +#define vm_iommu_map(vm) vm_iommu_modify((vm), TRUE) + +int +vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func) +{ + int error; + + error = ppt_unassign_device(vm, bus, slot, func); + if (error) + return (error); + + if (ppt_assigned_devices(vm) == 0) { + vm_iommu_unmap(vm); + vm_gpa_unwire(vm); + } + return (0); +} + +int +vm_assign_pptdev(struct vm *vm, int bus, int slot, int func) +{ + int error; + vm_paddr_t maxaddr; + + /* + * Virtual machines with pci passthru devices get special treatment: + * - the guest physical memory is wired + * - the iommu is programmed to do the 'gpa' to 'hpa' translation + * + * We need to do this before the first pci passthru device is attached. + */ + if (ppt_assigned_devices(vm) == 0) { + KASSERT(vm->iommu == NULL, + ("vm_assign_pptdev: iommu must be NULL")); + maxaddr = vm_maxmem(vm); + vm->iommu = iommu_create_domain(maxaddr); + + error = vm_gpa_wire(vm); + if (error) + return (error); + + vm_iommu_map(vm); + } + + error = ppt_assign_device(vm, bus, slot, func); + return (error); +} + +void * +vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot, + void **cookie) +{ + int count, pageoff; + vm_page_t m; + + pageoff = gpa & PAGE_MASK; + if (len > PAGE_SIZE - pageoff) + panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); + + count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, + trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); + + if (count == 1) { + *cookie = m; + return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff)); + } else { + *cookie = NULL; + return (NULL); + } +} + +void +vm_gpa_release(void *cookie) +{ + vm_page_t m = cookie; + + vm_page_lock(m); + vm_page_unhold(m); + vm_page_unlock(m); +} + +int +vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, + struct vm_memory_segment *seg) +{ + int i; + + for (i = 0; i < vm->num_mem_segs; i++) { + if (gpabase == vm->mem_segs[i].gpa) { + seg->gpa = vm->mem_segs[i].gpa; + seg->len = vm->mem_segs[i].len; + seg->wired = vm->mem_segs[i].wired; + return (0); + } + } + return (-1); +} + +int +vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len, + vm_offset_t *offset, struct vm_object **object) +{ + int i; + size_t seg_len; + vm_paddr_t seg_gpa; + vm_object_t seg_obj; + + for (i = 0; i < vm->num_mem_segs; i++) { + if ((seg_obj = vm->mem_segs[i].object) == NULL) + continue; + + seg_gpa = vm->mem_segs[i].gpa; + seg_len = vm->mem_segs[i].len; + + if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) { + *offset = gpa - seg_gpa; + *object = seg_obj; + vm_object_reference(seg_obj); + return (0); + } + } + + return (EINVAL); +} + +int +vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) +{ + + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (reg >= VM_REG_LAST) + return (EINVAL); + + return (VMGETREG(vm->cookie, vcpu, reg, retval)); +} + +int +vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val) +{ + struct vcpu *vcpu; + int error; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + if (reg >= VM_REG_LAST) + return (EINVAL); + + error = VMSETREG(vm->cookie, vcpuid, reg, val); + if (error || reg != VM_REG_GUEST_RIP) + return (error); + + /* Set 'nextrip' to match the value of %rip */ + VCPU_CTR1(vm, vcpuid, "Setting nextrip to %#lx", val); + vcpu = &vm->vcpu[vcpuid]; + vcpu->nextrip = val; + return (0); +} + +static boolean_t +is_descriptor_table(int reg) +{ + + switch (reg) { + case VM_REG_GUEST_IDTR: + case VM_REG_GUEST_GDTR: + return (TRUE); + default: + return (FALSE); + } +} + +static boolean_t +is_segment_register(int reg) +{ + + switch (reg) { + case VM_REG_GUEST_ES: + case VM_REG_GUEST_CS: + case VM_REG_GUEST_SS: + case VM_REG_GUEST_DS: + case VM_REG_GUEST_FS: + case VM_REG_GUEST_GS: + case VM_REG_GUEST_TR: + case VM_REG_GUEST_LDTR: + return (TRUE); + default: + return (FALSE); + } +} + +int +vm_get_seg_desc(struct vm *vm, int vcpu, int reg, + struct seg_desc *desc) +{ + + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (!is_segment_register(reg) && !is_descriptor_table(reg)) + return (EINVAL); + + return (VMGETDESC(vm->cookie, vcpu, reg, desc)); +} + +int +vm_set_seg_desc(struct vm *vm, int vcpu, int reg, + struct seg_desc *desc) +{ + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (!is_segment_register(reg) && !is_descriptor_table(reg)) + return (EINVAL); + + return (VMSETDESC(vm->cookie, vcpu, reg, desc)); +} + +static void +restore_guest_fpustate(struct vcpu *vcpu) +{ + + /* flush host state to the pcb */ + fpuexit(curthread); + + /* restore guest FPU state */ + fpu_stop_emulating(); + fpurestore(vcpu->guestfpu); + + /* restore guest XCR0 if XSAVE is enabled in the host */ + if (rcr4() & CR4_XSAVE) + load_xcr(0, vcpu->guest_xcr0); + + /* + * The FPU is now "dirty" with the guest's state so turn on emulation + * to trap any access to the FPU by the host. + */ + fpu_start_emulating(); +} + +static void +save_guest_fpustate(struct vcpu *vcpu) +{ + + if ((rcr0() & CR0_TS) == 0) + panic("fpu emulation not enabled in host!"); + + /* save guest XCR0 and restore host XCR0 */ + if (rcr4() & CR4_XSAVE) { + vcpu->guest_xcr0 = rxcr(0); + load_xcr(0, vmm_get_host_xcr0()); + } + + /* save guest FPU state */ + fpu_stop_emulating(); + fpusave(vcpu->guestfpu); + fpu_start_emulating(); +} + +static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); + +static int +vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, + bool from_idle) +{ + int error; + + vcpu_assert_locked(vcpu); + + /* + * State transitions from the vmmdev_ioctl() must always begin from + * the VCPU_IDLE state. This guarantees that there is only a single + * ioctl() operating on a vcpu at any point. + */ + if (from_idle) { + while (vcpu->state != VCPU_IDLE) + msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); + } else { + KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " + "vcpu idle state")); + } + + if (vcpu->state == VCPU_RUNNING) { + KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " + "mismatch for running vcpu", curcpu, vcpu->hostcpu)); + } else { + KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " + "vcpu that is not running", vcpu->hostcpu)); + } + + /* + * The following state transitions are allowed: + * IDLE -> FROZEN -> IDLE + * FROZEN -> RUNNING -> FROZEN + * FROZEN -> SLEEPING -> FROZEN + */ + switch (vcpu->state) { + case VCPU_IDLE: + case VCPU_RUNNING: + case VCPU_SLEEPING: + error = (newstate != VCPU_FROZEN); + break; + case VCPU_FROZEN: + error = (newstate == VCPU_FROZEN); + break; + default: + error = 1; + break; + } + + if (error) + return (EBUSY); + + vcpu->state = newstate; + if (newstate == VCPU_RUNNING) + vcpu->hostcpu = curcpu; + else + vcpu->hostcpu = NOCPU; + + if (newstate == VCPU_IDLE) + wakeup(&vcpu->state); + + return (0); +} + +static void +vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) +{ + int error; + + if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0) + panic("Error %d setting state to %d\n", error, newstate); +} + +static void +vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) +{ + int error; + + if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0) + panic("Error %d setting state to %d", error, newstate); +} + +static void +vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func) +{ + + KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked")); + + /* + * Update 'rendezvous_func' and execute a write memory barrier to + * ensure that it is visible across all host cpus. This is not needed + * for correctness but it does ensure that all the vcpus will notice + * that the rendezvous is requested immediately. + */ + vm->rendezvous_func = func; + wmb(); +} + +#define RENDEZVOUS_CTR0(vm, vcpuid, fmt) \ + do { \ + if (vcpuid >= 0) \ + VCPU_CTR0(vm, vcpuid, fmt); \ + else \ + VM_CTR0(vm, fmt); \ + } while (0) + +static void +vm_handle_rendezvous(struct vm *vm, int vcpuid) +{ + + KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU), + ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid)); + + mtx_lock(&vm->rendezvous_mtx); + while (vm->rendezvous_func != NULL) { + /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */ + CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus); + + if (vcpuid != -1 && + CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) && + !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) { + VCPU_CTR0(vm, vcpuid, "Calling rendezvous func"); + (*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg); + CPU_SET(vcpuid, &vm->rendezvous_done_cpus); + } + if (CPU_CMP(&vm->rendezvous_req_cpus, + &vm->rendezvous_done_cpus) == 0) { + VCPU_CTR0(vm, vcpuid, "Rendezvous completed"); + vm_set_rendezvous_func(vm, NULL); + wakeup(&vm->rendezvous_func); + break; + } + RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion"); + mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0, + "vmrndv", 0); + } + mtx_unlock(&vm->rendezvous_mtx); +} + +/* + * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. + */ +static int +vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu) +{ + struct vcpu *vcpu; + const char *wmesg; + int t, vcpu_halted, vm_halted; + + KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); + + vcpu = &vm->vcpu[vcpuid]; + vcpu_halted = 0; + vm_halted = 0; + + vcpu_lock(vcpu); + while (1) { + /* + * Do a final check for pending NMI or interrupts before + * really putting this thread to sleep. Also check for + * software events that would cause this vcpu to wakeup. + * + * These interrupts/events could have happened after the + * vcpu returned from VMRUN() and before it acquired the + * vcpu lock above. + */ + if (vm->rendezvous_func != NULL || vm->suspend) + break; + if (vm_nmi_pending(vm, vcpuid)) + break; + if (!intr_disabled) { + if (vm_extint_pending(vm, vcpuid) || + vlapic_pending_intr(vcpu->vlapic, NULL)) { + break; + } + } + + /* Don't go to sleep if the vcpu thread needs to yield */ + if (vcpu_should_yield(vm, vcpuid)) + break; + + /* + * Some Linux guests implement "halt" by having all vcpus + * execute HLT with interrupts disabled. 'halted_cpus' keeps + * track of the vcpus that have entered this state. When all + * vcpus enter the halted state the virtual machine is halted. + */ + if (intr_disabled) { + wmesg = "vmhalt"; + VCPU_CTR0(vm, vcpuid, "Halted"); + if (!vcpu_halted && halt_detection_enabled) { + vcpu_halted = 1; + CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); + } + if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { + vm_halted = 1; + break; + } + } else { + wmesg = "vmidle"; + } + + t = ticks; + vcpu_require_state_locked(vcpu, VCPU_SLEEPING); + /* + * XXX msleep_spin() cannot be interrupted by signals so + * wake up periodically to check pending signals. + */ + msleep_spin(vcpu, &vcpu->mtx, wmesg, hz); + vcpu_require_state_locked(vcpu, VCPU_FROZEN); + vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); + } + + if (vcpu_halted) + CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); + + vcpu_unlock(vcpu); + + if (vm_halted) + vm_suspend(vm, VM_SUSPEND_HALT); + + return (0); +} + +static int +vm_handle_paging(struct vm *vm, int vcpuid, bool *retu) +{ + int rv, ftype; + struct vm_map *map; + struct vcpu *vcpu; + struct vm_exit *vme; + + vcpu = &vm->vcpu[vcpuid]; + vme = &vcpu->exitinfo; + + KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", + __func__, vme->inst_length)); + + ftype = vme->u.paging.fault_type; + KASSERT(ftype == VM_PROT_READ || + ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE, + ("vm_handle_paging: invalid fault_type %d", ftype)); + + if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { + rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace), + vme->u.paging.gpa, ftype); + if (rv == 0) { + VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %#lx", + ftype == VM_PROT_READ ? "accessed" : "dirty", + vme->u.paging.gpa); + goto done; + } + } + + map = &vm->vmspace->vm_map; + rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL); + + VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, " + "ftype = %d", rv, vme->u.paging.gpa, ftype); + + if (rv != KERN_SUCCESS) + return (EFAULT); +done: + return (0); +} + +static int +vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) +{ + struct vie *vie; + struct vcpu *vcpu; + struct vm_exit *vme; + uint64_t gla, gpa, cs_base; + struct vm_guest_paging *paging; + mem_region_read_t mread; + mem_region_write_t mwrite; + enum vm_cpu_mode cpu_mode; + int cs_d, error, fault, length; + + vcpu = &vm->vcpu[vcpuid]; + vme = &vcpu->exitinfo; + + gla = vme->u.inst_emul.gla; + gpa = vme->u.inst_emul.gpa; + cs_base = vme->u.inst_emul.cs_base; + cs_d = vme->u.inst_emul.cs_d; + vie = &vme->u.inst_emul.vie; + paging = &vme->u.inst_emul.paging; + cpu_mode = paging->cpu_mode; + + VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %#lx", gpa); + + /* Fetch, decode and emulate the faulting instruction */ + if (vie->num_valid == 0) { + /* + * If the instruction length is not known then assume a + * maximum size instruction. + */ + length = vme->inst_length ? vme->inst_length : VIE_INST_SIZE; + error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip + + cs_base, length, vie, &fault); + } else { + /* + * The instruction bytes have already been copied into 'vie' + */ + error = fault = 0; + } + if (error || fault) + return (error); + + if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) { + VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %#lx", + vme->rip + cs_base); + *retu = true; /* dump instruction bytes in userspace */ + return (0); + } + + /* + * If the instruction length was not specified then update it now + * along with 'nextrip'. + */ + if (vme->inst_length == 0) { + vme->inst_length = vie->num_processed; + vcpu->nextrip += vie->num_processed; + } + + /* return to userland unless this is an in-kernel emulated device */ + if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { + mread = lapic_mmio_read; + mwrite = lapic_mmio_write; + } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { + mread = vioapic_mmio_read; + mwrite = vioapic_mmio_write; + } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { + mread = vhpet_mmio_read; + mwrite = vhpet_mmio_write; + } else { + *retu = true; + return (0); + } + + error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging, + mread, mwrite, retu); + + return (error); +} + +static int +vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu) +{ + int i, done; + struct vcpu *vcpu; + + done = 0; + vcpu = &vm->vcpu[vcpuid]; + + CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); + + /* + * Wait until all 'active_cpus' have suspended themselves. + * + * Since a VM may be suspended at any time including when one or + * more vcpus are doing a rendezvous we need to call the rendezvous + * handler while we are waiting to prevent a deadlock. + */ + vcpu_lock(vcpu); + while (1) { + if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { + VCPU_CTR0(vm, vcpuid, "All vcpus suspended"); + break; + } + + if (vm->rendezvous_func == NULL) { + VCPU_CTR0(vm, vcpuid, "Sleeping during suspend"); + vcpu_require_state_locked(vcpu, VCPU_SLEEPING); + msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz); + vcpu_require_state_locked(vcpu, VCPU_FROZEN); + } else { + VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend"); + vcpu_unlock(vcpu); + vm_handle_rendezvous(vm, vcpuid); + vcpu_lock(vcpu); + } + } + vcpu_unlock(vcpu); + + /* + * Wakeup the other sleeping vcpus and return to userspace. + */ + for (i = 0; i < VM_MAXCPU; i++) { + if (CPU_ISSET(i, &vm->suspended_cpus)) { + vcpu_notify_event(vm, i, false); + } + } + + *retu = true; + return (0); +} + +int +vm_suspend(struct vm *vm, enum vm_suspend_how how) +{ + int i; + + if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) + return (EINVAL); + + if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) { + VM_CTR2(vm, "virtual machine already suspended %d/%d", + vm->suspend, how); + return (EALREADY); + } + + VM_CTR1(vm, "virtual machine successfully suspended %d", how); + + /* + * Notify all active vcpus that they are now suspended. + */ + for (i = 0; i < VM_MAXCPU; i++) { + if (CPU_ISSET(i, &vm->active_cpus)) + vcpu_notify_event(vm, i, false); + } + + return (0); +} + +void +vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip) +{ + struct vm_exit *vmexit; + + KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST, + ("vm_exit_suspended: invalid suspend type %d", vm->suspend)); + + vmexit = vm_exitinfo(vm, vcpuid); + vmexit->rip = rip; + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_SUSPENDED; + vmexit->u.suspended.how = vm->suspend; +} + +void +vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip) +{ + struct vm_exit *vmexit; + + KASSERT(vm->rendezvous_func != NULL, ("rendezvous not in progress")); + + vmexit = vm_exitinfo(vm, vcpuid); + vmexit->rip = rip; + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_RENDEZVOUS; + vmm_stat_incr(vm, vcpuid, VMEXIT_RENDEZVOUS, 1); +} + +void +vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip) +{ + struct vm_exit *vmexit; + + vmexit = vm_exitinfo(vm, vcpuid); + vmexit->rip = rip; + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_BOGUS; + vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1); +} + +int +vm_run(struct vm *vm, struct vm_run *vmrun) +{ + int error, vcpuid; + struct vcpu *vcpu; + struct pcb *pcb; + uint64_t tscval; + struct vm_exit *vme; + bool retu, intr_disabled; + pmap_t pmap; + void *rptr, *sptr; + + vcpuid = vmrun->cpuid; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + if (!CPU_ISSET(vcpuid, &vm->active_cpus)) + return (EINVAL); + + if (CPU_ISSET(vcpuid, &vm->suspended_cpus)) + return (EINVAL); + + rptr = &vm->rendezvous_func; + sptr = &vm->suspend; + pmap = vmspace_pmap(vm->vmspace); + vcpu = &vm->vcpu[vcpuid]; + vme = &vcpu->exitinfo; +restart: + critical_enter(); + + KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), + ("vm_run: absurd pm_active")); + + tscval = rdtsc(); + + pcb = PCPU_GET(curpcb); + set_pcb_flags(pcb, PCB_FULL_IRET); + + restore_guest_fpustate(vcpu); + + vcpu_require_state(vm, vcpuid, VCPU_RUNNING); + error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, rptr, sptr); + vcpu_require_state(vm, vcpuid, VCPU_FROZEN); + + save_guest_fpustate(vcpu); + + vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); + + critical_exit(); + + if (error == 0) { + retu = false; + vcpu->nextrip = vme->rip + vme->inst_length; + switch (vme->exitcode) { + case VM_EXITCODE_SUSPENDED: + error = vm_handle_suspend(vm, vcpuid, &retu); + break; + case VM_EXITCODE_IOAPIC_EOI: + vioapic_process_eoi(vm, vcpuid, + vme->u.ioapic_eoi.vector); + break; + case VM_EXITCODE_RENDEZVOUS: + vm_handle_rendezvous(vm, vcpuid); + error = 0; + break; + case VM_EXITCODE_HLT: + intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); + error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu); + break; + case VM_EXITCODE_PAGING: + error = vm_handle_paging(vm, vcpuid, &retu); + break; + case VM_EXITCODE_INST_EMUL: + error = vm_handle_inst_emul(vm, vcpuid, &retu); + break; + case VM_EXITCODE_INOUT: + case VM_EXITCODE_INOUT_STR: + error = vm_handle_inout(vm, vcpuid, vme, &retu); + break; + case VM_EXITCODE_MONITOR: + case VM_EXITCODE_MWAIT: + vm_inject_ud(vm, vcpuid); + break; + default: + retu = true; /* handled in userland */ + break; + } + } + + if (error == 0 && retu == false) + goto restart; + + /* copy the exit information */ + bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit)); + return (error); +} + +int +vm_restart_instruction(void *arg, int vcpuid) +{ + struct vm *vm; + struct vcpu *vcpu; + enum vcpu_state state; + uint64_t rip; + int error; + + vm = arg; + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + state = vcpu_get_state(vm, vcpuid, NULL); + if (state == VCPU_RUNNING) { + /* + * When a vcpu is "running" the next instruction is determined + * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'. + * Thus setting 'inst_length' to zero will cause the current + * instruction to be restarted. + */ + vcpu->exitinfo.inst_length = 0; + VCPU_CTR1(vm, vcpuid, "restarting instruction at %#lx by " + "setting inst_length to zero", vcpu->exitinfo.rip); + } else if (state == VCPU_FROZEN) { + /* + * When a vcpu is "frozen" it is outside the critical section + * around VMRUN() and 'nextrip' points to the next instruction. + * Thus instruction restart is achieved by setting 'nextrip' + * to the vcpu's %rip. + */ + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip); + KASSERT(!error, ("%s: error %d getting rip", __func__, error)); + VCPU_CTR2(vm, vcpuid, "restarting instruction by updating " + "nextrip from %#lx to %#lx", vcpu->nextrip, rip); + vcpu->nextrip = rip; + } else { + panic("%s: invalid state %d", __func__, state); + } + return (0); +} + +int +vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info) +{ + struct vcpu *vcpu; + int type, vector; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + + if (info & VM_INTINFO_VALID) { + type = info & VM_INTINFO_TYPE; + vector = info & 0xff; + if (type == VM_INTINFO_NMI && vector != IDT_NMI) + return (EINVAL); + if (type == VM_INTINFO_HWEXCEPTION && vector >= 32) + return (EINVAL); + if (info & VM_INTINFO_RSVD) + return (EINVAL); + } else { + info = 0; + } + VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info); + vcpu->exitintinfo = info; + return (0); +} + +enum exc_class { + EXC_BENIGN, + EXC_CONTRIBUTORY, + EXC_PAGEFAULT +}; + +#define IDT_VE 20 /* Virtualization Exception (Intel specific) */ + +static enum exc_class +exception_class(uint64_t info) +{ + int type, vector; + + KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info)); + type = info & VM_INTINFO_TYPE; + vector = info & 0xff; + + /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ + switch (type) { + case VM_INTINFO_HWINTR: + case VM_INTINFO_SWINTR: + case VM_INTINFO_NMI: + return (EXC_BENIGN); + default: + /* + * Hardware exception. + * + * SVM and VT-x use identical type values to represent NMI, + * hardware interrupt and software interrupt. + * + * SVM uses type '3' for all exceptions. VT-x uses type '3' + * for exceptions except #BP and #OF. #BP and #OF use a type + * value of '5' or '6'. Therefore we don't check for explicit + * values of 'type' to classify 'intinfo' into a hardware + * exception. + */ + break; + } + + switch (vector) { + case IDT_PF: + case IDT_VE: + return (EXC_PAGEFAULT); + case IDT_DE: + case IDT_TS: + case IDT_NP: + case IDT_SS: + case IDT_GP: + return (EXC_CONTRIBUTORY); + default: + return (EXC_BENIGN); + } +} + +static int +nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2, + uint64_t *retinfo) +{ + enum exc_class exc1, exc2; + int type1, vector1; + + KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1)); + KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2)); + + /* + * If an exception occurs while attempting to call the double-fault + * handler the processor enters shutdown mode (aka triple fault). + */ + type1 = info1 & VM_INTINFO_TYPE; + vector1 = info1 & 0xff; + if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) { + VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)", + info1, info2); + vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT); + *retinfo = 0; + return (0); + } + + /* + * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3 + */ + exc1 = exception_class(info1); + exc2 = exception_class(info2); + if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || + (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { + /* Convert nested fault into a double fault. */ + *retinfo = IDT_DF; + *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; + *retinfo |= VM_INTINFO_DEL_ERRCODE; + } else { + /* Handle exceptions serially */ + *retinfo = info2; + } + return (1); +} + +static uint64_t +vcpu_exception_intinfo(struct vcpu *vcpu) +{ + uint64_t info = 0; + + if (vcpu->exception_pending) { + info = vcpu->exc_vector & 0xff; + info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; + if (vcpu->exc_errcode_valid) { + info |= VM_INTINFO_DEL_ERRCODE; + info |= (uint64_t)vcpu->exc_errcode << 32; + } + } + return (info); +} + +int +vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) +{ + struct vcpu *vcpu; + uint64_t info1, info2; + int valid; + + KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid)); + + vcpu = &vm->vcpu[vcpuid]; + + info1 = vcpu->exitintinfo; + vcpu->exitintinfo = 0; + + info2 = 0; + if (vcpu->exception_pending) { + info2 = vcpu_exception_intinfo(vcpu); + vcpu->exception_pending = 0; + VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx", + vcpu->exc_vector, info2); + } + + if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) { + valid = nested_fault(vm, vcpuid, info1, info2, retinfo); + } else if (info1 & VM_INTINFO_VALID) { + *retinfo = info1; + valid = 1; + } else if (info2 & VM_INTINFO_VALID) { + *retinfo = info2; + valid = 1; + } else { + valid = 0; + } + + if (valid) { + VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), " + "retinfo(%#lx)", __func__, info1, info2, *retinfo); + } + + return (valid); +} + +int +vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + *info1 = vcpu->exitintinfo; + *info2 = vcpu_exception_intinfo(vcpu); + return (0); +} + +int +vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid, + uint32_t errcode, int restart_instruction) +{ + struct vcpu *vcpu; + int error; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + if (vector < 0 || vector >= 32) + return (EINVAL); + + /* + * A double fault exception should never be injected directly into + * the guest. It is a derived exception that results from specific + * combinations of nested faults. + */ + if (vector == IDT_DF) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + + if (vcpu->exception_pending) { + VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to " + "pending exception %d", vector, vcpu->exc_vector); + return (EBUSY); + } + + /* + * From section 26.6.1 "Interruptibility State" in Intel SDM: + * + * Event blocking by "STI" or "MOV SS" is cleared after guest executes + * one instruction or incurs an exception. + */ + error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0); + KASSERT(error == 0, ("%s: error %d clearing interrupt shadow", + __func__, error)); + + if (restart_instruction) + vm_restart_instruction(vm, vcpuid); + + vcpu->exception_pending = 1; + vcpu->exc_vector = vector; + vcpu->exc_errcode = errcode; + vcpu->exc_errcode_valid = errcode_valid; + VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector); + return (0); +} + +void +vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid, + int errcode) +{ + struct vm *vm; + int error, restart_instruction; + + vm = vmarg; + restart_instruction = 1; + + error = vm_inject_exception(vm, vcpuid, vector, errcode_valid, + errcode, restart_instruction); + KASSERT(error == 0, ("vm_inject_exception error %d", error)); +} + +void +vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2) +{ + struct vm *vm; + int error; + + vm = vmarg; + VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx", + error_code, cr2); + + error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2); + KASSERT(error == 0, ("vm_set_register(cr2) error %d", error)); + + vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code); +} + +static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); + +int +vm_inject_nmi(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu->nmi_pending = 1; + vcpu_notify_event(vm, vcpuid, false); + return (0); +} + +int +vm_nmi_pending(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + return (vcpu->nmi_pending); +} + +void +vm_nmi_clear(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + if (vcpu->nmi_pending == 0) + panic("vm_nmi_clear: inconsistent nmi_pending state"); + + vcpu->nmi_pending = 0; + vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); +} + +static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); + +int +vm_inject_extint(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu->extint_pending = 1; + vcpu_notify_event(vm, vcpuid, false); + return (0); +} + +int +vm_extint_pending(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + panic("vm_extint_pending: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + return (vcpu->extint_pending); +} + +void +vm_extint_clear(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + panic("vm_extint_pending: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + if (vcpu->extint_pending == 0) + panic("vm_extint_clear: inconsistent extint_pending state"); + + vcpu->extint_pending = 0; + vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1); +} + +int +vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) +{ + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (type < 0 || type >= VM_CAP_MAX) + return (EINVAL); + + return (VMGETCAP(vm->cookie, vcpu, type, retval)); +} + +int +vm_set_capability(struct vm *vm, int vcpu, int type, int val) +{ + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + if (type < 0 || type >= VM_CAP_MAX) + return (EINVAL); + + return (VMSETCAP(vm->cookie, vcpu, type, val)); +} + +struct vlapic * +vm_lapic(struct vm *vm, int cpu) +{ + return (vm->vcpu[cpu].vlapic); +} + +struct vioapic * +vm_ioapic(struct vm *vm) +{ + + return (vm->vioapic); +} + +struct vhpet * +vm_hpet(struct vm *vm) +{ + + return (vm->vhpet); +} + +boolean_t +vmm_is_pptdev(int bus, int slot, int func) +{ + int found, i, n; + int b, s, f; + char *val, *cp, *cp2; + + /* + * XXX + * The length of an environment variable is limited to 128 bytes which + * puts an upper limit on the number of passthru devices that may be + * specified using a single environment variable. + * + * Work around this by scanning multiple environment variable + * names instead of a single one - yuck! + */ + const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL }; + + /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */ + found = 0; + for (i = 0; names[i] != NULL && !found; i++) { + cp = val = kern_getenv(names[i]); + while (cp != NULL && *cp != '\0') { + if ((cp2 = strchr(cp, ' ')) != NULL) + *cp2 = '\0'; + + n = sscanf(cp, "%d/%d/%d", &b, &s, &f); + if (n == 3 && bus == b && slot == s && func == f) { + found = 1; + break; + } + + if (cp2 != NULL) + *cp2++ = ' '; + + cp = cp2; + } + freeenv(val); + } + return (found); +} + +void * +vm_iommu_domain(struct vm *vm) +{ + + return (vm->iommu); +} + +int +vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, + bool from_idle) +{ + int error; + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + panic("vm_set_run_state: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + error = vcpu_set_state_locked(vcpu, newstate, from_idle); + vcpu_unlock(vcpu); + + return (error); +} + +enum vcpu_state +vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) +{ + struct vcpu *vcpu; + enum vcpu_state state; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + panic("vm_get_run_state: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + state = vcpu->state; + if (hostcpu != NULL) + *hostcpu = vcpu->hostcpu; + vcpu_unlock(vcpu); + + return (state); +} + +int +vm_activate_cpu(struct vm *vm, int vcpuid) +{ + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + if (CPU_ISSET(vcpuid, &vm->active_cpus)) + return (EBUSY); + + VCPU_CTR0(vm, vcpuid, "activated"); + CPU_SET_ATOMIC(vcpuid, &vm->active_cpus); + return (0); +} + +cpuset_t +vm_active_cpus(struct vm *vm) +{ + + return (vm->active_cpus); +} + +cpuset_t +vm_suspended_cpus(struct vm *vm) +{ + + return (vm->suspended_cpus); +} + +void * +vcpu_stats(struct vm *vm, int vcpuid) +{ + + return (vm->vcpu[vcpuid].stats); +} + +int +vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) +{ + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + *state = vm->vcpu[vcpuid].x2apic_state; + + return (0); +} + +int +vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) +{ + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + if (state >= X2APIC_STATE_LAST) + return (EINVAL); + + vm->vcpu[vcpuid].x2apic_state = state; + + vlapic_set_x2apic_state(vm, vcpuid, state); + + return (0); +} + +/* + * This function is called to ensure that a vcpu "sees" a pending event + * as soon as possible: + * - If the vcpu thread is sleeping then it is woken up. + * - If the vcpu is running on a different host_cpu then an IPI will be directed + * to the host_cpu to cause the vcpu to trap into the hypervisor. + */ +void +vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr) +{ + int hostcpu; + struct vcpu *vcpu; + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + hostcpu = vcpu->hostcpu; + if (vcpu->state == VCPU_RUNNING) { + KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); + if (hostcpu != curcpu) { + if (lapic_intr) { + vlapic_post_intr(vcpu->vlapic, hostcpu, + vmm_ipinum); + } else { + ipi_cpu(hostcpu, vmm_ipinum); + } + } else { + /* + * If the 'vcpu' is running on 'curcpu' then it must + * be sending a notification to itself (e.g. SELF_IPI). + * The pending event will be picked up when the vcpu + * transitions back to guest context. + */ + } + } else { + KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " + "with hostcpu %d", vcpu->state, hostcpu)); + if (vcpu->state == VCPU_SLEEPING) + wakeup_one(vcpu); + } + vcpu_unlock(vcpu); +} + +struct vmspace * +vm_get_vmspace(struct vm *vm) +{ + + return (vm->vmspace); +} + +int +vm_apicid2vcpuid(struct vm *vm, int apicid) +{ + /* + * XXX apic id is assumed to be numerically identical to vcpu id + */ + return (apicid); +} + +void +vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest, + vm_rendezvous_func_t func, void *arg) +{ + int i; + + /* + * Enforce that this function is called without any locks + */ + WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous"); + KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU), + ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid)); + +restart: + mtx_lock(&vm->rendezvous_mtx); + if (vm->rendezvous_func != NULL) { + /* + * If a rendezvous is already in progress then we need to + * call the rendezvous handler in case this 'vcpuid' is one + * of the targets of the rendezvous. + */ + RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress"); + mtx_unlock(&vm->rendezvous_mtx); + vm_handle_rendezvous(vm, vcpuid); + goto restart; + } + KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous " + "rendezvous is still in progress")); + + RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous"); + vm->rendezvous_req_cpus = dest; + CPU_ZERO(&vm->rendezvous_done_cpus); + vm->rendezvous_arg = arg; + vm_set_rendezvous_func(vm, func); + mtx_unlock(&vm->rendezvous_mtx); + + /* + * Wake up any sleeping vcpus and trigger a VM-exit in any running + * vcpus so they handle the rendezvous as soon as possible. + */ + for (i = 0; i < VM_MAXCPU; i++) { + if (CPU_ISSET(i, &dest)) + vcpu_notify_event(vm, i, false); + } + + vm_handle_rendezvous(vm, vcpuid); +} + +struct vatpic * +vm_atpic(struct vm *vm) +{ + return (vm->vatpic); +} + +struct vatpit * +vm_atpit(struct vm *vm) +{ + return (vm->vatpit); +} + +struct vpmtmr * +vm_pmtmr(struct vm *vm) +{ + + return (vm->vpmtmr); +} + +struct vrtc * +vm_rtc(struct vm *vm) +{ + + return (vm->vrtc); +} + +enum vm_reg_name +vm_segment_name(int seg) +{ + static enum vm_reg_name seg_names[] = { + VM_REG_GUEST_ES, + VM_REG_GUEST_CS, + VM_REG_GUEST_SS, + VM_REG_GUEST_DS, + VM_REG_GUEST_FS, + VM_REG_GUEST_GS + }; + + KASSERT(seg >= 0 && seg < nitems(seg_names), + ("%s: invalid segment encoding %d", __func__, seg)); + return (seg_names[seg]); +} + +void +vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, + int num_copyinfo) +{ + int idx; + + for (idx = 0; idx < num_copyinfo; idx++) { + if (copyinfo[idx].cookie != NULL) + vm_gpa_release(copyinfo[idx].cookie); + } + bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo)); +} + +int +vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, + int num_copyinfo, int *fault) +{ + int error, idx, nused; + size_t n, off, remaining; + void *hva, *cookie; + uint64_t gpa; + + bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo); + + nused = 0; + remaining = len; + while (remaining > 0) { + KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); + error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault); + if (error || *fault) + return (error); + off = gpa & PAGE_MASK; + n = min(remaining, PAGE_SIZE - off); + copyinfo[nused].gpa = gpa; + copyinfo[nused].len = n; + remaining -= n; + gla += n; + nused++; + } + + for (idx = 0; idx < nused; idx++) { + hva = vm_gpa_hold(vm, copyinfo[idx].gpa, copyinfo[idx].len, + prot, &cookie); + if (hva == NULL) + break; + copyinfo[idx].hva = hva; + copyinfo[idx].cookie = cookie; + } + + if (idx != nused) { + vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo); + return (EFAULT); + } else { + *fault = 0; + return (0); + } +} + +void +vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, + size_t len) +{ + char *dst; + int idx; + + dst = kaddr; + idx = 0; + while (len > 0) { + bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); + len -= copyinfo[idx].len; + dst += copyinfo[idx].len; + idx++; + } +} + +void +vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, + struct vm_copyinfo *copyinfo, size_t len) +{ + const char *src; + int idx; + + src = kaddr; + idx = 0; + while (len > 0) { + bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); + len -= copyinfo[idx].len; + src += copyinfo[idx].len; + idx++; + } +} + +/* + * Return the amount of in-use and wired memory for the VM. Since + * these are global stats, only return the values with for vCPU 0 + */ +VMM_STAT_DECLARE(VMM_MEM_RESIDENT); +VMM_STAT_DECLARE(VMM_MEM_WIRED); + +static void +vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) +{ + + if (vcpu == 0) { + vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT, + PAGE_SIZE * vmspace_resident_count(vm->vmspace)); + } +} + +static void +vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) +{ + + if (vcpu == 0) { + vmm_stat_set(vm, vcpu, VMM_MEM_WIRED, + PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace))); + } +} + +VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); +VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt); diff --git a/vmm/vmm_dev.c b/vmm/vmm_dev.c new file mode 100644 index 0000000..e3e140a --- /dev/null +++ b/vmm/vmm_dev.c @@ -0,0 +1,689 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include "vmm_lapic.h" +#include "vmm_stat.h" +#include "vmm_mem.h" +#include "io/ppt.h" +#include "io/vatpic.h" +#include "io/vioapic.h" +#include "io/vhpet.h" +#include "io/vrtc.h" + +struct vmmdev_softc { + struct vm *vm; /* vm instance cookie */ + struct cdev *cdev; + SLIST_ENTRY(vmmdev_softc) link; + int flags; +}; +#define VSC_LINKED 0x01 + +static SLIST_HEAD(, vmmdev_softc) head; + +static struct mtx vmmdev_mtx; + +static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev"); + +SYSCTL_DECL(_hw_vmm); + +static struct vmmdev_softc * +vmmdev_lookup(const char *name) +{ + struct vmmdev_softc *sc; + +#ifdef notyet /* XXX kernel is not compiled with invariants */ + mtx_assert(&vmmdev_mtx, MA_OWNED); +#endif + + SLIST_FOREACH(sc, &head, link) { + if (strcmp(name, vm_name(sc->vm)) == 0) + break; + } + + return (sc); +} + +static struct vmmdev_softc * +vmmdev_lookup2(struct cdev *cdev) +{ + + return (cdev->si_drv1); +} + +static int +vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) +{ + int error, off, c, prot; + vm_paddr_t gpa; + void *hpa, *cookie; + struct vmmdev_softc *sc; + + static char zerobuf[PAGE_SIZE]; + + error = 0; + sc = vmmdev_lookup2(cdev); + if (sc == NULL) + error = ENXIO; + + prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ); + while (uio->uio_resid > 0 && error == 0) { + gpa = uio->uio_offset; + off = gpa & PAGE_MASK; + c = min(uio->uio_resid, PAGE_SIZE - off); + + /* + * The VM has a hole in its physical memory map. If we want to + * use 'dd' to inspect memory beyond the hole we need to + * provide bogus data for memory that lies in the hole. + * + * Since this device does not support lseek(2), dd(1) will + * read(2) blocks of data to simulate the lseek(2). + */ + hpa = vm_gpa_hold(sc->vm, gpa, c, prot, &cookie); + if (hpa == NULL) { + if (uio->uio_rw == UIO_READ) + error = uiomove(zerobuf, c, uio); + else + error = EFAULT; + } else { + error = uiomove(hpa, c, uio); + vm_gpa_release(cookie); + } + } + return (error); +} + +static int +vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, + struct thread *td) +{ + int error, vcpu, state_changed, size; + cpuset_t *cpuset; + struct vmmdev_softc *sc; + struct vm_memory_segment *seg; + struct vm_register *vmreg; + struct vm_seg_desc *vmsegdesc; + struct vm_run *vmrun; + struct vm_exception *vmexc; + struct vm_lapic_irq *vmirq; + struct vm_lapic_msi *vmmsi; + struct vm_ioapic_irq *ioapic_irq; + struct vm_isa_irq *isa_irq; + struct vm_isa_irq_trigger *isa_irq_trigger; + struct vm_capability *vmcap; + struct vm_pptdev *pptdev; + struct vm_pptdev_mmio *pptmmio; + struct vm_pptdev_msi *pptmsi; + struct vm_pptdev_msix *pptmsix; + struct vm_nmi *vmnmi; + struct vm_stats *vmstats; + struct vm_stat_desc *statdesc; + struct vm_x2apic *x2apic; + struct vm_gpa_pte *gpapte; + struct vm_suspend *vmsuspend; + struct vm_gla2gpa *gg; + struct vm_activate_cpu *vac; + struct vm_cpuset *vm_cpuset; + struct vm_intinfo *vmii; + struct vm_rtc_time *rtctime; + struct vm_rtc_data *rtcdata; + + sc = vmmdev_lookup2(cdev); + if (sc == NULL) + return (ENXIO); + + error = 0; + vcpu = -1; + state_changed = 0; + + /* + * Some VMM ioctls can operate only on vcpus that are not running. + */ + switch (cmd) { + case VM_RUN: + case VM_GET_REGISTER: + case VM_SET_REGISTER: + case VM_GET_SEGMENT_DESCRIPTOR: + case VM_SET_SEGMENT_DESCRIPTOR: + case VM_INJECT_EXCEPTION: + case VM_GET_CAPABILITY: + case VM_SET_CAPABILITY: + case VM_PPTDEV_MSI: + case VM_PPTDEV_MSIX: + case VM_SET_X2APIC_STATE: + case VM_GLA2GPA: + case VM_ACTIVATE_CPU: + case VM_SET_INTINFO: + case VM_GET_INTINFO: + case VM_RESTART_INSTRUCTION: + /* + * XXX fragile, handle with care + * Assumes that the first field of the ioctl data is the vcpu. + */ + vcpu = *(int *)data; + if (vcpu < 0 || vcpu >= VM_MAXCPU) { + error = EINVAL; + goto done; + } + + error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true); + if (error) + goto done; + + state_changed = 1; + break; + + case VM_MAP_PPTDEV_MMIO: + case VM_BIND_PPTDEV: + case VM_UNBIND_PPTDEV: + case VM_MAP_MEMORY: + case VM_REINIT: + /* + * ioctls that operate on the entire virtual machine must + * prevent all vcpus from running. + */ + error = 0; + for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) { + error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true); + if (error) + break; + } + + if (error) { + while (--vcpu >= 0) + vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); + goto done; + } + + state_changed = 2; + break; + + default: + break; + } + + switch(cmd) { + case VM_RUN: + vmrun = (struct vm_run *)data; + error = vm_run(sc->vm, vmrun); + break; + case VM_SUSPEND: + vmsuspend = (struct vm_suspend *)data; + error = vm_suspend(sc->vm, vmsuspend->how); + break; + case VM_REINIT: + error = vm_reinit(sc->vm); + break; + case VM_STAT_DESC: { + statdesc = (struct vm_stat_desc *)data; + error = vmm_stat_desc_copy(statdesc->index, + statdesc->desc, sizeof(statdesc->desc)); + break; + } + case VM_STATS: { + CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS); + vmstats = (struct vm_stats *)data; + getmicrotime(&vmstats->tv); + error = vmm_stat_copy(sc->vm, vmstats->cpuid, + &vmstats->num_entries, vmstats->statbuf); + break; + } + case VM_PPTDEV_MSI: + pptmsi = (struct vm_pptdev_msi *)data; + error = ppt_setup_msi(sc->vm, pptmsi->vcpu, + pptmsi->bus, pptmsi->slot, pptmsi->func, + pptmsi->addr, pptmsi->msg, + pptmsi->numvec); + break; + case VM_PPTDEV_MSIX: + pptmsix = (struct vm_pptdev_msix *)data; + error = ppt_setup_msix(sc->vm, pptmsix->vcpu, + pptmsix->bus, pptmsix->slot, + pptmsix->func, pptmsix->idx, + pptmsix->addr, pptmsix->msg, + pptmsix->vector_control); + break; + case VM_MAP_PPTDEV_MMIO: + pptmmio = (struct vm_pptdev_mmio *)data; + error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot, + pptmmio->func, pptmmio->gpa, pptmmio->len, + pptmmio->hpa); + break; + case VM_BIND_PPTDEV: + pptdev = (struct vm_pptdev *)data; + error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot, + pptdev->func); + break; + case VM_UNBIND_PPTDEV: + pptdev = (struct vm_pptdev *)data; + error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot, + pptdev->func); + break; + case VM_INJECT_EXCEPTION: + vmexc = (struct vm_exception *)data; + error = vm_inject_exception(sc->vm, vmexc->cpuid, + vmexc->vector, vmexc->error_code_valid, vmexc->error_code, + vmexc->restart_instruction); + break; + case VM_INJECT_NMI: + vmnmi = (struct vm_nmi *)data; + error = vm_inject_nmi(sc->vm, vmnmi->cpuid); + break; + case VM_LAPIC_IRQ: + vmirq = (struct vm_lapic_irq *)data; + error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector); + break; + case VM_LAPIC_LOCAL_IRQ: + vmirq = (struct vm_lapic_irq *)data; + error = lapic_set_local_intr(sc->vm, vmirq->cpuid, + vmirq->vector); + break; + case VM_LAPIC_MSI: + vmmsi = (struct vm_lapic_msi *)data; + error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg); + break; + case VM_IOAPIC_ASSERT_IRQ: + ioapic_irq = (struct vm_ioapic_irq *)data; + error = vioapic_assert_irq(sc->vm, ioapic_irq->irq); + break; + case VM_IOAPIC_DEASSERT_IRQ: + ioapic_irq = (struct vm_ioapic_irq *)data; + error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq); + break; + case VM_IOAPIC_PULSE_IRQ: + ioapic_irq = (struct vm_ioapic_irq *)data; + error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq); + break; + case VM_IOAPIC_PINCOUNT: + *(int *)data = vioapic_pincount(sc->vm); + break; + case VM_ISA_ASSERT_IRQ: + isa_irq = (struct vm_isa_irq *)data; + error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq); + if (error == 0 && isa_irq->ioapic_irq != -1) + error = vioapic_assert_irq(sc->vm, + isa_irq->ioapic_irq); + break; + case VM_ISA_DEASSERT_IRQ: + isa_irq = (struct vm_isa_irq *)data; + error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq); + if (error == 0 && isa_irq->ioapic_irq != -1) + error = vioapic_deassert_irq(sc->vm, + isa_irq->ioapic_irq); + break; + case VM_ISA_PULSE_IRQ: + isa_irq = (struct vm_isa_irq *)data; + error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq); + if (error == 0 && isa_irq->ioapic_irq != -1) + error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq); + break; + case VM_ISA_SET_IRQ_TRIGGER: + isa_irq_trigger = (struct vm_isa_irq_trigger *)data; + error = vatpic_set_irq_trigger(sc->vm, + isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger); + break; + case VM_MAP_MEMORY: + seg = (struct vm_memory_segment *)data; + error = vm_malloc(sc->vm, seg->gpa, seg->len); + break; + case VM_GET_MEMORY_SEG: + seg = (struct vm_memory_segment *)data; + seg->len = 0; + (void)vm_gpabase2memseg(sc->vm, seg->gpa, seg); + error = 0; + break; + case VM_GET_REGISTER: + vmreg = (struct vm_register *)data; + error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum, + &vmreg->regval); + break; + case VM_SET_REGISTER: + vmreg = (struct vm_register *)data; + error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum, + vmreg->regval); + break; + case VM_SET_SEGMENT_DESCRIPTOR: + vmsegdesc = (struct vm_seg_desc *)data; + error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid, + vmsegdesc->regnum, + &vmsegdesc->desc); + break; + case VM_GET_SEGMENT_DESCRIPTOR: + vmsegdesc = (struct vm_seg_desc *)data; + error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid, + vmsegdesc->regnum, + &vmsegdesc->desc); + break; + case VM_GET_CAPABILITY: + vmcap = (struct vm_capability *)data; + error = vm_get_capability(sc->vm, vmcap->cpuid, + vmcap->captype, + &vmcap->capval); + break; + case VM_SET_CAPABILITY: + vmcap = (struct vm_capability *)data; + error = vm_set_capability(sc->vm, vmcap->cpuid, + vmcap->captype, + vmcap->capval); + break; + case VM_SET_X2APIC_STATE: + x2apic = (struct vm_x2apic *)data; + error = vm_set_x2apic_state(sc->vm, + x2apic->cpuid, x2apic->state); + break; + case VM_GET_X2APIC_STATE: + x2apic = (struct vm_x2apic *)data; + error = vm_get_x2apic_state(sc->vm, + x2apic->cpuid, &x2apic->state); + break; + case VM_GET_GPA_PMAP: + gpapte = (struct vm_gpa_pte *)data; + pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)), + gpapte->gpa, gpapte->pte, &gpapte->ptenum); + error = 0; + break; + case VM_GET_HPET_CAPABILITIES: + error = vhpet_getcap((struct vm_hpet_cap *)data); + break; + case VM_GLA2GPA: { + CTASSERT(PROT_READ == VM_PROT_READ); + CTASSERT(PROT_WRITE == VM_PROT_WRITE); + CTASSERT(PROT_EXEC == VM_PROT_EXECUTE); + gg = (struct vm_gla2gpa *)data; + error = vm_gla2gpa(sc->vm, gg->vcpuid, &gg->paging, gg->gla, + gg->prot, &gg->gpa, &gg->fault); + KASSERT(error == 0 || error == EFAULT, + ("%s: vm_gla2gpa unknown error %d", __func__, error)); + break; + } + case VM_ACTIVATE_CPU: + vac = (struct vm_activate_cpu *)data; + error = vm_activate_cpu(sc->vm, vac->vcpuid); + break; + case VM_GET_CPUS: + error = 0; + vm_cpuset = (struct vm_cpuset *)data; + size = vm_cpuset->cpusetsize; + if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) { + error = ERANGE; + break; + } + cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO); + if (vm_cpuset->which == VM_ACTIVE_CPUS) + *cpuset = vm_active_cpus(sc->vm); + else if (vm_cpuset->which == VM_SUSPENDED_CPUS) + *cpuset = vm_suspended_cpus(sc->vm); + else + error = EINVAL; + if (error == 0) + error = copyout(cpuset, vm_cpuset->cpus, size); + free(cpuset, M_TEMP); + break; + case VM_SET_INTINFO: + vmii = (struct vm_intinfo *)data; + error = vm_exit_intinfo(sc->vm, vmii->vcpuid, vmii->info1); + break; + case VM_GET_INTINFO: + vmii = (struct vm_intinfo *)data; + error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1, + &vmii->info2); + break; + case VM_RTC_WRITE: + rtcdata = (struct vm_rtc_data *)data; + error = vrtc_nvram_write(sc->vm, rtcdata->offset, + rtcdata->value); + break; + case VM_RTC_READ: + rtcdata = (struct vm_rtc_data *)data; + error = vrtc_nvram_read(sc->vm, rtcdata->offset, + &rtcdata->value); + break; + case VM_RTC_SETTIME: + rtctime = (struct vm_rtc_time *)data; + error = vrtc_set_time(sc->vm, rtctime->secs); + break; + case VM_RTC_GETTIME: + error = 0; + rtctime = (struct vm_rtc_time *)data; + rtctime->secs = vrtc_get_time(sc->vm); + break; + case VM_RESTART_INSTRUCTION: + error = vm_restart_instruction(sc->vm, vcpu); + break; + default: + error = ENOTTY; + break; + } + + if (state_changed == 1) { + vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); + } else if (state_changed == 2) { + for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) + vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); + } + +done: + /* Make sure that no handler returns a bogus value like ERESTART */ + KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error)); + return (error); +} + +static int +vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, + vm_size_t size, struct vm_object **object, int nprot) +{ + int error; + struct vmmdev_softc *sc; + + sc = vmmdev_lookup2(cdev); + if (sc != NULL && (nprot & PROT_EXEC) == 0) + error = vm_get_memobj(sc->vm, *offset, size, offset, object); + else + error = EINVAL; + + return (error); +} + +static void +vmmdev_destroy(void *arg) +{ + + struct vmmdev_softc *sc = arg; + + if (sc->cdev != NULL) + destroy_dev(sc->cdev); + + if (sc->vm != NULL) + vm_destroy(sc->vm); + + if ((sc->flags & VSC_LINKED) != 0) { + mtx_lock(&vmmdev_mtx); + SLIST_REMOVE(&head, sc, vmmdev_softc, link); + mtx_unlock(&vmmdev_mtx); + } + + free(sc, M_VMMDEV); +} + +static int +sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) +{ + int error; + char buf[VM_MAX_NAMELEN]; + struct vmmdev_softc *sc; + struct cdev *cdev; + + strlcpy(buf, "beavis", sizeof(buf)); + error = sysctl_handle_string(oidp, buf, sizeof(buf), req); + if (error != 0 || req->newptr == NULL) + return (error); + + mtx_lock(&vmmdev_mtx); + sc = vmmdev_lookup(buf); + if (sc == NULL || sc->cdev == NULL) { + mtx_unlock(&vmmdev_mtx); + return (EINVAL); + } + + /* + * The 'cdev' will be destroyed asynchronously when 'si_threadcount' + * goes down to 0 so we should not do it again in the callback. + */ + cdev = sc->cdev; + sc->cdev = NULL; + mtx_unlock(&vmmdev_mtx); + + /* + * Schedule the 'cdev' to be destroyed: + * + * - any new operations on this 'cdev' will return an error (ENXIO). + * + * - when the 'si_threadcount' dwindles down to zero the 'cdev' will + * be destroyed and the callback will be invoked in a taskqueue + * context. + */ + destroy_dev_sched_cb(cdev, vmmdev_destroy, sc); + + return (0); +} +SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW, + NULL, 0, sysctl_vmm_destroy, "A", NULL); + +static struct cdevsw vmmdevsw = { + .d_name = "vmmdev", + .d_version = D_VERSION, + .d_ioctl = vmmdev_ioctl, + .d_mmap_single = vmmdev_mmap_single, + .d_read = vmmdev_rw, + .d_write = vmmdev_rw, +}; + +static int +sysctl_vmm_create(SYSCTL_HANDLER_ARGS) +{ + int error; + struct vm *vm; + struct cdev *cdev; + struct vmmdev_softc *sc, *sc2; + char buf[VM_MAX_NAMELEN]; + + strlcpy(buf, "beavis", sizeof(buf)); + error = sysctl_handle_string(oidp, buf, sizeof(buf), req); + if (error != 0 || req->newptr == NULL) + return (error); + + mtx_lock(&vmmdev_mtx); + sc = vmmdev_lookup(buf); + mtx_unlock(&vmmdev_mtx); + if (sc != NULL) + return (EEXIST); + + error = vm_create(buf, &vm); + if (error != 0) + return (error); + + sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO); + sc->vm = vm; + + /* + * Lookup the name again just in case somebody sneaked in when we + * dropped the lock. + */ + mtx_lock(&vmmdev_mtx); + sc2 = vmmdev_lookup(buf); + if (sc2 == NULL) { + SLIST_INSERT_HEAD(&head, sc, link); + sc->flags |= VSC_LINKED; + } + mtx_unlock(&vmmdev_mtx); + + if (sc2 != NULL) { + vmmdev_destroy(sc); + return (EEXIST); + } + + error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL, + UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf); + if (error != 0) { + vmmdev_destroy(sc); + return (error); + } + + mtx_lock(&vmmdev_mtx); + sc->cdev = cdev; + sc->cdev->si_drv1 = sc; + mtx_unlock(&vmmdev_mtx); + + return (0); +} +SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW, + NULL, 0, sysctl_vmm_create, "A", NULL); + +void +vmmdev_init(void) +{ + mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF); +} + +int +vmmdev_cleanup(void) +{ + int error; + + if (SLIST_EMPTY(&head)) + error = 0; + else + error = EBUSY; + + return (error); +} diff --git a/vmm/vmm_host.c b/vmm/vmm_host.c new file mode 100644 index 0000000..9e5b966 --- /dev/null +++ b/vmm/vmm_host.c @@ -0,0 +1,161 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include + +#include "vmm_host.h" + +static uint64_t vmm_host_efer, vmm_host_pat, vmm_host_cr0, vmm_host_cr4, + vmm_host_xcr0; +static struct xsave_limits vmm_xsave_limits; + +void +vmm_host_state_init(void) +{ + int regs[4]; + + vmm_host_efer = rdmsr(MSR_EFER); + vmm_host_pat = rdmsr(MSR_PAT); + + /* + * We always want CR0.TS to be set when the processor does a VM exit. + * + * With emulation turned on unconditionally after a VM exit, we are + * able to trap inadvertent use of the FPU until the guest FPU state + * has been safely squirreled away. + */ + vmm_host_cr0 = rcr0() | CR0_TS; + + vmm_host_cr4 = rcr4(); + + /* + * Only permit a guest to use XSAVE if the host is using + * XSAVE. Only permit a guest to use XSAVE features supported + * by the host. This ensures that the FPU state used by the + * guest is always a subset of the saved guest FPU state. + * + * In addition, only permit known XSAVE features where the + * rules for which features depend on other features is known + * to properly emulate xsetbv. + */ + if (vmm_host_cr4 & CR4_XSAVE) { + vmm_xsave_limits.xsave_enabled = 1; + vmm_host_xcr0 = rxcr(0); + vmm_xsave_limits.xcr0_allowed = vmm_host_xcr0 & + (XFEATURE_AVX | XFEATURE_MPX | XFEATURE_AVX512); + + cpuid_count(0xd, 0x0, regs); + vmm_xsave_limits.xsave_max_size = regs[1]; + } +} + +uint64_t +vmm_get_host_pat(void) +{ + + return (vmm_host_pat); +} + +uint64_t +vmm_get_host_efer(void) +{ + + return (vmm_host_efer); +} + +uint64_t +vmm_get_host_cr0(void) +{ + + return (vmm_host_cr0); +} + +uint64_t +vmm_get_host_cr4(void) +{ + + return (vmm_host_cr4); +} + +uint64_t +vmm_get_host_xcr0(void) +{ + + return (vmm_host_xcr0); +} + +uint64_t +vmm_get_host_datasel(void) +{ + + return (GSEL(GDATA_SEL, SEL_KPL)); + +} + +uint64_t +vmm_get_host_codesel(void) +{ + + return (GSEL(GCODE_SEL, SEL_KPL)); +} + +uint64_t +vmm_get_host_tsssel(void) +{ + + return (GSEL(GPROC0_SEL, SEL_KPL)); +} + +uint64_t +vmm_get_host_fsbase(void) +{ + + return (0); +} + +uint64_t +vmm_get_host_idtrbase(void) +{ + + return (r_idt.rd_base); +} + +const struct xsave_limits * +vmm_get_xsave_limits(void) +{ + + return (&vmm_xsave_limits); +} diff --git a/vmm/vmm_host.h b/vmm/vmm_host.h new file mode 100644 index 0000000..95618ff --- /dev/null +++ b/vmm/vmm_host.h @@ -0,0 +1,83 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_HOST_H_ +#define _VMM_HOST_H_ + +#ifndef _KERNEL +#error "no user-servicable parts inside" +#endif + +struct xsave_limits { + int xsave_enabled; + uint64_t xcr0_allowed; + uint32_t xsave_max_size; +}; + +void vmm_host_state_init(void); + +uint64_t vmm_get_host_pat(void); +uint64_t vmm_get_host_efer(void); +uint64_t vmm_get_host_cr0(void); +uint64_t vmm_get_host_cr4(void); +uint64_t vmm_get_host_xcr0(void); +uint64_t vmm_get_host_datasel(void); +uint64_t vmm_get_host_codesel(void); +uint64_t vmm_get_host_tsssel(void); +uint64_t vmm_get_host_fsbase(void); +uint64_t vmm_get_host_idtrbase(void); +const struct xsave_limits *vmm_get_xsave_limits(void); + +/* + * Inline access to host state that is used on every VM entry + */ +static __inline uint64_t +vmm_get_host_trbase(void) +{ + + return ((uint64_t)PCPU_GET(tssp)); +} + +static __inline uint64_t +vmm_get_host_gdtrbase(void) +{ + + return ((uint64_t)&gdt[NGDT * curcpu]); +} + +struct pcpu; +extern struct pcpu __pcpu[]; + +static __inline uint64_t +vmm_get_host_gsbase(void) +{ + + return ((uint64_t)&__pcpu[curcpu]); +} + +#endif diff --git a/vmm/vmm_instruction_emul.c b/vmm/vmm_instruction_emul.c new file mode 100644 index 0000000..9c6158a --- /dev/null +++ b/vmm/vmm_instruction_emul.c @@ -0,0 +1,2407 @@ +/*- + * Copyright (c) 2012 Sandvine, Inc. + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#ifdef _KERNEL +#include +#include +#include +#include + +#include +#include + +#include +#include +#else /* !_KERNEL */ +#include +#include +#include + +#include + +#include +#include +#define KASSERT(exp,msg) assert((exp)) +#endif /* _KERNEL */ + +#include +#include +#include + +/* struct vie_op.op_type */ +enum { + VIE_OP_TYPE_NONE = 0, + VIE_OP_TYPE_MOV, + VIE_OP_TYPE_MOVSX, + VIE_OP_TYPE_MOVZX, + VIE_OP_TYPE_AND, + VIE_OP_TYPE_OR, + VIE_OP_TYPE_SUB, + VIE_OP_TYPE_TWO_BYTE, + VIE_OP_TYPE_PUSH, + VIE_OP_TYPE_CMP, + VIE_OP_TYPE_POP, + VIE_OP_TYPE_MOVS, + VIE_OP_TYPE_GROUP1, + VIE_OP_TYPE_STOS, + VIE_OP_TYPE_BITTEST, + VIE_OP_TYPE_LAST +}; + +/* struct vie_op.op_flags */ +#define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */ +#define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ +#define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */ +#define VIE_OP_F_NO_MODRM (1 << 3) +#define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4) + +static const struct vie_op two_byte_opcodes[256] = { + [0xB6] = { + .op_byte = 0xB6, + .op_type = VIE_OP_TYPE_MOVZX, + }, + [0xB7] = { + .op_byte = 0xB7, + .op_type = VIE_OP_TYPE_MOVZX, + }, + [0xBA] = { + .op_byte = 0xBA, + .op_type = VIE_OP_TYPE_BITTEST, + .op_flags = VIE_OP_F_IMM8, + }, + [0xBE] = { + .op_byte = 0xBE, + .op_type = VIE_OP_TYPE_MOVSX, + }, +}; + +static const struct vie_op one_byte_opcodes[256] = { + [0x0F] = { + .op_byte = 0x0F, + .op_type = VIE_OP_TYPE_TWO_BYTE + }, + [0x2B] = { + .op_byte = 0x2B, + .op_type = VIE_OP_TYPE_SUB, + }, + [0x3B] = { + .op_byte = 0x3B, + .op_type = VIE_OP_TYPE_CMP, + }, + [0x88] = { + .op_byte = 0x88, + .op_type = VIE_OP_TYPE_MOV, + }, + [0x89] = { + .op_byte = 0x89, + .op_type = VIE_OP_TYPE_MOV, + }, + [0x8A] = { + .op_byte = 0x8A, + .op_type = VIE_OP_TYPE_MOV, + }, + [0x8B] = { + .op_byte = 0x8B, + .op_type = VIE_OP_TYPE_MOV, + }, + [0xA1] = { + .op_byte = 0xA1, + .op_type = VIE_OP_TYPE_MOV, + .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, + }, + [0xA3] = { + .op_byte = 0xA3, + .op_type = VIE_OP_TYPE_MOV, + .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM, + }, + [0xA4] = { + .op_byte = 0xA4, + .op_type = VIE_OP_TYPE_MOVS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION + }, + [0xA5] = { + .op_byte = 0xA5, + .op_type = VIE_OP_TYPE_MOVS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION + }, + [0xAA] = { + .op_byte = 0xAA, + .op_type = VIE_OP_TYPE_STOS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION + }, + [0xAB] = { + .op_byte = 0xAB, + .op_type = VIE_OP_TYPE_STOS, + .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION + }, + [0xC6] = { + /* XXX Group 11 extended opcode - not just MOV */ + .op_byte = 0xC6, + .op_type = VIE_OP_TYPE_MOV, + .op_flags = VIE_OP_F_IMM8, + }, + [0xC7] = { + .op_byte = 0xC7, + .op_type = VIE_OP_TYPE_MOV, + .op_flags = VIE_OP_F_IMM, + }, + [0x23] = { + .op_byte = 0x23, + .op_type = VIE_OP_TYPE_AND, + }, + [0x80] = { + /* Group 1 extended opcode */ + .op_byte = 0x80, + .op_type = VIE_OP_TYPE_GROUP1, + .op_flags = VIE_OP_F_IMM8, + }, + [0x81] = { + /* Group 1 extended opcode */ + .op_byte = 0x81, + .op_type = VIE_OP_TYPE_GROUP1, + .op_flags = VIE_OP_F_IMM, + }, + [0x83] = { + /* Group 1 extended opcode */ + .op_byte = 0x83, + .op_type = VIE_OP_TYPE_GROUP1, + .op_flags = VIE_OP_F_IMM8, + }, + [0x8F] = { + /* XXX Group 1A extended opcode - not just POP */ + .op_byte = 0x8F, + .op_type = VIE_OP_TYPE_POP, + }, + [0xFF] = { + /* XXX Group 5 extended opcode - not just PUSH */ + .op_byte = 0xFF, + .op_type = VIE_OP_TYPE_PUSH, + } +}; + +/* struct vie.mod */ +#define VIE_MOD_INDIRECT 0 +#define VIE_MOD_INDIRECT_DISP8 1 +#define VIE_MOD_INDIRECT_DISP32 2 +#define VIE_MOD_DIRECT 3 + +/* struct vie.rm */ +#define VIE_RM_SIB 4 +#define VIE_RM_DISP32 5 + +#define GB (1024 * 1024 * 1024) + +static enum vm_reg_name gpr_map[16] = { + VM_REG_GUEST_RAX, + VM_REG_GUEST_RCX, + VM_REG_GUEST_RDX, + VM_REG_GUEST_RBX, + VM_REG_GUEST_RSP, + VM_REG_GUEST_RBP, + VM_REG_GUEST_RSI, + VM_REG_GUEST_RDI, + VM_REG_GUEST_R8, + VM_REG_GUEST_R9, + VM_REG_GUEST_R10, + VM_REG_GUEST_R11, + VM_REG_GUEST_R12, + VM_REG_GUEST_R13, + VM_REG_GUEST_R14, + VM_REG_GUEST_R15 +}; + +static uint64_t size2mask[] = { + [1] = 0xff, + [2] = 0xffff, + [4] = 0xffffffff, + [8] = 0xffffffffffffffff, +}; + +static int +vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval) +{ + int error; + + error = vm_get_register(vm, vcpuid, reg, rval); + + return (error); +} + +static void +vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr) +{ + *lhbr = 0; + *reg = gpr_map[vie->reg]; + + /* + * 64-bit mode imposes limitations on accessing legacy high byte + * registers (lhbr). + * + * The legacy high-byte registers cannot be addressed if the REX + * prefix is present. In this case the values 4, 5, 6 and 7 of the + * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively. + * + * If the REX prefix is not present then the values 4, 5, 6 and 7 + * of the 'ModRM:reg' field address the legacy high-byte registers, + * %ah, %ch, %dh and %bh respectively. + */ + if (!vie->rex_present) { + if (vie->reg & 0x4) { + *lhbr = 1; + *reg = gpr_map[vie->reg & 0x3]; + } + } +} + +static int +vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval) +{ + uint64_t val; + int error, lhbr; + enum vm_reg_name reg; + + vie_calc_bytereg(vie, ®, &lhbr); + error = vm_get_register(vm, vcpuid, reg, &val); + + /* + * To obtain the value of a legacy high byte register shift the + * base register right by 8 bits (%ah = %rax >> 8). + */ + if (lhbr) + *rval = val >> 8; + else + *rval = val; + return (error); +} + +static int +vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte) +{ + uint64_t origval, val, mask; + int error, lhbr; + enum vm_reg_name reg; + + vie_calc_bytereg(vie, ®, &lhbr); + error = vm_get_register(vm, vcpuid, reg, &origval); + if (error == 0) { + val = byte; + mask = 0xff; + if (lhbr) { + /* + * Shift left by 8 to store 'byte' in a legacy high + * byte register. + */ + val <<= 8; + mask <<= 8; + } + val |= origval & ~mask; + error = vm_set_register(vm, vcpuid, reg, val); + } + return (error); +} + +int +vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, + uint64_t val, int size) +{ + int error; + uint64_t origval; + + switch (size) { + case 1: + case 2: + error = vie_read_register(vm, vcpuid, reg, &origval); + if (error) + return (error); + val &= size2mask[size]; + val |= origval & ~size2mask[size]; + break; + case 4: + val &= 0xffffffffUL; + break; + case 8: + break; + default: + return (EINVAL); + } + + error = vm_set_register(vm, vcpuid, reg, val); + return (error); +} + +#define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V) + +/* + * Return the status flags that would result from doing (x - y). + */ +#define GETCC(sz) \ +static u_long \ +getcc##sz(uint##sz##_t x, uint##sz##_t y) \ +{ \ + u_long rflags; \ + \ + __asm __volatile("sub %2,%1; pushfq; popq %0" : \ + "=r" (rflags), "+r" (x) : "m" (y)); \ + return (rflags); \ +} struct __hack + +GETCC(8); +GETCC(16); +GETCC(32); +GETCC(64); + +static u_long +getcc(int opsize, uint64_t x, uint64_t y) +{ + KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, + ("getcc: invalid operand size %d", opsize)); + + if (opsize == 1) + return (getcc8(x, y)); + else if (opsize == 2) + return (getcc16(x, y)); + else if (opsize == 4) + return (getcc32(x, y)); + else + return (getcc64(x, y)); +} + +static int +emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +{ + int error, size; + enum vm_reg_name reg; + uint8_t byte; + uint64_t val; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0x88: + /* + * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m) + * 88/r: mov r/m8, r8 + * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available) + */ + size = 1; /* override for byte operation */ + error = vie_read_bytereg(vm, vcpuid, vie, &byte); + if (error == 0) + error = memwrite(vm, vcpuid, gpa, byte, size, arg); + break; + case 0x89: + /* + * MOV from reg (ModRM:reg) to mem (ModRM:r/m) + * 89/r: mov r/m16, r16 + * 89/r: mov r/m32, r32 + * REX.W + 89/r mov r/m64, r64 + */ + reg = gpr_map[vie->reg]; + error = vie_read_register(vm, vcpuid, reg, &val); + if (error == 0) { + val &= size2mask[size]; + error = memwrite(vm, vcpuid, gpa, val, size, arg); + } + break; + case 0x8A: + /* + * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg) + * 8A/r: mov r8, r/m8 + * REX + 8A/r: mov r8, r/m8 + */ + size = 1; /* override for byte operation */ + error = memread(vm, vcpuid, gpa, &val, size, arg); + if (error == 0) + error = vie_write_bytereg(vm, vcpuid, vie, val); + break; + case 0x8B: + /* + * MOV from mem (ModRM:r/m) to reg (ModRM:reg) + * 8B/r: mov r16, r/m16 + * 8B/r: mov r32, r/m32 + * REX.W 8B/r: mov r64, r/m64 + */ + error = memread(vm, vcpuid, gpa, &val, size, arg); + if (error == 0) { + reg = gpr_map[vie->reg]; + error = vie_update_register(vm, vcpuid, reg, val, size); + } + break; + case 0xA1: + /* + * MOV from seg:moffset to AX/EAX/RAX + * A1: mov AX, moffs16 + * A1: mov EAX, moffs32 + * REX.W + A1: mov RAX, moffs64 + */ + error = memread(vm, vcpuid, gpa, &val, size, arg); + if (error == 0) { + reg = VM_REG_GUEST_RAX; + error = vie_update_register(vm, vcpuid, reg, val, size); + } + break; + case 0xA3: + /* + * MOV from AX/EAX/RAX to seg:moffset + * A3: mov moffs16, AX + * A3: mov moffs32, EAX + * REX.W + A3: mov moffs64, RAX + */ + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); + if (error == 0) { + val &= size2mask[size]; + error = memwrite(vm, vcpuid, gpa, val, size, arg); + } + break; + case 0xC6: + /* + * MOV from imm8 to mem (ModRM:r/m) + * C6/0 mov r/m8, imm8 + * REX + C6/0 mov r/m8, imm8 + */ + size = 1; /* override for byte operation */ + error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg); + break; + case 0xC7: + /* + * MOV from imm16/imm32 to mem (ModRM:r/m) + * C7/0 mov r/m16, imm16 + * C7/0 mov r/m32, imm32 + * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits) + */ + val = vie->immediate & size2mask[size]; + error = memwrite(vm, vcpuid, gpa, val, size, arg); + break; + default: + break; + } + + return (error); +} + +static int +emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, + void *arg) +{ + int error, size; + enum vm_reg_name reg; + uint64_t val; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0xB6: + /* + * MOV and zero extend byte from mem (ModRM:r/m) to + * reg (ModRM:reg). + * + * 0F B6/r movzx r16, r/m8 + * 0F B6/r movzx r32, r/m8 + * REX.W + 0F B6/r movzx r64, r/m8 + */ + + /* get the first operand */ + error = memread(vm, vcpuid, gpa, &val, 1, arg); + if (error) + break; + + /* get the second operand */ + reg = gpr_map[vie->reg]; + + /* zero-extend byte */ + val = (uint8_t)val; + + /* write the result */ + error = vie_update_register(vm, vcpuid, reg, val, size); + break; + case 0xB7: + /* + * MOV and zero extend word from mem (ModRM:r/m) to + * reg (ModRM:reg). + * + * 0F B7/r movzx r32, r/m16 + * REX.W + 0F B7/r movzx r64, r/m16 + */ + error = memread(vm, vcpuid, gpa, &val, 2, arg); + if (error) + return (error); + + reg = gpr_map[vie->reg]; + + /* zero-extend word */ + val = (uint16_t)val; + + error = vie_update_register(vm, vcpuid, reg, val, size); + break; + case 0xBE: + /* + * MOV and sign extend byte from mem (ModRM:r/m) to + * reg (ModRM:reg). + * + * 0F BE/r movsx r16, r/m8 + * 0F BE/r movsx r32, r/m8 + * REX.W + 0F BE/r movsx r64, r/m8 + */ + + /* get the first operand */ + error = memread(vm, vcpuid, gpa, &val, 1, arg); + if (error) + break; + + /* get the second operand */ + reg = gpr_map[vie->reg]; + + /* sign extend byte */ + val = (int8_t)val; + + /* write the result */ + error = vie_update_register(vm, vcpuid, reg, val, size); + break; + default: + break; + } + return (error); +} + +/* + * Helper function to calculate and validate a linear address. + */ +static int +get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging, + int opsize, int addrsize, int prot, enum vm_reg_name seg, + enum vm_reg_name gpr, uint64_t *gla, int *fault) +{ + struct seg_desc desc; + uint64_t cr0, val, rflags; + int error; + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); + KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + error = vm_get_seg_desc(vm, vcpuid, seg, &desc); + KASSERT(error == 0, ("%s: error %d getting segment descriptor %d", + __func__, error, seg)); + + error = vie_read_register(vm, vcpuid, gpr, &val); + KASSERT(error == 0, ("%s: error %d getting register %d", __func__, + error, gpr)); + + if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize, + addrsize, prot, gla)) { + if (seg == VM_REG_GUEST_SS) + vm_inject_ss(vm, vcpuid, 0); + else + vm_inject_gp(vm, vcpuid); + goto guest_fault; + } + + if (vie_canonical_check(paging->cpu_mode, *gla)) { + if (seg == VM_REG_GUEST_SS) + vm_inject_ss(vm, vcpuid, 0); + else + vm_inject_gp(vm, vcpuid); + goto guest_fault; + } + + if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) { + vm_inject_ac(vm, vcpuid, 0); + goto guest_fault; + } + + *fault = 0; + return (0); + +guest_fault: + *fault = 1; + return (0); +} + +static int +emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *arg) +{ +#ifdef _KERNEL + struct vm_copyinfo copyinfo[2]; +#else + struct iovec copyinfo[2]; +#endif + uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val; + uint64_t rcx, rdi, rsi, rflags; + int error, fault, opsize, seg, repeat; + + opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize; + val = 0; + error = 0; + + /* + * XXX although the MOVS instruction is only supposed to be used with + * the "rep" prefix some guests like FreeBSD will use "repnz" instead. + * + * Empirically the "repnz" prefix has identical behavior to "rep" + * and the zero flag does not make a difference. + */ + repeat = vie->repz_present | vie->repnz_present; + + if (repeat) { + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); + KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); + + /* + * The count register is %rcx, %ecx or %cx depending on the + * address size of the instruction. + */ + if ((rcx & vie_size2mask(vie->addrsize)) == 0) { + error = 0; + goto done; + } + } + + /* + * Source Destination Comments + * -------------------------------------------- + * (1) memory memory n/a + * (2) memory mmio emulated + * (3) mmio memory emulated + * (4) mmio mmio emulated + * + * At this point we don't have sufficient information to distinguish + * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this + * out because it will succeed only when operating on regular memory. + * + * XXX the emulation doesn't properly handle the case where 'gpa' + * is straddling the boundary between the normal memory and MMIO. + */ + + seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS; + error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, + PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault); + if (error || fault) + goto done; + + error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ, + copyinfo, nitems(copyinfo), &fault); + if (error == 0) { + if (fault) + goto done; /* Resume guest to handle fault */ + + /* + * case (2): read from system memory and write to mmio. + */ + vm_copyin(vm, vcpuid, copyinfo, &val, opsize); + vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + error = memwrite(vm, vcpuid, gpa, val, opsize, arg); + if (error) + goto done; + } else { + /* + * 'vm_copy_setup()' is expected to fail for cases (3) and (4) + * if 'srcaddr' is in the mmio space. + */ + + error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, + PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr, + &fault); + if (error || fault) + goto done; + + error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize, + PROT_WRITE, copyinfo, nitems(copyinfo), &fault); + if (error == 0) { + if (fault) + goto done; /* Resume guest to handle fault */ + + /* + * case (3): read from MMIO and write to system memory. + * + * A MMIO read can have side-effects so we + * commit to it only after vm_copy_setup() is + * successful. If a page-fault needs to be + * injected into the guest then it will happen + * before the MMIO read is attempted. + */ + error = memread(vm, vcpuid, gpa, &val, opsize, arg); + if (error) + goto done; + + vm_copyout(vm, vcpuid, &val, copyinfo, opsize); + vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + } else { + /* + * Case (4): read from and write to mmio. + * + * Commit to the MMIO read/write (with potential + * side-effects) only after we are sure that the + * instruction is not going to be restarted due + * to address translation faults. + */ + error = vm_gla2gpa(vm, vcpuid, paging, srcaddr, + PROT_READ, &srcgpa, &fault); + if (error || fault) + goto done; + + error = vm_gla2gpa(vm, vcpuid, paging, dstaddr, + PROT_WRITE, &dstgpa, &fault); + if (error || fault) + goto done; + + error = memread(vm, vcpuid, srcgpa, &val, opsize, arg); + if (error) + goto done; + + error = memwrite(vm, vcpuid, dstgpa, val, opsize, arg); + if (error) + goto done; + } + } + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi); + KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); + KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + if (rflags & PSL_D) { + rsi -= opsize; + rdi -= opsize; + } else { + rsi += opsize; + rdi += opsize; + } + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi, + vie->addrsize); + KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error)); + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, + vie->addrsize); + KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); + + if (repeat) { + rcx = rcx - 1; + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, + rcx, vie->addrsize); + KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); + + /* + * Repeat the instruction if the count register is not zero. + */ + if ((rcx & vie_size2mask(vie->addrsize)) != 0) + vm_restart_instruction(vm, vcpuid); + } +done: + KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d", + __func__, error)); + return (error); +} + +static int +emulate_stos(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *arg) +{ + int error, opsize, repeat; + uint64_t val; + uint64_t rcx, rdi, rflags; + + opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize; + repeat = vie->repz_present | vie->repnz_present; + + if (repeat) { + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); + KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); + + /* + * The count register is %rcx, %ecx or %cx depending on the + * address size of the instruction. + */ + if ((rcx & vie_size2mask(vie->addrsize)) == 0) + return (0); + } + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); + KASSERT(!error, ("%s: error %d getting rax", __func__, error)); + + error = memwrite(vm, vcpuid, gpa, val, opsize, arg); + if (error) + return (error); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); + KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + if (rflags & PSL_D) + rdi -= opsize; + else + rdi += opsize; + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi, + vie->addrsize); + KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error)); + + if (repeat) { + rcx = rcx - 1; + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, + rcx, vie->addrsize); + KASSERT(!error, ("%s: error %d updating rcx", __func__, error)); + + /* + * Repeat the instruction if the count register is not zero. + */ + if ((rcx & vie_size2mask(vie->addrsize)) != 0) + vm_restart_instruction(vm, vcpuid); + } + + return (0); +} + +static int +emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +{ + int error, size; + enum vm_reg_name reg; + uint64_t result, rflags, rflags2, val1, val2; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0x23: + /* + * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the + * result in reg. + * + * 23/r and r16, r/m16 + * 23/r and r32, r/m32 + * REX.W + 23/r and r64, r/m64 + */ + + /* get the first operand */ + reg = gpr_map[vie->reg]; + error = vie_read_register(vm, vcpuid, reg, &val1); + if (error) + break; + + /* get the second operand */ + error = memread(vm, vcpuid, gpa, &val2, size, arg); + if (error) + break; + + /* perform the operation and write the result */ + result = val1 & val2; + error = vie_update_register(vm, vcpuid, reg, result, size); + break; + case 0x81: + case 0x83: + /* + * AND mem (ModRM:r/m) with immediate and store the + * result in mem. + * + * 81 /4 and r/m16, imm16 + * 81 /4 and r/m32, imm32 + * REX.W + 81 /4 and r/m64, imm32 sign-extended to 64 + * + * 83 /4 and r/m16, imm8 sign-extended to 16 + * 83 /4 and r/m32, imm8 sign-extended to 32 + * REX.W + 83/4 and r/m64, imm8 sign-extended to 64 + */ + + /* get the first operand */ + error = memread(vm, vcpuid, gpa, &val1, size, arg); + if (error) + break; + + /* + * perform the operation with the pre-fetched immediate + * operand and write the result + */ + result = val1 & vie->immediate; + error = memwrite(vm, vcpuid, gpa, result, size, arg); + break; + default: + break; + } + if (error) + return (error); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + if (error) + return (error); + + /* + * OF and CF are cleared; the SF, ZF and PF flags are set according + * to the result; AF is undefined. + * + * The updated status flags are obtained by subtracting 0 from 'result'. + */ + rflags2 = getcc(size, result, 0); + rflags &= ~RFLAGS_STATUS_BITS; + rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); + return (error); +} + +static int +emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +{ + int error, size; + uint64_t val1, result, rflags, rflags2; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0x81: + case 0x83: + /* + * OR mem (ModRM:r/m) with immediate and store the + * result in mem. + * + * 81 /1 or r/m16, imm16 + * 81 /1 or r/m32, imm32 + * REX.W + 81 /1 or r/m64, imm32 sign-extended to 64 + * + * 83 /1 or r/m16, imm8 sign-extended to 16 + * 83 /1 or r/m32, imm8 sign-extended to 32 + * REX.W + 83/1 or r/m64, imm8 sign-extended to 64 + */ + + /* get the first operand */ + error = memread(vm, vcpuid, gpa, &val1, size, arg); + if (error) + break; + + /* + * perform the operation with the pre-fetched immediate + * operand and write the result + */ + result = val1 | vie->immediate; + error = memwrite(vm, vcpuid, gpa, result, size, arg); + break; + default: + break; + } + if (error) + return (error); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + if (error) + return (error); + + /* + * OF and CF are cleared; the SF, ZF and PF flags are set according + * to the result; AF is undefined. + * + * The updated status flags are obtained by subtracting 0 from 'result'. + */ + rflags2 = getcc(size, result, 0); + rflags &= ~RFLAGS_STATUS_BITS; + rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N); + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); + return (error); +} + +static int +emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +{ + int error, size; + uint64_t op1, op2, rflags, rflags2; + enum vm_reg_name reg; + + size = vie->opsize; + switch (vie->op.op_byte) { + case 0x3B: + /* + * 3B/r CMP r16, r/m16 + * 3B/r CMP r32, r/m32 + * REX.W + 3B/r CMP r64, r/m64 + * + * Compare first operand (reg) with second operand (r/m) and + * set status flags in EFLAGS register. The comparison is + * performed by subtracting the second operand from the first + * operand and then setting the status flags. + */ + + /* Get the first operand */ + reg = gpr_map[vie->reg]; + error = vie_read_register(vm, vcpuid, reg, &op1); + if (error) + return (error); + + /* Get the second operand */ + error = memread(vm, vcpuid, gpa, &op2, size, arg); + if (error) + return (error); + + rflags2 = getcc(size, op1, op2); + break; + case 0x80: + case 0x81: + case 0x83: + /* + * 80 /7 cmp r/m8, imm8 + * REX + 80 /7 cmp r/m8, imm8 + * + * 81 /7 cmp r/m16, imm16 + * 81 /7 cmp r/m32, imm32 + * REX.W + 81 /7 cmp r/m64, imm32 sign-extended to 64 + * + * 83 /7 cmp r/m16, imm8 sign-extended to 16 + * 83 /7 cmp r/m32, imm8 sign-extended to 32 + * REX.W + 83 /7 cmp r/m64, imm8 sign-extended to 64 + * + * Compare mem (ModRM:r/m) with immediate and set + * status flags according to the results. The + * comparison is performed by subtracting the + * immediate from the first operand and then setting + * the status flags. + * + */ + if (vie->op.op_byte == 0x80) + size = 1; + + /* get the first operand */ + error = memread(vm, vcpuid, gpa, &op1, size, arg); + if (error) + return (error); + + rflags2 = getcc(size, op1, vie->immediate); + break; + default: + return (EINVAL); + } + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + if (error) + return (error); + rflags &= ~RFLAGS_STATUS_BITS; + rflags |= rflags2 & RFLAGS_STATUS_BITS; + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); + return (error); +} + +static int +emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +{ + int error, size; + uint64_t nval, rflags, rflags2, val1, val2; + enum vm_reg_name reg; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0x2B: + /* + * SUB r/m from r and store the result in r + * + * 2B/r SUB r16, r/m16 + * 2B/r SUB r32, r/m32 + * REX.W + 2B/r SUB r64, r/m64 + */ + + /* get the first operand */ + reg = gpr_map[vie->reg]; + error = vie_read_register(vm, vcpuid, reg, &val1); + if (error) + break; + + /* get the second operand */ + error = memread(vm, vcpuid, gpa, &val2, size, arg); + if (error) + break; + + /* perform the operation and write the result */ + nval = val1 - val2; + error = vie_update_register(vm, vcpuid, reg, nval, size); + break; + default: + break; + } + + if (!error) { + rflags2 = getcc(size, val1, val2); + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, + &rflags); + if (error) + return (error); + + rflags &= ~RFLAGS_STATUS_BITS; + rflags |= rflags2 & RFLAGS_STATUS_BITS; + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, + rflags, 8); + } + + return (error); +} + +static int +emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *arg) +{ +#ifdef _KERNEL + struct vm_copyinfo copyinfo[2]; +#else + struct iovec copyinfo[2]; +#endif + struct seg_desc ss_desc; + uint64_t cr0, rflags, rsp, stack_gla, val; + int error, fault, size, stackaddrsize, pushop; + + val = 0; + size = vie->opsize; + pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0; + + /* + * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1 + */ + if (paging->cpu_mode == CPU_MODE_REAL) { + stackaddrsize = 2; + } else if (paging->cpu_mode == CPU_MODE_64BIT) { + /* + * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3 + * - Stack pointer size is always 64-bits. + * - PUSH/POP of 32-bit values is not possible in 64-bit mode. + * - 16-bit PUSH/POP is supported by using the operand size + * override prefix (66H). + */ + stackaddrsize = 8; + size = vie->opsize_override ? 2 : 8; + } else { + /* + * In protected or compability mode the 'B' flag in the + * stack-segment descriptor determines the size of the + * stack pointer. + */ + error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc); + KASSERT(error == 0, ("%s: error %d getting SS descriptor", + __func__, error)); + if (SEG_DESC_DEF32(ss_desc.access)) + stackaddrsize = 4; + else + stackaddrsize = 2; + } + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); + KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp); + KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error)); + if (pushop) { + rsp -= size; + } + + if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc, + rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ, + &stack_gla)) { + vm_inject_ss(vm, vcpuid, 0); + return (0); + } + + if (vie_canonical_check(paging->cpu_mode, stack_gla)) { + vm_inject_ss(vm, vcpuid, 0); + return (0); + } + + if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) { + vm_inject_ac(vm, vcpuid, 0); + return (0); + } + + error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size, + pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo), + &fault); + if (error || fault) + return (error); + + if (pushop) { + error = memread(vm, vcpuid, mmio_gpa, &val, size, arg); + if (error == 0) + vm_copyout(vm, vcpuid, &val, copyinfo, size); + } else { + vm_copyin(vm, vcpuid, copyinfo, &val, size); + error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg); + rsp += size; + } + vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + + if (error == 0) { + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp, + stackaddrsize); + KASSERT(error == 0, ("error %d updating rsp", error)); + } + return (error); +} + +static int +emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *arg) +{ + int error; + + /* + * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. + * + * PUSH is part of the group 5 extended opcodes and is identified + * by ModRM:reg = b110. + */ + if ((vie->reg & 7) != 6) + return (EINVAL); + + error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread, + memwrite, arg); + return (error); +} + +static int +emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *arg) +{ + int error; + + /* + * Table A-6, "Opcode Extensions", Intel SDM, Vol 2. + * + * POP is part of the group 1A extended opcodes and is identified + * by ModRM:reg = b000. + */ + if ((vie->reg & 7) != 0) + return (EINVAL); + + error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread, + memwrite, arg); + return (error); +} + +static int +emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *memarg) +{ + int error; + + switch (vie->reg & 7) { + case 0x1: /* OR */ + error = emulate_or(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case 0x4: /* AND */ + error = emulate_and(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case 0x7: /* CMP */ + error = emulate_cmp(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + default: + error = EINVAL; + break; + } + + return (error); +} + +static int +emulate_bittest(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *memarg) +{ + uint64_t val, rflags; + int error, bitmask, bitoff; + + /* + * 0F BA is a Group 8 extended opcode. + * + * Currently we only emulate the 'Bit Test' instruction which is + * identified by a ModR/M:reg encoding of 100b. + */ + if ((vie->reg & 7) != 4) + return (EINVAL); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + error = memread(vm, vcpuid, gpa, &val, vie->opsize, memarg); + if (error) + return (error); + + /* + * Intel SDM, Vol 2, Table 3-2: + * "Range of Bit Positions Specified by Bit Offset Operands" + */ + bitmask = vie->opsize * 8 - 1; + bitoff = vie->immediate & bitmask; + + /* Copy the bit into the Carry flag in %rflags */ + if (val & (1UL << bitoff)) + rflags |= PSL_C; + else + rflags &= ~PSL_C; + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); + KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error)); + + return (0); +} + +int +vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t memread, + mem_region_write_t memwrite, void *memarg) +{ + int error; + + if (!vie->decoded) + return (EINVAL); + + switch (vie->op.op_type) { + case VIE_OP_TYPE_GROUP1: + error = emulate_group1(vm, vcpuid, gpa, vie, paging, memread, + memwrite, memarg); + break; + case VIE_OP_TYPE_POP: + error = emulate_pop(vm, vcpuid, gpa, vie, paging, memread, + memwrite, memarg); + break; + case VIE_OP_TYPE_PUSH: + error = emulate_push(vm, vcpuid, gpa, vie, paging, memread, + memwrite, memarg); + break; + case VIE_OP_TYPE_CMP: + error = emulate_cmp(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_MOV: + error = emulate_mov(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_MOVSX: + case VIE_OP_TYPE_MOVZX: + error = emulate_movx(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_MOVS: + error = emulate_movs(vm, vcpuid, gpa, vie, paging, memread, + memwrite, memarg); + break; + case VIE_OP_TYPE_STOS: + error = emulate_stos(vm, vcpuid, gpa, vie, paging, memread, + memwrite, memarg); + break; + case VIE_OP_TYPE_AND: + error = emulate_and(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_OR: + error = emulate_or(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_SUB: + error = emulate_sub(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_BITTEST: + error = emulate_bittest(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + default: + error = EINVAL; + break; + } + + return (error); +} + +int +vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla) +{ + KASSERT(size == 1 || size == 2 || size == 4 || size == 8, + ("%s: invalid size %d", __func__, size)); + KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl)); + + if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0) + return (0); + + return ((gla & (size - 1)) ? 1 : 0); +} + +int +vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla) +{ + uint64_t mask; + + if (cpu_mode != CPU_MODE_64BIT) + return (0); + + /* + * The value of the bit 47 in the 'gla' should be replicated in the + * most significant 16 bits. + */ + mask = ~((1UL << 48) - 1); + if (gla & (1UL << 47)) + return ((gla & mask) != mask); + else + return ((gla & mask) != 0); +} + +uint64_t +vie_size2mask(int size) +{ + KASSERT(size == 1 || size == 2 || size == 4 || size == 8, + ("vie_size2mask: invalid size %d", size)); + return (size2mask[size]); +} + +int +vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, + struct seg_desc *desc, uint64_t offset, int length, int addrsize, + int prot, uint64_t *gla) +{ + uint64_t firstoff, low_limit, high_limit, segbase; + int glasize, type; + + KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS, + ("%s: invalid segment %d", __func__, seg)); + KASSERT(length == 1 || length == 2 || length == 4 || length == 8, + ("%s: invalid operand size %d", __func__, length)); + KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0, + ("%s: invalid prot %#x", __func__, prot)); + + firstoff = offset; + if (cpu_mode == CPU_MODE_64BIT) { + KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address " + "size %d for cpu_mode %d", __func__, addrsize, cpu_mode)); + glasize = 8; + } else { + KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address " + "size %d for cpu mode %d", __func__, addrsize, cpu_mode)); + glasize = 4; + /* + * If the segment selector is loaded with a NULL selector + * then the descriptor is unusable and attempting to use + * it results in a #GP(0). + */ + if (SEG_DESC_UNUSABLE(desc->access)) + return (-1); + + /* + * The processor generates a #NP exception when a segment + * register is loaded with a selector that points to a + * descriptor that is not present. If this was the case then + * it would have been checked before the VM-exit. + */ + KASSERT(SEG_DESC_PRESENT(desc->access), + ("segment %d not present: %#x", seg, desc->access)); + + /* + * The descriptor type must indicate a code/data segment. + */ + type = SEG_DESC_TYPE(desc->access); + KASSERT(type >= 16 && type <= 31, ("segment %d has invalid " + "descriptor type %#x", seg, type)); + + if (prot & PROT_READ) { + /* #GP on a read access to a exec-only code segment */ + if ((type & 0xA) == 0x8) + return (-1); + } + + if (prot & PROT_WRITE) { + /* + * #GP on a write access to a code segment or a + * read-only data segment. + */ + if (type & 0x8) /* code segment */ + return (-1); + + if ((type & 0xA) == 0) /* read-only data seg */ + return (-1); + } + + /* + * 'desc->limit' is fully expanded taking granularity into + * account. + */ + if ((type & 0xC) == 0x4) { + /* expand-down data segment */ + low_limit = desc->limit + 1; + high_limit = SEG_DESC_DEF32(desc->access) ? + 0xffffffff : 0xffff; + } else { + /* code segment or expand-up data segment */ + low_limit = 0; + high_limit = desc->limit; + } + + while (length > 0) { + offset &= vie_size2mask(addrsize); + if (offset < low_limit || offset > high_limit) + return (-1); + offset++; + length--; + } + } + + /* + * In 64-bit mode all segments except %fs and %gs have a segment + * base address of 0. + */ + if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && + seg != VM_REG_GUEST_GS) { + segbase = 0; + } else { + segbase = desc->base; + } + + /* + * Truncate 'firstoff' to the effective address size before adding + * it to the segment base. + */ + firstoff &= vie_size2mask(addrsize); + *gla = (segbase + firstoff) & vie_size2mask(glasize); + return (0); +} + +#ifdef _KERNEL +void +vie_init(struct vie *vie, const char *inst_bytes, int inst_length) +{ + KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE, + ("%s: invalid instruction length (%d)", __func__, inst_length)); + + bzero(vie, sizeof(struct vie)); + + vie->base_register = VM_REG_LAST; + vie->index_register = VM_REG_LAST; + vie->segment_register = VM_REG_LAST; + + if (inst_length) { + bcopy(inst_bytes, vie->inst, inst_length); + vie->num_valid = inst_length; + } +} + +static int +pf_error_code(int usermode, int prot, int rsvd, uint64_t pte) +{ + int error_code = 0; + + if (pte & PG_V) + error_code |= PGEX_P; + if (prot & VM_PROT_WRITE) + error_code |= PGEX_W; + if (usermode) + error_code |= PGEX_U; + if (rsvd) + error_code |= PGEX_RSV; + if (prot & VM_PROT_EXECUTE) + error_code |= PGEX_I; + + return (error_code); +} + +static void +ptp_release(void **cookie) +{ + if (*cookie != NULL) { + vm_gpa_release(*cookie); + *cookie = NULL; + } +} + +static void * +ptp_hold(struct vm *vm, vm_paddr_t ptpphys, size_t len, void **cookie) +{ + void *ptr; + + ptp_release(cookie); + ptr = vm_gpa_hold(vm, ptpphys, len, VM_PROT_RW, cookie); + return (ptr); +} + +int +vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) +{ + int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable; + u_int retries; + uint64_t *ptpbase, ptpphys, pte, pgsize; + uint32_t *ptpbase32, pte32; + void *cookie; + + *guest_fault = 0; + + usermode = (paging->cpl == 3 ? 1 : 0); + writable = prot & VM_PROT_WRITE; + cookie = NULL; + retval = 0; + retries = 0; +restart: + ptpphys = paging->cr3; /* root of the page tables */ + ptp_release(&cookie); + if (retries++ > 0) + maybe_yield(); + + if (vie_canonical_check(paging->cpu_mode, gla)) { + /* + * XXX assuming a non-stack reference otherwise a stack fault + * should be generated. + */ + vm_inject_gp(vm, vcpuid); + goto fault; + } + + if (paging->paging_mode == PAGING_MODE_FLAT) { + *gpa = gla; + goto done; + } + + if (paging->paging_mode == PAGING_MODE_32) { + nlevels = 2; + while (--nlevels >= 0) { + /* Zero out the lower 12 bits. */ + ptpphys &= ~0xfff; + + ptpbase32 = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie); + + if (ptpbase32 == NULL) + goto error; + + ptpshift = PAGE_SHIFT + nlevels * 10; + ptpindex = (gla >> ptpshift) & 0x3FF; + pgsize = 1UL << ptpshift; + + pte32 = ptpbase32[ptpindex]; + + if ((pte32 & PG_V) == 0 || + (usermode && (pte32 & PG_U) == 0) || + (writable && (pte32 & PG_RW) == 0)) { + pfcode = pf_error_code(usermode, prot, 0, + pte32); + vm_inject_pf(vm, vcpuid, pfcode, gla); + goto fault; + } + + /* + * Emulate the x86 MMU's management of the accessed + * and dirty flags. While the accessed flag is set + * at every level of the page table, the dirty flag + * is only set at the last level providing the guest + * physical address. + */ + if ((pte32 & PG_A) == 0) { + if (atomic_cmpset_32(&ptpbase32[ptpindex], + pte32, pte32 | PG_A) == 0) { + goto restart; + } + } + + /* XXX must be ignored if CR4.PSE=0 */ + if (nlevels > 0 && (pte32 & PG_PS) != 0) + break; + + ptpphys = pte32; + } + + /* Set the dirty bit in the page table entry if necessary */ + if (writable && (pte32 & PG_M) == 0) { + if (atomic_cmpset_32(&ptpbase32[ptpindex], + pte32, pte32 | PG_M) == 0) { + goto restart; + } + } + + /* Zero out the lower 'ptpshift' bits */ + pte32 >>= ptpshift; pte32 <<= ptpshift; + *gpa = pte32 | (gla & (pgsize - 1)); + goto done; + } + + if (paging->paging_mode == PAGING_MODE_PAE) { + /* Zero out the lower 5 bits and the upper 32 bits */ + ptpphys &= 0xffffffe0UL; + + ptpbase = ptp_hold(vm, ptpphys, sizeof(*ptpbase) * 4, &cookie); + if (ptpbase == NULL) + goto error; + + ptpindex = (gla >> 30) & 0x3; + + pte = ptpbase[ptpindex]; + + if ((pte & PG_V) == 0) { + pfcode = pf_error_code(usermode, prot, 0, pte); + vm_inject_pf(vm, vcpuid, pfcode, gla); + goto fault; + } + + ptpphys = pte; + + nlevels = 2; + } else + nlevels = 4; + while (--nlevels >= 0) { + /* Zero out the lower 12 bits and the upper 12 bits */ + ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12; + + ptpbase = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie); + if (ptpbase == NULL) + goto error; + + ptpshift = PAGE_SHIFT + nlevels * 9; + ptpindex = (gla >> ptpshift) & 0x1FF; + pgsize = 1UL << ptpshift; + + pte = ptpbase[ptpindex]; + + if ((pte & PG_V) == 0 || + (usermode && (pte & PG_U) == 0) || + (writable && (pte & PG_RW) == 0)) { + pfcode = pf_error_code(usermode, prot, 0, pte); + vm_inject_pf(vm, vcpuid, pfcode, gla); + goto fault; + } + + /* Set the accessed bit in the page table entry */ + if ((pte & PG_A) == 0) { + if (atomic_cmpset_64(&ptpbase[ptpindex], + pte, pte | PG_A) == 0) { + goto restart; + } + } + + if (nlevels > 0 && (pte & PG_PS) != 0) { + if (pgsize > 1 * GB) { + pfcode = pf_error_code(usermode, prot, 1, pte); + vm_inject_pf(vm, vcpuid, pfcode, gla); + goto fault; + } + break; + } + + ptpphys = pte; + } + + /* Set the dirty bit in the page table entry if necessary */ + if (writable && (pte & PG_M) == 0) { + if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0) + goto restart; + } + + /* Zero out the lower 'ptpshift' bits and the upper 12 bits */ + pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12; + *gpa = pte | (gla & (pgsize - 1)); +done: + ptp_release(&cookie); + KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d", + __func__, retval)); + return (retval); +error: + retval = EFAULT; + goto done; +fault: + *guest_fault = 1; + goto done; +} + +int +vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t rip, int inst_length, struct vie *vie, int *faultptr) +{ + struct vm_copyinfo copyinfo[2]; + int error, prot; + + if (inst_length > VIE_INST_SIZE) + panic("vmm_fetch_instruction: invalid length %d", inst_length); + + prot = PROT_READ | PROT_EXEC; + error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot, + copyinfo, nitems(copyinfo), faultptr); + if (error || *faultptr) + return (error); + + vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length); + vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + vie->num_valid = inst_length; + return (0); +} + +static int +vie_peek(struct vie *vie, uint8_t *x) +{ + + if (vie->num_processed < vie->num_valid) { + *x = vie->inst[vie->num_processed]; + return (0); + } else + return (-1); +} + +static void +vie_advance(struct vie *vie) +{ + + vie->num_processed++; +} + +static bool +segment_override(uint8_t x, int *seg) +{ + + switch (x) { + case 0x2E: + *seg = VM_REG_GUEST_CS; + break; + case 0x36: + *seg = VM_REG_GUEST_SS; + break; + case 0x3E: + *seg = VM_REG_GUEST_DS; + break; + case 0x26: + *seg = VM_REG_GUEST_ES; + break; + case 0x64: + *seg = VM_REG_GUEST_FS; + break; + case 0x65: + *seg = VM_REG_GUEST_GS; + break; + default: + return (false); + } + return (true); +} + +static int +decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d) +{ + uint8_t x; + + while (1) { + if (vie_peek(vie, &x)) + return (-1); + + if (x == 0x66) + vie->opsize_override = 1; + else if (x == 0x67) + vie->addrsize_override = 1; + else if (x == 0xF3) + vie->repz_present = 1; + else if (x == 0xF2) + vie->repnz_present = 1; + else if (segment_override(x, &vie->segment_register)) + vie->segment_override = 1; + else + break; + + vie_advance(vie); + } + + /* + * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2: + * - Only one REX prefix is allowed per instruction. + * - The REX prefix must immediately precede the opcode byte or the + * escape opcode byte. + * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3) + * the mandatory prefix must come before the REX prefix. + */ + if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) { + vie->rex_present = 1; + vie->rex_w = x & 0x8 ? 1 : 0; + vie->rex_r = x & 0x4 ? 1 : 0; + vie->rex_x = x & 0x2 ? 1 : 0; + vie->rex_b = x & 0x1 ? 1 : 0; + vie_advance(vie); + } + + /* + * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1 + */ + if (cpu_mode == CPU_MODE_64BIT) { + /* + * Default address size is 64-bits and default operand size + * is 32-bits. + */ + vie->addrsize = vie->addrsize_override ? 4 : 8; + if (vie->rex_w) + vie->opsize = 8; + else if (vie->opsize_override) + vie->opsize = 2; + else + vie->opsize = 4; + } else if (cs_d) { + /* Default address and operand sizes are 32-bits */ + vie->addrsize = vie->addrsize_override ? 2 : 4; + vie->opsize = vie->opsize_override ? 2 : 4; + } else { + /* Default address and operand sizes are 16-bits */ + vie->addrsize = vie->addrsize_override ? 4 : 2; + vie->opsize = vie->opsize_override ? 4 : 2; + } + return (0); +} + +static int +decode_two_byte_opcode(struct vie *vie) +{ + uint8_t x; + + if (vie_peek(vie, &x)) + return (-1); + + vie->op = two_byte_opcodes[x]; + + if (vie->op.op_type == VIE_OP_TYPE_NONE) + return (-1); + + vie_advance(vie); + return (0); +} + +static int +decode_opcode(struct vie *vie) +{ + uint8_t x; + + if (vie_peek(vie, &x)) + return (-1); + + vie->op = one_byte_opcodes[x]; + + if (vie->op.op_type == VIE_OP_TYPE_NONE) + return (-1); + + vie_advance(vie); + + if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE) + return (decode_two_byte_opcode(vie)); + + return (0); +} + +static int +decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode) +{ + uint8_t x; + + if (vie->op.op_flags & VIE_OP_F_NO_MODRM) + return (0); + + if (cpu_mode == CPU_MODE_REAL) + return (-1); + + if (vie_peek(vie, &x)) + return (-1); + + vie->mod = (x >> 6) & 0x3; + vie->rm = (x >> 0) & 0x7; + vie->reg = (x >> 3) & 0x7; + + /* + * A direct addressing mode makes no sense in the context of an EPT + * fault. There has to be a memory access involved to cause the + * EPT fault. + */ + if (vie->mod == VIE_MOD_DIRECT) + return (-1); + + if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) || + (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) { + /* + * Table 2-5: Special Cases of REX Encodings + * + * mod=0, r/m=5 is used in the compatibility mode to + * indicate a disp32 without a base register. + * + * mod!=3, r/m=4 is used in the compatibility mode to + * indicate that the SIB byte is present. + * + * The 'b' bit in the REX prefix is don't care in + * this case. + */ + } else { + vie->rm |= (vie->rex_b << 3); + } + + vie->reg |= (vie->rex_r << 3); + + /* SIB */ + if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB) + goto done; + + vie->base_register = gpr_map[vie->rm]; + + switch (vie->mod) { + case VIE_MOD_INDIRECT_DISP8: + vie->disp_bytes = 1; + break; + case VIE_MOD_INDIRECT_DISP32: + vie->disp_bytes = 4; + break; + case VIE_MOD_INDIRECT: + if (vie->rm == VIE_RM_DISP32) { + vie->disp_bytes = 4; + /* + * Table 2-7. RIP-Relative Addressing + * + * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32 + * whereas in compatibility mode it just implies disp32. + */ + + if (cpu_mode == CPU_MODE_64BIT) + vie->base_register = VM_REG_GUEST_RIP; + else + vie->base_register = VM_REG_LAST; + } + break; + } + +done: + vie_advance(vie); + + return (0); +} + +static int +decode_sib(struct vie *vie) +{ + uint8_t x; + + /* Proceed only if SIB byte is present */ + if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB) + return (0); + + if (vie_peek(vie, &x)) + return (-1); + + /* De-construct the SIB byte */ + vie->ss = (x >> 6) & 0x3; + vie->index = (x >> 3) & 0x7; + vie->base = (x >> 0) & 0x7; + + /* Apply the REX prefix modifiers */ + vie->index |= vie->rex_x << 3; + vie->base |= vie->rex_b << 3; + + switch (vie->mod) { + case VIE_MOD_INDIRECT_DISP8: + vie->disp_bytes = 1; + break; + case VIE_MOD_INDIRECT_DISP32: + vie->disp_bytes = 4; + break; + } + + if (vie->mod == VIE_MOD_INDIRECT && + (vie->base == 5 || vie->base == 13)) { + /* + * Special case when base register is unused if mod = 0 + * and base = %rbp or %r13. + * + * Documented in: + * Table 2-3: 32-bit Addressing Forms with the SIB Byte + * Table 2-5: Special Cases of REX Encodings + */ + vie->disp_bytes = 4; + } else { + vie->base_register = gpr_map[vie->base]; + } + + /* + * All encodings of 'index' are valid except for %rsp (4). + * + * Documented in: + * Table 2-3: 32-bit Addressing Forms with the SIB Byte + * Table 2-5: Special Cases of REX Encodings + */ + if (vie->index != 4) + vie->index_register = gpr_map[vie->index]; + + /* 'scale' makes sense only in the context of an index register */ + if (vie->index_register < VM_REG_LAST) + vie->scale = 1 << vie->ss; + + vie_advance(vie); + + return (0); +} + +static int +decode_displacement(struct vie *vie) +{ + int n, i; + uint8_t x; + + union { + char buf[4]; + int8_t signed8; + int32_t signed32; + } u; + + if ((n = vie->disp_bytes) == 0) + return (0); + + if (n != 1 && n != 4) + panic("decode_displacement: invalid disp_bytes %d", n); + + for (i = 0; i < n; i++) { + if (vie_peek(vie, &x)) + return (-1); + + u.buf[i] = x; + vie_advance(vie); + } + + if (n == 1) + vie->displacement = u.signed8; /* sign-extended */ + else + vie->displacement = u.signed32; /* sign-extended */ + + return (0); +} + +static int +decode_immediate(struct vie *vie) +{ + int i, n; + uint8_t x; + union { + char buf[4]; + int8_t signed8; + int16_t signed16; + int32_t signed32; + } u; + + /* Figure out immediate operand size (if any) */ + if (vie->op.op_flags & VIE_OP_F_IMM) { + /* + * Section 2.2.1.5 "Immediates", Intel SDM: + * In 64-bit mode the typical size of immediate operands + * remains 32-bits. When the operand size if 64-bits, the + * processor sign-extends all immediates to 64-bits prior + * to their use. + */ + if (vie->opsize == 4 || vie->opsize == 8) + vie->imm_bytes = 4; + else + vie->imm_bytes = 2; + } else if (vie->op.op_flags & VIE_OP_F_IMM8) { + vie->imm_bytes = 1; + } + + if ((n = vie->imm_bytes) == 0) + return (0); + + KASSERT(n == 1 || n == 2 || n == 4, + ("%s: invalid number of immediate bytes: %d", __func__, n)); + + for (i = 0; i < n; i++) { + if (vie_peek(vie, &x)) + return (-1); + + u.buf[i] = x; + vie_advance(vie); + } + + /* sign-extend the immediate value before use */ + if (n == 1) + vie->immediate = u.signed8; + else if (n == 2) + vie->immediate = u.signed16; + else + vie->immediate = u.signed32; + + return (0); +} + +static int +decode_moffset(struct vie *vie) +{ + int i, n; + uint8_t x; + union { + char buf[8]; + uint64_t u64; + } u; + + if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0) + return (0); + + /* + * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM: + * The memory offset size follows the address-size of the instruction. + */ + n = vie->addrsize; + KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n)); + + u.u64 = 0; + for (i = 0; i < n; i++) { + if (vie_peek(vie, &x)) + return (-1); + + u.buf[i] = x; + vie_advance(vie); + } + vie->displacement = u.u64; + return (0); +} + +/* + * Verify that all the bytes in the instruction buffer were consumed. + */ +static int +verify_inst_length(struct vie *vie) +{ + + if (vie->num_processed) + return (0); + else + return (-1); +} + +/* + * Verify that the 'guest linear address' provided as collateral of the nested + * page table fault matches with our instruction decoding. + */ +static int +verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie) +{ + int error; + uint64_t base, idx, gla2; + + /* Skip 'gla' verification */ + if (gla == VIE_INVALID_GLA) + return (0); + + base = 0; + if (vie->base_register != VM_REG_LAST) { + error = vm_get_register(vm, cpuid, vie->base_register, &base); + if (error) { + printf("verify_gla: error %d getting base reg %d\n", + error, vie->base_register); + return (-1); + } + + /* + * RIP-relative addressing starts from the following + * instruction + */ + if (vie->base_register == VM_REG_GUEST_RIP) + base += vie->num_valid; + } + + idx = 0; + if (vie->index_register != VM_REG_LAST) { + error = vm_get_register(vm, cpuid, vie->index_register, &idx); + if (error) { + printf("verify_gla: error %d getting index reg %d\n", + error, vie->index_register); + return (-1); + } + } + + /* XXX assuming that the base address of the segment is 0 */ + gla2 = base + vie->scale * idx + vie->displacement; + gla2 &= size2mask[vie->addrsize]; + if (gla != gla2) { + printf("verify_gla mismatch: " + "base(0x%0lx), scale(%d), index(0x%0lx), " + "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n", + base, vie->scale, idx, vie->displacement, gla, gla2); + return (-1); + } + + return (0); +} + +int +vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, + enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie) +{ + + if (decode_prefixes(vie, cpu_mode, cs_d)) + return (-1); + + if (decode_opcode(vie)) + return (-1); + + if (decode_modrm(vie, cpu_mode)) + return (-1); + + if (decode_sib(vie)) + return (-1); + + if (decode_displacement(vie)) + return (-1); + + if (decode_immediate(vie)) + return (-1); + + if (decode_moffset(vie)) + return (-1); + + if (verify_inst_length(vie)) + return (-1); + + if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) { + if (verify_gla(vm, cpuid, gla, vie)) + return (-1); + } + + vie->decoded = 1; /* success */ + + return (0); +} +#endif /* _KERNEL */ diff --git a/vmm/vmm_ioport.c b/vmm/vmm_ioport.c new file mode 100644 index 0000000..63044e8 --- /dev/null +++ b/vmm/vmm_ioport.c @@ -0,0 +1,176 @@ +/*- + * Copyright (c) 2014 Tycho Nightingale + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include + +#include "vatpic.h" +#include "vatpit.h" +#include "vpmtmr.h" +#include "vrtc.h" +#include "vmm_ioport.h" +#include "vmm_ktr.h" + +#define MAX_IOPORTS 1280 + +ioport_handler_func_t ioport_handler[MAX_IOPORTS] = { + [TIMER_MODE] = vatpit_handler, + [TIMER_CNTR0] = vatpit_handler, + [TIMER_CNTR1] = vatpit_handler, + [TIMER_CNTR2] = vatpit_handler, + [NMISC_PORT] = vatpit_nmisc_handler, + [IO_ICU1] = vatpic_master_handler, + [IO_ICU1 + ICU_IMR_OFFSET] = vatpic_master_handler, + [IO_ICU2] = vatpic_slave_handler, + [IO_ICU2 + ICU_IMR_OFFSET] = vatpic_slave_handler, + [IO_ELCR1] = vatpic_elc_handler, + [IO_ELCR2] = vatpic_elc_handler, + [IO_PMTMR] = vpmtmr_handler, + [IO_RTC] = vrtc_addr_handler, + [IO_RTC + 1] = vrtc_data_handler, +}; + +#ifdef KTR +static const char * +inout_instruction(struct vm_exit *vmexit) +{ + int index; + + static const char *iodesc[] = { + "outb", "outw", "outl", + "inb", "inw", "inl", + "outsb", "outsw", "outsd", + "insb", "insw", "insd", + }; + + switch (vmexit->u.inout.bytes) { + case 1: + index = 0; + break; + case 2: + index = 1; + break; + default: + index = 2; + break; + } + + if (vmexit->u.inout.in) + index += 3; + + if (vmexit->u.inout.string) + index += 6; + + KASSERT(index < nitems(iodesc), ("%s: invalid index %d", + __func__, index)); + + return (iodesc[index]); +} +#endif /* KTR */ + +static int +emulate_inout_port(struct vm *vm, int vcpuid, struct vm_exit *vmexit, + bool *retu) +{ + ioport_handler_func_t handler; + uint32_t mask, val; + int error; + + /* + * If there is no handler for the I/O port then punt to userspace. + */ + if (vmexit->u.inout.port >= MAX_IOPORTS || + (handler = ioport_handler[vmexit->u.inout.port]) == NULL) { + *retu = true; + return (0); + } + + mask = vie_size2mask(vmexit->u.inout.bytes); + + if (!vmexit->u.inout.in) { + val = vmexit->u.inout.eax & mask; + } + + error = (*handler)(vm, vcpuid, vmexit->u.inout.in, + vmexit->u.inout.port, vmexit->u.inout.bytes, &val); + if (error) { + /* + * The value returned by this function is also the return value + * of vm_run(). This needs to be a positive number otherwise it + * can be interpreted as a "pseudo-error" like ERESTART. + * + * Enforce this by mapping all errors to EIO. + */ + return (EIO); + } + + if (vmexit->u.inout.in) { + vmexit->u.inout.eax &= ~mask; + vmexit->u.inout.eax |= val & mask; + error = vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX, + vmexit->u.inout.eax); + KASSERT(error == 0, ("emulate_ioport: error %d setting guest " + "rax register", error)); + } + *retu = false; + return (0); +} + +static int +emulate_inout_str(struct vm *vm, int vcpuid, struct vm_exit *vmexit, bool *retu) +{ + *retu = true; + return (0); /* Return to userspace to finish emulation */ +} + +int +vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vmexit, bool *retu) +{ + int bytes, error; + + bytes = vmexit->u.inout.bytes; + KASSERT(bytes == 1 || bytes == 2 || bytes == 4, + ("vm_handle_inout: invalid operand size %d", bytes)); + + if (vmexit->u.inout.string) + error = emulate_inout_str(vm, vcpuid, vmexit, retu); + else + error = emulate_inout_port(vm, vcpuid, vmexit, retu); + + VCPU_CTR4(vm, vcpuid, "%s%s 0x%04x: %s", + vmexit->u.inout.rep ? "rep " : "", + inout_instruction(vmexit), + vmexit->u.inout.port, + error ? "error" : (*retu ? "userspace" : "handled")); + + return (error); +} diff --git a/vmm/vmm_ioport.h b/vmm/vmm_ioport.h new file mode 100644 index 0000000..ba51989 --- /dev/null +++ b/vmm/vmm_ioport.h @@ -0,0 +1,37 @@ +/*- + * Copyright (c) 2014 Tycho Nightingale + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_IOPORT_H_ +#define _VMM_IOPORT_H_ + +typedef int (*ioport_handler_func_t)(struct vm *vm, int vcpuid, + bool in, int port, int bytes, uint32_t *val); + +int vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme, bool *retu); + +#endif /* _VMM_IOPORT_H_ */ diff --git a/vmm/vmm_ktr.h b/vmm/vmm_ktr.h new file mode 100644 index 0000000..61ff53f --- /dev/null +++ b/vmm/vmm_ktr.h @@ -0,0 +1,69 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_KTR_H_ +#define _VMM_KTR_H_ + +#include +#include + +#ifndef KTR_VMM +#define KTR_VMM KTR_GEN +#endif + +#define VCPU_CTR0(vm, vcpuid, format) \ +CTR2(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid)) + +#define VCPU_CTR1(vm, vcpuid, format, p1) \ +CTR3(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), (p1)) + +#define VCPU_CTR2(vm, vcpuid, format, p1, p2) \ +CTR4(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), (p1), (p2)) + +#define VCPU_CTR3(vm, vcpuid, format, p1, p2, p3) \ +CTR5(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), (p1), (p2), (p3)) + +#define VCPU_CTR4(vm, vcpuid, format, p1, p2, p3, p4) \ +CTR6(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), \ + (p1), (p2), (p3), (p4)) + +#define VM_CTR0(vm, format) \ +CTR1(KTR_VMM, "vm %s: " format, vm_name((vm))) + +#define VM_CTR1(vm, format, p1) \ +CTR2(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1)) + +#define VM_CTR2(vm, format, p1, p2) \ +CTR3(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1), (p2)) + +#define VM_CTR3(vm, format, p1, p2, p3) \ +CTR4(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1), (p2), (p3)) + +#define VM_CTR4(vm, format, p1, p2, p3, p4) \ +CTR5(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1), (p2), (p3), (p4)) +#endif diff --git a/vmm/vmm_lapic.c b/vmm/vmm_lapic.c new file mode 100644 index 0000000..6bccd32 --- /dev/null +++ b/vmm/vmm_lapic.c @@ -0,0 +1,247 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include + +#include +#include "vmm_ktr.h" +#include "vmm_lapic.h" +#include "vlapic.h" + +/* + * Some MSI message definitions + */ +#define MSI_X86_ADDR_MASK 0xfff00000 +#define MSI_X86_ADDR_BASE 0xfee00000 +#define MSI_X86_ADDR_RH 0x00000008 /* Redirection Hint */ +#define MSI_X86_ADDR_LOG 0x00000004 /* Destination Mode */ + +int +lapic_set_intr(struct vm *vm, int cpu, int vector, bool level) +{ + struct vlapic *vlapic; + + if (cpu < 0 || cpu >= VM_MAXCPU) + return (EINVAL); + + /* + * According to section "Maskable Hardware Interrupts" in Intel SDM + * vectors 16 through 255 can be delivered through the local APIC. + */ + if (vector < 16 || vector > 255) + return (EINVAL); + + vlapic = vm_lapic(vm, cpu); + if (vlapic_set_intr_ready(vlapic, vector, level)) + vcpu_notify_event(vm, cpu, true); + return (0); +} + +int +lapic_set_local_intr(struct vm *vm, int cpu, int vector) +{ + struct vlapic *vlapic; + cpuset_t dmask; + int error; + + if (cpu < -1 || cpu >= VM_MAXCPU) + return (EINVAL); + + if (cpu == -1) + dmask = vm_active_cpus(vm); + else + CPU_SETOF(cpu, &dmask); + error = 0; + while ((cpu = CPU_FFS(&dmask)) != 0) { + cpu--; + CPU_CLR(cpu, &dmask); + vlapic = vm_lapic(vm, cpu); + error = vlapic_trigger_lvt(vlapic, vector); + if (error) + break; + } + + return (error); +} + +int +lapic_intr_msi(struct vm *vm, uint64_t addr, uint64_t msg) +{ + int delmode, vec; + uint32_t dest; + bool phys; + + VM_CTR2(vm, "lapic MSI addr: %#lx msg: %#lx", addr, msg); + + if ((addr & MSI_X86_ADDR_MASK) != MSI_X86_ADDR_BASE) { + VM_CTR1(vm, "lapic MSI invalid addr %#lx", addr); + return (-1); + } + + /* + * Extract the x86-specific fields from the MSI addr/msg + * params according to the Intel Arch spec, Vol3 Ch 10. + * + * The PCI specification does not support level triggered + * MSI/MSI-X so ignore trigger level in 'msg'. + * + * The 'dest' is interpreted as a logical APIC ID if both + * the Redirection Hint and Destination Mode are '1' and + * physical otherwise. + */ + dest = (addr >> 12) & 0xff; + phys = ((addr & (MSI_X86_ADDR_RH | MSI_X86_ADDR_LOG)) != + (MSI_X86_ADDR_RH | MSI_X86_ADDR_LOG)); + delmode = msg & APIC_DELMODE_MASK; + vec = msg & 0xff; + + VM_CTR3(vm, "lapic MSI %s dest %#x, vec %d", + phys ? "physical" : "logical", dest, vec); + + vlapic_deliver_intr(vm, LAPIC_TRIG_EDGE, dest, phys, delmode, vec); + return (0); +} + +static boolean_t +x2apic_msr(u_int msr) +{ + if (msr >= 0x800 && msr <= 0xBFF) + return (TRUE); + else + return (FALSE); +} + +static u_int +x2apic_msr_to_regoff(u_int msr) +{ + + return ((msr - 0x800) << 4); +} + +boolean_t +lapic_msr(u_int msr) +{ + + if (x2apic_msr(msr) || (msr == MSR_APICBASE)) + return (TRUE); + else + return (FALSE); +} + +int +lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval, bool *retu) +{ + int error; + u_int offset; + struct vlapic *vlapic; + + vlapic = vm_lapic(vm, cpu); + + if (msr == MSR_APICBASE) { + *rval = vlapic_get_apicbase(vlapic); + error = 0; + } else { + offset = x2apic_msr_to_regoff(msr); + error = vlapic_read(vlapic, 0, offset, rval, retu); + } + + return (error); +} + +int +lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t val, bool *retu) +{ + int error; + u_int offset; + struct vlapic *vlapic; + + vlapic = vm_lapic(vm, cpu); + + if (msr == MSR_APICBASE) { + error = vlapic_set_apicbase(vlapic, val); + } else { + offset = x2apic_msr_to_regoff(msr); + error = vlapic_write(vlapic, 0, offset, val, retu); + } + + return (error); +} + +int +lapic_mmio_write(void *vm, int cpu, uint64_t gpa, uint64_t wval, int size, + void *arg) +{ + int error; + uint64_t off; + struct vlapic *vlapic; + + off = gpa - DEFAULT_APIC_BASE; + + /* + * Memory mapped local apic accesses must be 4 bytes wide and + * aligned on a 16-byte boundary. + */ + if (size != 4 || off & 0xf) + return (EINVAL); + + vlapic = vm_lapic(vm, cpu); + error = vlapic_write(vlapic, 1, off, wval, arg); + return (error); +} + +int +lapic_mmio_read(void *vm, int cpu, uint64_t gpa, uint64_t *rval, int size, + void *arg) +{ + int error; + uint64_t off; + struct vlapic *vlapic; + + off = gpa - DEFAULT_APIC_BASE; + + /* + * Memory mapped local apic accesses should be aligned on a + * 16-byte boundary. They are also suggested to be 4 bytes + * wide, alas not all OSes follow suggestions. + */ + off &= ~3; + if (off & 0xf) + return (EINVAL); + + vlapic = vm_lapic(vm, cpu); + error = vlapic_read(vlapic, 1, off, rval, arg); + return (error); +} diff --git a/vmm/vmm_lapic.h b/vmm/vmm_lapic.h new file mode 100644 index 0000000..88fa948 --- /dev/null +++ b/vmm/vmm_lapic.h @@ -0,0 +1,75 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_LAPIC_H_ +#define _VMM_LAPIC_H_ + +struct vm; + +boolean_t lapic_msr(u_int num); +int lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval, + bool *retu); +int lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t wval, + bool *retu); + +int lapic_mmio_read(void *vm, int cpu, uint64_t gpa, + uint64_t *rval, int size, void *arg); +int lapic_mmio_write(void *vm, int cpu, uint64_t gpa, + uint64_t wval, int size, void *arg); + +/* + * Signals to the LAPIC that an interrupt at 'vector' needs to be generated + * to the 'cpu', the state is recorded in IRR. + */ +int lapic_set_intr(struct vm *vm, int cpu, int vector, bool trig); + +#define LAPIC_TRIG_LEVEL true +#define LAPIC_TRIG_EDGE false +static __inline int +lapic_intr_level(struct vm *vm, int cpu, int vector) +{ + + return (lapic_set_intr(vm, cpu, vector, LAPIC_TRIG_LEVEL)); +} + +static __inline int +lapic_intr_edge(struct vm *vm, int cpu, int vector) +{ + + return (lapic_set_intr(vm, cpu, vector, LAPIC_TRIG_EDGE)); +} + +/* + * Triggers the LAPIC local interrupt (LVT) 'vector' on 'cpu'. 'cpu' can + * be set to -1 to trigger the interrupt on all CPUs. + */ +int lapic_set_local_intr(struct vm *vm, int cpu, int vector); + +int lapic_intr_msi(struct vm *vm, uint64_t addr, uint64_t msg); + +#endif diff --git a/vmm/vmm_mem.c b/vmm/vmm_mem.c new file mode 100644 index 0000000..1019f2b --- /dev/null +++ b/vmm/vmm_mem.c @@ -0,0 +1,154 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "vmm_mem.h" + +int +vmm_mem_init(void) +{ + + return (0); +} + +vm_object_t +vmm_mmio_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len, + vm_paddr_t hpa) +{ + int error; + vm_object_t obj; + struct sglist *sg; + + sg = sglist_alloc(1, M_WAITOK); + error = sglist_append_phys(sg, hpa, len); + KASSERT(error == 0, ("error %d appending physaddr to sglist", error)); + + obj = vm_pager_allocate(OBJT_SG, sg, len, VM_PROT_RW, 0, NULL); + if (obj != NULL) { + /* + * VT-x ignores the MTRR settings when figuring out the + * memory type for translations obtained through EPT. + * + * Therefore we explicitly force the pages provided by + * this object to be mapped as uncacheable. + */ + VM_OBJECT_WLOCK(obj); + error = vm_object_set_memattr(obj, VM_MEMATTR_UNCACHEABLE); + VM_OBJECT_WUNLOCK(obj); + if (error != KERN_SUCCESS) { + panic("vmm_mmio_alloc: vm_object_set_memattr error %d", + error); + } + error = vm_map_find(&vmspace->vm_map, obj, 0, &gpa, len, 0, + VMFS_NO_SPACE, VM_PROT_RW, VM_PROT_RW, 0); + if (error != KERN_SUCCESS) { + vm_object_deallocate(obj); + obj = NULL; + } + } + + /* + * Drop the reference on the sglist. + * + * If the scatter/gather object was successfully allocated then it + * has incremented the reference count on the sglist. Dropping the + * initial reference count ensures that the sglist will be freed + * when the object is deallocated. + * + * If the object could not be allocated then we end up freeing the + * sglist. + */ + sglist_free(sg); + + return (obj); +} + +void +vmm_mmio_free(struct vmspace *vmspace, vm_paddr_t gpa, size_t len) +{ + + vm_map_remove(&vmspace->vm_map, gpa, gpa + len); +} + +vm_object_t +vmm_mem_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len) +{ + int error; + vm_object_t obj; + + if (gpa & PAGE_MASK) + panic("vmm_mem_alloc: invalid gpa %#lx", gpa); + + if (len == 0 || (len & PAGE_MASK) != 0) + panic("vmm_mem_alloc: invalid allocation size %lu", len); + + obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT); + if (obj != NULL) { + error = vm_map_find(&vmspace->vm_map, obj, 0, &gpa, len, 0, + VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0); + if (error != KERN_SUCCESS) { + vm_object_deallocate(obj); + obj = NULL; + } + } + + return (obj); +} + +void +vmm_mem_free(struct vmspace *vmspace, vm_paddr_t gpa, size_t len) +{ + + vm_map_remove(&vmspace->vm_map, gpa, gpa + len); +} + +vm_paddr_t +vmm_mem_maxaddr(void) +{ + + return (ptoa(Maxmem)); +} diff --git a/vmm/vmm_mem.h b/vmm/vmm_mem.h new file mode 100644 index 0000000..a375070 --- /dev/null +++ b/vmm/vmm_mem.h @@ -0,0 +1,43 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_MEM_H_ +#define _VMM_MEM_H_ + +struct vmspace; +struct vm_object; + +int vmm_mem_init(void); +struct vm_object *vmm_mem_alloc(struct vmspace *, vm_paddr_t gpa, size_t size); +struct vm_object *vmm_mmio_alloc(struct vmspace *, vm_paddr_t gpa, size_t len, + vm_paddr_t hpa); +void vmm_mem_free(struct vmspace *, vm_paddr_t gpa, size_t size); +void vmm_mmio_free(struct vmspace *, vm_paddr_t gpa, size_t size); +vm_paddr_t vmm_mem_maxaddr(void); + +#endif diff --git a/vmm/vmm_stat.c b/vmm/vmm_stat.c new file mode 100644 index 0000000..4ae5fb9 --- /dev/null +++ b/vmm/vmm_stat.c @@ -0,0 +1,169 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include "vmm_util.h" +#include "vmm_stat.h" + +/* + * 'vst_num_elems' is the total number of addressable statistic elements + * 'vst_num_types' is the number of unique statistic types + * + * It is always true that 'vst_num_elems' is greater than or equal to + * 'vst_num_types'. This is because a stat type may represent more than + * one element (for e.g. VMM_STAT_ARRAY). + */ +static int vst_num_elems, vst_num_types; +static struct vmm_stat_type *vsttab[MAX_VMM_STAT_ELEMS]; + +static MALLOC_DEFINE(M_VMM_STAT, "vmm stat", "vmm stat"); + +#define vst_size ((size_t)vst_num_elems * sizeof(uint64_t)) + +void +vmm_stat_register(void *arg) +{ + struct vmm_stat_type *vst = arg; + + /* We require all stats to identify themselves with a description */ + if (vst->desc == NULL) + return; + + if (vst->scope == VMM_STAT_SCOPE_INTEL && !vmm_is_intel()) + return; + + if (vst->scope == VMM_STAT_SCOPE_AMD && !vmm_is_amd()) + return; + + if (vst_num_elems + vst->nelems >= MAX_VMM_STAT_ELEMS) { + printf("Cannot accomodate vmm stat type \"%s\"!\n", vst->desc); + return; + } + + vst->index = vst_num_elems; + vst_num_elems += vst->nelems; + + vsttab[vst_num_types++] = vst; +} + +int +vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf) +{ + struct vmm_stat_type *vst; + uint64_t *stats; + int i; + + if (vcpu < 0 || vcpu >= VM_MAXCPU) + return (EINVAL); + + /* Let stats functions update their counters */ + for (i = 0; i < vst_num_types; i++) { + vst = vsttab[i]; + if (vst->func != NULL) + (*vst->func)(vm, vcpu, vst); + } + + /* Copy over the stats */ + stats = vcpu_stats(vm, vcpu); + for (i = 0; i < vst_num_elems; i++) + buf[i] = stats[i]; + *num_stats = vst_num_elems; + return (0); +} + +void * +vmm_stat_alloc(void) +{ + + return (malloc(vst_size, M_VMM_STAT, M_WAITOK)); +} + +void +vmm_stat_init(void *vp) +{ + + bzero(vp, vst_size); +} + +void +vmm_stat_free(void *vp) +{ + free(vp, M_VMM_STAT); +} + +int +vmm_stat_desc_copy(int index, char *buf, int bufsize) +{ + int i; + struct vmm_stat_type *vst; + + for (i = 0; i < vst_num_types; i++) { + vst = vsttab[i]; + if (index >= vst->index && index < vst->index + vst->nelems) { + if (vst->nelems > 1) { + snprintf(buf, bufsize, "%s[%d]", + vst->desc, index - vst->index); + } else { + strlcpy(buf, vst->desc, bufsize); + } + return (0); /* found it */ + } + } + + return (EINVAL); +} + +/* global statistics */ +VMM_STAT(VCPU_MIGRATIONS, "vcpu migration across host cpus"); +VMM_STAT(VMEXIT_COUNT, "total number of vm exits"); +VMM_STAT(VMEXIT_EXTINT, "vm exits due to external interrupt"); +VMM_STAT(VMEXIT_HLT, "number of times hlt was intercepted"); +VMM_STAT(VMEXIT_CR_ACCESS, "number of times %cr access was intercepted"); +VMM_STAT(VMEXIT_RDMSR, "number of times rdmsr was intercepted"); +VMM_STAT(VMEXIT_WRMSR, "number of times wrmsr was intercepted"); +VMM_STAT(VMEXIT_MTRAP, "number of monitor trap exits"); +VMM_STAT(VMEXIT_PAUSE, "number of times pause was intercepted"); +VMM_STAT(VMEXIT_INTR_WINDOW, "vm exits due to interrupt window opening"); +VMM_STAT(VMEXIT_NMI_WINDOW, "vm exits due to nmi window opening"); +VMM_STAT(VMEXIT_INOUT, "number of times in/out was intercepted"); +VMM_STAT(VMEXIT_CPUID, "number of times cpuid was intercepted"); +VMM_STAT(VMEXIT_NESTED_FAULT, "vm exits due to nested page fault"); +VMM_STAT(VMEXIT_INST_EMUL, "vm exits for instruction emulation"); +VMM_STAT(VMEXIT_UNKNOWN, "number of vm exits for unknown reason"); +VMM_STAT(VMEXIT_ASTPENDING, "number of times astpending at exit"); +VMM_STAT(VMEXIT_USERSPACE, "number of vm exits handled in userspace"); +VMM_STAT(VMEXIT_RENDEZVOUS, "number of times rendezvous pending at exit"); +VMM_STAT(VMEXIT_EXCEPTION, "number of vm exits due to exceptions"); diff --git a/vmm/vmm_stat.h b/vmm/vmm_stat.h new file mode 100644 index 0000000..1640ba3 --- /dev/null +++ b/vmm/vmm_stat.h @@ -0,0 +1,160 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_STAT_H_ +#define _VMM_STAT_H_ + +struct vm; + +#define MAX_VMM_STAT_ELEMS 64 /* arbitrary */ + +enum vmm_stat_scope { + VMM_STAT_SCOPE_ANY, + VMM_STAT_SCOPE_INTEL, /* Intel VMX specific statistic */ + VMM_STAT_SCOPE_AMD, /* AMD SVM specific statistic */ +}; + +struct vmm_stat_type; +typedef void (*vmm_stat_func_t)(struct vm *vm, int vcpu, + struct vmm_stat_type *stat); + +struct vmm_stat_type { + int index; /* position in the stats buffer */ + int nelems; /* standalone or array */ + const char *desc; /* description of statistic */ + vmm_stat_func_t func; + enum vmm_stat_scope scope; +}; + +void vmm_stat_register(void *arg); + +#define VMM_STAT_FDEFINE(type, nelems, desc, func, scope) \ + struct vmm_stat_type type[1] = { \ + { -1, nelems, desc, func, scope } \ + }; \ + SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_register, type) + +#define VMM_STAT_DEFINE(type, nelems, desc, scope) \ + VMM_STAT_FDEFINE(type, nelems, desc, NULL, scope) + +#define VMM_STAT_DECLARE(type) \ + extern struct vmm_stat_type type[1] + +#define VMM_STAT(type, desc) \ + VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_ANY) +#define VMM_STAT_INTEL(type, desc) \ + VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_INTEL) +#define VMM_STAT_AMD(type, desc) \ + VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_AMD) + +#define VMM_STAT_FUNC(type, desc, func) \ + VMM_STAT_FDEFINE(type, 1, desc, func, VMM_STAT_SCOPE_ANY) + +#define VMM_STAT_ARRAY(type, nelems, desc) \ + VMM_STAT_DEFINE(type, nelems, desc, VMM_STAT_SCOPE_ANY) + +void *vmm_stat_alloc(void); +void vmm_stat_init(void *vp); +void vmm_stat_free(void *vp); + +/* + * 'buf' should be at least fit 'MAX_VMM_STAT_TYPES' entries + */ +int vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf); +int vmm_stat_desc_copy(int index, char *buf, int buflen); + +static void __inline +vmm_stat_array_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, + int statidx, uint64_t x) +{ +#ifdef VMM_KEEP_STATS + uint64_t *stats; + + stats = vcpu_stats(vm, vcpu); + + if (vst->index >= 0 && statidx < vst->nelems) + stats[vst->index + statidx] += x; +#endif +} + +static void __inline +vmm_stat_array_set(struct vm *vm, int vcpu, struct vmm_stat_type *vst, + int statidx, uint64_t val) +{ +#ifdef VMM_KEEP_STATS + uint64_t *stats; + + stats = vcpu_stats(vm, vcpu); + + if (vst->index >= 0 && statidx < vst->nelems) + stats[vst->index + statidx] = val; +#endif +} + +static void __inline +vmm_stat_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t x) +{ + +#ifdef VMM_KEEP_STATS + vmm_stat_array_incr(vm, vcpu, vst, 0, x); +#endif +} + +static void __inline +vmm_stat_set(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t val) +{ + +#ifdef VMM_KEEP_STATS + vmm_stat_array_set(vm, vcpu, vst, 0, val); +#endif +} + +VMM_STAT_DECLARE(VCPU_MIGRATIONS); +VMM_STAT_DECLARE(VMEXIT_COUNT); +VMM_STAT_DECLARE(VMEXIT_EXTINT); +VMM_STAT_DECLARE(VMEXIT_HLT); +VMM_STAT_DECLARE(VMEXIT_CR_ACCESS); +VMM_STAT_DECLARE(VMEXIT_RDMSR); +VMM_STAT_DECLARE(VMEXIT_WRMSR); +VMM_STAT_DECLARE(VMEXIT_MTRAP); +VMM_STAT_DECLARE(VMEXIT_PAUSE); +VMM_STAT_DECLARE(VMEXIT_INTR_WINDOW); +VMM_STAT_DECLARE(VMEXIT_NMI_WINDOW); +VMM_STAT_DECLARE(VMEXIT_INOUT); +VMM_STAT_DECLARE(VMEXIT_CPUID); +VMM_STAT_DECLARE(VMEXIT_NESTED_FAULT); +VMM_STAT_DECLARE(VMEXIT_INST_EMUL); +VMM_STAT_DECLARE(VMEXIT_UNKNOWN); +VMM_STAT_DECLARE(VMEXIT_ASTPENDING); +VMM_STAT_DECLARE(VMEXIT_USERSPACE); +VMM_STAT_DECLARE(VMEXIT_RENDEZVOUS); +VMM_STAT_DECLARE(VMEXIT_EXCEPTION); +#endif diff --git a/vmm/vmm_util.c b/vmm/vmm_util.c new file mode 100644 index 0000000..f245f92 --- /dev/null +++ b/vmm/vmm_util.c @@ -0,0 +1,111 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include + +#include "vmm_util.h" + +boolean_t +vmm_is_intel(void) +{ + + if (strcmp(cpu_vendor, "GenuineIntel") == 0) + return (TRUE); + else + return (FALSE); +} + +boolean_t +vmm_is_amd(void) +{ + if (strcmp(cpu_vendor, "AuthenticAMD") == 0) + return (TRUE); + else + return (FALSE); +} + +boolean_t +vmm_supports_1G_pages(void) +{ + unsigned int regs[4]; + + /* + * CPUID.80000001:EDX[bit 26] = 1 indicates support for 1GB pages + * + * Both Intel and AMD support this bit. + */ + if (cpu_exthigh >= 0x80000001) { + do_cpuid(0x80000001, regs); + if (regs[3] & (1 << 26)) + return (TRUE); + } + return (FALSE); +} + +#include +#include +#define DUMP_REG(x) printf(#x "\t\t0x%016lx\n", (long)(tf->tf_ ## x)) +#define DUMP_SEG(x) printf(#x "\t\t0x%04x\n", (unsigned)(tf->tf_ ## x)) +void +dump_trapframe(struct trapframe *tf) +{ + DUMP_REG(rdi); + DUMP_REG(rsi); + DUMP_REG(rdx); + DUMP_REG(rcx); + DUMP_REG(r8); + DUMP_REG(r9); + DUMP_REG(rax); + DUMP_REG(rbx); + DUMP_REG(rbp); + DUMP_REG(r10); + DUMP_REG(r11); + DUMP_REG(r12); + DUMP_REG(r13); + DUMP_REG(r14); + DUMP_REG(r15); + DUMP_REG(trapno); + DUMP_REG(addr); + DUMP_REG(flags); + DUMP_REG(err); + DUMP_REG(rip); + DUMP_REG(rflags); + DUMP_REG(rsp); + DUMP_SEG(cs); + DUMP_SEG(ss); + DUMP_SEG(fs); + DUMP_SEG(gs); + DUMP_SEG(es); + DUMP_SEG(ds); +} diff --git a/vmm/vmm_util.h b/vmm/vmm_util.h new file mode 100644 index 0000000..7f82332 --- /dev/null +++ b/vmm/vmm_util.h @@ -0,0 +1,40 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_UTIL_H_ +#define _VMM_UTIL_H_ + +struct trapframe; + +boolean_t vmm_is_intel(void); +boolean_t vmm_is_amd(void); +boolean_t vmm_supports_1G_pages(void); + +void dump_trapframe(struct trapframe *tf); + +#endif diff --git a/vmm/x86.c b/vmm/x86.c new file mode 100644 index 0000000..525e1d9 --- /dev/null +++ b/vmm/x86.c @@ -0,0 +1,521 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include "vmm_host.h" +#include "vmm_ktr.h" +#include "vmm_util.h" +#include "x86.h" + +SYSCTL_DECL(_hw_vmm); +static SYSCTL_NODE(_hw_vmm, OID_AUTO, topology, CTLFLAG_RD, 0, NULL); + +#define CPUID_VM_HIGH 0x40000000 + +static const char bhyve_id[12] = "bhyve bhyve "; + +static uint64_t bhyve_xcpuids; +SYSCTL_ULONG(_hw_vmm, OID_AUTO, bhyve_xcpuids, CTLFLAG_RW, &bhyve_xcpuids, 0, + "Number of times an unknown cpuid leaf was accessed"); + +/* + * The default CPU topology is a single thread per package. + */ +static u_int threads_per_core = 1; +SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, threads_per_core, CTLFLAG_RDTUN, + &threads_per_core, 0, NULL); + +static u_int cores_per_package = 1; +SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, cores_per_package, CTLFLAG_RDTUN, + &cores_per_package, 0, NULL); + +static int cpuid_leaf_b = 1; +SYSCTL_INT(_hw_vmm_topology, OID_AUTO, cpuid_leaf_b, CTLFLAG_RDTUN, + &cpuid_leaf_b, 0, NULL); + +/* + * Round up to the next power of two, if necessary, and then take log2. + * Returns -1 if argument is zero. + */ +static __inline int +log2(u_int x) +{ + + return (fls(x << (1 - powerof2(x))) - 1); +} + +int +x86_emulate_cpuid(struct vm *vm, int vcpu_id, + uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) +{ + const struct xsave_limits *limits; + uint64_t cr4; + int error, enable_invpcid, level, width, x2apic_id; + unsigned int func, regs[4], logical_cpus; + enum x2apic_state x2apic_state; + + VCPU_CTR2(vm, vcpu_id, "cpuid %#x,%#x", *eax, *ecx); + + /* + * Requests for invalid CPUID levels should map to the highest + * available level instead. + */ + if (cpu_exthigh != 0 && *eax >= 0x80000000) { + if (*eax > cpu_exthigh) + *eax = cpu_exthigh; + } else if (*eax >= 0x40000000) { + if (*eax > CPUID_VM_HIGH) + *eax = CPUID_VM_HIGH; + } else if (*eax > cpu_high) { + *eax = cpu_high; + } + + func = *eax; + + /* + * In general the approach used for CPU topology is to + * advertise a flat topology where all CPUs are packages with + * no multi-core or SMT. + */ + switch (func) { + /* + * Pass these through to the guest + */ + case CPUID_0000_0000: + case CPUID_0000_0002: + case CPUID_0000_0003: + case CPUID_8000_0000: + case CPUID_8000_0002: + case CPUID_8000_0003: + case CPUID_8000_0004: + case CPUID_8000_0006: + cpuid_count(*eax, *ecx, regs); + break; + case CPUID_8000_0008: + cpuid_count(*eax, *ecx, regs); + if (vmm_is_amd()) { + /* + * XXX this might appear silly because AMD + * cpus don't have threads. + * + * However this matches the logical cpus as + * advertised by leaf 0x1 and will work even + * if the 'threads_per_core' tunable is set + * incorrectly on an AMD host. + */ + logical_cpus = threads_per_core * + cores_per_package; + regs[2] = logical_cpus - 1; + } + break; + + case CPUID_8000_0001: + cpuid_count(*eax, *ecx, regs); + + /* + * Hide SVM and Topology Extension features from guest. + */ + regs[2] &= ~(AMDID2_SVM | AMDID2_TOPOLOGY); + + /* + * Don't advertise extended performance counter MSRs + * to the guest. + */ + regs[2] &= ~AMDID2_PCXC; + regs[2] &= ~AMDID2_PNXC; + regs[2] &= ~AMDID2_PTSCEL2I; + + /* + * Don't advertise Instruction Based Sampling feature. + */ + regs[2] &= ~AMDID2_IBS; + + /* NodeID MSR not available */ + regs[2] &= ~AMDID2_NODE_ID; + + /* Don't advertise the OS visible workaround feature */ + regs[2] &= ~AMDID2_OSVW; + + /* + * Hide rdtscp/ia32_tsc_aux until we know how + * to deal with them. + */ + regs[3] &= ~AMDID_RDTSCP; + break; + + case CPUID_8000_0007: + /* + * AMD uses this leaf to advertise the processor's + * power monitoring and RAS capabilities. These + * features are hardware-specific and exposing + * them to a guest doesn't make a lot of sense. + * + * Intel uses this leaf only to advertise the + * "Invariant TSC" feature with all other bits + * being reserved (set to zero). + */ + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + + /* + * "Invariant TSC" can be advertised to the guest if: + * - host TSC frequency is invariant + * - host TSCs are synchronized across physical cpus + * + * XXX This still falls short because the vcpu + * can observe the TSC moving backwards as it + * migrates across physical cpus. But at least + * it should discourage the guest from using the + * TSC to keep track of time. + */ + if (tsc_is_invariant && smp_tsc) + regs[3] |= AMDPM_TSC_INVARIANT; + break; + + case CPUID_0000_0001: + do_cpuid(1, regs); + + error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state); + if (error) { + panic("x86_emulate_cpuid: error %d " + "fetching x2apic state", error); + } + + /* + * Override the APIC ID only in ebx + */ + regs[1] &= ~(CPUID_LOCAL_APIC_ID); + regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT); + + /* + * Don't expose VMX, SpeedStep, TME or SMX capability. + * Advertise x2APIC capability and Hypervisor guest. + */ + regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2); + regs[2] &= ~(CPUID2_SMX); + + regs[2] |= CPUID2_HV; + + if (x2apic_state != X2APIC_DISABLED) + regs[2] |= CPUID2_X2APIC; + else + regs[2] &= ~CPUID2_X2APIC; + + /* + * Only advertise CPUID2_XSAVE in the guest if + * the host is using XSAVE. + */ + if (!(regs[2] & CPUID2_OSXSAVE)) + regs[2] &= ~CPUID2_XSAVE; + + /* + * If CPUID2_XSAVE is being advertised and the + * guest has set CR4_XSAVE, set + * CPUID2_OSXSAVE. + */ + regs[2] &= ~CPUID2_OSXSAVE; + if (regs[2] & CPUID2_XSAVE) { + error = vm_get_register(vm, vcpu_id, + VM_REG_GUEST_CR4, &cr4); + if (error) + panic("x86_emulate_cpuid: error %d " + "fetching %%cr4", error); + if (cr4 & CR4_XSAVE) + regs[2] |= CPUID2_OSXSAVE; + } + + /* + * Hide monitor/mwait until we know how to deal with + * these instructions. + */ + regs[2] &= ~CPUID2_MON; + + /* + * Hide the performance and debug features. + */ + regs[2] &= ~CPUID2_PDCM; + + /* + * No TSC deadline support in the APIC yet + */ + regs[2] &= ~CPUID2_TSCDLT; + + /* + * Hide thermal monitoring + */ + regs[3] &= ~(CPUID_ACPI | CPUID_TM); + + /* + * Hide the debug store capability. + */ + regs[3] &= ~CPUID_DS; + + /* + * Advertise the Machine Check and MTRR capability. + * + * Some guest OSes (e.g. Windows) will not boot if + * these features are absent. + */ + regs[3] |= (CPUID_MCA | CPUID_MCE | CPUID_MTRR); + + logical_cpus = threads_per_core * cores_per_package; + regs[1] &= ~CPUID_HTT_CORES; + regs[1] |= (logical_cpus & 0xff) << 16; + regs[3] |= CPUID_HTT; + break; + + case CPUID_0000_0004: + cpuid_count(*eax, *ecx, regs); + + if (regs[0] || regs[1] || regs[2] || regs[3]) { + regs[0] &= 0x3ff; + regs[0] |= (cores_per_package - 1) << 26; + /* + * Cache topology: + * - L1 and L2 are shared only by the logical + * processors in a single core. + * - L3 and above are shared by all logical + * processors in the package. + */ + logical_cpus = threads_per_core; + level = (regs[0] >> 5) & 0x7; + if (level >= 3) + logical_cpus *= cores_per_package; + regs[0] |= (logical_cpus - 1) << 14; + } + break; + + case CPUID_0000_0007: + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + + /* leaf 0 */ + if (*ecx == 0) { + cpuid_count(*eax, *ecx, regs); + + /* Only leaf 0 is supported */ + regs[0] = 0; + + /* + * Expose known-safe features. + */ + regs[1] &= (CPUID_STDEXT_FSGSBASE | + CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE | + CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2 | + CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM | + CPUID_STDEXT_AVX512F | + CPUID_STDEXT_AVX512PF | + CPUID_STDEXT_AVX512ER | + CPUID_STDEXT_AVX512CD); + regs[2] = 0; + regs[3] = 0; + + /* Advertise INVPCID if it is enabled. */ + error = vm_get_capability(vm, vcpu_id, + VM_CAP_ENABLE_INVPCID, &enable_invpcid); + if (error == 0 && enable_invpcid) + regs[1] |= CPUID_STDEXT_INVPCID; + } + break; + + case CPUID_0000_0006: + regs[0] = CPUTPM1_ARAT; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + + case CPUID_0000_000A: + /* + * Handle the access, but report 0 for + * all options + */ + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + + case CPUID_0000_000B: + /* + * Processor topology enumeration + */ + if (*ecx == 0) { + logical_cpus = threads_per_core; + width = log2(logical_cpus); + level = CPUID_TYPE_SMT; + x2apic_id = vcpu_id; + } + + if (*ecx == 1) { + logical_cpus = threads_per_core * + cores_per_package; + width = log2(logical_cpus); + level = CPUID_TYPE_CORE; + x2apic_id = vcpu_id; + } + + if (!cpuid_leaf_b || *ecx >= 2) { + width = 0; + logical_cpus = 0; + level = 0; + x2apic_id = 0; + } + + regs[0] = width & 0x1f; + regs[1] = logical_cpus & 0xffff; + regs[2] = (level << 8) | (*ecx & 0xff); + regs[3] = x2apic_id; + break; + + case CPUID_0000_000D: + limits = vmm_get_xsave_limits(); + if (!limits->xsave_enabled) { + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + } + + cpuid_count(*eax, *ecx, regs); + switch (*ecx) { + case 0: + /* + * Only permit the guest to use bits + * that are active in the host in + * %xcr0. Also, claim that the + * maximum save area size is + * equivalent to the host's current + * save area size. Since this runs + * "inside" of vmrun(), it runs with + * the guest's xcr0, so the current + * save area size is correct as-is. + */ + regs[0] &= limits->xcr0_allowed; + regs[2] = limits->xsave_max_size; + regs[3] &= (limits->xcr0_allowed >> 32); + break; + case 1: + /* Only permit XSAVEOPT. */ + regs[0] &= CPUID_EXTSTATE_XSAVEOPT; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + default: + /* + * If the leaf is for a permitted feature, + * pass through as-is, otherwise return + * all zeroes. + */ + if (!(limits->xcr0_allowed & (1ul << *ecx))) { + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + } + break; + } + break; + + case 0x40000000: + regs[0] = CPUID_VM_HIGH; + bcopy(bhyve_id, ®s[1], 4); + bcopy(bhyve_id + 4, ®s[2], 4); + bcopy(bhyve_id + 8, ®s[3], 4); + break; + + default: + /* + * The leaf value has already been clamped so + * simply pass this through, keeping count of + * how many unhandled leaf values have been seen. + */ + atomic_add_long(&bhyve_xcpuids, 1); + cpuid_count(*eax, *ecx, regs); + break; + } + + *eax = regs[0]; + *ebx = regs[1]; + *ecx = regs[2]; + *edx = regs[3]; + + return (1); +} + +bool +vm_cpuid_capability(struct vm *vm, int vcpuid, enum vm_cpuid_capability cap) +{ + bool rv; + + KASSERT(cap > 0 && cap < VCC_LAST, ("%s: invalid vm_cpu_capability %d", + __func__, cap)); + + /* + * Simply passthrough the capabilities of the host cpu for now. + */ + rv = false; + switch (cap) { + case VCC_NO_EXECUTE: + if (amd_feature & AMDID_NX) + rv = true; + break; + case VCC_FFXSR: + if (amd_feature & AMDID_FFXSR) + rv = true; + break; + case VCC_TCE: + if (amd_feature2 & AMDID2_TCE) + rv = true; + break; + default: + panic("%s: unknown vm_cpu_capability %d", __func__, cap); + } + return (rv); +} diff --git a/vmm/x86.h b/vmm/x86.h new file mode 100644 index 0000000..6f99d52 --- /dev/null +++ b/vmm/x86.h @@ -0,0 +1,78 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _X86_H_ +#define _X86_H_ + +#define CPUID_0000_0000 (0x0) +#define CPUID_0000_0001 (0x1) +#define CPUID_0000_0002 (0x2) +#define CPUID_0000_0003 (0x3) +#define CPUID_0000_0004 (0x4) +#define CPUID_0000_0006 (0x6) +#define CPUID_0000_0007 (0x7) +#define CPUID_0000_000A (0xA) +#define CPUID_0000_000B (0xB) +#define CPUID_0000_000D (0xD) +#define CPUID_8000_0000 (0x80000000) +#define CPUID_8000_0001 (0x80000001) +#define CPUID_8000_0002 (0x80000002) +#define CPUID_8000_0003 (0x80000003) +#define CPUID_8000_0004 (0x80000004) +#define CPUID_8000_0006 (0x80000006) +#define CPUID_8000_0007 (0x80000007) +#define CPUID_8000_0008 (0x80000008) + +/* + * CPUID instruction Fn0000_0001: + */ +#define CPUID_0000_0001_APICID_MASK (0xff<<24) +#define CPUID_0000_0001_APICID_SHIFT 24 + +/* + * CPUID instruction Fn0000_0001 ECX + */ +#define CPUID_0000_0001_FEAT0_VMX (1<<5) + +int x86_emulate_cpuid(struct vm *vm, int vcpu_id, uint32_t *eax, uint32_t *ebx, + uint32_t *ecx, uint32_t *edx); + +enum vm_cpuid_capability { + VCC_NONE, + VCC_NO_EXECUTE, + VCC_FFXSR, + VCC_TCE, + VCC_LAST +}; + +/* + * Return 'true' if the capability 'cap' is enabled in this virtual cpu + * and 'false' otherwise. + */ +bool vm_cpuid_capability(struct vm *vm, int vcpuid, enum vm_cpuid_capability); +#endif diff --git a/vmm_dev.h b/vmm_dev.h new file mode 100644 index 0000000..9d031a9 --- /dev/null +++ b/vmm_dev.h @@ -0,0 +1,365 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_DEV_H_ +#define _VMM_DEV_H_ + +#ifdef _KERNEL +void vmmdev_init(void); +int vmmdev_cleanup(void); +#endif + +struct vm_memory_segment { + vm_paddr_t gpa; /* in */ + size_t len; + int wired; +}; + +struct vm_register { + int cpuid; + int regnum; /* enum vm_reg_name */ + uint64_t regval; +}; + +struct vm_seg_desc { /* data or code segment */ + int cpuid; + int regnum; /* enum vm_reg_name */ + struct seg_desc desc; +}; + +struct vm_run { + int cpuid; + struct vm_exit vm_exit; +}; + +struct vm_exception { + int cpuid; + int vector; + uint32_t error_code; + int error_code_valid; + int restart_instruction; +}; + +struct vm_lapic_msi { + uint64_t msg; + uint64_t addr; +}; + +struct vm_lapic_irq { + int cpuid; + int vector; +}; + +struct vm_ioapic_irq { + int irq; +}; + +struct vm_isa_irq { + int atpic_irq; + int ioapic_irq; +}; + +struct vm_isa_irq_trigger { + int atpic_irq; + enum vm_intr_trigger trigger; +}; + +struct vm_capability { + int cpuid; + enum vm_cap_type captype; + int capval; + int allcpus; +}; + +struct vm_pptdev { + int bus; + int slot; + int func; +}; + +struct vm_pptdev_mmio { + int bus; + int slot; + int func; + vm_paddr_t gpa; + vm_paddr_t hpa; + size_t len; +}; + +struct vm_pptdev_msi { + int vcpu; + int bus; + int slot; + int func; + int numvec; /* 0 means disabled */ + uint64_t msg; + uint64_t addr; +}; + +struct vm_pptdev_msix { + int vcpu; + int bus; + int slot; + int func; + int idx; + uint64_t msg; + uint32_t vector_control; + uint64_t addr; +}; + +struct vm_nmi { + int cpuid; +}; + +#define MAX_VM_STATS 64 +struct vm_stats { + int cpuid; /* in */ + int num_entries; /* out */ + struct timeval tv; + uint64_t statbuf[MAX_VM_STATS]; +}; + +struct vm_stat_desc { + int index; /* in */ + char desc[128]; /* out */ +}; + +struct vm_x2apic { + int cpuid; + enum x2apic_state state; +}; + +struct vm_gpa_pte { + uint64_t gpa; /* in */ + uint64_t pte[4]; /* out */ + int ptenum; +}; + +struct vm_hpet_cap { + uint32_t capabilities; /* lower 32 bits of HPET capabilities */ +}; + +struct vm_suspend { + enum vm_suspend_how how; +}; + +struct vm_gla2gpa { + int vcpuid; /* inputs */ + int prot; /* PROT_READ or PROT_WRITE */ + uint64_t gla; + struct vm_guest_paging paging; + int fault; /* outputs */ + uint64_t gpa; +}; + +struct vm_activate_cpu { + int vcpuid; +}; + +struct vm_cpuset { + int which; + int cpusetsize; + cpuset_t *cpus; +}; +#define VM_ACTIVE_CPUS 0 +#define VM_SUSPENDED_CPUS 1 + +struct vm_intinfo { + int vcpuid; + uint64_t info1; + uint64_t info2; +}; + +struct vm_rtc_time { + time_t secs; +}; + +struct vm_rtc_data { + int offset; + uint8_t value; +}; + +enum { + /* general routines */ + IOCNUM_ABIVERS = 0, + IOCNUM_RUN = 1, + IOCNUM_SET_CAPABILITY = 2, + IOCNUM_GET_CAPABILITY = 3, + IOCNUM_SUSPEND = 4, + IOCNUM_REINIT = 5, + + /* memory apis */ + IOCNUM_MAP_MEMORY = 10, + IOCNUM_GET_MEMORY_SEG = 11, + IOCNUM_GET_GPA_PMAP = 12, + IOCNUM_GLA2GPA = 13, + + /* register/state accessors */ + IOCNUM_SET_REGISTER = 20, + IOCNUM_GET_REGISTER = 21, + IOCNUM_SET_SEGMENT_DESCRIPTOR = 22, + IOCNUM_GET_SEGMENT_DESCRIPTOR = 23, + + /* interrupt injection */ + IOCNUM_GET_INTINFO = 28, + IOCNUM_SET_INTINFO = 29, + IOCNUM_INJECT_EXCEPTION = 30, + IOCNUM_LAPIC_IRQ = 31, + IOCNUM_INJECT_NMI = 32, + IOCNUM_IOAPIC_ASSERT_IRQ = 33, + IOCNUM_IOAPIC_DEASSERT_IRQ = 34, + IOCNUM_IOAPIC_PULSE_IRQ = 35, + IOCNUM_LAPIC_MSI = 36, + IOCNUM_LAPIC_LOCAL_IRQ = 37, + IOCNUM_IOAPIC_PINCOUNT = 38, + IOCNUM_RESTART_INSTRUCTION = 39, + + /* PCI pass-thru */ + IOCNUM_BIND_PPTDEV = 40, + IOCNUM_UNBIND_PPTDEV = 41, + IOCNUM_MAP_PPTDEV_MMIO = 42, + IOCNUM_PPTDEV_MSI = 43, + IOCNUM_PPTDEV_MSIX = 44, + + /* statistics */ + IOCNUM_VM_STATS = 50, + IOCNUM_VM_STAT_DESC = 51, + + /* kernel device state */ + IOCNUM_SET_X2APIC_STATE = 60, + IOCNUM_GET_X2APIC_STATE = 61, + IOCNUM_GET_HPET_CAPABILITIES = 62, + + /* legacy interrupt injection */ + IOCNUM_ISA_ASSERT_IRQ = 80, + IOCNUM_ISA_DEASSERT_IRQ = 81, + IOCNUM_ISA_PULSE_IRQ = 82, + IOCNUM_ISA_SET_IRQ_TRIGGER = 83, + + /* vm_cpuset */ + IOCNUM_ACTIVATE_CPU = 90, + IOCNUM_GET_CPUSET = 91, + + /* RTC */ + IOCNUM_RTC_READ = 100, + IOCNUM_RTC_WRITE = 101, + IOCNUM_RTC_SETTIME = 102, + IOCNUM_RTC_GETTIME = 103, +}; + +#define VM_RUN \ + _IOWR('v', IOCNUM_RUN, struct vm_run) +#define VM_SUSPEND \ + _IOW('v', IOCNUM_SUSPEND, struct vm_suspend) +#define VM_REINIT \ + _IO('v', IOCNUM_REINIT) +#define VM_MAP_MEMORY \ + _IOWR('v', IOCNUM_MAP_MEMORY, struct vm_memory_segment) +#define VM_GET_MEMORY_SEG \ + _IOWR('v', IOCNUM_GET_MEMORY_SEG, struct vm_memory_segment) +#define VM_SET_REGISTER \ + _IOW('v', IOCNUM_SET_REGISTER, struct vm_register) +#define VM_GET_REGISTER \ + _IOWR('v', IOCNUM_GET_REGISTER, struct vm_register) +#define VM_SET_SEGMENT_DESCRIPTOR \ + _IOW('v', IOCNUM_SET_SEGMENT_DESCRIPTOR, struct vm_seg_desc) +#define VM_GET_SEGMENT_DESCRIPTOR \ + _IOWR('v', IOCNUM_GET_SEGMENT_DESCRIPTOR, struct vm_seg_desc) +#define VM_INJECT_EXCEPTION \ + _IOW('v', IOCNUM_INJECT_EXCEPTION, struct vm_exception) +#define VM_LAPIC_IRQ \ + _IOW('v', IOCNUM_LAPIC_IRQ, struct vm_lapic_irq) +#define VM_LAPIC_LOCAL_IRQ \ + _IOW('v', IOCNUM_LAPIC_LOCAL_IRQ, struct vm_lapic_irq) +#define VM_LAPIC_MSI \ + _IOW('v', IOCNUM_LAPIC_MSI, struct vm_lapic_msi) +#define VM_IOAPIC_ASSERT_IRQ \ + _IOW('v', IOCNUM_IOAPIC_ASSERT_IRQ, struct vm_ioapic_irq) +#define VM_IOAPIC_DEASSERT_IRQ \ + _IOW('v', IOCNUM_IOAPIC_DEASSERT_IRQ, struct vm_ioapic_irq) +#define VM_IOAPIC_PULSE_IRQ \ + _IOW('v', IOCNUM_IOAPIC_PULSE_IRQ, struct vm_ioapic_irq) +#define VM_IOAPIC_PINCOUNT \ + _IOR('v', IOCNUM_IOAPIC_PINCOUNT, int) +#define VM_ISA_ASSERT_IRQ \ + _IOW('v', IOCNUM_ISA_ASSERT_IRQ, struct vm_isa_irq) +#define VM_ISA_DEASSERT_IRQ \ + _IOW('v', IOCNUM_ISA_DEASSERT_IRQ, struct vm_isa_irq) +#define VM_ISA_PULSE_IRQ \ + _IOW('v', IOCNUM_ISA_PULSE_IRQ, struct vm_isa_irq) +#define VM_ISA_SET_IRQ_TRIGGER \ + _IOW('v', IOCNUM_ISA_SET_IRQ_TRIGGER, struct vm_isa_irq_trigger) +#define VM_SET_CAPABILITY \ + _IOW('v', IOCNUM_SET_CAPABILITY, struct vm_capability) +#define VM_GET_CAPABILITY \ + _IOWR('v', IOCNUM_GET_CAPABILITY, struct vm_capability) +#define VM_BIND_PPTDEV \ + _IOW('v', IOCNUM_BIND_PPTDEV, struct vm_pptdev) +#define VM_UNBIND_PPTDEV \ + _IOW('v', IOCNUM_UNBIND_PPTDEV, struct vm_pptdev) +#define VM_MAP_PPTDEV_MMIO \ + _IOW('v', IOCNUM_MAP_PPTDEV_MMIO, struct vm_pptdev_mmio) +#define VM_PPTDEV_MSI \ + _IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi) +#define VM_PPTDEV_MSIX \ + _IOW('v', IOCNUM_PPTDEV_MSIX, struct vm_pptdev_msix) +#define VM_INJECT_NMI \ + _IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi) +#define VM_STATS \ + _IOWR('v', IOCNUM_VM_STATS, struct vm_stats) +#define VM_STAT_DESC \ + _IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc) +#define VM_SET_X2APIC_STATE \ + _IOW('v', IOCNUM_SET_X2APIC_STATE, struct vm_x2apic) +#define VM_GET_X2APIC_STATE \ + _IOWR('v', IOCNUM_GET_X2APIC_STATE, struct vm_x2apic) +#define VM_GET_HPET_CAPABILITIES \ + _IOR('v', IOCNUM_GET_HPET_CAPABILITIES, struct vm_hpet_cap) +#define VM_GET_GPA_PMAP \ + _IOWR('v', IOCNUM_GET_GPA_PMAP, struct vm_gpa_pte) +#define VM_GLA2GPA \ + _IOWR('v', IOCNUM_GLA2GPA, struct vm_gla2gpa) +#define VM_ACTIVATE_CPU \ + _IOW('v', IOCNUM_ACTIVATE_CPU, struct vm_activate_cpu) +#define VM_GET_CPUS \ + _IOW('v', IOCNUM_GET_CPUSET, struct vm_cpuset) +#define VM_SET_INTINFO \ + _IOW('v', IOCNUM_SET_INTINFO, struct vm_intinfo) +#define VM_GET_INTINFO \ + _IOWR('v', IOCNUM_GET_INTINFO, struct vm_intinfo) +#define VM_RTC_WRITE \ + _IOW('v', IOCNUM_RTC_WRITE, struct vm_rtc_data) +#define VM_RTC_READ \ + _IOWR('v', IOCNUM_RTC_READ, struct vm_rtc_data) +#define VM_RTC_SETTIME \ + _IOW('v', IOCNUM_RTC_SETTIME, struct vm_rtc_time) +#define VM_RTC_GETTIME \ + _IOR('v', IOCNUM_RTC_GETTIME, struct vm_rtc_time) +#define VM_RESTART_INSTRUCTION \ + _IOW('v', IOCNUM_RESTART_INSTRUCTION, int) +#endif diff --git a/vmm_instruction_emul.h b/vmm_instruction_emul.h new file mode 100644 index 0000000..5e7127f --- /dev/null +++ b/vmm_instruction_emul.h @@ -0,0 +1,116 @@ +/*- + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMM_INSTRUCTION_EMUL_H_ +#define _VMM_INSTRUCTION_EMUL_H_ + +#include + +/* + * Callback functions to read and write memory regions. + */ +typedef int (*mem_region_read_t)(void *vm, int cpuid, uint64_t gpa, + uint64_t *rval, int rsize, void *arg); + +typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa, + uint64_t wval, int wsize, void *arg); + +/* + * Emulate the decoded 'vie' instruction. + * + * The callbacks 'mrr' and 'mrw' emulate reads and writes to the memory region + * containing 'gpa'. 'mrarg' is an opaque argument that is passed into the + * callback functions. + * + * 'void *vm' should be 'struct vm *' when called from kernel context and + * 'struct vmctx *' when called from user context. + * s + */ +int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie, + struct vm_guest_paging *paging, mem_region_read_t mrr, + mem_region_write_t mrw, void *mrarg); + +int vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, + uint64_t val, int size); + +/* + * Returns 1 if an alignment check exception should be injected and 0 otherwise. + */ +int vie_alignment_check(int cpl, int operand_size, uint64_t cr0, + uint64_t rflags, uint64_t gla); + +/* Returns 1 if the 'gla' is not canonical and 0 otherwise. */ +int vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla); + +uint64_t vie_size2mask(int size); + +int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, + struct seg_desc *desc, uint64_t off, int length, int addrsize, int prot, + uint64_t *gla); + +#ifdef _KERNEL +/* + * APIs to fetch and decode the instruction from nested page fault handler. + * + * 'vie' must be initialized before calling 'vmm_fetch_instruction()' + */ +int vmm_fetch_instruction(struct vm *vm, int cpuid, + struct vm_guest_paging *guest_paging, + uint64_t rip, int inst_length, struct vie *vie, + int *is_fault); + +/* + * Translate the guest linear address 'gla' to a guest physical address. + * + * retval is_fault Interpretation + * 0 0 'gpa' contains result of the translation + * 0 1 An exception was injected into the guest + * EFAULT N/A An unrecoverable hypervisor error occurred + */ +int vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *is_fault); + +void vie_init(struct vie *vie, const char *inst_bytes, int inst_length); + +/* + * Decode the instruction fetched into 'vie' so it can be emulated. + * + * 'gla' is the guest linear address provided by the hardware assist + * that caused the nested page table fault. It is used to verify that + * the software instruction decoding is in agreement with the hardware. + * + * Some hardware assists do not provide the 'gla' to the hypervisor. + * To skip the 'gla' verification for this or any other reason pass + * in VIE_INVALID_GLA instead. + */ +#define VIE_INVALID_GLA (1UL << 63) /* a non-canonical address */ +int vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, + enum vm_cpu_mode cpu_mode, int csd, struct vie *vie); +#endif /* _KERNEL */ + +#endif /* _VMM_INSTRUCTION_EMUL_H_ */