Base bhyve checkin

git https://github.com/freebsd/freebsd
132e393b177792999244c04f7b08753a98b0c4c8
This commit is contained in:
Michael Steil 2015-06-09 21:25:36 -07:00
commit 04180a5a81
126 changed files with 49630 additions and 0 deletions

50
bhyve/Makefile Normal file
View file

@ -0,0 +1,50 @@
#
# $FreeBSD$
#
PROG= bhyve
DEBUG_FLAGS= -g -O0
MAN= bhyve.8
SRCS= \
atkbdc.c \
acpi.c \
bhyverun.c \
block_if.c \
consport.c \
dbgport.c \
inout.c \
ioapic.c \
mem.c \
mevent.c \
mptbl.c \
pci_ahci.c \
pci_emul.c \
pci_hostbridge.c \
pci_irq.c \
pci_lpc.c \
pci_passthru.c \
pci_virtio_block.c \
pci_virtio_net.c \
pci_virtio_rnd.c \
pci_uart.c \
pm.c \
post.c \
rtc.c \
smbiostbl.c \
task_switch.c \
uart_emul.c \
virtio.c \
xmsr.c \
spinup_ap.c
.PATH: ${.CURDIR}/../../sys/amd64/vmm
SRCS+= vmm_instruction_emul.c
LIBADD= vmmapi md pthread
WARNS?= 2
.include <bsd.prog.mk>

1012
bhyve/acpi.c Normal file

File diff suppressed because it is too large Load diff

54
bhyve/acpi.h Normal file
View file

@ -0,0 +1,54 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _ACPI_H_
#define _ACPI_H_
#define SCI_INT 9
#define SMI_CMD 0xb2
#define BHYVE_ACPI_ENABLE 0xa0
#define BHYVE_ACPI_DISABLE 0xa1
#define PM1A_EVT_ADDR 0x400
#define PM1A_CNT_ADDR 0x404
#define IO_PMTMR 0x408 /* 4-byte i/o port for the timer */
struct vmctx;
int acpi_build(struct vmctx *ctx, int ncpu);
void dsdt_line(const char *fmt, ...);
void dsdt_fixed_ioport(uint16_t iobase, uint16_t length);
void dsdt_fixed_irq(uint8_t irq);
void dsdt_fixed_mem32(uint32_t base, uint32_t length);
void dsdt_indent(int levels);
void dsdt_unindent(int levels);
void sci_init(struct vmctx *ctx);
#endif /* _ACPI_H_ */

322
bhyve/ahci.h Normal file
View file

@ -0,0 +1,322 @@
/*-
* Copyright (c) 1998 - 2008 Søren Schmidt <sos@FreeBSD.org>
* Copyright (c) 2009-2012 Alexander Motin <mav@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer,
* without modification, immediately at the beginning of the file.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _AHCI_H_
#define _AHCI_H_
/* ATA register defines */
#define ATA_DATA 0 /* (RW) data */
#define ATA_FEATURE 1 /* (W) feature */
#define ATA_F_DMA 0x01 /* enable DMA */
#define ATA_F_OVL 0x02 /* enable overlap */
#define ATA_COUNT 2 /* (W) sector count */
#define ATA_SECTOR 3 /* (RW) sector # */
#define ATA_CYL_LSB 4 /* (RW) cylinder# LSB */
#define ATA_CYL_MSB 5 /* (RW) cylinder# MSB */
#define ATA_DRIVE 6 /* (W) Sector/Drive/Head */
#define ATA_D_LBA 0x40 /* use LBA addressing */
#define ATA_D_IBM 0xa0 /* 512 byte sectors, ECC */
#define ATA_COMMAND 7 /* (W) command */
#define ATA_ERROR 8 /* (R) error */
#define ATA_E_ILI 0x01 /* illegal length */
#define ATA_E_NM 0x02 /* no media */
#define ATA_E_ABORT 0x04 /* command aborted */
#define ATA_E_MCR 0x08 /* media change request */
#define ATA_E_IDNF 0x10 /* ID not found */
#define ATA_E_MC 0x20 /* media changed */
#define ATA_E_UNC 0x40 /* uncorrectable data */
#define ATA_E_ICRC 0x80 /* UDMA crc error */
#define ATA_E_ATAPI_SENSE_MASK 0xf0 /* ATAPI sense key mask */
#define ATA_IREASON 9 /* (R) interrupt reason */
#define ATA_I_CMD 0x01 /* cmd (1) | data (0) */
#define ATA_I_IN 0x02 /* read (1) | write (0) */
#define ATA_I_RELEASE 0x04 /* released bus (1) */
#define ATA_I_TAGMASK 0xf8 /* tag mask */
#define ATA_STATUS 10 /* (R) status */
#define ATA_ALTSTAT 11 /* (R) alternate status */
#define ATA_S_ERROR 0x01 /* error */
#define ATA_S_INDEX 0x02 /* index */
#define ATA_S_CORR 0x04 /* data corrected */
#define ATA_S_DRQ 0x08 /* data request */
#define ATA_S_DSC 0x10 /* drive seek completed */
#define ATA_S_SERVICE 0x10 /* drive needs service */
#define ATA_S_DWF 0x20 /* drive write fault */
#define ATA_S_DMA 0x20 /* DMA ready */
#define ATA_S_READY 0x40 /* drive ready */
#define ATA_S_BUSY 0x80 /* busy */
#define ATA_CONTROL 12 /* (W) control */
#define ATA_A_IDS 0x02 /* disable interrupts */
#define ATA_A_RESET 0x04 /* RESET controller */
#define ATA_A_4BIT 0x08 /* 4 head bits */
#define ATA_A_HOB 0x80 /* High Order Byte enable */
/* SATA register defines */
#define ATA_SSTATUS 13
#define ATA_SS_DET_MASK 0x0000000f
#define ATA_SS_DET_NO_DEVICE 0x00000000
#define ATA_SS_DET_DEV_PRESENT 0x00000001
#define ATA_SS_DET_PHY_ONLINE 0x00000003
#define ATA_SS_DET_PHY_OFFLINE 0x00000004
#define ATA_SS_SPD_MASK 0x000000f0
#define ATA_SS_SPD_NO_SPEED 0x00000000
#define ATA_SS_SPD_GEN1 0x00000010
#define ATA_SS_SPD_GEN2 0x00000020
#define ATA_SS_SPD_GEN3 0x00000030
#define ATA_SS_IPM_MASK 0x00000f00
#define ATA_SS_IPM_NO_DEVICE 0x00000000
#define ATA_SS_IPM_ACTIVE 0x00000100
#define ATA_SS_IPM_PARTIAL 0x00000200
#define ATA_SS_IPM_SLUMBER 0x00000600
#define ATA_SS_IPM_DEVSLEEP 0x00000800
#define ATA_SERROR 14
#define ATA_SE_DATA_CORRECTED 0x00000001
#define ATA_SE_COMM_CORRECTED 0x00000002
#define ATA_SE_DATA_ERR 0x00000100
#define ATA_SE_COMM_ERR 0x00000200
#define ATA_SE_PROT_ERR 0x00000400
#define ATA_SE_HOST_ERR 0x00000800
#define ATA_SE_PHY_CHANGED 0x00010000
#define ATA_SE_PHY_IERROR 0x00020000
#define ATA_SE_COMM_WAKE 0x00040000
#define ATA_SE_DECODE_ERR 0x00080000
#define ATA_SE_PARITY_ERR 0x00100000
#define ATA_SE_CRC_ERR 0x00200000
#define ATA_SE_HANDSHAKE_ERR 0x00400000
#define ATA_SE_LINKSEQ_ERR 0x00800000
#define ATA_SE_TRANSPORT_ERR 0x01000000
#define ATA_SE_UNKNOWN_FIS 0x02000000
#define ATA_SE_EXCHANGED 0x04000000
#define ATA_SCONTROL 15
#define ATA_SC_DET_MASK 0x0000000f
#define ATA_SC_DET_IDLE 0x00000000
#define ATA_SC_DET_RESET 0x00000001
#define ATA_SC_DET_DISABLE 0x00000004
#define ATA_SC_SPD_MASK 0x000000f0
#define ATA_SC_SPD_NO_SPEED 0x00000000
#define ATA_SC_SPD_SPEED_GEN1 0x00000010
#define ATA_SC_SPD_SPEED_GEN2 0x00000020
#define ATA_SC_SPD_SPEED_GEN3 0x00000030
#define ATA_SC_IPM_MASK 0x00000f00
#define ATA_SC_IPM_NONE 0x00000000
#define ATA_SC_IPM_DIS_PARTIAL 0x00000100
#define ATA_SC_IPM_DIS_SLUMBER 0x00000200
#define ATA_SC_IPM_DIS_DEVSLEEP 0x00000400
#define ATA_SACTIVE 16
#define AHCI_MAX_PORTS 32
#define AHCI_MAX_SLOTS 32
#define AHCI_MAX_IRQS 16
/* SATA AHCI v1.0 register defines */
#define AHCI_CAP 0x00
#define AHCI_CAP_NPMASK 0x0000001f
#define AHCI_CAP_SXS 0x00000020
#define AHCI_CAP_EMS 0x00000040
#define AHCI_CAP_CCCS 0x00000080
#define AHCI_CAP_NCS 0x00001F00
#define AHCI_CAP_NCS_SHIFT 8
#define AHCI_CAP_PSC 0x00002000
#define AHCI_CAP_SSC 0x00004000
#define AHCI_CAP_PMD 0x00008000
#define AHCI_CAP_FBSS 0x00010000
#define AHCI_CAP_SPM 0x00020000
#define AHCI_CAP_SAM 0x00080000
#define AHCI_CAP_ISS 0x00F00000
#define AHCI_CAP_ISS_SHIFT 20
#define AHCI_CAP_SCLO 0x01000000
#define AHCI_CAP_SAL 0x02000000
#define AHCI_CAP_SALP 0x04000000
#define AHCI_CAP_SSS 0x08000000
#define AHCI_CAP_SMPS 0x10000000
#define AHCI_CAP_SSNTF 0x20000000
#define AHCI_CAP_SNCQ 0x40000000
#define AHCI_CAP_64BIT 0x80000000
#define AHCI_GHC 0x04
#define AHCI_GHC_AE 0x80000000
#define AHCI_GHC_MRSM 0x00000004
#define AHCI_GHC_IE 0x00000002
#define AHCI_GHC_HR 0x00000001
#define AHCI_IS 0x08
#define AHCI_PI 0x0c
#define AHCI_VS 0x10
#define AHCI_CCCC 0x14
#define AHCI_CCCC_TV_MASK 0xffff0000
#define AHCI_CCCC_TV_SHIFT 16
#define AHCI_CCCC_CC_MASK 0x0000ff00
#define AHCI_CCCC_CC_SHIFT 8
#define AHCI_CCCC_INT_MASK 0x000000f8
#define AHCI_CCCC_INT_SHIFT 3
#define AHCI_CCCC_EN 0x00000001
#define AHCI_CCCP 0x18
#define AHCI_EM_LOC 0x1C
#define AHCI_EM_CTL 0x20
#define AHCI_EM_MR 0x00000001
#define AHCI_EM_TM 0x00000100
#define AHCI_EM_RST 0x00000200
#define AHCI_EM_LED 0x00010000
#define AHCI_EM_SAFTE 0x00020000
#define AHCI_EM_SES2 0x00040000
#define AHCI_EM_SGPIO 0x00080000
#define AHCI_EM_SMB 0x01000000
#define AHCI_EM_XMT 0x02000000
#define AHCI_EM_ALHD 0x04000000
#define AHCI_EM_PM 0x08000000
#define AHCI_CAP2 0x24
#define AHCI_CAP2_BOH 0x00000001
#define AHCI_CAP2_NVMP 0x00000002
#define AHCI_CAP2_APST 0x00000004
#define AHCI_CAP2_SDS 0x00000008
#define AHCI_CAP2_SADM 0x00000010
#define AHCI_CAP2_DESO 0x00000020
#define AHCI_OFFSET 0x100
#define AHCI_STEP 0x80
#define AHCI_P_CLB 0x00
#define AHCI_P_CLBU 0x04
#define AHCI_P_FB 0x08
#define AHCI_P_FBU 0x0c
#define AHCI_P_IS 0x10
#define AHCI_P_IE 0x14
#define AHCI_P_IX_DHR 0x00000001
#define AHCI_P_IX_PS 0x00000002
#define AHCI_P_IX_DS 0x00000004
#define AHCI_P_IX_SDB 0x00000008
#define AHCI_P_IX_UF 0x00000010
#define AHCI_P_IX_DP 0x00000020
#define AHCI_P_IX_PC 0x00000040
#define AHCI_P_IX_MP 0x00000080
#define AHCI_P_IX_PRC 0x00400000
#define AHCI_P_IX_IPM 0x00800000
#define AHCI_P_IX_OF 0x01000000
#define AHCI_P_IX_INF 0x04000000
#define AHCI_P_IX_IF 0x08000000
#define AHCI_P_IX_HBD 0x10000000
#define AHCI_P_IX_HBF 0x20000000
#define AHCI_P_IX_TFE 0x40000000
#define AHCI_P_IX_CPD 0x80000000
#define AHCI_P_CMD 0x18
#define AHCI_P_CMD_ST 0x00000001
#define AHCI_P_CMD_SUD 0x00000002
#define AHCI_P_CMD_POD 0x00000004
#define AHCI_P_CMD_CLO 0x00000008
#define AHCI_P_CMD_FRE 0x00000010
#define AHCI_P_CMD_CCS_MASK 0x00001f00
#define AHCI_P_CMD_CCS_SHIFT 8
#define AHCI_P_CMD_ISS 0x00002000
#define AHCI_P_CMD_FR 0x00004000
#define AHCI_P_CMD_CR 0x00008000
#define AHCI_P_CMD_CPS 0x00010000
#define AHCI_P_CMD_PMA 0x00020000
#define AHCI_P_CMD_HPCP 0x00040000
#define AHCI_P_CMD_MPSP 0x00080000
#define AHCI_P_CMD_CPD 0x00100000
#define AHCI_P_CMD_ESP 0x00200000
#define AHCI_P_CMD_FBSCP 0x00400000
#define AHCI_P_CMD_APSTE 0x00800000
#define AHCI_P_CMD_ATAPI 0x01000000
#define AHCI_P_CMD_DLAE 0x02000000
#define AHCI_P_CMD_ALPE 0x04000000
#define AHCI_P_CMD_ASP 0x08000000
#define AHCI_P_CMD_ICC_MASK 0xf0000000
#define AHCI_P_CMD_NOOP 0x00000000
#define AHCI_P_CMD_ACTIVE 0x10000000
#define AHCI_P_CMD_PARTIAL 0x20000000
#define AHCI_P_CMD_SLUMBER 0x60000000
#define AHCI_P_CMD_DEVSLEEP 0x80000000
#define AHCI_P_TFD 0x20
#define AHCI_P_SIG 0x24
#define AHCI_P_SSTS 0x28
#define AHCI_P_SCTL 0x2c
#define AHCI_P_SERR 0x30
#define AHCI_P_SACT 0x34
#define AHCI_P_CI 0x38
#define AHCI_P_SNTF 0x3C
#define AHCI_P_FBS 0x40
#define AHCI_P_FBS_EN 0x00000001
#define AHCI_P_FBS_DEC 0x00000002
#define AHCI_P_FBS_SDE 0x00000004
#define AHCI_P_FBS_DEV 0x00000f00
#define AHCI_P_FBS_DEV_SHIFT 8
#define AHCI_P_FBS_ADO 0x0000f000
#define AHCI_P_FBS_ADO_SHIFT 12
#define AHCI_P_FBS_DWE 0x000f0000
#define AHCI_P_FBS_DWE_SHIFT 16
#define AHCI_P_DEVSLP 0x44
#define AHCI_P_DEVSLP_ADSE 0x00000001
#define AHCI_P_DEVSLP_DSP 0x00000002
#define AHCI_P_DEVSLP_DETO 0x000003fc
#define AHCI_P_DEVSLP_DETO_SHIFT 2
#define AHCI_P_DEVSLP_MDAT 0x00007c00
#define AHCI_P_DEVSLP_MDAT_SHIFT 10
#define AHCI_P_DEVSLP_DITO 0x01ff8000
#define AHCI_P_DEVSLP_DITO_SHIFT 15
#define AHCI_P_DEVSLP_DM 0x0e000000
#define AHCI_P_DEVSLP_DM_SHIFT 25
/* Just to be sure, if building as module. */
#if MAXPHYS < 512 * 1024
#undef MAXPHYS
#define MAXPHYS 512 * 1024
#endif
/* Pessimistic prognosis on number of required S/G entries */
#define AHCI_SG_ENTRIES (roundup(btoc(MAXPHYS) + 1, 8))
/* Command list. 32 commands. First, 1Kbyte aligned. */
#define AHCI_CL_OFFSET 0
#define AHCI_CL_SIZE 32
/* Command tables. Up to 32 commands, Each, 128byte aligned. */
#define AHCI_CT_OFFSET (AHCI_CL_OFFSET + AHCI_CL_SIZE * AHCI_MAX_SLOTS)
#define AHCI_CT_SIZE (128 + AHCI_SG_ENTRIES * 16)
/* Total main work area. */
#define AHCI_WORK_SIZE (AHCI_CT_OFFSET + AHCI_CT_SIZE * ch->numslots)
#endif /* _AHCI_H_ */

90
bhyve/atkbdc.c Normal file
View file

@ -0,0 +1,90 @@
/*-
* Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <machine/vmm.h>
#include <vmmapi.h>
#include <assert.h>
#include <errno.h>
#include <stdio.h>
#include "inout.h"
#include "pci_lpc.h"
#define KBD_DATA_PORT 0x60
#define KBD_STS_CTL_PORT 0x64
#define KBD_SYS_FLAG 0x4
#define KBDC_RESET 0xfe
static int
atkbdc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
if (bytes != 1)
return (-1);
*eax = 0;
return (0);
}
static int
atkbdc_sts_ctl_handler(struct vmctx *ctx, int vcpu, int in, int port,
int bytes, uint32_t *eax, void *arg)
{
int error, retval;
if (bytes != 1)
return (-1);
retval = 0;
if (in) {
*eax = KBD_SYS_FLAG; /* system passed POST */
} else {
switch (*eax) {
case KBDC_RESET: /* Pulse "reset" line. */
error = vm_suspend(ctx, VM_SUSPEND_RESET);
assert(error == 0 || errno == EALREADY);
break;
}
}
return (retval);
}
INOUT_PORT(atkdbc, KBD_DATA_PORT, IOPORT_F_INOUT, atkbdc_data_handler);
SYSRES_IO(KBD_DATA_PORT, 1);
INOUT_PORT(atkbdc, KBD_STS_CTL_PORT, IOPORT_F_INOUT,
atkbdc_sts_ctl_handler);
SYSRES_IO(KBD_STS_CTL_PORT, 1);

325
bhyve/bhyve.8 Normal file
View file

@ -0,0 +1,325 @@
.\" Copyright (c) 2013 Peter Grehan
.\" All rights reserved.
.\"
.\" Redistribution and use in source and binary forms, with or without
.\" modification, are permitted provided that the following conditions
.\" are met:
.\" 1. Redistributions of source code must retain the above copyright
.\" notice, this list of conditions and the following disclaimer.
.\" 2. Redistributions in binary form must reproduce the above copyright
.\" notice, this list of conditions and the following disclaimer in the
.\" documentation and/or other materials provided with the distribution.
.\"
.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
.\" SUCH DAMAGE.
.\"
.\" $FreeBSD$
.\"
.Dd September 17, 2014
.Dt BHYVE 8
.Os
.Sh NAME
.Nm bhyve
.Nd "run a guest operating system inside a virtual machine"
.Sh SYNOPSIS
.Nm
.Op Fl abehuwxACHPWY
.Op Fl c Ar numcpus
.Op Fl g Ar gdbport
.Op Fl l Ar lpcdev Ns Op , Ns Ar conf
.Op Fl m Ar size Ns Op Ar K|k|M|m|G|g|T|t
.Op Fl p Ar vcpu:hostcpu
.Op Fl s Ar slot,emulation Ns Op , Ns Ar conf
.Op Fl U Ar uuid
.Ar vmname
.Sh DESCRIPTION
.Nm
is a hypervisor that runs guest operating systems inside a
virtual machine.
.Pp
Parameters such as the number of virtual CPUs, amount of guest memory, and
I/O connectivity can be specified with command-line parameters.
.Pp
The guest operating system must be loaded with
.Xr bhyveload 4
or a similar boot loader before running
.Nm .
.Pp
.Nm
runs until the guest operating system reboots or an unhandled hypervisor
exit is detected.
.Sh OPTIONS
.Bl -tag -width 10n
.It Fl a
The guest's local APIC is configured in xAPIC mode.
The xAPIC mode is the default setting so this option is redundant. It will be
deprecated in a future version.
.It Fl A
Generate ACPI tables.
Required for
.Fx Ns /amd64
guests.
.It Fl b
Enable a low-level console device supported by
.Fx
kernels compiled with
.Cd "device bvmconsole" .
This option will be deprecated in a future version.
.It Fl c Ar numcpus
Number of guest virtual CPUs.
The default is 1 and the maximum is 16.
.It Fl C
Include guest memory in core file.
.It Fl e
Force
.Nm
to exit when a guest issues an access to an I/O port that is not emulated.
This is intended for debug purposes.
.It Fl g Ar gdbport
For
.Fx
kernels compiled with
.Cd "device bvmdebug" ,
allow a remote kernel kgdb to be relayed to the guest kernel gdb stub
via a local IPv4 address and this port.
This option will be deprecated in a future version.
.It Fl h
Print help message and exit.
.It Fl H
Yield the virtual CPU thread when a HLT instruction is detected.
If this option is not specified, virtual CPUs will use 100% of a host CPU.
.It Fl l Ar lpcdev Ns Op , Ns Ar conf
Allow devices behind the LPC PCI-ISA bridge to be configured.
The only supported devices are the TTY-class devices,
.Li com1
and
.Li com2 .
.It Fl m Ar size Ns Op Ar K|k|M|m|G|g|T|t
Guest physical memory size in bytes.
This must be the same size that was given to
.Xr bhyveload 8 .
.Pp
The size argument may be suffixed with one of K, M, G or T (either upper
or lower case) to indicate a multiple of kilobytes, megabytes, gigabytes,
or terabytes.
If no suffix is given, the value is assumed to be in megabytes.
.It Fl p Ar vcpu:hostcpu
Pin guest's virtual CPU
.Em vcpu
to
.Em hostcpu .
.It Fl P
Force the guest virtual CPU to exit when a PAUSE instruction is detected.
.It Fl s Ar slot,emulation Ns Op , Ns Ar conf
Configure a virtual PCI slot and function.
.Pp
.Nm bhyve
provides PCI bus emulation and virtual devices that can be attached to
slots on the bus.
There are 32 available slots, with the option of providing up to 8 functions
per slot.
.Bl -tag -width 10n
.It Ar slot
.Ar pcislot[:function]
.Ar bus:pcislot:function
.Pp
The
.Ar pcislot
value is 0 to 31. The optional function value is 0 to 7. The optional
.Ar bus
value is 0 to 255.
If not specified, the function value defaults to 0.
If not specified, the bus value defaults to 0.
.It Ar emulation
.Bl -tag -width 10n
.It Li hostbridge | Li amd_hostbridge
.Pp
Provide a simple host bridge.
This is usually configured at slot 0, and is required by most guest
operating systems.
The
.Li amd_hostbridge
emulation is identical but uses a PCI vendor ID of
.Li AMD .
.It Li passthru
PCI pass-through device.
.It Li virtio-net
Virtio network interface.
.It Li virtio-blk
Virtio block storage interface.
.It Li virtio-rnd
Virtio RNG interface.
.It Li ahci-cd
AHCI controller attached to an ATAPI CD/DVD.
.It Li ahci-hd
AHCI controller attached to a SATA hard-drive.
.It Li uart
PCI 16550 serial device.
.It Li lpc
LPC PCI-ISA bridge with COM1 and COM2 16550 serial ports. The LPC bridge
emulation can only be configured on bus 0.
.El
.It Op Ar conf
This optional parameter describes the backend for device emulations.
If
.Ar conf
is not specified, the device emulation has no backend and can be
considered unconnected.
.Pp
Network devices:
.Bl -tag -width 10n
.It Ar tapN Ns Op , Ns Ar mac=xx:xx:xx:xx:xx:xx
.It Ar vmnetN Ns Op , Ns Ar mac=xx:xx:xx:xx:xx:xx
.Pp
If
.Ar mac
is not specified, the MAC address is derived from a fixed OUI and the
remaining bytes from an MD5 hash of the slot and function numbers and
the device name.
.Pp
The MAC address is an ASCII string in
.Xr ethers 5
format.
.El
.Pp
Block storage devices:
.Bl -tag -width 10n
.It Pa /filename Ns Oo , Ns Ar block-device-options Oc
.It Pa /dev/xxx Ns Oo , Ns Ar block-device-options Oc
.El
.Pp
The
.Ar block-device-options
are:
.Bl -tag -width 8n
.It Li nocache
Open the file with
.Dv O_DIRECT .
.It Li direct
Open the file using
.Dv O_SYNC .
.It Li ro
Force the file to be opened read-only.
.It Li sectorsize= Ns Ar logical Ns Oo / Ns Ar physical Oc
Specify the logical and physical sector sizes of the emulated disk.
The physical sector size is optional and is equal to the logical sector size
if not explicitly specified.
.El
.Pp
TTY devices:
.Bl -tag -width 10n
.It Li stdio
Connect the serial port to the standard input and output of
the bhyve process.
.It Pa /dev/xxx
Use the host TTY device for serial port I/O.
.El
.Pp
Pass-through devices:
.Bl -tag -width 10n
.It Ns Ar slot Ns / Ns Ar bus Ns / Ns Ar function
Connect to a PCI device on the host at the selector described by
.Ar slot ,
.Ar bus ,
and
.Ar function
numbers.
.El
.Pp
The host device must have been reserved at boot-time using the
.Va pptdev
loader variable as described in
.Xr vmm 4 .
.El
.It Fl u
RTC keeps UTC time.
.It Fl U Ar uuid
Set the universally unique identifier
.Pq UUID
in the guest's System Management BIOS System Information structure.
By default a UUID is generated from the host's hostname and
.Ar vmname .
.It Fl w
Ignore accesses to unimplemented Model Specific Registers (MSRs). This is intended for debug purposes.
.It Fl W
Force virtio PCI device emulations to use MSI interrupts instead of MSI-X
interrupts.
.It Fl x
The guest's local APIC is configured in x2APIC mode.
.It Fl Y
Disable MPtable generation.
.It Ar vmname
Alphanumeric name of the guest.
This should be the same as that created by
.Xr bhyveload 8 .
.El
.Sh EXAMPLES
The guest operating system must have been loaded with
.Xr bhyveload 4
or a similar boot loader before
.Xr bhyve 4
can be run.
.Pp
To run a virtual machine with 1GB of memory, two virtual CPUs, a virtio
block device backed by the
.Pa /my/image
filesystem image, and a serial port for the console:
.Bd -literal -offset indent
bhyve -c 2 -s 0,hostbridge -s 1,lpc -s 2,virtio-blk,/my/image \\
-l com1,stdio -A -H -P -m 1G vm1
.Ed
.Pp
Run a 24GB single-CPU virtual machine with three network ports, one of which
has a MAC address specified:
.Bd -literal -offset indent
bhyve -s 0,hostbridge -s 1,lpc -s 2:0,virtio-net,tap0 \\
-s 2:1,virtio-net,tap1 \\
-s 2:2,virtio-net,tap2,mac=00:be:fa:76:45:00 \\
-s 3,virtio-blk,/my/image -l com1,stdio \\
-A -H -P -m 24G bigvm
.Ed
.Pp
Run an 8GB quad-CPU virtual machine with 8 AHCI SATA disks, an AHCI ATAPI
CD-ROM, a single virtio network port, an AMD hostbridge, and the console
port connected to an
.Xr nmdm 4
null-model device.
.Bd -literal -offset indent
bhyve -c 4 \e\
-s 0,amd_hostbridge -s 1,lpc \\
-s 1:0,ahci-hd,/images/disk.1 \\
-s 1:1,ahci-hd,/images/disk.2 \\
-s 1:2,ahci-hd,/images/disk.3 \\
-s 1:3,ahci-hd,/images/disk.4 \\
-s 1:4,ahci-hd,/images/disk.5 \\
-s 1:5,ahci-hd,/images/disk.6 \\
-s 1:6,ahci-hd,/images/disk.7 \\
-s 1:7,ahci-hd,/images/disk.8 \\
-s 2,ahci-cd,/images.install.iso \\
-s 3,virtio-net,tap0 \\
-l com1,/dev/nmdm0A \\
-A -H -P -m 8G
.Ed
.Sh SEE ALSO
.Xr bhyve 4 ,
.Xr nmdm 4 ,
.Xr vmm 4 ,
.Xr ethers 5 ,
.Xr bhyvectl 8 ,
.Xr bhyveload 8
.Sh HISTORY
.Nm
first appeared in
.Fx 10.0 .
.Sh AUTHORS
.An Neel Natu Aq Mt neel@freebsd.org
.An Peter Grehan Aq Mt grehan@freebsd.org

892
bhyve/bhyverun.c Normal file
View file

@ -0,0 +1,892 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/time.h>
#include <machine/atomic.h>
#include <machine/segments.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <err.h>
#include <libgen.h>
#include <unistd.h>
#include <assert.h>
#include <errno.h>
#include <pthread.h>
#include <pthread_np.h>
#include <sysexits.h>
#include <machine/vmm.h>
#include <vmmapi.h>
#include "bhyverun.h"
#include "acpi.h"
#include "inout.h"
#include "dbgport.h"
#include "ioapic.h"
#include "mem.h"
#include "mevent.h"
#include "mptbl.h"
#include "pci_emul.h"
#include "pci_irq.h"
#include "pci_lpc.h"
#include "smbiostbl.h"
#include "xmsr.h"
#include "spinup_ap.h"
#include "rtc.h"
#define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */
#define MB (1024UL * 1024)
#define GB (1024UL * MB)
typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu);
char *vmname;
int guest_ncpus;
char *guest_uuid_str;
static int guest_vmexit_on_hlt, guest_vmexit_on_pause;
static int virtio_msix = 1;
static int x2apic_mode = 0; /* default is xAPIC */
static int strictio;
static int strictmsr = 1;
static int acpi;
static char *progname;
static const int BSP = 0;
static cpuset_t cpumask;
static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);
static struct vm_exit vmexit[VM_MAXCPU];
struct bhyvestats {
uint64_t vmexit_bogus;
uint64_t vmexit_bogus_switch;
uint64_t vmexit_hlt;
uint64_t vmexit_pause;
uint64_t vmexit_mtrap;
uint64_t vmexit_inst_emul;
uint64_t cpu_switch_rotate;
uint64_t cpu_switch_direct;
} stats;
struct mt_vmm_info {
pthread_t mt_thr;
struct vmctx *mt_ctx;
int mt_vcpu;
} mt_vmm_info[VM_MAXCPU];
static cpuset_t *vcpumap[VM_MAXCPU] = { NULL };
static void
usage(int code)
{
fprintf(stderr,
"Usage: %s [-abehuwxACHPWY] [-c vcpus] [-g <gdb port>] [-l <lpc>]\n"
" %*s [-m mem] [-p vcpu:hostcpu] [-s <pci>] [-U uuid] <vm>\n"
" -a: local apic is in xAPIC mode (deprecated)\n"
" -A: create ACPI tables\n"
" -c: # cpus (default 1)\n"
" -C: include guest memory in core file\n"
" -e: exit on unhandled I/O access\n"
" -g: gdb port\n"
" -h: help\n"
" -H: vmexit from the guest on hlt\n"
" -l: LPC device configuration\n"
" -m: memory size in MB\n"
" -p: pin 'vcpu' to 'hostcpu'\n"
" -P: vmexit from the guest on pause\n"
" -s: <slot,driver,configinfo> PCI slot config\n"
" -u: RTC keeps UTC time\n"
" -U: uuid\n"
" -w: ignore unimplemented MSRs\n"
" -W: force virtio to use single-vector MSI\n"
" -x: local apic is in x2APIC mode\n"
" -Y: disable MPtable generation\n",
progname, (int)strlen(progname), "");
exit(code);
}
static int
pincpu_parse(const char *opt)
{
int vcpu, pcpu;
if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) {
fprintf(stderr, "invalid format: %s\n", opt);
return (-1);
}
if (vcpu < 0 || vcpu >= VM_MAXCPU) {
fprintf(stderr, "vcpu '%d' outside valid range from 0 to %d\n",
vcpu, VM_MAXCPU - 1);
return (-1);
}
if (pcpu < 0 || pcpu >= CPU_SETSIZE) {
fprintf(stderr, "hostcpu '%d' outside valid range from "
"0 to %d\n", pcpu, CPU_SETSIZE - 1);
return (-1);
}
if (vcpumap[vcpu] == NULL) {
if ((vcpumap[vcpu] = malloc(sizeof(cpuset_t))) == NULL) {
perror("malloc");
return (-1);
}
CPU_ZERO(vcpumap[vcpu]);
}
CPU_SET(pcpu, vcpumap[vcpu]);
return (0);
}
void
vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid,
int errcode)
{
struct vmctx *ctx;
int error, restart_instruction;
ctx = arg;
restart_instruction = 1;
error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode,
restart_instruction);
assert(error == 0);
}
void *
paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len)
{
return (vm_map_gpa(ctx, gaddr, len));
}
int
fbsdrun_vmexit_on_pause(void)
{
return (guest_vmexit_on_pause);
}
int
fbsdrun_vmexit_on_hlt(void)
{
return (guest_vmexit_on_hlt);
}
int
fbsdrun_virtio_msix(void)
{
return (virtio_msix);
}
static void *
fbsdrun_start_thread(void *param)
{
char tname[MAXCOMLEN + 1];
struct mt_vmm_info *mtp;
int vcpu;
mtp = param;
vcpu = mtp->mt_vcpu;
snprintf(tname, sizeof(tname), "vcpu %d", vcpu);
pthread_set_name_np(mtp->mt_thr, tname);
vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip);
/* not reached */
exit(1);
return (NULL);
}
void
fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip)
{
int error;
assert(fromcpu == BSP);
/*
* The 'newcpu' must be activated in the context of 'fromcpu'. If
* vm_activate_cpu() is delayed until newcpu's pthread starts running
* then vmm.ko is out-of-sync with bhyve and this can create a race
* with vm_suspend().
*/
error = vm_activate_cpu(ctx, newcpu);
assert(error == 0);
CPU_SET_ATOMIC(newcpu, &cpumask);
/*
* Set up the vmexit struct to allow execution to start
* at the given RIP
*/
vmexit[newcpu].rip = rip;
vmexit[newcpu].inst_length = 0;
mt_vmm_info[newcpu].mt_ctx = ctx;
mt_vmm_info[newcpu].mt_vcpu = newcpu;
error = pthread_create(&mt_vmm_info[newcpu].mt_thr, NULL,
fbsdrun_start_thread, &mt_vmm_info[newcpu]);
assert(error == 0);
}
static int
fbsdrun_deletecpu(struct vmctx *ctx, int vcpu)
{
if (!CPU_ISSET(vcpu, &cpumask)) {
fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu);
exit(1);
}
CPU_CLR_ATOMIC(vcpu, &cpumask);
return (CPU_EMPTY(&cpumask));
}
static int
vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu,
uint32_t eax)
{
#if BHYVE_DEBUG
/*
* put guest-driven debug here
*/
#endif
return (VMEXIT_CONTINUE);
}
static int
vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
{
int error;
int bytes, port, in, out, string;
int vcpu;
vcpu = *pvcpu;
port = vme->u.inout.port;
bytes = vme->u.inout.bytes;
string = vme->u.inout.string;
in = vme->u.inout.in;
out = !in;
/* Extra-special case of host notifications */
if (out && port == GUEST_NIO_PORT) {
error = vmexit_handle_notify(ctx, vme, pvcpu, vme->u.inout.eax);
return (error);
}
error = emulate_inout(ctx, vcpu, vme, strictio);
if (error) {
fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n",
in ? "in" : "out",
bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'),
port, vmexit->rip);
return (VMEXIT_ABORT);
} else {
return (VMEXIT_CONTINUE);
}
}
static int
vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
{
uint64_t val;
uint32_t eax, edx;
int error;
val = 0;
error = emulate_rdmsr(ctx, *pvcpu, vme->u.msr.code, &val);
if (error != 0) {
fprintf(stderr, "rdmsr to register %#x on vcpu %d\n",
vme->u.msr.code, *pvcpu);
if (strictmsr) {
vm_inject_gp(ctx, *pvcpu);
return (VMEXIT_CONTINUE);
}
}
eax = val;
error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RAX, eax);
assert(error == 0);
edx = val >> 32;
error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RDX, edx);
assert(error == 0);
return (VMEXIT_CONTINUE);
}
static int
vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
{
int error;
error = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code, vme->u.msr.wval);
if (error != 0) {
fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n",
vme->u.msr.code, vme->u.msr.wval, *pvcpu);
if (strictmsr) {
vm_inject_gp(ctx, *pvcpu);
return (VMEXIT_CONTINUE);
}
}
return (VMEXIT_CONTINUE);
}
static int
vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
{
int newcpu;
int retval = VMEXIT_CONTINUE;
newcpu = spinup_ap(ctx, *pvcpu,
vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip);
return (retval);
}
#define DEBUG_EPT_MISCONFIG
#ifdef DEBUG_EPT_MISCONFIG
#define EXIT_REASON_EPT_MISCONFIG 49
#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400
#define VMCS_IDENT(x) ((x) | 0x80000000)
static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4];
static int ept_misconfig_ptenum;
#endif
static int
vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
fprintf(stderr, "vm exit[%d]\n", *pvcpu);
fprintf(stderr, "\treason\t\tVMX\n");
fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
fprintf(stderr, "\tstatus\t\t%d\n", vmexit->u.vmx.status);
fprintf(stderr, "\texit_reason\t%u\n", vmexit->u.vmx.exit_reason);
fprintf(stderr, "\tqualification\t0x%016lx\n",
vmexit->u.vmx.exit_qualification);
fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type);
fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error);
#ifdef DEBUG_EPT_MISCONFIG
if (vmexit->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) {
vm_get_register(ctx, *pvcpu,
VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS),
&ept_misconfig_gpa);
vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte,
&ept_misconfig_ptenum);
fprintf(stderr, "\tEPT misconfiguration:\n");
fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa);
fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n",
ept_misconfig_ptenum, ept_misconfig_pte[0],
ept_misconfig_pte[1], ept_misconfig_pte[2],
ept_misconfig_pte[3]);
}
#endif /* DEBUG_EPT_MISCONFIG */
return (VMEXIT_ABORT);
}
static int
vmexit_svm(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
fprintf(stderr, "vm exit[%d]\n", *pvcpu);
fprintf(stderr, "\treason\t\tSVM\n");
fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
fprintf(stderr, "\texitcode\t%#lx\n", vmexit->u.svm.exitcode);
fprintf(stderr, "\texitinfo1\t%#lx\n", vmexit->u.svm.exitinfo1);
fprintf(stderr, "\texitinfo2\t%#lx\n", vmexit->u.svm.exitinfo2);
return (VMEXIT_ABORT);
}
static int
vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
assert(vmexit->inst_length == 0);
stats.vmexit_bogus++;
return (VMEXIT_CONTINUE);
}
static int
vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
stats.vmexit_hlt++;
/*
* Just continue execution with the next instruction. We use
* the HLT VM exit as a way to be friendly with the host
* scheduler.
*/
return (VMEXIT_CONTINUE);
}
static int
vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
stats.vmexit_pause++;
return (VMEXIT_CONTINUE);
}
static int
vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
assert(vmexit->inst_length == 0);
stats.vmexit_mtrap++;
return (VMEXIT_CONTINUE);
}
static int
vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
int err, i;
struct vie *vie;
stats.vmexit_inst_emul++;
vie = &vmexit->u.inst_emul.vie;
err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa,
vie, &vmexit->u.inst_emul.paging);
if (err) {
if (err == ESRCH) {
fprintf(stderr, "Unhandled memory access to 0x%lx\n",
vmexit->u.inst_emul.gpa);
}
fprintf(stderr, "Failed to emulate instruction [");
for (i = 0; i < vie->num_valid; i++) {
fprintf(stderr, "0x%02x%s", vie->inst[i],
i != (vie->num_valid - 1) ? " " : "");
}
fprintf(stderr, "] at 0x%lx\n", vmexit->rip);
return (VMEXIT_ABORT);
}
return (VMEXIT_CONTINUE);
}
static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER;
static int
vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
enum vm_suspend_how how;
how = vmexit->u.suspended.how;
fbsdrun_deletecpu(ctx, *pvcpu);
if (*pvcpu != BSP) {
pthread_mutex_lock(&resetcpu_mtx);
pthread_cond_signal(&resetcpu_cond);
pthread_mutex_unlock(&resetcpu_mtx);
pthread_exit(NULL);
}
pthread_mutex_lock(&resetcpu_mtx);
while (!CPU_EMPTY(&cpumask)) {
pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx);
}
pthread_mutex_unlock(&resetcpu_mtx);
switch (how) {
case VM_SUSPEND_RESET:
exit(0);
case VM_SUSPEND_POWEROFF:
exit(1);
case VM_SUSPEND_HALT:
exit(2);
case VM_SUSPEND_TRIPLEFAULT:
exit(3);
default:
fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how);
exit(100);
}
return (0); /* NOTREACHED */
}
static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
[VM_EXITCODE_INOUT] = vmexit_inout,
[VM_EXITCODE_INOUT_STR] = vmexit_inout,
[VM_EXITCODE_VMX] = vmexit_vmx,
[VM_EXITCODE_SVM] = vmexit_svm,
[VM_EXITCODE_BOGUS] = vmexit_bogus,
[VM_EXITCODE_RDMSR] = vmexit_rdmsr,
[VM_EXITCODE_WRMSR] = vmexit_wrmsr,
[VM_EXITCODE_MTRAP] = vmexit_mtrap,
[VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
[VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
[VM_EXITCODE_SUSPENDED] = vmexit_suspend,
[VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch,
};
static void
vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip)
{
int error, rc, prevcpu;
enum vm_exitcode exitcode;
cpuset_t active_cpus;
if (vcpumap[vcpu] != NULL) {
error = pthread_setaffinity_np(pthread_self(),
sizeof(cpuset_t), vcpumap[vcpu]);
assert(error == 0);
}
error = vm_active_cpus(ctx, &active_cpus);
assert(CPU_ISSET(vcpu, &active_cpus));
error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip);
assert(error == 0);
while (1) {
error = vm_run(ctx, vcpu, &vmexit[vcpu]);
if (error != 0)
break;
prevcpu = vcpu;
exitcode = vmexit[vcpu].exitcode;
if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) {
fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n",
exitcode);
exit(1);
}
rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu);
switch (rc) {
case VMEXIT_CONTINUE:
break;
case VMEXIT_ABORT:
abort();
default:
exit(1);
}
}
fprintf(stderr, "vm_run error %d, errno %d\n", error, errno);
}
static int
num_vcpus_allowed(struct vmctx *ctx)
{
int tmp, error;
error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp);
/*
* The guest is allowed to spinup more than one processor only if the
* UNRESTRICTED_GUEST capability is available.
*/
if (error == 0)
return (VM_MAXCPU);
else
return (1);
}
void
fbsdrun_set_capabilities(struct vmctx *ctx, int cpu)
{
int err, tmp;
if (fbsdrun_vmexit_on_hlt()) {
err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp);
if (err < 0) {
fprintf(stderr, "VM exit on HLT not supported\n");
exit(1);
}
vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1);
if (cpu == BSP)
handler[VM_EXITCODE_HLT] = vmexit_hlt;
}
if (fbsdrun_vmexit_on_pause()) {
/*
* pause exit support required for this mode
*/
err = vm_get_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, &tmp);
if (err < 0) {
fprintf(stderr,
"SMP mux requested, no pause support\n");
exit(1);
}
vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1);
if (cpu == BSP)
handler[VM_EXITCODE_PAUSE] = vmexit_pause;
}
if (x2apic_mode)
err = vm_set_x2apic_state(ctx, cpu, X2APIC_ENABLED);
else
err = vm_set_x2apic_state(ctx, cpu, X2APIC_DISABLED);
if (err) {
fprintf(stderr, "Unable to set x2apic state (%d)\n", err);
exit(1);
}
vm_set_capability(ctx, cpu, VM_CAP_ENABLE_INVPCID, 1);
}
int
main(int argc, char *argv[])
{
int c, error, gdb_port, err, bvmcons;
int dump_guest_memory, max_vcpus, mptgen;
int rtc_localtime;
struct vmctx *ctx;
uint64_t rip;
size_t memsize;
bvmcons = 0;
dump_guest_memory = 0;
progname = basename(argv[0]);
gdb_port = 0;
guest_ncpus = 1;
memsize = 256 * MB;
mptgen = 1;
rtc_localtime = 1;
while ((c = getopt(argc, argv, "abehuwxACHIPWYp:g:c:s:m:l:U:")) != -1) {
switch (c) {
case 'a':
x2apic_mode = 0;
break;
case 'A':
acpi = 1;
break;
case 'b':
bvmcons = 1;
break;
case 'p':
if (pincpu_parse(optarg) != 0) {
errx(EX_USAGE, "invalid vcpu pinning "
"configuration '%s'", optarg);
}
break;
case 'c':
guest_ncpus = atoi(optarg);
break;
case 'C':
dump_guest_memory = 1;
break;
case 'g':
gdb_port = atoi(optarg);
break;
case 'l':
if (lpc_device_parse(optarg) != 0) {
errx(EX_USAGE, "invalid lpc device "
"configuration '%s'", optarg);
}
break;
case 's':
if (pci_parse_slot(optarg) != 0)
exit(1);
else
break;
case 'm':
error = vm_parse_memsize(optarg, &memsize);
if (error)
errx(EX_USAGE, "invalid memsize '%s'", optarg);
break;
case 'H':
guest_vmexit_on_hlt = 1;
break;
case 'I':
/*
* The "-I" option was used to add an ioapic to the
* virtual machine.
*
* An ioapic is now provided unconditionally for each
* virtual machine and this option is now deprecated.
*/
break;
case 'P':
guest_vmexit_on_pause = 1;
break;
case 'e':
strictio = 1;
break;
case 'u':
rtc_localtime = 0;
break;
case 'U':
guest_uuid_str = optarg;
break;
case 'w':
strictmsr = 0;
break;
case 'W':
virtio_msix = 0;
break;
case 'x':
x2apic_mode = 1;
break;
case 'Y':
mptgen = 0;
break;
case 'h':
usage(0);
default:
usage(1);
}
}
argc -= optind;
argv += optind;
if (argc != 1)
usage(1);
vmname = argv[0];
ctx = vm_open(vmname);
if (ctx == NULL) {
perror("vm_open");
exit(1);
}
if (guest_ncpus < 1) {
fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus);
exit(1);
}
max_vcpus = num_vcpus_allowed(ctx);
if (guest_ncpus > max_vcpus) {
fprintf(stderr, "%d vCPUs requested but only %d available\n",
guest_ncpus, max_vcpus);
exit(1);
}
fbsdrun_set_capabilities(ctx, BSP);
if (dump_guest_memory)
vm_set_memflags(ctx, VM_MEM_F_INCORE);
err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
if (err) {
fprintf(stderr, "Unable to setup memory (%d)\n", err);
exit(1);
}
error = init_msr();
if (error) {
fprintf(stderr, "init_msr error %d", error);
exit(1);
}
init_mem();
init_inout();
pci_irq_init(ctx);
ioapic_init(ctx);
rtc_init(ctx, rtc_localtime);
sci_init(ctx);
/*
* Exit if a device emulation finds an error in it's initilization
*/
if (init_pci(ctx) != 0)
exit(1);
if (gdb_port != 0)
init_dbgport(gdb_port);
if (bvmcons)
init_bvmcons();
error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
assert(error == 0);
/*
* build the guest tables, MP etc.
*/
if (mptgen) {
error = mptable_build(ctx, guest_ncpus);
if (error)
exit(1);
}
error = smbios_build(ctx);
assert(error == 0);
if (acpi) {
error = acpi_build(ctx, guest_ncpus);
assert(error == 0);
}
/*
* Change the proc title to include the VM name.
*/
setproctitle("%s", vmname);
/*
* Add CPU 0
*/
fbsdrun_addcpu(ctx, BSP, BSP, rip);
/*
* Head off to the main event dispatch loop
*/
mevent_dispatch();
exit(1);
}

55
bhyve/bhyverun.h Normal file
View file

@ -0,0 +1,55 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _FBSDRUN_H_
#define _FBSDRUN_H_
#ifndef CTASSERT /* Allow lint to override */
#define CTASSERT(x) _CTASSERT(x, __LINE__)
#define _CTASSERT(x, y) __CTASSERT(x, y)
#define __CTASSERT(x, y) typedef char __assert ## y[(x) ? 1 : -1]
#endif
#define VMEXIT_CONTINUE (0)
#define VMEXIT_ABORT (-1)
struct vmctx;
extern int guest_ncpus;
extern char *guest_uuid_str;
extern char *vmname;
void *paddr_guest2host(struct vmctx *ctx, uintptr_t addr, size_t len);
void fbsdrun_set_capabilities(struct vmctx *ctx, int cpu);
void fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip);
int fbsdrun_muxed(void);
int fbsdrun_vmexit_on_hlt(void);
int fbsdrun_vmexit_on_pause(void);
int fbsdrun_disable_x2apic(void);
int fbsdrun_virtio_msix(void);
#endif

822
bhyve/block_if.c Normal file
View file

@ -0,0 +1,822 @@
/*-
* Copyright (c) 2013 Peter Grehan <grehan@freebsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/queue.h>
#include <sys/errno.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <sys/disk.h>
#include <assert.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <pthread_np.h>
#include <signal.h>
#include <unistd.h>
#include <machine/atomic.h>
#include "bhyverun.h"
#include "mevent.h"
#include "block_if.h"
#define BLOCKIF_SIG 0xb109b109
#define BLOCKIF_NUMTHR 8
#define BLOCKIF_MAXREQ (64 + BLOCKIF_NUMTHR)
enum blockop {
BOP_READ,
BOP_WRITE,
BOP_FLUSH,
BOP_DELETE
};
enum blockstat {
BST_FREE,
BST_BLOCK,
BST_PEND,
BST_BUSY,
BST_DONE
};
struct blockif_elem {
TAILQ_ENTRY(blockif_elem) be_link;
struct blockif_req *be_req;
enum blockop be_op;
enum blockstat be_status;
pthread_t be_tid;
off_t be_block;
};
struct blockif_ctxt {
int bc_magic;
int bc_fd;
int bc_ischr;
int bc_isgeom;
int bc_candelete;
int bc_rdonly;
off_t bc_size;
int bc_sectsz;
int bc_psectsz;
int bc_psectoff;
int bc_closing;
pthread_t bc_btid[BLOCKIF_NUMTHR];
pthread_mutex_t bc_mtx;
pthread_cond_t bc_cond;
/* Request elements and free/pending/busy queues */
TAILQ_HEAD(, blockif_elem) bc_freeq;
TAILQ_HEAD(, blockif_elem) bc_pendq;
TAILQ_HEAD(, blockif_elem) bc_busyq;
struct blockif_elem bc_reqs[BLOCKIF_MAXREQ];
};
static pthread_once_t blockif_once = PTHREAD_ONCE_INIT;
struct blockif_sig_elem {
pthread_mutex_t bse_mtx;
pthread_cond_t bse_cond;
int bse_pending;
struct blockif_sig_elem *bse_next;
};
static struct blockif_sig_elem *blockif_bse_head;
static int
blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq,
enum blockop op)
{
struct blockif_elem *be, *tbe;
off_t off;
int i;
be = TAILQ_FIRST(&bc->bc_freeq);
assert(be != NULL);
assert(be->be_status == BST_FREE);
TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
be->be_req = breq;
be->be_op = op;
switch (op) {
case BOP_READ:
case BOP_WRITE:
case BOP_DELETE:
off = breq->br_offset;
for (i = 0; i < breq->br_iovcnt; i++)
off += breq->br_iov[i].iov_len;
break;
default:
off = OFF_MAX;
}
be->be_block = off;
TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
if (tbe->be_block == breq->br_offset)
break;
}
if (tbe == NULL) {
TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) {
if (tbe->be_block == breq->br_offset)
break;
}
}
if (tbe == NULL)
be->be_status = BST_PEND;
else
be->be_status = BST_BLOCK;
TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link);
return (be->be_status == BST_PEND);
}
static int
blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep)
{
struct blockif_elem *be;
TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
if (be->be_status == BST_PEND)
break;
assert(be->be_status == BST_BLOCK);
}
if (be == NULL)
return (0);
TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
be->be_status = BST_BUSY;
be->be_tid = t;
TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link);
*bep = be;
return (1);
}
static void
blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be)
{
struct blockif_elem *tbe;
if (be->be_status == BST_DONE || be->be_status == BST_BUSY)
TAILQ_REMOVE(&bc->bc_busyq, be, be_link);
else
TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
if (tbe->be_req->br_offset == be->be_block)
tbe->be_status = BST_PEND;
}
be->be_tid = 0;
be->be_status = BST_FREE;
be->be_req = NULL;
TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
}
static void
blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
{
struct blockif_req *br;
off_t arg[2];
ssize_t clen, len, off, boff, voff;
int i, err;
br = be->be_req;
if (br->br_iovcnt <= 1)
buf = NULL;
err = 0;
switch (be->be_op) {
case BOP_READ:
if (buf == NULL) {
if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
br->br_offset)) < 0)
err = errno;
else
br->br_resid -= len;
break;
}
i = 0;
off = voff = 0;
while (br->br_resid > 0) {
len = MIN(br->br_resid, MAXPHYS);
if (pread(bc->bc_fd, buf, len, br->br_offset +
off) < 0) {
err = errno;
break;
}
boff = 0;
do {
clen = MIN(len - boff, br->br_iov[i].iov_len -
voff);
memcpy(br->br_iov[i].iov_base + voff,
buf + boff, clen);
if (clen < br->br_iov[i].iov_len - voff)
voff += clen;
else {
i++;
voff = 0;
}
boff += clen;
} while (boff < len);
off += len;
br->br_resid -= len;
}
break;
case BOP_WRITE:
if (bc->bc_rdonly) {
err = EROFS;
break;
}
if (buf == NULL) {
if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
br->br_offset)) < 0)
err = errno;
else
br->br_resid -= len;
break;
}
i = 0;
off = voff = 0;
while (br->br_resid > 0) {
len = MIN(br->br_resid, MAXPHYS);
boff = 0;
do {
clen = MIN(len - boff, br->br_iov[i].iov_len -
voff);
memcpy(buf + boff,
br->br_iov[i].iov_base + voff, clen);
if (clen < br->br_iov[i].iov_len - voff)
voff += clen;
else {
i++;
voff = 0;
}
boff += clen;
} while (boff < len);
if (pwrite(bc->bc_fd, buf, len, br->br_offset +
off) < 0) {
err = errno;
break;
}
off += len;
br->br_resid -= len;
}
break;
case BOP_FLUSH:
if (bc->bc_ischr) {
if (ioctl(bc->bc_fd, DIOCGFLUSH))
err = errno;
} else if (fsync(bc->bc_fd))
err = errno;
break;
case BOP_DELETE:
if (!bc->bc_candelete)
err = EOPNOTSUPP;
else if (bc->bc_rdonly)
err = EROFS;
else if (bc->bc_ischr) {
arg[0] = br->br_offset;
arg[1] = br->br_resid;
if (ioctl(bc->bc_fd, DIOCGDELETE, arg))
err = errno;
else
br->br_resid = 0;
} else
err = EOPNOTSUPP;
break;
default:
err = EINVAL;
break;
}
be->be_status = BST_DONE;
(*br->br_callback)(br, err);
}
static void *
blockif_thr(void *arg)
{
struct blockif_ctxt *bc;
struct blockif_elem *be;
pthread_t t;
uint8_t *buf;
bc = arg;
if (bc->bc_isgeom)
buf = malloc(MAXPHYS);
else
buf = NULL;
t = pthread_self();
pthread_mutex_lock(&bc->bc_mtx);
for (;;) {
while (blockif_dequeue(bc, t, &be)) {
pthread_mutex_unlock(&bc->bc_mtx);
blockif_proc(bc, be, buf);
pthread_mutex_lock(&bc->bc_mtx);
blockif_complete(bc, be);
}
/* Check ctxt status here to see if exit requested */
if (bc->bc_closing)
break;
pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
}
pthread_mutex_unlock(&bc->bc_mtx);
if (buf)
free(buf);
pthread_exit(NULL);
return (NULL);
}
static void
blockif_sigcont_handler(int signal, enum ev_type type, void *arg)
{
struct blockif_sig_elem *bse;
for (;;) {
/*
* Process the entire list even if not intended for
* this thread.
*/
do {
bse = blockif_bse_head;
if (bse == NULL)
return;
} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
(uintptr_t)bse,
(uintptr_t)bse->bse_next));
pthread_mutex_lock(&bse->bse_mtx);
bse->bse_pending = 0;
pthread_cond_signal(&bse->bse_cond);
pthread_mutex_unlock(&bse->bse_mtx);
}
}
static void
blockif_init(void)
{
mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL);
(void) signal(SIGCONT, SIG_IGN);
}
struct blockif_ctxt *
blockif_open(const char *optstr, const char *ident)
{
char tname[MAXCOMLEN + 1];
char name[MAXPATHLEN];
char *nopt, *xopts, *cp;
struct blockif_ctxt *bc;
struct stat sbuf;
struct diocgattr_arg arg;
off_t size, psectsz, psectoff;
int extra, fd, i, sectsz;
int nocache, sync, ro, candelete, geom, ssopt, pssopt;
pthread_once(&blockif_once, blockif_init);
fd = -1;
ssopt = 0;
nocache = 0;
sync = 0;
ro = 0;
/*
* The first element in the optstring is always a pathname.
* Optional elements follow
*/
nopt = xopts = strdup(optstr);
while (xopts != NULL) {
cp = strsep(&xopts, ",");
if (cp == nopt) /* file or device pathname */
continue;
else if (!strcmp(cp, "nocache"))
nocache = 1;
else if (!strcmp(cp, "sync") || !strcmp(cp, "direct"))
sync = 1;
else if (!strcmp(cp, "ro"))
ro = 1;
else if (sscanf(cp, "sectorsize=%d/%d", &ssopt, &pssopt) == 2)
;
else if (sscanf(cp, "sectorsize=%d", &ssopt) == 1)
pssopt = ssopt;
else {
fprintf(stderr, "Invalid device option \"%s\"\n", cp);
goto err;
}
}
extra = 0;
if (nocache)
extra |= O_DIRECT;
if (sync)
extra |= O_SYNC;
fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra);
if (fd < 0 && !ro) {
/* Attempt a r/w fail with a r/o open */
fd = open(nopt, O_RDONLY | extra);
ro = 1;
}
if (fd < 0) {
perror("Could not open backing file");
goto err;
}
if (fstat(fd, &sbuf) < 0) {
perror("Could not stat backing file");
goto err;
}
/*
* Deal with raw devices
*/
size = sbuf.st_size;
sectsz = DEV_BSIZE;
psectsz = psectoff = 0;
candelete = geom = 0;
if (S_ISCHR(sbuf.st_mode)) {
if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
perror("Could not fetch dev blk/sector size");
goto err;
}
assert(size != 0);
assert(sectsz != 0);
if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0)
ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff);
strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
arg.len = sizeof(arg.value.i);
if (ioctl(fd, DIOCGATTR, &arg) == 0)
candelete = arg.value.i;
if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0)
geom = 1;
} else
psectsz = sbuf.st_blksize;
if (ssopt != 0) {
if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 ||
ssopt > pssopt) {
fprintf(stderr, "Invalid sector size %d/%d\n",
ssopt, pssopt);
goto err;
}
/*
* Some backend drivers (e.g. cd0, ada0) require that the I/O
* size be a multiple of the device's sector size.
*
* Validate that the emulated sector size complies with this
* requirement.
*/
if (S_ISCHR(sbuf.st_mode)) {
if (ssopt < sectsz || (ssopt % sectsz) != 0) {
fprintf(stderr, "Sector size %d incompatible "
"with underlying device sector size %d\n",
ssopt, sectsz);
goto err;
}
}
sectsz = ssopt;
psectsz = pssopt;
psectoff = 0;
}
bc = calloc(1, sizeof(struct blockif_ctxt));
if (bc == NULL) {
perror("calloc");
goto err;
}
bc->bc_magic = BLOCKIF_SIG;
bc->bc_fd = fd;
bc->bc_ischr = S_ISCHR(sbuf.st_mode);
bc->bc_isgeom = geom;
bc->bc_candelete = candelete;
bc->bc_rdonly = ro;
bc->bc_size = size;
bc->bc_sectsz = sectsz;
bc->bc_psectsz = psectsz;
bc->bc_psectoff = psectoff;
pthread_mutex_init(&bc->bc_mtx, NULL);
pthread_cond_init(&bc->bc_cond, NULL);
TAILQ_INIT(&bc->bc_freeq);
TAILQ_INIT(&bc->bc_pendq);
TAILQ_INIT(&bc->bc_busyq);
for (i = 0; i < BLOCKIF_MAXREQ; i++) {
bc->bc_reqs[i].be_status = BST_FREE;
TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
}
for (i = 0; i < BLOCKIF_NUMTHR; i++) {
pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc);
snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i);
pthread_set_name_np(bc->bc_btid[i], tname);
}
return (bc);
err:
if (fd >= 0)
close(fd);
return (NULL);
}
static int
blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
enum blockop op)
{
int err;
err = 0;
pthread_mutex_lock(&bc->bc_mtx);
if (!TAILQ_EMPTY(&bc->bc_freeq)) {
/*
* Enqueue and inform the block i/o thread
* that there is work available
*/
if (blockif_enqueue(bc, breq, op))
pthread_cond_signal(&bc->bc_cond);
} else {
/*
* Callers are not allowed to enqueue more than
* the specified blockif queue limit. Return an
* error to indicate that the queue length has been
* exceeded.
*/
err = E2BIG;
}
pthread_mutex_unlock(&bc->bc_mtx);
return (err);
}
int
blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
{
assert(bc->bc_magic == BLOCKIF_SIG);
return (blockif_request(bc, breq, BOP_READ));
}
int
blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
{
assert(bc->bc_magic == BLOCKIF_SIG);
return (blockif_request(bc, breq, BOP_WRITE));
}
int
blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
{
assert(bc->bc_magic == BLOCKIF_SIG);
return (blockif_request(bc, breq, BOP_FLUSH));
}
int
blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq)
{
assert(bc->bc_magic == BLOCKIF_SIG);
return (blockif_request(bc, breq, BOP_DELETE));
}
int
blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
{
struct blockif_elem *be;
assert(bc->bc_magic == BLOCKIF_SIG);
pthread_mutex_lock(&bc->bc_mtx);
/*
* Check pending requests.
*/
TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
if (be->be_req == breq)
break;
}
if (be != NULL) {
/*
* Found it.
*/
blockif_complete(bc, be);
pthread_mutex_unlock(&bc->bc_mtx);
return (0);
}
/*
* Check in-flight requests.
*/
TAILQ_FOREACH(be, &bc->bc_busyq, be_link) {
if (be->be_req == breq)
break;
}
if (be == NULL) {
/*
* Didn't find it.
*/
pthread_mutex_unlock(&bc->bc_mtx);
return (EINVAL);
}
/*
* Interrupt the processing thread to force it return
* prematurely via it's normal callback path.
*/
while (be->be_status == BST_BUSY) {
struct blockif_sig_elem bse, *old_head;
pthread_mutex_init(&bse.bse_mtx, NULL);
pthread_cond_init(&bse.bse_cond, NULL);
bse.bse_pending = 1;
do {
old_head = blockif_bse_head;
bse.bse_next = old_head;
} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
(uintptr_t)old_head,
(uintptr_t)&bse));
pthread_kill(be->be_tid, SIGCONT);
pthread_mutex_lock(&bse.bse_mtx);
while (bse.bse_pending)
pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx);
pthread_mutex_unlock(&bse.bse_mtx);
}
pthread_mutex_unlock(&bc->bc_mtx);
/*
* The processing thread has been interrupted. Since it's not
* clear if the callback has been invoked yet, return EBUSY.
*/
return (EBUSY);
}
int
blockif_close(struct blockif_ctxt *bc)
{
void *jval;
int err, i;
err = 0;
assert(bc->bc_magic == BLOCKIF_SIG);
/*
* Stop the block i/o thread
*/
pthread_mutex_lock(&bc->bc_mtx);
bc->bc_closing = 1;
pthread_mutex_unlock(&bc->bc_mtx);
pthread_cond_broadcast(&bc->bc_cond);
for (i = 0; i < BLOCKIF_NUMTHR; i++)
pthread_join(bc->bc_btid[i], &jval);
/* XXX Cancel queued i/o's ??? */
/*
* Release resources
*/
bc->bc_magic = 0;
close(bc->bc_fd);
free(bc);
return (0);
}
/*
* Return virtual C/H/S values for a given block. Use the algorithm
* outlined in the VHD specification to calculate values.
*/
void
blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
{
off_t sectors; /* total sectors of the block dev */
off_t hcyl; /* cylinders times heads */
uint16_t secpt; /* sectors per track */
uint8_t heads;
assert(bc->bc_magic == BLOCKIF_SIG);
sectors = bc->bc_size / bc->bc_sectsz;
/* Clamp the size to the largest possible with CHS */
if (sectors > 65535UL*16*255)
sectors = 65535UL*16*255;
if (sectors >= 65536UL*16*63) {
secpt = 255;
heads = 16;
hcyl = sectors / secpt;
} else {
secpt = 17;
hcyl = sectors / secpt;
heads = (hcyl + 1023) / 1024;
if (heads < 4)
heads = 4;
if (hcyl >= (heads * 1024) || heads > 16) {
secpt = 31;
heads = 16;
hcyl = sectors / secpt;
}
if (hcyl >= (heads * 1024)) {
secpt = 63;
heads = 16;
hcyl = sectors / secpt;
}
}
*c = hcyl / heads;
*h = heads;
*s = secpt;
}
/*
* Accessors
*/
off_t
blockif_size(struct blockif_ctxt *bc)
{
assert(bc->bc_magic == BLOCKIF_SIG);
return (bc->bc_size);
}
int
blockif_sectsz(struct blockif_ctxt *bc)
{
assert(bc->bc_magic == BLOCKIF_SIG);
return (bc->bc_sectsz);
}
void
blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off)
{
assert(bc->bc_magic == BLOCKIF_SIG);
*size = bc->bc_psectsz;
*off = bc->bc_psectoff;
}
int
blockif_queuesz(struct blockif_ctxt *bc)
{
assert(bc->bc_magic == BLOCKIF_SIG);
return (BLOCKIF_MAXREQ - 1);
}
int
blockif_is_ro(struct blockif_ctxt *bc)
{
assert(bc->bc_magic == BLOCKIF_SIG);
return (bc->bc_rdonly);
}
int
blockif_candelete(struct blockif_ctxt *bc)
{
assert(bc->bc_magic == BLOCKIF_SIG);
return (bc->bc_candelete);
}

70
bhyve/block_if.h Normal file
View file

@ -0,0 +1,70 @@
/*-
* Copyright (c) 2013 Peter Grehan <grehan@freebsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
/*
* The block API to be used by bhyve block-device emulations. The routines
* are thread safe, with no assumptions about the context of the completion
* callback - it may occur in the caller's context, or asynchronously in
* another thread.
*/
#ifndef _BLOCK_IF_H_
#define _BLOCK_IF_H_
#include <sys/uio.h>
#include <sys/unistd.h>
#define BLOCKIF_IOV_MAX 33 /* not practical to be IOV_MAX */
struct blockif_req {
struct iovec br_iov[BLOCKIF_IOV_MAX];
int br_iovcnt;
off_t br_offset;
ssize_t br_resid;
void (*br_callback)(struct blockif_req *req, int err);
void *br_param;
};
struct blockif_ctxt;
struct blockif_ctxt *blockif_open(const char *optstr, const char *ident);
off_t blockif_size(struct blockif_ctxt *bc);
void blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h,
uint8_t *s);
int blockif_sectsz(struct blockif_ctxt *bc);
void blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off);
int blockif_queuesz(struct blockif_ctxt *bc);
int blockif_is_ro(struct blockif_ctxt *bc);
int blockif_candelete(struct blockif_ctxt *bc);
int blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq);
int blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq);
int blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq);
int blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq);
int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq);
int blockif_close(struct blockif_ctxt *bc);
#endif /* _BLOCK_IF_H_ */

153
bhyve/consport.c Normal file
View file

@ -0,0 +1,153 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/select.h>
#include <stdio.h>
#include <stdlib.h>
#include <termios.h>
#include <unistd.h>
#include <stdbool.h>
#include "inout.h"
#include "pci_lpc.h"
#define BVM_CONSOLE_PORT 0x220
#define BVM_CONS_SIG ('b' << 8 | 'v')
static struct termios tio_orig, tio_new;
static void
ttyclose(void)
{
tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig);
}
static void
ttyopen(void)
{
tcgetattr(STDIN_FILENO, &tio_orig);
cfmakeraw(&tio_new);
tcsetattr(STDIN_FILENO, TCSANOW, &tio_new);
atexit(ttyclose);
}
static bool
tty_char_available(void)
{
fd_set rfds;
struct timeval tv;
FD_ZERO(&rfds);
FD_SET(STDIN_FILENO, &rfds);
tv.tv_sec = 0;
tv.tv_usec = 0;
if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0) {
return (true);
} else {
return (false);
}
}
static int
ttyread(void)
{
char rb;
if (tty_char_available()) {
read(STDIN_FILENO, &rb, 1);
return (rb & 0xff);
} else {
return (-1);
}
}
static void
ttywrite(unsigned char wb)
{
(void) write(STDOUT_FILENO, &wb, 1);
}
static int
console_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
static int opened;
if (bytes == 2 && in) {
*eax = BVM_CONS_SIG;
return (0);
}
/*
* Guests might probe this port to look for old ISA devices
* using single-byte reads. Return 0xff for those.
*/
if (bytes == 1 && in) {
*eax = 0xff;
return (0);
}
if (bytes != 4)
return (-1);
if (!opened) {
ttyopen();
opened = 1;
}
if (in)
*eax = ttyread();
else
ttywrite(*eax);
return (0);
}
SYSRES_IO(BVM_CONSOLE_PORT, 4);
static struct inout_port consport = {
"bvmcons",
BVM_CONSOLE_PORT,
1,
IOPORT_F_INOUT,
console_handler
};
void
init_bvmcons(void)
{
register_inout(&consport);
}

142
bhyve/dbgport.c Normal file
View file

@ -0,0 +1,142 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <sys/uio.h>
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include "inout.h"
#include "dbgport.h"
#include "pci_lpc.h"
#define BVM_DBG_PORT 0x224
#define BVM_DBG_SIG ('B' << 8 | 'V')
static int listen_fd, conn_fd;
static struct sockaddr_in sin;
static int
dbg_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
char ch;
int nwritten, nread, printonce;
if (bytes == 2 && in) {
*eax = BVM_DBG_SIG;
return (0);
}
if (bytes != 4)
return (-1);
again:
printonce = 0;
while (conn_fd < 0) {
if (!printonce) {
printf("Waiting for connection from gdb\r\n");
printonce = 1;
}
conn_fd = accept(listen_fd, NULL, NULL);
if (conn_fd >= 0)
fcntl(conn_fd, F_SETFL, O_NONBLOCK);
else if (errno != EINTR)
perror("accept");
}
if (in) {
nread = read(conn_fd, &ch, 1);
if (nread == -1 && errno == EAGAIN)
*eax = -1;
else if (nread == 1)
*eax = ch;
else {
close(conn_fd);
conn_fd = -1;
goto again;
}
} else {
ch = *eax;
nwritten = write(conn_fd, &ch, 1);
if (nwritten != 1) {
close(conn_fd);
conn_fd = -1;
goto again;
}
}
return (0);
}
static struct inout_port dbgport = {
"bvmdbg",
BVM_DBG_PORT,
1,
IOPORT_F_INOUT,
dbg_handler
};
SYSRES_IO(BVM_DBG_PORT, 4);
void
init_dbgport(int sport)
{
conn_fd = -1;
if ((listen_fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
perror("socket");
exit(1);
}
sin.sin_len = sizeof(sin);
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = htonl(INADDR_ANY);
sin.sin_port = htons(sport);
if (bind(listen_fd, (struct sockaddr *)&sin, sizeof(sin)) < 0) {
perror("bind");
exit(1);
}
if (listen(listen_fd, 1) < 0) {
perror("listen");
exit(1);
}
register_inout(&dbgport);
}

34
bhyve/dbgport.h Normal file
View file

@ -0,0 +1,34 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _DBGPORT_H_
#define _DBGPORT_H_
void init_dbgport(int port);
#endif

297
bhyve/inout.c Normal file
View file

@ -0,0 +1,297 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/linker_set.h>
#include <sys/_iovec.h>
#include <sys/mman.h>
#include <x86/psl.h>
#include <x86/segments.h>
#include <machine/vmm.h>
#include <machine/vmm_instruction_emul.h>
#include <vmmapi.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include "bhyverun.h"
#include "inout.h"
SET_DECLARE(inout_port_set, struct inout_port);
#define MAX_IOPORTS (1 << 16)
#define VERIFY_IOPORT(port, size) \
assert((port) >= 0 && (size) > 0 && ((port) + (size)) <= MAX_IOPORTS)
static struct {
const char *name;
int flags;
inout_func_t handler;
void *arg;
} inout_handlers[MAX_IOPORTS];
static int
default_inout(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
if (in) {
switch (bytes) {
case 4:
*eax = 0xffffffff;
break;
case 2:
*eax = 0xffff;
break;
case 1:
*eax = 0xff;
break;
}
}
return (0);
}
static void
register_default_iohandler(int start, int size)
{
struct inout_port iop;
VERIFY_IOPORT(start, size);
bzero(&iop, sizeof(iop));
iop.name = "default";
iop.port = start;
iop.size = size;
iop.flags = IOPORT_F_INOUT | IOPORT_F_DEFAULT;
iop.handler = default_inout;
register_inout(&iop);
}
int
emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict)
{
int addrsize, bytes, flags, in, port, prot, rep;
uint32_t eax, val;
inout_func_t handler;
void *arg;
int error, fault, retval;
enum vm_reg_name idxreg;
uint64_t gla, index, iterations, count;
struct vm_inout_str *vis;
struct iovec iov[2];
bytes = vmexit->u.inout.bytes;
in = vmexit->u.inout.in;
port = vmexit->u.inout.port;
assert(port < MAX_IOPORTS);
assert(bytes == 1 || bytes == 2 || bytes == 4);
handler = inout_handlers[port].handler;
if (strict && handler == default_inout)
return (-1);
flags = inout_handlers[port].flags;
arg = inout_handlers[port].arg;
if (in) {
if (!(flags & IOPORT_F_IN))
return (-1);
} else {
if (!(flags & IOPORT_F_OUT))
return (-1);
}
retval = 0;
if (vmexit->u.inout.string) {
vis = &vmexit->u.inout_str;
rep = vis->inout.rep;
addrsize = vis->addrsize;
prot = in ? PROT_WRITE : PROT_READ;
assert(addrsize == 2 || addrsize == 4 || addrsize == 8);
/* Index register */
idxreg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI;
index = vis->index & vie_size2mask(addrsize);
/* Count register */
count = vis->count & vie_size2mask(addrsize);
/* Limit number of back-to-back in/out emulations to 16 */
iterations = MIN(count, 16);
while (iterations > 0) {
assert(retval == 0);
if (vie_calculate_gla(vis->paging.cpu_mode,
vis->seg_name, &vis->seg_desc, index, bytes,
addrsize, prot, &gla)) {
vm_inject_gp(ctx, vcpu);
break;
}
error = vm_copy_setup(ctx, vcpu, &vis->paging, gla,
bytes, prot, iov, nitems(iov), &fault);
if (error) {
retval = -1; /* Unrecoverable error */
break;
} else if (fault) {
retval = 0; /* Resume guest to handle fault */
break;
}
if (vie_alignment_check(vis->paging.cpl, bytes,
vis->cr0, vis->rflags, gla)) {
vm_inject_ac(ctx, vcpu, 0);
break;
}
val = 0;
if (!in)
vm_copyin(ctx, vcpu, iov, &val, bytes);
retval = handler(ctx, vcpu, in, port, bytes, &val, arg);
if (retval != 0)
break;
if (in)
vm_copyout(ctx, vcpu, &val, iov, bytes);
/* Update index */
if (vis->rflags & PSL_D)
index -= bytes;
else
index += bytes;
count--;
iterations--;
}
/* Update index register */
error = vie_update_register(ctx, vcpu, idxreg, index, addrsize);
assert(error == 0);
/*
* Update count register only if the instruction had a repeat
* prefix.
*/
if (rep) {
error = vie_update_register(ctx, vcpu, VM_REG_GUEST_RCX,
count, addrsize);
assert(error == 0);
}
/* Restart the instruction if more iterations remain */
if (retval == 0 && count != 0) {
error = vm_restart_instruction(ctx, vcpu);
assert(error == 0);
}
} else {
eax = vmexit->u.inout.eax;
val = eax & vie_size2mask(bytes);
retval = handler(ctx, vcpu, in, port, bytes, &val, arg);
if (retval == 0 && in) {
eax &= ~vie_size2mask(bytes);
eax |= val & vie_size2mask(bytes);
error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX,
eax);
assert(error == 0);
}
}
return (retval);
}
void
init_inout(void)
{
struct inout_port **iopp, *iop;
/*
* Set up the default handler for all ports
*/
register_default_iohandler(0, MAX_IOPORTS);
/*
* Overwrite with specified handlers
*/
SET_FOREACH(iopp, inout_port_set) {
iop = *iopp;
assert(iop->port < MAX_IOPORTS);
inout_handlers[iop->port].name = iop->name;
inout_handlers[iop->port].flags = iop->flags;
inout_handlers[iop->port].handler = iop->handler;
inout_handlers[iop->port].arg = NULL;
}
}
int
register_inout(struct inout_port *iop)
{
int i;
VERIFY_IOPORT(iop->port, iop->size);
/*
* Verify that the new registration is not overwriting an already
* allocated i/o range.
*/
if ((iop->flags & IOPORT_F_DEFAULT) == 0) {
for (i = iop->port; i < iop->port + iop->size; i++) {
if ((inout_handlers[i].flags & IOPORT_F_DEFAULT) == 0)
return (-1);
}
}
for (i = iop->port; i < iop->port + iop->size; i++) {
inout_handlers[i].name = iop->name;
inout_handlers[i].flags = iop->flags;
inout_handlers[i].handler = iop->handler;
inout_handlers[i].arg = iop->arg;
}
return (0);
}
int
unregister_inout(struct inout_port *iop)
{
VERIFY_IOPORT(iop->port, iop->size);
assert(inout_handlers[iop->port].name == iop->name);
register_default_iohandler(iop->port, iop->size);
return (0);
}

79
bhyve/inout.h Normal file
View file

@ -0,0 +1,79 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _INOUT_H_
#define _INOUT_H_
#include <sys/linker_set.h>
struct vmctx;
struct vm_exit;
/*
* inout emulation handlers return 0 on success and -1 on failure.
*/
typedef int (*inout_func_t)(struct vmctx *ctx, int vcpu, int in, int port,
int bytes, uint32_t *eax, void *arg);
struct inout_port {
const char *name;
int port;
int size;
int flags;
inout_func_t handler;
void *arg;
};
#define IOPORT_F_IN 0x1
#define IOPORT_F_OUT 0x2
#define IOPORT_F_INOUT (IOPORT_F_IN | IOPORT_F_OUT)
/*
* The following flags are used internally and must not be used by
* device models.
*/
#define IOPORT_F_DEFAULT 0x80000000 /* claimed by default handler */
#define INOUT_PORT(name, port, flags, handler) \
static struct inout_port __CONCAT(__inout_port, __LINE__) = { \
#name, \
(port), \
1, \
(flags), \
(handler), \
0 \
}; \
DATA_SET(inout_port_set, __CONCAT(__inout_port, __LINE__))
void init_inout(void);
int emulate_inout(struct vmctx *, int vcpu, struct vm_exit *vmexit,
int strict);
int register_inout(struct inout_port *iop);
int unregister_inout(struct inout_port *iop);
void init_bvmcons(void);
#endif /* _INOUT_H_ */

74
bhyve/ioapic.c Normal file
View file

@ -0,0 +1,74 @@
/*-
* Copyright (c) 2014 Hudson River Trading LLC
* Written by: John H. Baldwin <jhb@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <machine/vmm.h>
#include <vmmapi.h>
#include "ioapic.h"
/*
* Assign PCI INTx interrupts to I/O APIC pins in a round-robin
* fashion. Note that we have no idea what the HPET is using, but the
* HPET is also programmable whereas this is intended for hardwired
* PCI interrupts.
*
* This assumes a single I/O APIC where pins >= 16 are permitted for
* PCI devices.
*/
static int pci_pins;
void
ioapic_init(struct vmctx *ctx)
{
if (vm_ioapic_pincount(ctx, &pci_pins) < 0) {
pci_pins = 0;
return;
}
/* Ignore the first 16 pins. */
if (pci_pins <= 16) {
pci_pins = 0;
return;
}
pci_pins -= 16;
}
int
ioapic_pci_alloc_irq(void)
{
static int last_pin;
if (pci_pins == 0)
return (-1);
return (16 + (last_pin++ % pci_pins));
}

39
bhyve/ioapic.h Normal file
View file

@ -0,0 +1,39 @@
/*-
* Copyright (c) 2014 Hudson River Trading LLC
* Written by: John H. Baldwin <jhb@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _IOAPIC_H_
#define _IOAPIC_H_
/*
* Allocate a PCI IRQ from the I/O APIC.
*/
void ioapic_init(struct vmctx *ctx);
int ioapic_pci_alloc_irq(void);
#endif

291
bhyve/mem.c Normal file
View file

@ -0,0 +1,291 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
/*
* Memory ranges are represented with an RB tree. On insertion, the range
* is checked for overlaps. On lookup, the key has the same base and limit
* so it can be searched within the range.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/tree.h>
#include <sys/errno.h>
#include <machine/vmm.h>
#include <machine/vmm_instruction_emul.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <pthread.h>
#include "mem.h"
struct mmio_rb_range {
RB_ENTRY(mmio_rb_range) mr_link; /* RB tree links */
struct mem_range mr_param;
uint64_t mr_base;
uint64_t mr_end;
};
struct mmio_rb_tree;
RB_PROTOTYPE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare);
RB_HEAD(mmio_rb_tree, mmio_rb_range) mmio_rb_root, mmio_rb_fallback;
/*
* Per-vCPU cache. Since most accesses from a vCPU will be to
* consecutive addresses in a range, it makes sense to cache the
* result of a lookup.
*/
static struct mmio_rb_range *mmio_hint[VM_MAXCPU];
static pthread_rwlock_t mmio_rwlock;
static int
mmio_rb_range_compare(struct mmio_rb_range *a, struct mmio_rb_range *b)
{
if (a->mr_end < b->mr_base)
return (-1);
else if (a->mr_base > b->mr_end)
return (1);
return (0);
}
static int
mmio_rb_lookup(struct mmio_rb_tree *rbt, uint64_t addr,
struct mmio_rb_range **entry)
{
struct mmio_rb_range find, *res;
find.mr_base = find.mr_end = addr;
res = RB_FIND(mmio_rb_tree, rbt, &find);
if (res != NULL) {
*entry = res;
return (0);
}
return (ENOENT);
}
static int
mmio_rb_add(struct mmio_rb_tree *rbt, struct mmio_rb_range *new)
{
struct mmio_rb_range *overlap;
overlap = RB_INSERT(mmio_rb_tree, rbt, new);
if (overlap != NULL) {
#ifdef RB_DEBUG
printf("overlap detected: new %lx:%lx, tree %lx:%lx\n",
new->mr_base, new->mr_end,
overlap->mr_base, overlap->mr_end);
#endif
return (EEXIST);
}
return (0);
}
#if 0
static void
mmio_rb_dump(struct mmio_rb_tree *rbt)
{
struct mmio_rb_range *np;
pthread_rwlock_rdlock(&mmio_rwlock);
RB_FOREACH(np, mmio_rb_tree, rbt) {
printf(" %lx:%lx, %s\n", np->mr_base, np->mr_end,
np->mr_param.name);
}
pthread_rwlock_unlock(&mmio_rwlock);
}
#endif
RB_GENERATE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare);
static int
mem_read(void *ctx, int vcpu, uint64_t gpa, uint64_t *rval, int size, void *arg)
{
int error;
struct mem_range *mr = arg;
error = (*mr->handler)(ctx, vcpu, MEM_F_READ, gpa, size,
rval, mr->arg1, mr->arg2);
return (error);
}
static int
mem_write(void *ctx, int vcpu, uint64_t gpa, uint64_t wval, int size, void *arg)
{
int error;
struct mem_range *mr = arg;
error = (*mr->handler)(ctx, vcpu, MEM_F_WRITE, gpa, size,
&wval, mr->arg1, mr->arg2);
return (error);
}
int
emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie,
struct vm_guest_paging *paging)
{
struct mmio_rb_range *entry;
int err, immutable;
pthread_rwlock_rdlock(&mmio_rwlock);
/*
* First check the per-vCPU cache
*/
if (mmio_hint[vcpu] &&
paddr >= mmio_hint[vcpu]->mr_base &&
paddr <= mmio_hint[vcpu]->mr_end) {
entry = mmio_hint[vcpu];
} else
entry = NULL;
if (entry == NULL) {
if (mmio_rb_lookup(&mmio_rb_root, paddr, &entry) == 0) {
/* Update the per-vCPU cache */
mmio_hint[vcpu] = entry;
} else if (mmio_rb_lookup(&mmio_rb_fallback, paddr, &entry)) {
pthread_rwlock_unlock(&mmio_rwlock);
return (ESRCH);
}
}
assert(entry != NULL);
/*
* An 'immutable' memory range is guaranteed to be never removed
* so there is no need to hold 'mmio_rwlock' while calling the
* handler.
*
* XXX writes to the PCIR_COMMAND register can cause register_mem()
* to be called. If the guest is using PCI extended config space
* to modify the PCIR_COMMAND register then register_mem() can
* deadlock on 'mmio_rwlock'. However by registering the extended
* config space window as 'immutable' the deadlock can be avoided.
*/
immutable = (entry->mr_param.flags & MEM_F_IMMUTABLE);
if (immutable)
pthread_rwlock_unlock(&mmio_rwlock);
err = vmm_emulate_instruction(ctx, vcpu, paddr, vie, paging,
mem_read, mem_write, &entry->mr_param);
if (!immutable)
pthread_rwlock_unlock(&mmio_rwlock);
return (err);
}
static int
register_mem_int(struct mmio_rb_tree *rbt, struct mem_range *memp)
{
struct mmio_rb_range *entry, *mrp;
int err;
err = 0;
mrp = malloc(sizeof(struct mmio_rb_range));
if (mrp != NULL) {
mrp->mr_param = *memp;
mrp->mr_base = memp->base;
mrp->mr_end = memp->base + memp->size - 1;
pthread_rwlock_wrlock(&mmio_rwlock);
if (mmio_rb_lookup(rbt, memp->base, &entry) != 0)
err = mmio_rb_add(rbt, mrp);
pthread_rwlock_unlock(&mmio_rwlock);
if (err)
free(mrp);
} else
err = ENOMEM;
return (err);
}
int
register_mem(struct mem_range *memp)
{
return (register_mem_int(&mmio_rb_root, memp));
}
int
register_mem_fallback(struct mem_range *memp)
{
return (register_mem_int(&mmio_rb_fallback, memp));
}
int
unregister_mem(struct mem_range *memp)
{
struct mem_range *mr;
struct mmio_rb_range *entry = NULL;
int err, i;
pthread_rwlock_wrlock(&mmio_rwlock);
err = mmio_rb_lookup(&mmio_rb_root, memp->base, &entry);
if (err == 0) {
mr = &entry->mr_param;
assert(mr->name == memp->name);
assert(mr->base == memp->base && mr->size == memp->size);
assert((mr->flags & MEM_F_IMMUTABLE) == 0);
RB_REMOVE(mmio_rb_tree, &mmio_rb_root, entry);
/* flush Per-vCPU cache */
for (i=0; i < VM_MAXCPU; i++) {
if (mmio_hint[i] == entry)
mmio_hint[i] = NULL;
}
}
pthread_rwlock_unlock(&mmio_rwlock);
if (entry)
free(entry);
return (err);
}
void
init_mem(void)
{
RB_INIT(&mmio_rb_root);
RB_INIT(&mmio_rb_fallback);
pthread_rwlock_init(&mmio_rwlock, NULL);
}

61
bhyve/mem.h Normal file
View file

@ -0,0 +1,61 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _MEM_H_
#define _MEM_H_
#include <sys/linker_set.h>
struct vmctx;
typedef int (*mem_func_t)(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
int size, uint64_t *val, void *arg1, long arg2);
struct mem_range {
const char *name;
int flags;
mem_func_t handler;
void *arg1;
long arg2;
uint64_t base;
uint64_t size;
};
#define MEM_F_READ 0x1
#define MEM_F_WRITE 0x2
#define MEM_F_RW 0x3
#define MEM_F_IMMUTABLE 0x4 /* mem_range cannot be unregistered */
void init_mem(void);
int emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie,
struct vm_guest_paging *paging);
int register_mem(struct mem_range *memp);
int register_mem_fallback(struct mem_range *memp);
int unregister_mem(struct mem_range *memp);
#endif /* _MEM_H_ */

456
bhyve/mevent.c Normal file
View file

@ -0,0 +1,456 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
/*
* Micro event library for FreeBSD, designed for a single i/o thread
* using kqueue, and having events be persistent by default.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <assert.h>
#include <errno.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/event.h>
#include <sys/time.h>
#include <pthread.h>
#include <pthread_np.h>
#include "mevent.h"
#define MEVENT_MAX 64
#define MEV_ADD 1
#define MEV_ENABLE 2
#define MEV_DISABLE 3
#define MEV_DEL_PENDING 4
extern char *vmname;
static pthread_t mevent_tid;
static int mevent_timid = 43;
static int mevent_pipefd[2];
static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER;
struct mevent {
void (*me_func)(int, enum ev_type, void *);
#define me_msecs me_fd
int me_fd;
int me_timid;
enum ev_type me_type;
void *me_param;
int me_cq;
int me_state;
int me_closefd;
LIST_ENTRY(mevent) me_list;
};
static LIST_HEAD(listhead, mevent) global_head, change_head;
static void
mevent_qlock(void)
{
pthread_mutex_lock(&mevent_lmutex);
}
static void
mevent_qunlock(void)
{
pthread_mutex_unlock(&mevent_lmutex);
}
static void
mevent_pipe_read(int fd, enum ev_type type, void *param)
{
char buf[MEVENT_MAX];
int status;
/*
* Drain the pipe read side. The fd is non-blocking so this is
* safe to do.
*/
do {
status = read(fd, buf, sizeof(buf));
} while (status == MEVENT_MAX);
}
static void
mevent_notify(void)
{
char c;
/*
* If calling from outside the i/o thread, write a byte on the
* pipe to force the i/o thread to exit the blocking kevent call.
*/
if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) {
write(mevent_pipefd[1], &c, 1);
}
}
static int
mevent_kq_filter(struct mevent *mevp)
{
int retval;
retval = 0;
if (mevp->me_type == EVF_READ)
retval = EVFILT_READ;
if (mevp->me_type == EVF_WRITE)
retval = EVFILT_WRITE;
if (mevp->me_type == EVF_TIMER)
retval = EVFILT_TIMER;
if (mevp->me_type == EVF_SIGNAL)
retval = EVFILT_SIGNAL;
return (retval);
}
static int
mevent_kq_flags(struct mevent *mevp)
{
int ret;
switch (mevp->me_state) {
case MEV_ADD:
ret = EV_ADD; /* implicitly enabled */
break;
case MEV_ENABLE:
ret = EV_ENABLE;
break;
case MEV_DISABLE:
ret = EV_DISABLE;
break;
case MEV_DEL_PENDING:
ret = EV_DELETE;
break;
default:
assert(0);
break;
}
return (ret);
}
static int
mevent_kq_fflags(struct mevent *mevp)
{
/* XXX nothing yet, perhaps EV_EOF for reads ? */
return (0);
}
static int
mevent_build(int mfd, struct kevent *kev)
{
struct mevent *mevp, *tmpp;
int i;
i = 0;
mevent_qlock();
LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) {
if (mevp->me_closefd) {
/*
* A close of the file descriptor will remove the
* event
*/
close(mevp->me_fd);
} else {
if (mevp->me_type == EVF_TIMER) {
kev[i].ident = mevp->me_timid;
kev[i].data = mevp->me_msecs;
} else {
kev[i].ident = mevp->me_fd;
kev[i].data = 0;
}
kev[i].filter = mevent_kq_filter(mevp);
kev[i].flags = mevent_kq_flags(mevp);
kev[i].fflags = mevent_kq_fflags(mevp);
kev[i].udata = mevp;
i++;
}
mevp->me_cq = 0;
LIST_REMOVE(mevp, me_list);
if (mevp->me_state == MEV_DEL_PENDING) {
free(mevp);
} else {
LIST_INSERT_HEAD(&global_head, mevp, me_list);
}
assert(i < MEVENT_MAX);
}
mevent_qunlock();
return (i);
}
static void
mevent_handle(struct kevent *kev, int numev)
{
struct mevent *mevp;
int i;
for (i = 0; i < numev; i++) {
mevp = kev[i].udata;
/* XXX check for EV_ERROR ? */
(*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param);
}
}
struct mevent *
mevent_add(int tfd, enum ev_type type,
void (*func)(int, enum ev_type, void *), void *param)
{
struct mevent *lp, *mevp;
if (tfd < 0 || func == NULL) {
return (NULL);
}
mevp = NULL;
mevent_qlock();
/*
* Verify that the fd/type tuple is not present in any list
*/
LIST_FOREACH(lp, &global_head, me_list) {
if (type != EVF_TIMER && lp->me_fd == tfd &&
lp->me_type == type) {
goto exit;
}
}
LIST_FOREACH(lp, &change_head, me_list) {
if (type != EVF_TIMER && lp->me_fd == tfd &&
lp->me_type == type) {
goto exit;
}
}
/*
* Allocate an entry, populate it, and add it to the change list.
*/
mevp = calloc(1, sizeof(struct mevent));
if (mevp == NULL) {
goto exit;
}
if (type == EVF_TIMER) {
mevp->me_msecs = tfd;
mevp->me_timid = mevent_timid++;
} else
mevp->me_fd = tfd;
mevp->me_type = type;
mevp->me_func = func;
mevp->me_param = param;
LIST_INSERT_HEAD(&change_head, mevp, me_list);
mevp->me_cq = 1;
mevp->me_state = MEV_ADD;
mevent_notify();
exit:
mevent_qunlock();
return (mevp);
}
static int
mevent_update(struct mevent *evp, int newstate)
{
/*
* It's not possible to enable/disable a deleted event
*/
if (evp->me_state == MEV_DEL_PENDING)
return (EINVAL);
/*
* No update needed if state isn't changing
*/
if (evp->me_state == newstate)
return (0);
mevent_qlock();
evp->me_state = newstate;
/*
* Place the entry onto the changed list if not already there.
*/
if (evp->me_cq == 0) {
evp->me_cq = 1;
LIST_REMOVE(evp, me_list);
LIST_INSERT_HEAD(&change_head, evp, me_list);
mevent_notify();
}
mevent_qunlock();
return (0);
}
int
mevent_enable(struct mevent *evp)
{
return (mevent_update(evp, MEV_ENABLE));
}
int
mevent_disable(struct mevent *evp)
{
return (mevent_update(evp, MEV_DISABLE));
}
static int
mevent_delete_event(struct mevent *evp, int closefd)
{
mevent_qlock();
/*
* Place the entry onto the changed list if not already there, and
* mark as to be deleted.
*/
if (evp->me_cq == 0) {
evp->me_cq = 1;
LIST_REMOVE(evp, me_list);
LIST_INSERT_HEAD(&change_head, evp, me_list);
mevent_notify();
}
evp->me_state = MEV_DEL_PENDING;
if (closefd)
evp->me_closefd = 1;
mevent_qunlock();
return (0);
}
int
mevent_delete(struct mevent *evp)
{
return (mevent_delete_event(evp, 0));
}
int
mevent_delete_close(struct mevent *evp)
{
return (mevent_delete_event(evp, 1));
}
static void
mevent_set_name(void)
{
pthread_set_name_np(mevent_tid, "mevent");
}
void
mevent_dispatch(void)
{
struct kevent changelist[MEVENT_MAX];
struct kevent eventlist[MEVENT_MAX];
struct mevent *pipev;
int mfd;
int numev;
int ret;
mevent_tid = pthread_self();
mevent_set_name();
mfd = kqueue();
assert(mfd > 0);
/*
* Open the pipe that will be used for other threads to force
* the blocking kqueue call to exit by writing to it. Set the
* descriptor to non-blocking.
*/
ret = pipe(mevent_pipefd);
if (ret < 0) {
perror("pipe");
exit(0);
}
/*
* Add internal event handler for the pipe write fd
*/
pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL);
assert(pipev != NULL);
for (;;) {
/*
* Build changelist if required.
* XXX the changelist can be put into the blocking call
* to eliminate the extra syscall. Currently better for
* debug.
*/
numev = mevent_build(mfd, changelist);
if (numev) {
ret = kevent(mfd, changelist, numev, NULL, 0, NULL);
if (ret == -1) {
perror("Error return from kevent change");
}
}
/*
* Block awaiting events
*/
ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL);
if (ret == -1 && errno != EINTR) {
perror("Error return from kevent monitor");
}
/*
* Handle reported events
*/
mevent_handle(eventlist, ret);
}
}

51
bhyve/mevent.h Normal file
View file

@ -0,0 +1,51 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _MEVENT_H_
#define _MEVENT_H_
enum ev_type {
EVF_READ,
EVF_WRITE,
EVF_TIMER,
EVF_SIGNAL
};
struct mevent;
struct mevent *mevent_add(int fd, enum ev_type type,
void (*func)(int, enum ev_type, void *),
void *param);
int mevent_enable(struct mevent *evp);
int mevent_disable(struct mevent *evp);
int mevent_delete(struct mevent *evp);
int mevent_delete_close(struct mevent *evp);
void mevent_dispatch(void);
#endif /* _MEVENT_H_ */

256
bhyve/mevent_test.c Normal file
View file

@ -0,0 +1,256 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
/*
* Test program for the micro event library. Set up a simple TCP echo
* service.
*
* cc mevent_test.c mevent.c -lpthread
*/
#include <sys/types.h>
#include <sys/stdint.h>
#include <sys/sysctl.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <machine/cpufunc.h>
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <unistd.h>
#include "mevent.h"
#define TEST_PORT 4321
static pthread_mutex_t accept_mutex = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t accept_condvar = PTHREAD_COND_INITIALIZER;
static struct mevent *tevp;
char *vmname = "test vm";
#define MEVENT_ECHO
/* Number of timer events to capture */
#define TEVSZ 4096
uint64_t tevbuf[TEVSZ];
static void
timer_print(void)
{
uint64_t min, max, diff, sum, tsc_freq;
size_t len;
int j;
min = UINT64_MAX;
max = 0;
sum = 0;
len = sizeof(tsc_freq);
sysctlbyname("machdep.tsc_freq", &tsc_freq, &len, NULL, 0);
for (j = 1; j < TEVSZ; j++) {
/* Convert a tsc diff into microseconds */
diff = (tevbuf[j] - tevbuf[j-1]) * 1000000 / tsc_freq;
sum += diff;
if (min > diff)
min = diff;
if (max < diff)
max = diff;
}
printf("timers done: usecs, min %ld, max %ld, mean %ld\n", min, max,
sum/(TEVSZ - 1));
}
static void
timer_callback(int fd, enum ev_type type, void *param)
{
static int i;
if (i >= TEVSZ)
abort();
tevbuf[i++] = rdtsc();
if (i == TEVSZ) {
mevent_delete(tevp);
timer_print();
}
}
#ifdef MEVENT_ECHO
struct esync {
pthread_mutex_t e_mt;
pthread_cond_t e_cond;
};
static void
echoer_callback(int fd, enum ev_type type, void *param)
{
struct esync *sync = param;
pthread_mutex_lock(&sync->e_mt);
pthread_cond_signal(&sync->e_cond);
pthread_mutex_unlock(&sync->e_mt);
}
static void *
echoer(void *param)
{
struct esync sync;
struct mevent *mev;
char buf[128];
int fd = (int)(uintptr_t) param;
int len;
pthread_mutex_init(&sync.e_mt, NULL);
pthread_cond_init(&sync.e_cond, NULL);
pthread_mutex_lock(&sync.e_mt);
mev = mevent_add(fd, EVF_READ, echoer_callback, &sync);
if (mev == NULL) {
printf("Could not allocate echoer event\n");
exit(1);
}
while (!pthread_cond_wait(&sync.e_cond, &sync.e_mt)) {
len = read(fd, buf, sizeof(buf));
if (len > 0) {
write(fd, buf, len);
write(0, buf, len);
} else {
break;
}
}
mevent_delete_close(mev);
pthread_mutex_unlock(&sync.e_mt);
pthread_mutex_destroy(&sync.e_mt);
pthread_cond_destroy(&sync.e_cond);
return (NULL);
}
#else
static void *
echoer(void *param)
{
char buf[128];
int fd = (int)(uintptr_t) param;
int len;
while ((len = read(fd, buf, sizeof(buf))) > 0) {
write(1, buf, len);
}
return (NULL);
}
#endif /* MEVENT_ECHO */
static void
acceptor_callback(int fd, enum ev_type type, void *param)
{
pthread_mutex_lock(&accept_mutex);
pthread_cond_signal(&accept_condvar);
pthread_mutex_unlock(&accept_mutex);
}
static void *
acceptor(void *param)
{
struct sockaddr_in sin;
pthread_t tid;
int news;
int s;
static int first;
if ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
perror("socket");
exit(1);
}
sin.sin_len = sizeof(sin);
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = htonl(INADDR_ANY);
sin.sin_port = htons(TEST_PORT);
if (bind(s, (struct sockaddr *)&sin, sizeof(sin)) < 0) {
perror("bind");
exit(1);
}
if (listen(s, 1) < 0) {
perror("listen");
exit(1);
}
(void) mevent_add(s, EVF_READ, acceptor_callback, NULL);
pthread_mutex_lock(&accept_mutex);
while (!pthread_cond_wait(&accept_condvar, &accept_mutex)) {
news = accept(s, NULL, NULL);
if (news < 0) {
perror("accept error");
} else {
static int first = 1;
if (first) {
/*
* Start a timer
*/
first = 0;
tevp = mevent_add(1, EVF_TIMER, timer_callback,
NULL);
}
printf("incoming connection, spawning thread\n");
pthread_create(&tid, NULL, echoer,
(void *)(uintptr_t)news);
}
}
return (NULL);
}
main()
{
pthread_t tid;
pthread_create(&tid, NULL, acceptor, NULL);
mevent_dispatch();
}

377
bhyve/mptbl.c Normal file
View file

@ -0,0 +1,377 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/errno.h>
#include <x86/mptable.h>
#include <stdio.h>
#include <string.h>
#include "acpi.h"
#include "bhyverun.h"
#include "mptbl.h"
#include "pci_emul.h"
#define MPTABLE_BASE 0xF0000
/* floating pointer length + maximum length of configuration table */
#define MPTABLE_MAX_LENGTH (65536 + 16)
#define LAPIC_PADDR 0xFEE00000
#define LAPIC_VERSION 16
#define IOAPIC_PADDR 0xFEC00000
#define IOAPIC_VERSION 0x11
#define MP_SPECREV 4
#define MPFP_SIG "_MP_"
/* Configuration header defines */
#define MPCH_SIG "PCMP"
#define MPCH_OEMID "BHyVe "
#define MPCH_OEMID_LEN 8
#define MPCH_PRODID "Hypervisor "
#define MPCH_PRODID_LEN 12
/* Processor entry defines */
#define MPEP_SIG_FAMILY 6 /* XXX bhyve should supply this */
#define MPEP_SIG_MODEL 26
#define MPEP_SIG_STEPPING 5
#define MPEP_SIG \
((MPEP_SIG_FAMILY << 8) | \
(MPEP_SIG_MODEL << 4) | \
(MPEP_SIG_STEPPING))
#define MPEP_FEATURES (0xBFEBFBFF) /* XXX Intel i7 */
/* Number of local intr entries */
#define MPEII_NUM_LOCAL_IRQ 2
/* Bus entry defines */
#define MPE_NUM_BUSES 2
#define MPE_BUSNAME_LEN 6
#define MPE_BUSNAME_ISA "ISA "
#define MPE_BUSNAME_PCI "PCI "
static void *oem_tbl_start;
static int oem_tbl_size;
static uint8_t
mpt_compute_checksum(void *base, size_t len)
{
uint8_t *bytes;
uint8_t sum;
for(bytes = base, sum = 0; len > 0; len--) {
sum += *bytes++;
}
return (256 - sum);
}
static void
mpt_build_mpfp(mpfps_t mpfp, vm_paddr_t gpa)
{
memset(mpfp, 0, sizeof(*mpfp));
memcpy(mpfp->signature, MPFP_SIG, 4);
mpfp->pap = gpa + sizeof(*mpfp);
mpfp->length = 1;
mpfp->spec_rev = MP_SPECREV;
mpfp->checksum = mpt_compute_checksum(mpfp, sizeof(*mpfp));
}
static void
mpt_build_mpch(mpcth_t mpch)
{
memset(mpch, 0, sizeof(*mpch));
memcpy(mpch->signature, MPCH_SIG, 4);
mpch->spec_rev = MP_SPECREV;
memcpy(mpch->oem_id, MPCH_OEMID, MPCH_OEMID_LEN);
memcpy(mpch->product_id, MPCH_PRODID, MPCH_PRODID_LEN);
mpch->apic_address = LAPIC_PADDR;
}
static void
mpt_build_proc_entries(proc_entry_ptr mpep, int ncpu)
{
int i;
for (i = 0; i < ncpu; i++) {
memset(mpep, 0, sizeof(*mpep));
mpep->type = MPCT_ENTRY_PROCESSOR;
mpep->apic_id = i; // XXX
mpep->apic_version = LAPIC_VERSION;
mpep->cpu_flags = PROCENTRY_FLAG_EN;
if (i == 0)
mpep->cpu_flags |= PROCENTRY_FLAG_BP;
mpep->cpu_signature = MPEP_SIG;
mpep->feature_flags = MPEP_FEATURES;
mpep++;
}
}
static void
mpt_build_localint_entries(int_entry_ptr mpie)
{
/* Hardcode LINT0 as ExtINT on all CPUs. */
memset(mpie, 0, sizeof(*mpie));
mpie->type = MPCT_ENTRY_LOCAL_INT;
mpie->int_type = INTENTRY_TYPE_EXTINT;
mpie->int_flags = INTENTRY_FLAGS_POLARITY_CONFORM |
INTENTRY_FLAGS_TRIGGER_CONFORM;
mpie->dst_apic_id = 0xff;
mpie->dst_apic_int = 0;
mpie++;
/* Hardcode LINT1 as NMI on all CPUs. */
memset(mpie, 0, sizeof(*mpie));
mpie->type = MPCT_ENTRY_LOCAL_INT;
mpie->int_type = INTENTRY_TYPE_NMI;
mpie->int_flags = INTENTRY_FLAGS_POLARITY_CONFORM |
INTENTRY_FLAGS_TRIGGER_CONFORM;
mpie->dst_apic_id = 0xff;
mpie->dst_apic_int = 1;
}
static void
mpt_build_bus_entries(bus_entry_ptr mpeb)
{
memset(mpeb, 0, sizeof(*mpeb));
mpeb->type = MPCT_ENTRY_BUS;
mpeb->bus_id = 0;
memcpy(mpeb->bus_type, MPE_BUSNAME_PCI, MPE_BUSNAME_LEN);
mpeb++;
memset(mpeb, 0, sizeof(*mpeb));
mpeb->type = MPCT_ENTRY_BUS;
mpeb->bus_id = 1;
memcpy(mpeb->bus_type, MPE_BUSNAME_ISA, MPE_BUSNAME_LEN);
}
static void
mpt_build_ioapic_entries(io_apic_entry_ptr mpei, int id)
{
memset(mpei, 0, sizeof(*mpei));
mpei->type = MPCT_ENTRY_IOAPIC;
mpei->apic_id = id;
mpei->apic_version = IOAPIC_VERSION;
mpei->apic_flags = IOAPICENTRY_FLAG_EN;
mpei->apic_address = IOAPIC_PADDR;
}
static int
mpt_count_ioint_entries(void)
{
int bus, count;
count = 0;
for (bus = 0; bus <= PCI_BUSMAX; bus++)
count += pci_count_lintr(bus);
/*
* Always include entries for the first 16 pins along with a entry
* for each active PCI INTx pin.
*/
return (16 + count);
}
static void
mpt_generate_pci_int(int bus, int slot, int pin, int pirq_pin, int ioapic_irq,
void *arg)
{
int_entry_ptr *mpiep, mpie;
mpiep = arg;
mpie = *mpiep;
memset(mpie, 0, sizeof(*mpie));
/*
* This is always after another I/O interrupt entry, so cheat
* and fetch the I/O APIC ID from the prior entry.
*/
mpie->type = MPCT_ENTRY_INT;
mpie->int_type = INTENTRY_TYPE_INT;
mpie->src_bus_id = bus;
mpie->src_bus_irq = slot << 2 | (pin - 1);
mpie->dst_apic_id = mpie[-1].dst_apic_id;
mpie->dst_apic_int = ioapic_irq;
*mpiep = mpie + 1;
}
static void
mpt_build_ioint_entries(int_entry_ptr mpie, int id)
{
int pin, bus;
/*
* The following config is taken from kernel mptable.c
* mptable_parse_default_config_ints(...), for now
* just use the default config, tweek later if needed.
*/
/* First, generate the first 16 pins. */
for (pin = 0; pin < 16; pin++) {
memset(mpie, 0, sizeof(*mpie));
mpie->type = MPCT_ENTRY_INT;
mpie->src_bus_id = 1;
mpie->dst_apic_id = id;
/*
* All default configs route IRQs from bus 0 to the first 16
* pins of the first I/O APIC with an APIC ID of 2.
*/
mpie->dst_apic_int = pin;
switch (pin) {
case 0:
/* Pin 0 is an ExtINT pin. */
mpie->int_type = INTENTRY_TYPE_EXTINT;
break;
case 2:
/* IRQ 0 is routed to pin 2. */
mpie->int_type = INTENTRY_TYPE_INT;
mpie->src_bus_irq = 0;
break;
case SCI_INT:
/* ACPI SCI is level triggered and active-lo. */
mpie->int_flags = INTENTRY_FLAGS_POLARITY_ACTIVELO |
INTENTRY_FLAGS_TRIGGER_LEVEL;
mpie->int_type = INTENTRY_TYPE_INT;
mpie->src_bus_irq = SCI_INT;
break;
default:
/* All other pins are identity mapped. */
mpie->int_type = INTENTRY_TYPE_INT;
mpie->src_bus_irq = pin;
break;
}
mpie++;
}
/* Next, generate entries for any PCI INTx interrupts. */
for (bus = 0; bus <= PCI_BUSMAX; bus++)
pci_walk_lintr(bus, mpt_generate_pci_int, &mpie);
}
void
mptable_add_oemtbl(void *tbl, int tblsz)
{
oem_tbl_start = tbl;
oem_tbl_size = tblsz;
}
int
mptable_build(struct vmctx *ctx, int ncpu)
{
mpcth_t mpch;
bus_entry_ptr mpeb;
io_apic_entry_ptr mpei;
proc_entry_ptr mpep;
mpfps_t mpfp;
int_entry_ptr mpie;
int ioints, bus;
char *curraddr;
char *startaddr;
startaddr = paddr_guest2host(ctx, MPTABLE_BASE, MPTABLE_MAX_LENGTH);
if (startaddr == NULL) {
fprintf(stderr, "mptable requires mapped mem\n");
return (ENOMEM);
}
/*
* There is no way to advertise multiple PCI hierarchies via MPtable
* so require that there is no PCI hierarchy with a non-zero bus
* number.
*/
for (bus = 1; bus <= PCI_BUSMAX; bus++) {
if (pci_bus_configured(bus)) {
fprintf(stderr, "MPtable is incompatible with "
"multiple PCI hierarchies.\r\n");
fprintf(stderr, "MPtable generation can be disabled "
"by passing the -Y option to bhyve(8).\r\n");
return (EINVAL);
}
}
curraddr = startaddr;
mpfp = (mpfps_t)curraddr;
mpt_build_mpfp(mpfp, MPTABLE_BASE);
curraddr += sizeof(*mpfp);
mpch = (mpcth_t)curraddr;
mpt_build_mpch(mpch);
curraddr += sizeof(*mpch);
mpep = (proc_entry_ptr)curraddr;
mpt_build_proc_entries(mpep, ncpu);
curraddr += sizeof(*mpep) * ncpu;
mpch->entry_count += ncpu;
mpeb = (bus_entry_ptr) curraddr;
mpt_build_bus_entries(mpeb);
curraddr += sizeof(*mpeb) * MPE_NUM_BUSES;
mpch->entry_count += MPE_NUM_BUSES;
mpei = (io_apic_entry_ptr)curraddr;
mpt_build_ioapic_entries(mpei, 0);
curraddr += sizeof(*mpei);
mpch->entry_count++;
mpie = (int_entry_ptr) curraddr;
ioints = mpt_count_ioint_entries();
mpt_build_ioint_entries(mpie, 0);
curraddr += sizeof(*mpie) * ioints;
mpch->entry_count += ioints;
mpie = (int_entry_ptr)curraddr;
mpt_build_localint_entries(mpie);
curraddr += sizeof(*mpie) * MPEII_NUM_LOCAL_IRQ;
mpch->entry_count += MPEII_NUM_LOCAL_IRQ;
if (oem_tbl_start) {
mpch->oem_table_pointer = curraddr - startaddr + MPTABLE_BASE;
mpch->oem_table_size = oem_tbl_size;
memcpy(curraddr, oem_tbl_start, oem_tbl_size);
}
mpch->base_table_length = curraddr - (char *)mpch;
mpch->checksum = mpt_compute_checksum(mpch, mpch->base_table_length);
return (0);
}

35
bhyve/mptbl.h Normal file
View file

@ -0,0 +1,35 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _MPTBL_H_
#define _MPTBL_H_
int mptable_build(struct vmctx *ctx, int ncpu);
void mptable_add_oemtbl(void *tbl, int tblsz);
#endif /* _MPTBL_H_ */

2346
bhyve/pci_ahci.c Normal file

File diff suppressed because it is too large Load diff

2108
bhyve/pci_emul.c Normal file

File diff suppressed because it is too large Load diff

283
bhyve/pci_emul.h Normal file
View file

@ -0,0 +1,283 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _PCI_EMUL_H_
#define _PCI_EMUL_H_
#include <sys/types.h>
#include <sys/queue.h>
#include <sys/kernel.h>
#include <sys/_pthreadtypes.h>
#include <dev/pci/pcireg.h>
#include <assert.h>
#define PCI_BARMAX PCIR_MAX_BAR_0 /* BAR registers in a Type 0 header */
struct vmctx;
struct pci_devinst;
struct memory_region;
struct pci_devemu {
char *pe_emu; /* Name of device emulation */
/* instance creation */
int (*pe_init)(struct vmctx *, struct pci_devinst *,
char *opts);
/* ACPI DSDT enumeration */
void (*pe_write_dsdt)(struct pci_devinst *);
/* config space read/write callbacks */
int (*pe_cfgwrite)(struct vmctx *ctx, int vcpu,
struct pci_devinst *pi, int offset,
int bytes, uint32_t val);
int (*pe_cfgread)(struct vmctx *ctx, int vcpu,
struct pci_devinst *pi, int offset,
int bytes, uint32_t *retval);
/* BAR read/write callbacks */
void (*pe_barwrite)(struct vmctx *ctx, int vcpu,
struct pci_devinst *pi, int baridx,
uint64_t offset, int size, uint64_t value);
uint64_t (*pe_barread)(struct vmctx *ctx, int vcpu,
struct pci_devinst *pi, int baridx,
uint64_t offset, int size);
};
#define PCI_EMUL_SET(x) DATA_SET(pci_devemu_set, x);
enum pcibar_type {
PCIBAR_NONE,
PCIBAR_IO,
PCIBAR_MEM32,
PCIBAR_MEM64,
PCIBAR_MEMHI64
};
struct pcibar {
enum pcibar_type type; /* io or memory */
uint64_t size;
uint64_t addr;
};
#define PI_NAMESZ 40
struct msix_table_entry {
uint64_t addr;
uint32_t msg_data;
uint32_t vector_control;
} __packed;
/*
* In case the structure is modified to hold extra information, use a define
* for the size that should be emulated.
*/
#define MSIX_TABLE_ENTRY_SIZE 16
#define MAX_MSIX_TABLE_ENTRIES 2048
#define PBA_SIZE(msgnum) (roundup2((msgnum), 64) / 8)
enum lintr_stat {
IDLE,
ASSERTED,
PENDING
};
struct pci_devinst {
struct pci_devemu *pi_d;
struct vmctx *pi_vmctx;
uint8_t pi_bus, pi_slot, pi_func;
char pi_name[PI_NAMESZ];
int pi_bar_getsize;
int pi_prevcap;
int pi_capend;
struct {
int8_t pin;
enum lintr_stat state;
int pirq_pin;
int ioapic_irq;
pthread_mutex_t lock;
} pi_lintr;
struct {
int enabled;
uint64_t addr;
uint64_t msg_data;
int maxmsgnum;
} pi_msi;
struct {
int enabled;
int table_bar;
int pba_bar;
uint32_t table_offset;
int table_count;
uint32_t pba_offset;
int pba_size;
int function_mask;
struct msix_table_entry *table; /* allocated at runtime */
} pi_msix;
void *pi_arg; /* devemu-private data */
u_char pi_cfgdata[PCI_REGMAX + 1];
struct pcibar pi_bar[PCI_BARMAX + 1];
};
struct msicap {
uint8_t capid;
uint8_t nextptr;
uint16_t msgctrl;
uint32_t addrlo;
uint32_t addrhi;
uint16_t msgdata;
} __packed;
struct msixcap {
uint8_t capid;
uint8_t nextptr;
uint16_t msgctrl;
uint32_t table_info; /* bar index and offset within it */
uint32_t pba_info; /* bar index and offset within it */
} __packed;
struct pciecap {
uint8_t capid;
uint8_t nextptr;
uint16_t pcie_capabilities;
uint32_t dev_capabilities; /* all devices */
uint16_t dev_control;
uint16_t dev_status;
uint32_t link_capabilities; /* devices with links */
uint16_t link_control;
uint16_t link_status;
uint32_t slot_capabilities; /* ports with slots */
uint16_t slot_control;
uint16_t slot_status;
uint16_t root_control; /* root ports */
uint16_t root_capabilities;
uint32_t root_status;
uint32_t dev_capabilities2; /* all devices */
uint16_t dev_control2;
uint16_t dev_status2;
uint32_t link_capabilities2; /* devices with links */
uint16_t link_control2;
uint16_t link_status2;
uint32_t slot_capabilities2; /* ports with slots */
uint16_t slot_control2;
uint16_t slot_status2;
} __packed;
typedef void (*pci_lintr_cb)(int b, int s, int pin, int pirq_pin,
int ioapic_irq, void *arg);
int init_pci(struct vmctx *ctx);
void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
int bytes, uint32_t val);
void msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
int bytes, uint32_t val);
void pci_callback(void);
int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx,
enum pcibar_type type, uint64_t size);
int pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx,
uint64_t hostbase, enum pcibar_type type, uint64_t size);
int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum);
int pci_emul_add_pciecap(struct pci_devinst *pi, int pcie_device_type);
void pci_generate_msi(struct pci_devinst *pi, int msgnum);
void pci_generate_msix(struct pci_devinst *pi, int msgnum);
void pci_lintr_assert(struct pci_devinst *pi);
void pci_lintr_deassert(struct pci_devinst *pi);
void pci_lintr_request(struct pci_devinst *pi);
int pci_msi_enabled(struct pci_devinst *pi);
int pci_msix_enabled(struct pci_devinst *pi);
int pci_msix_table_bar(struct pci_devinst *pi);
int pci_msix_pba_bar(struct pci_devinst *pi);
int pci_msi_msgnum(struct pci_devinst *pi);
int pci_parse_slot(char *opt);
void pci_populate_msicap(struct msicap *cap, int msgs, int nextptr);
int pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum);
int pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size,
uint64_t value);
uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size);
int pci_count_lintr(int bus);
void pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg);
void pci_write_dsdt(void);
uint64_t pci_ecfg_base(void);
int pci_bus_configured(int bus);
static __inline void
pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val)
{
assert(offset <= PCI_REGMAX);
*(uint8_t *)(pi->pi_cfgdata + offset) = val;
}
static __inline void
pci_set_cfgdata16(struct pci_devinst *pi, int offset, uint16_t val)
{
assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0);
*(uint16_t *)(pi->pi_cfgdata + offset) = val;
}
static __inline void
pci_set_cfgdata32(struct pci_devinst *pi, int offset, uint32_t val)
{
assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0);
*(uint32_t *)(pi->pi_cfgdata + offset) = val;
}
static __inline uint8_t
pci_get_cfgdata8(struct pci_devinst *pi, int offset)
{
assert(offset <= PCI_REGMAX);
return (*(uint8_t *)(pi->pi_cfgdata + offset));
}
static __inline uint16_t
pci_get_cfgdata16(struct pci_devinst *pi, int offset)
{
assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0);
return (*(uint16_t *)(pi->pi_cfgdata + offset));
}
static __inline uint32_t
pci_get_cfgdata32(struct pci_devinst *pi, int offset)
{
assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0);
return (*(uint32_t *)(pi->pi_cfgdata + offset));
}
#endif /* _PCI_EMUL_H_ */

70
bhyve/pci_hostbridge.c Normal file
View file

@ -0,0 +1,70 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "pci_emul.h"
static int
pci_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
/* config space */
pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1275); /* NetApp */
pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1275); /* NetApp */
pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_BRIDGE);
pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE);
pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_HOST);
pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_PORT);
return (0);
}
static int
pci_amd_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
(void) pci_hostbridge_init(ctx, pi, opts);
pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1022); /* AMD */
pci_set_cfgdata16(pi, PCIR_DEVICE, 0x7432); /* made up */
return (0);
}
struct pci_devemu pci_de_amd_hostbridge = {
.pe_emu = "amd_hostbridge",
.pe_init = pci_amd_hostbridge_init,
};
PCI_EMUL_SET(pci_de_amd_hostbridge);
struct pci_devemu pci_de_hostbridge = {
.pe_emu = "hostbridge",
.pe_init = pci_hostbridge_init,
};
PCI_EMUL_SET(pci_de_hostbridge);

346
bhyve/pci_irq.c Normal file
View file

@ -0,0 +1,346 @@
/*-
* Copyright (c) 2014 Hudson River Trading LLC
* Written by: John H. Baldwin <jhb@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <machine/vmm.h>
#include <assert.h>
#include <pthread.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <vmmapi.h>
#include "acpi.h"
#include "inout.h"
#include "pci_emul.h"
#include "pci_irq.h"
#include "pci_lpc.h"
/*
* Implement an 8 pin PCI interrupt router compatible with the router
* present on Intel's ICH10 chip.
*/
/* Fields in each PIRQ register. */
#define PIRQ_DIS 0x80
#define PIRQ_IRQ 0x0f
/* Only IRQs 3-7, 9-12, and 14-15 are permitted. */
#define PERMITTED_IRQS 0xdef8
#define IRQ_PERMITTED(irq) (((1U << (irq)) & PERMITTED_IRQS) != 0)
/* IRQ count to disable an IRQ. */
#define IRQ_DISABLED 0xff
static struct pirq {
uint8_t reg;
int use_count;
int active_count;
pthread_mutex_t lock;
} pirqs[8];
static u_char irq_counts[16];
static int pirq_cold = 1;
/*
* Returns true if this pin is enabled with a valid IRQ. Setting the
* register to a reserved IRQ causes interrupts to not be asserted as
* if the pin was disabled.
*/
static bool
pirq_valid_irq(int reg)
{
if (reg & PIRQ_DIS)
return (false);
return (IRQ_PERMITTED(reg & PIRQ_IRQ));
}
uint8_t
pirq_read(int pin)
{
assert(pin > 0 && pin <= nitems(pirqs));
return (pirqs[pin - 1].reg);
}
void
pirq_write(struct vmctx *ctx, int pin, uint8_t val)
{
struct pirq *pirq;
assert(pin > 0 && pin <= nitems(pirqs));
pirq = &pirqs[pin - 1];
pthread_mutex_lock(&pirq->lock);
if (pirq->reg != (val & (PIRQ_DIS | PIRQ_IRQ))) {
if (pirq->active_count != 0 && pirq_valid_irq(pirq->reg))
vm_isa_deassert_irq(ctx, pirq->reg & PIRQ_IRQ, -1);
pirq->reg = val & (PIRQ_DIS | PIRQ_IRQ);
if (pirq->active_count != 0 && pirq_valid_irq(pirq->reg))
vm_isa_assert_irq(ctx, pirq->reg & PIRQ_IRQ, -1);
}
pthread_mutex_unlock(&pirq->lock);
}
void
pci_irq_reserve(int irq)
{
assert(irq >= 0 && irq < nitems(irq_counts));
assert(pirq_cold);
assert(irq_counts[irq] == 0 || irq_counts[irq] == IRQ_DISABLED);
irq_counts[irq] = IRQ_DISABLED;
}
void
pci_irq_use(int irq)
{
assert(irq >= 0 && irq < nitems(irq_counts));
assert(pirq_cold);
assert(irq_counts[irq] != IRQ_DISABLED);
irq_counts[irq]++;
}
void
pci_irq_init(struct vmctx *ctx)
{
int i;
for (i = 0; i < nitems(pirqs); i++) {
pirqs[i].reg = PIRQ_DIS;
pirqs[i].use_count = 0;
pirqs[i].active_count = 0;
pthread_mutex_init(&pirqs[i].lock, NULL);
}
for (i = 0; i < nitems(irq_counts); i++) {
if (IRQ_PERMITTED(i))
irq_counts[i] = 0;
else
irq_counts[i] = IRQ_DISABLED;
}
}
void
pci_irq_assert(struct pci_devinst *pi)
{
struct pirq *pirq;
if (pi->pi_lintr.pirq_pin > 0) {
assert(pi->pi_lintr.pirq_pin <= nitems(pirqs));
pirq = &pirqs[pi->pi_lintr.pirq_pin - 1];
pthread_mutex_lock(&pirq->lock);
pirq->active_count++;
if (pirq->active_count == 1 && pirq_valid_irq(pirq->reg)) {
vm_isa_assert_irq(pi->pi_vmctx, pirq->reg & PIRQ_IRQ,
pi->pi_lintr.ioapic_irq);
pthread_mutex_unlock(&pirq->lock);
return;
}
pthread_mutex_unlock(&pirq->lock);
}
vm_ioapic_assert_irq(pi->pi_vmctx, pi->pi_lintr.ioapic_irq);
}
void
pci_irq_deassert(struct pci_devinst *pi)
{
struct pirq *pirq;
if (pi->pi_lintr.pirq_pin > 0) {
assert(pi->pi_lintr.pirq_pin <= nitems(pirqs));
pirq = &pirqs[pi->pi_lintr.pirq_pin - 1];
pthread_mutex_lock(&pirq->lock);
pirq->active_count--;
if (pirq->active_count == 0 && pirq_valid_irq(pirq->reg)) {
vm_isa_deassert_irq(pi->pi_vmctx, pirq->reg & PIRQ_IRQ,
pi->pi_lintr.ioapic_irq);
pthread_mutex_unlock(&pirq->lock);
return;
}
pthread_mutex_unlock(&pirq->lock);
}
vm_ioapic_deassert_irq(pi->pi_vmctx, pi->pi_lintr.ioapic_irq);
}
int
pirq_alloc_pin(struct vmctx *ctx)
{
int best_count, best_irq, best_pin, irq, pin;
pirq_cold = 0;
/* First, find the least-used PIRQ pin. */
best_pin = 0;
best_count = pirqs[0].use_count;
for (pin = 1; pin < nitems(pirqs); pin++) {
if (pirqs[pin].use_count < best_count) {
best_pin = pin;
best_count = pirqs[pin].use_count;
}
}
pirqs[best_pin].use_count++;
/* Second, route this pin to an IRQ. */
if (pirqs[best_pin].reg == PIRQ_DIS) {
best_irq = -1;
best_count = 0;
for (irq = 0; irq < nitems(irq_counts); irq++) {
if (irq_counts[irq] == IRQ_DISABLED)
continue;
if (best_irq == -1 || irq_counts[irq] < best_count) {
best_irq = irq;
best_count = irq_counts[irq];
}
}
assert(best_irq >= 0);
irq_counts[best_irq]++;
pirqs[best_pin].reg = best_irq;
vm_isa_set_irq_trigger(ctx, best_irq, LEVEL_TRIGGER);
}
return (best_pin + 1);
}
int
pirq_irq(int pin)
{
assert(pin > 0 && pin <= nitems(pirqs));
return (pirqs[pin - 1].reg & PIRQ_IRQ);
}
/* XXX: Generate $PIR table. */
static void
pirq_dsdt(void)
{
char *irq_prs, *old;
int irq, pin;
irq_prs = NULL;
for (irq = 0; irq < nitems(irq_counts); irq++) {
if (!IRQ_PERMITTED(irq))
continue;
if (irq_prs == NULL)
asprintf(&irq_prs, "%d", irq);
else {
old = irq_prs;
asprintf(&irq_prs, "%s,%d", old, irq);
free(old);
}
}
/*
* A helper method to validate a link register's value. This
* duplicates pirq_valid_irq().
*/
dsdt_line("");
dsdt_line("Method (PIRV, 1, NotSerialized)");
dsdt_line("{");
dsdt_line(" If (And (Arg0, 0x%02X))", PIRQ_DIS);
dsdt_line(" {");
dsdt_line(" Return (0x00)");
dsdt_line(" }");
dsdt_line(" And (Arg0, 0x%02X, Local0)", PIRQ_IRQ);
dsdt_line(" If (LLess (Local0, 0x03))");
dsdt_line(" {");
dsdt_line(" Return (0x00)");
dsdt_line(" }");
dsdt_line(" If (LEqual (Local0, 0x08))");
dsdt_line(" {");
dsdt_line(" Return (0x00)");
dsdt_line(" }");
dsdt_line(" If (LEqual (Local0, 0x0D))");
dsdt_line(" {");
dsdt_line(" Return (0x00)");
dsdt_line(" }");
dsdt_line(" Return (0x01)");
dsdt_line("}");
for (pin = 0; pin < nitems(pirqs); pin++) {
dsdt_line("");
dsdt_line("Device (LNK%c)", 'A' + pin);
dsdt_line("{");
dsdt_line(" Name (_HID, EisaId (\"PNP0C0F\"))");
dsdt_line(" Name (_UID, 0x%02X)", pin + 1);
dsdt_line(" Method (_STA, 0, NotSerialized)");
dsdt_line(" {");
dsdt_line(" If (PIRV (PIR%c))", 'A' + pin);
dsdt_line(" {");
dsdt_line(" Return (0x0B)");
dsdt_line(" }");
dsdt_line(" Else");
dsdt_line(" {");
dsdt_line(" Return (0x09)");
dsdt_line(" }");
dsdt_line(" }");
dsdt_line(" Name (_PRS, ResourceTemplate ()");
dsdt_line(" {");
dsdt_line(" IRQ (Level, ActiveLow, Shared, )");
dsdt_line(" {%s}", irq_prs);
dsdt_line(" })");
dsdt_line(" Name (CB%02X, ResourceTemplate ()", pin + 1);
dsdt_line(" {");
dsdt_line(" IRQ (Level, ActiveLow, Shared, )");
dsdt_line(" {}");
dsdt_line(" })");
dsdt_line(" CreateWordField (CB%02X, 0x01, CIR%c)",
pin + 1, 'A' + pin);
dsdt_line(" Method (_CRS, 0, NotSerialized)");
dsdt_line(" {");
dsdt_line(" And (PIR%c, 0x%02X, Local0)", 'A' + pin,
PIRQ_DIS | PIRQ_IRQ);
dsdt_line(" If (PIRV (Local0))");
dsdt_line(" {");
dsdt_line(" ShiftLeft (0x01, Local0, CIR%c)", 'A' + pin);
dsdt_line(" }");
dsdt_line(" Else");
dsdt_line(" {");
dsdt_line(" Store (0x00, CIR%c)", 'A' + pin);
dsdt_line(" }");
dsdt_line(" Return (CB%02X)", pin + 1);
dsdt_line(" }");
dsdt_line(" Method (_DIS, 0, NotSerialized)");
dsdt_line(" {");
dsdt_line(" Store (0x80, PIR%c)", 'A' + pin);
dsdt_line(" }");
dsdt_line(" Method (_SRS, 1, NotSerialized)");
dsdt_line(" {");
dsdt_line(" CreateWordField (Arg0, 0x01, SIR%c)", 'A' + pin);
dsdt_line(" FindSetRightBit (SIR%c, Local0)", 'A' + pin);
dsdt_line(" Store (Decrement (Local0), PIR%c)", 'A' + pin);
dsdt_line(" }");
dsdt_line("}");
}
free(irq_prs);
}
LPC_DSDT(pirq_dsdt);

45
bhyve/pci_irq.h Normal file
View file

@ -0,0 +1,45 @@
/*-
* Copyright (c) 2014 Hudson River Trading LLC
* Written by: John H. Baldwin <jhb@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef __PCI_IRQ_H__
#define __PCI_IRQ_H__
struct pci_devinst;
void pci_irq_assert(struct pci_devinst *pi);
void pci_irq_deassert(struct pci_devinst *pi);
void pci_irq_init(struct vmctx *ctx);
void pci_irq_reserve(int irq);
void pci_irq_use(int irq);
int pirq_alloc_pin(struct vmctx *ctx);
int pirq_irq(int pin);
uint8_t pirq_read(int pin);
void pirq_write(struct vmctx *ctx, int pin, uint8_t val);
#endif

429
bhyve/pci_lpc.c Normal file
View file

@ -0,0 +1,429 @@
/*-
* Copyright (c) 2013 Neel Natu <neel@freebsd.org>
* Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <machine/vmm.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <vmmapi.h>
#include "acpi.h"
#include "inout.h"
#include "pci_emul.h"
#include "pci_irq.h"
#include "pci_lpc.h"
#include "uart_emul.h"
#define IO_ICU1 0x20
#define IO_ICU2 0xA0
SET_DECLARE(lpc_dsdt_set, struct lpc_dsdt);
SET_DECLARE(lpc_sysres_set, struct lpc_sysres);
#define ELCR_PORT 0x4d0
SYSRES_IO(ELCR_PORT, 2);
#define IO_TIMER1_PORT 0x40
#define NMISC_PORT 0x61
SYSRES_IO(NMISC_PORT, 1);
static struct pci_devinst *lpc_bridge;
#define LPC_UART_NUM 2
static struct lpc_uart_softc {
struct uart_softc *uart_softc;
const char *opts;
int iobase;
int irq;
int enabled;
} lpc_uart_softc[LPC_UART_NUM];
static const char *lpc_uart_names[LPC_UART_NUM] = { "COM1", "COM2" };
/*
* LPC device configuration is in the following form:
* <lpc_device_name>[,<options>]
* For e.g. "com1,stdio"
*/
int
lpc_device_parse(const char *opts)
{
int unit, error;
char *str, *cpy, *lpcdev;
error = -1;
str = cpy = strdup(opts);
lpcdev = strsep(&str, ",");
if (lpcdev != NULL) {
for (unit = 0; unit < LPC_UART_NUM; unit++) {
if (strcasecmp(lpcdev, lpc_uart_names[unit]) == 0) {
lpc_uart_softc[unit].opts = str;
error = 0;
goto done;
}
}
}
done:
if (error)
free(cpy);
return (error);
}
static void
lpc_uart_intr_assert(void *arg)
{
struct lpc_uart_softc *sc = arg;
assert(sc->irq >= 0);
vm_isa_pulse_irq(lpc_bridge->pi_vmctx, sc->irq, sc->irq);
}
static void
lpc_uart_intr_deassert(void *arg)
{
/*
* The COM devices on the LPC bus generate edge triggered interrupts,
* so nothing more to do here.
*/
}
static int
lpc_uart_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
int offset;
struct lpc_uart_softc *sc = arg;
offset = port - sc->iobase;
switch (bytes) {
case 1:
if (in)
*eax = uart_read(sc->uart_softc, offset);
else
uart_write(sc->uart_softc, offset, *eax);
break;
case 2:
if (in) {
*eax = uart_read(sc->uart_softc, offset);
*eax |= uart_read(sc->uart_softc, offset + 1) << 8;
} else {
uart_write(sc->uart_softc, offset, *eax);
uart_write(sc->uart_softc, offset + 1, *eax >> 8);
}
break;
default:
return (-1);
}
return (0);
}
static int
lpc_init(void)
{
struct lpc_uart_softc *sc;
struct inout_port iop;
const char *name;
int unit, error;
/* COM1 and COM2 */
for (unit = 0; unit < LPC_UART_NUM; unit++) {
sc = &lpc_uart_softc[unit];
name = lpc_uart_names[unit];
if (uart_legacy_alloc(unit, &sc->iobase, &sc->irq) != 0) {
fprintf(stderr, "Unable to allocate resources for "
"LPC device %s\n", name);
return (-1);
}
pci_irq_reserve(sc->irq);
sc->uart_softc = uart_init(lpc_uart_intr_assert,
lpc_uart_intr_deassert, sc);
if (uart_set_backend(sc->uart_softc, sc->opts) != 0) {
fprintf(stderr, "Unable to initialize backend '%s' "
"for LPC device %s\n", sc->opts, name);
return (-1);
}
bzero(&iop, sizeof(struct inout_port));
iop.name = name;
iop.port = sc->iobase;
iop.size = UART_IO_BAR_SIZE;
iop.flags = IOPORT_F_INOUT;
iop.handler = lpc_uart_io_handler;
iop.arg = sc;
error = register_inout(&iop);
assert(error == 0);
sc->enabled = 1;
}
return (0);
}
static void
pci_lpc_write_dsdt(struct pci_devinst *pi)
{
struct lpc_dsdt **ldpp, *ldp;
dsdt_line("");
dsdt_line("Device (ISA)");
dsdt_line("{");
dsdt_line(" Name (_ADR, 0x%04X%04X)", pi->pi_slot, pi->pi_func);
dsdt_line(" OperationRegion (LPCR, PCI_Config, 0x00, 0x100)");
dsdt_line(" Field (LPCR, AnyAcc, NoLock, Preserve)");
dsdt_line(" {");
dsdt_line(" Offset (0x60),");
dsdt_line(" PIRA, 8,");
dsdt_line(" PIRB, 8,");
dsdt_line(" PIRC, 8,");
dsdt_line(" PIRD, 8,");
dsdt_line(" Offset (0x68),");
dsdt_line(" PIRE, 8,");
dsdt_line(" PIRF, 8,");
dsdt_line(" PIRG, 8,");
dsdt_line(" PIRH, 8");
dsdt_line(" }");
dsdt_line("");
dsdt_indent(1);
SET_FOREACH(ldpp, lpc_dsdt_set) {
ldp = *ldpp;
ldp->handler();
}
dsdt_line("");
dsdt_line("Device (PIC)");
dsdt_line("{");
dsdt_line(" Name (_HID, EisaId (\"PNP0000\"))");
dsdt_line(" Name (_CRS, ResourceTemplate ()");
dsdt_line(" {");
dsdt_indent(2);
dsdt_fixed_ioport(IO_ICU1, 2);
dsdt_fixed_ioport(IO_ICU2, 2);
dsdt_fixed_irq(2);
dsdt_unindent(2);
dsdt_line(" })");
dsdt_line("}");
dsdt_line("");
dsdt_line("Device (TIMR)");
dsdt_line("{");
dsdt_line(" Name (_HID, EisaId (\"PNP0100\"))");
dsdt_line(" Name (_CRS, ResourceTemplate ()");
dsdt_line(" {");
dsdt_indent(2);
dsdt_fixed_ioport(IO_TIMER1_PORT, 4);
dsdt_fixed_irq(0);
dsdt_unindent(2);
dsdt_line(" })");
dsdt_line("}");
dsdt_unindent(1);
dsdt_line("}");
}
static void
pci_lpc_sysres_dsdt(void)
{
struct lpc_sysres **lspp, *lsp;
dsdt_line("");
dsdt_line("Device (SIO)");
dsdt_line("{");
dsdt_line(" Name (_HID, EisaId (\"PNP0C02\"))");
dsdt_line(" Name (_CRS, ResourceTemplate ()");
dsdt_line(" {");
dsdt_indent(2);
SET_FOREACH(lspp, lpc_sysres_set) {
lsp = *lspp;
switch (lsp->type) {
case LPC_SYSRES_IO:
dsdt_fixed_ioport(lsp->base, lsp->length);
break;
case LPC_SYSRES_MEM:
dsdt_fixed_mem32(lsp->base, lsp->length);
break;
}
}
dsdt_unindent(2);
dsdt_line(" })");
dsdt_line("}");
}
LPC_DSDT(pci_lpc_sysres_dsdt);
static void
pci_lpc_uart_dsdt(void)
{
struct lpc_uart_softc *sc;
int unit;
for (unit = 0; unit < LPC_UART_NUM; unit++) {
sc = &lpc_uart_softc[unit];
if (!sc->enabled)
continue;
dsdt_line("");
dsdt_line("Device (%s)", lpc_uart_names[unit]);
dsdt_line("{");
dsdt_line(" Name (_HID, EisaId (\"PNP0501\"))");
dsdt_line(" Name (_UID, %d)", unit + 1);
dsdt_line(" Name (_CRS, ResourceTemplate ()");
dsdt_line(" {");
dsdt_indent(2);
dsdt_fixed_ioport(sc->iobase, UART_IO_BAR_SIZE);
dsdt_fixed_irq(sc->irq);
dsdt_unindent(2);
dsdt_line(" })");
dsdt_line("}");
}
}
LPC_DSDT(pci_lpc_uart_dsdt);
static int
pci_lpc_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int coff, int bytes, uint32_t val)
{
int pirq_pin;
if (bytes == 1) {
pirq_pin = 0;
if (coff >= 0x60 && coff <= 0x63)
pirq_pin = coff - 0x60 + 1;
if (coff >= 0x68 && coff <= 0x6b)
pirq_pin = coff - 0x68 + 5;
if (pirq_pin != 0) {
pirq_write(ctx, pirq_pin, val);
pci_set_cfgdata8(pi, coff, pirq_read(pirq_pin));
return (0);
}
}
return (-1);
}
static void
pci_lpc_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int baridx, uint64_t offset, int size, uint64_t value)
{
}
static uint64_t
pci_lpc_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int baridx, uint64_t offset, int size)
{
return (0);
}
#define LPC_DEV 0x7000
#define LPC_VENDOR 0x8086
static int
pci_lpc_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
/*
* Do not allow more than one LPC bridge to be configured.
*/
if (lpc_bridge != NULL) {
fprintf(stderr, "Only one LPC bridge is allowed.\n");
return (-1);
}
/*
* Enforce that the LPC can only be configured on bus 0. This
* simplifies the ACPI DSDT because it can provide a decode for
* all legacy i/o ports behind bus 0.
*/
if (pi->pi_bus != 0) {
fprintf(stderr, "LPC bridge can be present only on bus 0.\n");
return (-1);
}
if (lpc_init() != 0)
return (-1);
/* initialize config space */
pci_set_cfgdata16(pi, PCIR_DEVICE, LPC_DEV);
pci_set_cfgdata16(pi, PCIR_VENDOR, LPC_VENDOR);
pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE);
pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_ISA);
lpc_bridge = pi;
return (0);
}
char *
lpc_pirq_name(int pin)
{
char *name;
if (lpc_bridge == NULL)
return (NULL);
asprintf(&name, "\\_SB.PC00.ISA.LNK%c,", 'A' + pin - 1);
return (name);
}
void
lpc_pirq_routed(void)
{
int pin;
if (lpc_bridge == NULL)
return;
for (pin = 0; pin < 4; pin++)
pci_set_cfgdata8(lpc_bridge, 0x60 + pin, pirq_read(pin + 1));
for (pin = 0; pin < 4; pin++)
pci_set_cfgdata8(lpc_bridge, 0x68 + pin, pirq_read(pin + 5));
}
struct pci_devemu pci_de_lpc = {
.pe_emu = "lpc",
.pe_init = pci_lpc_init,
.pe_write_dsdt = pci_lpc_write_dsdt,
.pe_cfgwrite = pci_lpc_cfgwrite,
.pe_barwrite = pci_lpc_write,
.pe_barread = pci_lpc_read
};
PCI_EMUL_SET(pci_de_lpc);

72
bhyve/pci_lpc.h Normal file
View file

@ -0,0 +1,72 @@
/*-
* Copyright (c) 2013 Neel Natu <neel@freebsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _LPC_H_
#define _LPC_H_
#include <sys/linker_set.h>
typedef void (*lpc_write_dsdt_t)(void);
struct lpc_dsdt {
lpc_write_dsdt_t handler;
};
#define LPC_DSDT(handler) \
static struct lpc_dsdt __CONCAT(__lpc_dsdt, __LINE__) = { \
(handler), \
}; \
DATA_SET(lpc_dsdt_set, __CONCAT(__lpc_dsdt, __LINE__))
enum lpc_sysres_type {
LPC_SYSRES_IO,
LPC_SYSRES_MEM
};
struct lpc_sysres {
enum lpc_sysres_type type;
uint32_t base;
uint32_t length;
};
#define LPC_SYSRES(type, base, length) \
static struct lpc_sysres __CONCAT(__lpc_sysres, __LINE__) = { \
(type), \
(base), \
(length) \
}; \
DATA_SET(lpc_sysres_set, __CONCAT(__lpc_sysres, __LINE__))
#define SYSRES_IO(base, length) LPC_SYSRES(LPC_SYSRES_IO, base, length)
#define SYSRES_MEM(base, length) LPC_SYSRES(LPC_SYSRES_MEM, base, length)
int lpc_device_parse(const char *opt);
char *lpc_pirq_name(int pin);
void lpc_pirq_routed(void);
#endif

790
bhyve/pci_passthru.c Normal file
View file

@ -0,0 +1,790 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/types.h>
#include <sys/pciio.h>
#include <sys/ioctl.h>
#include <dev/io/iodev.h>
#include <dev/pci/pcireg.h>
#include <machine/iodev.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <fcntl.h>
#include <unistd.h>
#include <machine/vmm.h>
#include <vmmapi.h>
#include "pci_emul.h"
#include "mem.h"
#ifndef _PATH_DEVPCI
#define _PATH_DEVPCI "/dev/pci"
#endif
#ifndef _PATH_DEVIO
#define _PATH_DEVIO "/dev/io"
#endif
#define LEGACY_SUPPORT 1
#define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1)
#define MSIX_CAPLEN 12
static int pcifd = -1;
static int iofd = -1;
struct passthru_softc {
struct pci_devinst *psc_pi;
struct pcibar psc_bar[PCI_BARMAX + 1];
struct {
int capoff;
int msgctrl;
int emulated;
} psc_msi;
struct {
int capoff;
} psc_msix;
struct pcisel psc_sel;
};
static int
msi_caplen(int msgctrl)
{
int len;
len = 10; /* minimum length of msi capability */
if (msgctrl & PCIM_MSICTRL_64BIT)
len += 4;
#if 0
/*
* Ignore the 'mask' and 'pending' bits in the MSI capability.
* We'll let the guest manipulate them directly.
*/
if (msgctrl & PCIM_MSICTRL_VECTOR)
len += 10;
#endif
return (len);
}
static uint32_t
read_config(const struct pcisel *sel, long reg, int width)
{
struct pci_io pi;
bzero(&pi, sizeof(pi));
pi.pi_sel = *sel;
pi.pi_reg = reg;
pi.pi_width = width;
if (ioctl(pcifd, PCIOCREAD, &pi) < 0)
return (0); /* XXX */
else
return (pi.pi_data);
}
static void
write_config(const struct pcisel *sel, long reg, int width, uint32_t data)
{
struct pci_io pi;
bzero(&pi, sizeof(pi));
pi.pi_sel = *sel;
pi.pi_reg = reg;
pi.pi_width = width;
pi.pi_data = data;
(void)ioctl(pcifd, PCIOCWRITE, &pi); /* XXX */
}
#ifdef LEGACY_SUPPORT
static int
passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr)
{
int capoff, i;
struct msicap msicap;
u_char *capdata;
pci_populate_msicap(&msicap, msgnum, nextptr);
/*
* XXX
* Copy the msi capability structure in the last 16 bytes of the
* config space. This is wrong because it could shadow something
* useful to the device.
*/
capoff = 256 - roundup(sizeof(msicap), 4);
capdata = (u_char *)&msicap;
for (i = 0; i < sizeof(msicap); i++)
pci_set_cfgdata8(pi, capoff + i, capdata[i]);
return (capoff);
}
#endif /* LEGACY_SUPPORT */
static int
cfginitmsi(struct passthru_softc *sc)
{
int i, ptr, capptr, cap, sts, caplen, table_size;
uint32_t u32;
struct pcisel sel;
struct pci_devinst *pi;
struct msixcap msixcap;
uint32_t *msixcap_ptr;
pi = sc->psc_pi;
sel = sc->psc_sel;
/*
* Parse the capabilities and cache the location of the MSI
* and MSI-X capabilities.
*/
sts = read_config(&sel, PCIR_STATUS, 2);
if (sts & PCIM_STATUS_CAPPRESENT) {
ptr = read_config(&sel, PCIR_CAP_PTR, 1);
while (ptr != 0 && ptr != 0xff) {
cap = read_config(&sel, ptr + PCICAP_ID, 1);
if (cap == PCIY_MSI) {
/*
* Copy the MSI capability into the config
* space of the emulated pci device
*/
sc->psc_msi.capoff = ptr;
sc->psc_msi.msgctrl = read_config(&sel,
ptr + 2, 2);
sc->psc_msi.emulated = 0;
caplen = msi_caplen(sc->psc_msi.msgctrl);
capptr = ptr;
while (caplen > 0) {
u32 = read_config(&sel, capptr, 4);
pci_set_cfgdata32(pi, capptr, u32);
caplen -= 4;
capptr += 4;
}
} else if (cap == PCIY_MSIX) {
/*
* Copy the MSI-X capability
*/
sc->psc_msix.capoff = ptr;
caplen = 12;
msixcap_ptr = (uint32_t*) &msixcap;
capptr = ptr;
while (caplen > 0) {
u32 = read_config(&sel, capptr, 4);
*msixcap_ptr = u32;
pci_set_cfgdata32(pi, capptr, u32);
caplen -= 4;
capptr += 4;
msixcap_ptr++;
}
}
ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1);
}
}
if (sc->psc_msix.capoff != 0) {
pi->pi_msix.pba_bar =
msixcap.pba_info & PCIM_MSIX_BIR_MASK;
pi->pi_msix.pba_offset =
msixcap.pba_info & ~PCIM_MSIX_BIR_MASK;
pi->pi_msix.table_bar =
msixcap.table_info & PCIM_MSIX_BIR_MASK;
pi->pi_msix.table_offset =
msixcap.table_info & ~PCIM_MSIX_BIR_MASK;
pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl);
pi->pi_msix.pba_size = PBA_SIZE(pi->pi_msix.table_count);
/* Allocate the emulated MSI-X table array */
table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
pi->pi_msix.table = calloc(1, table_size);
/* Mask all table entries */
for (i = 0; i < pi->pi_msix.table_count; i++) {
pi->pi_msix.table[i].vector_control |=
PCIM_MSIX_VCTRL_MASK;
}
}
#ifdef LEGACY_SUPPORT
/*
* If the passthrough device does not support MSI then craft a
* MSI capability for it. We link the new MSI capability at the
* head of the list of capabilities.
*/
if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) {
int origptr, msiptr;
origptr = read_config(&sel, PCIR_CAP_PTR, 1);
msiptr = passthru_add_msicap(pi, 1, origptr);
sc->psc_msi.capoff = msiptr;
sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2);
sc->psc_msi.emulated = 1;
pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr);
}
#endif
/* Make sure one of the capabilities is present */
if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0)
return (-1);
else
return (0);
}
static uint64_t
msix_table_read(struct passthru_softc *sc, uint64_t offset, int size)
{
struct pci_devinst *pi;
struct msix_table_entry *entry;
uint8_t *src8;
uint16_t *src16;
uint32_t *src32;
uint64_t *src64;
uint64_t data;
size_t entry_offset;
int index;
pi = sc->psc_pi;
if (offset < pi->pi_msix.table_offset)
return (-1);
offset -= pi->pi_msix.table_offset;
index = offset / MSIX_TABLE_ENTRY_SIZE;
if (index >= pi->pi_msix.table_count)
return (-1);
entry = &pi->pi_msix.table[index];
entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
switch(size) {
case 1:
src8 = (uint8_t *)((void *)entry + entry_offset);
data = *src8;
break;
case 2:
src16 = (uint16_t *)((void *)entry + entry_offset);
data = *src16;
break;
case 4:
src32 = (uint32_t *)((void *)entry + entry_offset);
data = *src32;
break;
case 8:
src64 = (uint64_t *)((void *)entry + entry_offset);
data = *src64;
break;
default:
return (-1);
}
return (data);
}
static void
msix_table_write(struct vmctx *ctx, int vcpu, struct passthru_softc *sc,
uint64_t offset, int size, uint64_t data)
{
struct pci_devinst *pi;
struct msix_table_entry *entry;
uint32_t *dest;
size_t entry_offset;
uint32_t vector_control;
int error, index;
pi = sc->psc_pi;
if (offset < pi->pi_msix.table_offset)
return;
offset -= pi->pi_msix.table_offset;
index = offset / MSIX_TABLE_ENTRY_SIZE;
if (index >= pi->pi_msix.table_count)
return;
entry = &pi->pi_msix.table[index];
entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
/* Only 4 byte naturally-aligned writes are supported */
assert(size == 4);
assert(entry_offset % 4 == 0);
vector_control = entry->vector_control;
dest = (uint32_t *)((void *)entry + entry_offset);
*dest = data;
/* If MSI-X hasn't been enabled, do nothing */
if (pi->pi_msix.enabled) {
/* If the entry is masked, don't set it up */
if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 ||
(vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
error = vm_setup_pptdev_msix(ctx, vcpu,
sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
sc->psc_sel.pc_func, index, entry->addr,
entry->msg_data, entry->vector_control);
}
}
}
static int
init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base)
{
int b, s, f;
int error, idx;
size_t len, remaining;
uint32_t table_size, table_offset;
uint32_t pba_size, pba_offset;
vm_paddr_t start;
struct pci_devinst *pi = sc->psc_pi;
assert(pci_msix_table_bar(pi) >= 0 && pci_msix_pba_bar(pi) >= 0);
b = sc->psc_sel.pc_bus;
s = sc->psc_sel.pc_dev;
f = sc->psc_sel.pc_func;
/*
* If the MSI-X table BAR maps memory intended for
* other uses, it is at least assured that the table
* either resides in its own page within the region,
* or it resides in a page shared with only the PBA.
*/
table_offset = rounddown2(pi->pi_msix.table_offset, 4096);
table_size = pi->pi_msix.table_offset - table_offset;
table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
table_size = roundup2(table_size, 4096);
if (pi->pi_msix.pba_bar == pi->pi_msix.table_bar) {
pba_offset = pi->pi_msix.pba_offset;
pba_size = pi->pi_msix.pba_size;
if (pba_offset >= table_offset + table_size ||
table_offset >= pba_offset + pba_size) {
/*
* The PBA can reside in the same BAR as the MSI-x
* tables as long as it does not overlap with any
* naturally aligned page occupied by the tables.
*/
} else {
/* Need to also emulate the PBA, not supported yet */
printf("Unsupported MSI-X configuration: %d/%d/%d\n",
b, s, f);
return (-1);
}
}
idx = pi->pi_msix.table_bar;
start = pi->pi_bar[idx].addr;
remaining = pi->pi_bar[idx].size;
/* Map everything before the MSI-X table */
if (table_offset > 0) {
len = table_offset;
error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base);
if (error)
return (error);
base += len;
start += len;
remaining -= len;
}
/* Skip the MSI-X table */
base += table_size;
start += table_size;
remaining -= table_size;
/* Map everything beyond the end of the MSI-X table */
if (remaining > 0) {
len = remaining;
error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base);
if (error)
return (error);
}
return (0);
}
static int
cfginitbar(struct vmctx *ctx, struct passthru_softc *sc)
{
int i, error;
struct pci_devinst *pi;
struct pci_bar_io bar;
enum pcibar_type bartype;
uint64_t base, size;
pi = sc->psc_pi;
/*
* Initialize BAR registers
*/
for (i = 0; i <= PCI_BARMAX; i++) {
bzero(&bar, sizeof(bar));
bar.pbi_sel = sc->psc_sel;
bar.pbi_reg = PCIR_BAR(i);
if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0)
continue;
if (PCI_BAR_IO(bar.pbi_base)) {
bartype = PCIBAR_IO;
base = bar.pbi_base & PCIM_BAR_IO_BASE;
} else {
switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) {
case PCIM_BAR_MEM_64:
bartype = PCIBAR_MEM64;
break;
default:
bartype = PCIBAR_MEM32;
break;
}
base = bar.pbi_base & PCIM_BAR_MEM_BASE;
}
size = bar.pbi_length;
if (bartype != PCIBAR_IO) {
if (((base | size) & PAGE_MASK) != 0) {
printf("passthru device %d/%d/%d BAR %d: "
"base %#lx or size %#lx not page aligned\n",
sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
sc->psc_sel.pc_func, i, base, size);
return (-1);
}
}
/* Cache information about the "real" BAR */
sc->psc_bar[i].type = bartype;
sc->psc_bar[i].size = size;
sc->psc_bar[i].addr = base;
/* Allocate the BAR in the guest I/O or MMIO space */
error = pci_emul_alloc_pbar(pi, i, base, bartype, size);
if (error)
return (-1);
/* The MSI-X table needs special handling */
if (i == pci_msix_table_bar(pi)) {
error = init_msix_table(ctx, sc, base);
if (error)
return (-1);
} else if (bartype != PCIBAR_IO) {
/* Map the physical BAR in the guest MMIO space */
error = vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus,
sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
pi->pi_bar[i].addr, pi->pi_bar[i].size, base);
if (error)
return (-1);
}
/*
* 64-bit BAR takes up two slots so skip the next one.
*/
if (bartype == PCIBAR_MEM64) {
i++;
assert(i <= PCI_BARMAX);
sc->psc_bar[i].type = PCIBAR_MEMHI64;
}
}
return (0);
}
static int
cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func)
{
int error;
struct passthru_softc *sc;
error = 1;
sc = pi->pi_arg;
bzero(&sc->psc_sel, sizeof(struct pcisel));
sc->psc_sel.pc_bus = bus;
sc->psc_sel.pc_dev = slot;
sc->psc_sel.pc_func = func;
if (cfginitmsi(sc) != 0)
goto done;
if (cfginitbar(ctx, sc) != 0)
goto done;
error = 0; /* success */
done:
return (error);
}
static int
passthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
int bus, slot, func, error;
struct passthru_softc *sc;
sc = NULL;
error = 1;
if (pcifd < 0) {
pcifd = open(_PATH_DEVPCI, O_RDWR, 0);
if (pcifd < 0)
goto done;
}
if (iofd < 0) {
iofd = open(_PATH_DEVIO, O_RDWR, 0);
if (iofd < 0)
goto done;
}
if (opts == NULL ||
sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3)
goto done;
if (vm_assign_pptdev(ctx, bus, slot, func) != 0)
goto done;
sc = calloc(1, sizeof(struct passthru_softc));
pi->pi_arg = sc;
sc->psc_pi = pi;
/* initialize config space */
if ((error = cfginit(ctx, pi, bus, slot, func)) != 0)
goto done;
error = 0; /* success */
done:
if (error) {
free(sc);
vm_unassign_pptdev(ctx, bus, slot, func);
}
return (error);
}
static int
bar_access(int coff)
{
if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1))
return (1);
else
return (0);
}
static int
msicap_access(struct passthru_softc *sc, int coff)
{
int caplen;
if (sc->psc_msi.capoff == 0)
return (0);
caplen = msi_caplen(sc->psc_msi.msgctrl);
if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen)
return (1);
else
return (0);
}
static int
msixcap_access(struct passthru_softc *sc, int coff)
{
if (sc->psc_msix.capoff == 0)
return (0);
return (coff >= sc->psc_msix.capoff &&
coff < sc->psc_msix.capoff + MSIX_CAPLEN);
}
static int
passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int coff, int bytes, uint32_t *rv)
{
struct passthru_softc *sc;
sc = pi->pi_arg;
/*
* PCI BARs and MSI capability is emulated.
*/
if (bar_access(coff) || msicap_access(sc, coff))
return (-1);
#ifdef LEGACY_SUPPORT
/*
* Emulate PCIR_CAP_PTR if this device does not support MSI capability
* natively.
*/
if (sc->psc_msi.emulated) {
if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4)
return (-1);
}
#endif
/* Everything else just read from the device's config space */
*rv = read_config(&sc->psc_sel, coff, bytes);
return (0);
}
static int
passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int coff, int bytes, uint32_t val)
{
int error, msix_table_entries, i;
struct passthru_softc *sc;
sc = pi->pi_arg;
/*
* PCI BARs are emulated
*/
if (bar_access(coff))
return (-1);
/*
* MSI capability is emulated
*/
if (msicap_access(sc, coff)) {
msicap_cfgwrite(pi, sc->psc_msi.capoff, coff, bytes, val);
error = vm_setup_pptdev_msi(ctx, vcpu, sc->psc_sel.pc_bus,
sc->psc_sel.pc_dev, sc->psc_sel.pc_func,
pi->pi_msi.addr, pi->pi_msi.msg_data,
pi->pi_msi.maxmsgnum);
if (error != 0) {
printf("vm_setup_pptdev_msi error %d\r\n", errno);
exit(1);
}
return (0);
}
if (msixcap_access(sc, coff)) {
msixcap_cfgwrite(pi, sc->psc_msix.capoff, coff, bytes, val);
if (pi->pi_msix.enabled) {
msix_table_entries = pi->pi_msix.table_count;
for (i = 0; i < msix_table_entries; i++) {
error = vm_setup_pptdev_msix(ctx, vcpu,
sc->psc_sel.pc_bus, sc->psc_sel.pc_dev,
sc->psc_sel.pc_func, i,
pi->pi_msix.table[i].addr,
pi->pi_msix.table[i].msg_data,
pi->pi_msix.table[i].vector_control);
if (error) {
printf("vm_setup_pptdev_msix error "
"%d\r\n", errno);
exit(1);
}
}
}
return (0);
}
#ifdef LEGACY_SUPPORT
/*
* If this device does not support MSI natively then we cannot let
* the guest disable legacy interrupts from the device. It is the
* legacy interrupt that is triggering the virtual MSI to the guest.
*/
if (sc->psc_msi.emulated && pci_msi_enabled(pi)) {
if (coff == PCIR_COMMAND && bytes == 2)
val &= ~PCIM_CMD_INTxDIS;
}
#endif
write_config(&sc->psc_sel, coff, bytes, val);
return (0);
}
static void
passthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
uint64_t offset, int size, uint64_t value)
{
struct passthru_softc *sc;
struct iodev_pio_req pio;
sc = pi->pi_arg;
if (baridx == pci_msix_table_bar(pi)) {
msix_table_write(ctx, vcpu, sc, offset, size, value);
} else {
assert(pi->pi_bar[baridx].type == PCIBAR_IO);
bzero(&pio, sizeof(struct iodev_pio_req));
pio.access = IODEV_PIO_WRITE;
pio.port = sc->psc_bar[baridx].addr + offset;
pio.width = size;
pio.val = value;
(void)ioctl(iofd, IODEV_PIO, &pio);
}
}
static uint64_t
passthru_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
uint64_t offset, int size)
{
struct passthru_softc *sc;
struct iodev_pio_req pio;
uint64_t val;
sc = pi->pi_arg;
if (baridx == pci_msix_table_bar(pi)) {
val = msix_table_read(sc, offset, size);
} else {
assert(pi->pi_bar[baridx].type == PCIBAR_IO);
bzero(&pio, sizeof(struct iodev_pio_req));
pio.access = IODEV_PIO_READ;
pio.port = sc->psc_bar[baridx].addr + offset;
pio.width = size;
pio.val = 0;
(void)ioctl(iofd, IODEV_PIO, &pio);
val = pio.val;
}
return (val);
}
struct pci_devemu passthru = {
.pe_emu = "passthru",
.pe_init = passthru_init,
.pe_cfgwrite = passthru_cfgwrite,
.pe_cfgread = passthru_cfgread,
.pe_barwrite = passthru_write,
.pe_barread = passthru_read,
};
PCI_EMUL_SET(passthru);

119
bhyve/pci_uart.c Normal file
View file

@ -0,0 +1,119 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <stdio.h>
#include "bhyverun.h"
#include "pci_emul.h"
#include "uart_emul.h"
/*
* Pick a PCI vid/did of a chip with a single uart at
* BAR0, that most versions of FreeBSD can understand:
* Siig CyberSerial 1-port.
*/
#define COM_VENDOR 0x131f
#define COM_DEV 0x2000
static void
pci_uart_intr_assert(void *arg)
{
struct pci_devinst *pi = arg;
pci_lintr_assert(pi);
}
static void
pci_uart_intr_deassert(void *arg)
{
struct pci_devinst *pi = arg;
pci_lintr_deassert(pi);
}
static void
pci_uart_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int baridx, uint64_t offset, int size, uint64_t value)
{
assert(baridx == 0);
assert(size == 1);
uart_write(pi->pi_arg, offset, value);
}
uint64_t
pci_uart_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int baridx, uint64_t offset, int size)
{
uint8_t val;
assert(baridx == 0);
assert(size == 1);
val = uart_read(pi->pi_arg, offset);
return (val);
}
static int
pci_uart_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
struct uart_softc *sc;
pci_emul_alloc_bar(pi, 0, PCIBAR_IO, UART_IO_BAR_SIZE);
pci_lintr_request(pi);
/* initialize config space */
pci_set_cfgdata16(pi, PCIR_DEVICE, COM_DEV);
pci_set_cfgdata16(pi, PCIR_VENDOR, COM_VENDOR);
pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SIMPLECOMM);
sc = uart_init(pci_uart_intr_assert, pci_uart_intr_deassert, pi);
pi->pi_arg = sc;
if (uart_set_backend(sc, opts) != 0) {
fprintf(stderr, "Unable to initialize backend '%s' for "
"pci uart at %d:%d\n", opts, pi->pi_slot, pi->pi_func);
return (-1);
}
return (0);
}
struct pci_devemu pci_de_com = {
.pe_emu = "uart",
.pe_init = pci_uart_init,
.pe_barwrite = pci_uart_write,
.pe_barread = pci_uart_read
};
PCI_EMUL_SET(pci_de_com);

410
bhyve/pci_virtio_block.c Normal file
View file

@ -0,0 +1,410 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/linker_set.h>
#include <sys/stat.h>
#include <sys/uio.h>
#include <sys/ioctl.h>
#include <sys/disk.h>
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <strings.h>
#include <unistd.h>
#include <assert.h>
#include <pthread.h>
#include <md5.h>
#include "bhyverun.h"
#include "pci_emul.h"
#include "virtio.h"
#include "block_if.h"
#define VTBLK_RINGSZ 64
#define VTBLK_S_OK 0
#define VTBLK_S_IOERR 1
#define VTBLK_S_UNSUPP 2
#define VTBLK_BLK_ID_BYTES 20
/* Capability bits */
#define VTBLK_F_SEG_MAX (1 << 2) /* Maximum request segments */
#define VTBLK_F_BLK_SIZE (1 << 6) /* cfg block size valid */
#define VTBLK_F_FLUSH (1 << 9) /* Cache flush support */
#define VTBLK_F_TOPOLOGY (1 << 10) /* Optimal I/O alignment */
/*
* Host capabilities
*/
#define VTBLK_S_HOSTCAPS \
( VTBLK_F_SEG_MAX | \
VTBLK_F_BLK_SIZE | \
VTBLK_F_FLUSH | \
VTBLK_F_TOPOLOGY | \
VIRTIO_RING_F_INDIRECT_DESC ) /* indirect descriptors */
/*
* Config space "registers"
*/
struct vtblk_config {
uint64_t vbc_capacity;
uint32_t vbc_size_max;
uint32_t vbc_seg_max;
struct {
uint16_t cylinders;
uint8_t heads;
uint8_t sectors;
} vbc_geometry;
uint32_t vbc_blk_size;
struct {
uint8_t physical_block_exp;
uint8_t alignment_offset;
uint16_t min_io_size;
uint32_t opt_io_size;
} vbc_topology;
uint8_t vbc_writeback;
} __packed;
/*
* Fixed-size block header
*/
struct virtio_blk_hdr {
#define VBH_OP_READ 0
#define VBH_OP_WRITE 1
#define VBH_OP_FLUSH 4
#define VBH_OP_FLUSH_OUT 5
#define VBH_OP_IDENT 8
#define VBH_FLAG_BARRIER 0x80000000 /* OR'ed into vbh_type */
uint32_t vbh_type;
uint32_t vbh_ioprio;
uint64_t vbh_sector;
} __packed;
/*
* Debug printf
*/
static int pci_vtblk_debug;
#define DPRINTF(params) if (pci_vtblk_debug) printf params
#define WPRINTF(params) printf params
struct pci_vtblk_ioreq {
struct blockif_req io_req;
struct pci_vtblk_softc *io_sc;
uint8_t *io_status;
uint16_t io_idx;
};
/*
* Per-device softc
*/
struct pci_vtblk_softc {
struct virtio_softc vbsc_vs;
pthread_mutex_t vsc_mtx;
struct vqueue_info vbsc_vq;
struct vtblk_config vbsc_cfg;
struct blockif_ctxt *bc;
char vbsc_ident[VTBLK_BLK_ID_BYTES];
struct pci_vtblk_ioreq vbsc_ios[VTBLK_RINGSZ];
};
static void pci_vtblk_reset(void *);
static void pci_vtblk_notify(void *, struct vqueue_info *);
static int pci_vtblk_cfgread(void *, int, int, uint32_t *);
static int pci_vtblk_cfgwrite(void *, int, int, uint32_t);
static struct virtio_consts vtblk_vi_consts = {
"vtblk", /* our name */
1, /* we support 1 virtqueue */
sizeof(struct vtblk_config), /* config reg size */
pci_vtblk_reset, /* reset */
pci_vtblk_notify, /* device-wide qnotify */
pci_vtblk_cfgread, /* read PCI config */
pci_vtblk_cfgwrite, /* write PCI config */
NULL, /* apply negotiated features */
VTBLK_S_HOSTCAPS, /* our capabilities */
};
static void
pci_vtblk_reset(void *vsc)
{
struct pci_vtblk_softc *sc = vsc;
DPRINTF(("vtblk: device reset requested !\n"));
vi_reset_dev(&sc->vbsc_vs);
}
static void
pci_vtblk_done(struct blockif_req *br, int err)
{
struct pci_vtblk_ioreq *io = br->br_param;
struct pci_vtblk_softc *sc = io->io_sc;
/* convert errno into a virtio block error return */
if (err == EOPNOTSUPP || err == ENOSYS)
*io->io_status = VTBLK_S_UNSUPP;
else if (err != 0)
*io->io_status = VTBLK_S_IOERR;
else
*io->io_status = VTBLK_S_OK;
/*
* Return the descriptor back to the host.
* We wrote 1 byte (our status) to host.
*/
pthread_mutex_lock(&sc->vsc_mtx);
vq_relchain(&sc->vbsc_vq, io->io_idx, 1);
vq_endchains(&sc->vbsc_vq, 0);
pthread_mutex_unlock(&sc->vsc_mtx);
}
static void
pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq)
{
struct virtio_blk_hdr *vbh;
struct pci_vtblk_ioreq *io;
int i, n;
int err;
ssize_t iolen;
int writeop, type;
off_t offset;
struct iovec iov[BLOCKIF_IOV_MAX + 2];
uint16_t idx, flags[BLOCKIF_IOV_MAX + 2];
n = vq_getchain(vq, &idx, iov, BLOCKIF_IOV_MAX + 2, flags);
/*
* The first descriptor will be the read-only fixed header,
* and the last is for status (hence +2 above and below).
* The remaining iov's are the actual data I/O vectors.
*
* XXX - note - this fails on crash dump, which does a
* VIRTIO_BLK_T_FLUSH with a zero transfer length
*/
assert(n >= 2 && n <= BLOCKIF_IOV_MAX + 2);
io = &sc->vbsc_ios[idx];
assert((flags[0] & VRING_DESC_F_WRITE) == 0);
assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr));
vbh = iov[0].iov_base;
memcpy(&io->io_req.br_iov, &iov[1], sizeof(struct iovec) * (n - 2));
io->io_req.br_iovcnt = n - 2;
io->io_req.br_offset = vbh->vbh_sector * DEV_BSIZE;
io->io_status = iov[--n].iov_base;
assert(iov[n].iov_len == 1);
assert(flags[n] & VRING_DESC_F_WRITE);
/*
* XXX
* The guest should not be setting the BARRIER flag because
* we don't advertise the capability.
*/
type = vbh->vbh_type & ~VBH_FLAG_BARRIER;
writeop = (type == VBH_OP_WRITE);
iolen = 0;
for (i = 1; i < n; i++) {
/*
* - write op implies read-only descriptor,
* - read/ident op implies write-only descriptor,
* therefore test the inverse of the descriptor bit
* to the op.
*/
assert(((flags[i] & VRING_DESC_F_WRITE) == 0) == writeop);
iolen += iov[i].iov_len;
}
io->io_req.br_resid = iolen;
DPRINTF(("virtio-block: %s op, %zd bytes, %d segs, offset %ld\n\r",
writeop ? "write" : "read/ident", iolen, i - 1, offset));
switch (type) {
case VBH_OP_READ:
err = blockif_read(sc->bc, &io->io_req);
break;
case VBH_OP_WRITE:
err = blockif_write(sc->bc, &io->io_req);
break;
case VBH_OP_FLUSH:
case VBH_OP_FLUSH_OUT:
err = blockif_flush(sc->bc, &io->io_req);
break;
case VBH_OP_IDENT:
/* Assume a single buffer */
/* S/n equal to buffer is not zero-terminated. */
memset(iov[1].iov_base, 0, iov[1].iov_len);
strncpy(iov[1].iov_base, sc->vbsc_ident,
MIN(iov[1].iov_len, sizeof(sc->vbsc_ident)));
pci_vtblk_done(&io->io_req, 0);
return;
default:
pci_vtblk_done(&io->io_req, EOPNOTSUPP);
return;
}
assert(err == 0);
}
static void
pci_vtblk_notify(void *vsc, struct vqueue_info *vq)
{
struct pci_vtblk_softc *sc = vsc;
while (vq_has_descs(vq))
pci_vtblk_proc(sc, vq);
}
static int
pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
char bident[sizeof("XX:X:X")];
struct blockif_ctxt *bctxt;
MD5_CTX mdctx;
u_char digest[16];
struct pci_vtblk_softc *sc;
off_t size;
int i, sectsz, sts, sto;
if (opts == NULL) {
printf("virtio-block: backing device required\n");
return (1);
}
/*
* The supplied backing file has to exist
*/
snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->pi_func);
bctxt = blockif_open(opts, bident);
if (bctxt == NULL) {
perror("Could not open backing file");
return (1);
}
size = blockif_size(bctxt);
sectsz = blockif_sectsz(bctxt);
blockif_psectsz(bctxt, &sts, &sto);
sc = calloc(1, sizeof(struct pci_vtblk_softc));
sc->bc = bctxt;
for (i = 0; i < VTBLK_RINGSZ; i++) {
struct pci_vtblk_ioreq *io = &sc->vbsc_ios[i];
io->io_req.br_callback = pci_vtblk_done;
io->io_req.br_param = io;
io->io_sc = sc;
io->io_idx = i;
}
pthread_mutex_init(&sc->vsc_mtx, NULL);
/* init virtio softc and virtqueues */
vi_softc_linkup(&sc->vbsc_vs, &vtblk_vi_consts, sc, pi, &sc->vbsc_vq);
sc->vbsc_vs.vs_mtx = &sc->vsc_mtx;
sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ;
/* sc->vbsc_vq.vq_notify = we have no per-queue notify */
/*
* Create an identifier for the backing file. Use parts of the
* md5 sum of the filename
*/
MD5Init(&mdctx);
MD5Update(&mdctx, opts, strlen(opts));
MD5Final(digest, &mdctx);
sprintf(sc->vbsc_ident, "BHYVE-%02X%02X-%02X%02X-%02X%02X",
digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]);
/* setup virtio block config space */
sc->vbsc_cfg.vbc_capacity = size / DEV_BSIZE; /* 512-byte units */
sc->vbsc_cfg.vbc_size_max = 0; /* not negotiated */
sc->vbsc_cfg.vbc_seg_max = BLOCKIF_IOV_MAX;
sc->vbsc_cfg.vbc_geometry.cylinders = 0; /* no geometry */
sc->vbsc_cfg.vbc_geometry.heads = 0;
sc->vbsc_cfg.vbc_geometry.sectors = 0;
sc->vbsc_cfg.vbc_blk_size = sectsz;
sc->vbsc_cfg.vbc_topology.physical_block_exp =
(sts > sectsz) ? (ffsll(sts / sectsz) - 1) : 0;
sc->vbsc_cfg.vbc_topology.alignment_offset =
(sto != 0) ? ((sts - sto) / sectsz) : 0;
sc->vbsc_cfg.vbc_topology.min_io_size = 0;
sc->vbsc_cfg.vbc_topology.opt_io_size = 0;
sc->vbsc_cfg.vbc_writeback = 0;
/*
* Should we move some of this into virtio.c? Could
* have the device, class, and subdev_0 as fields in
* the virtio constants structure.
*/
pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_BLOCK);
pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_BLOCK);
pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
if (vi_intr_init(&sc->vbsc_vs, 1, fbsdrun_virtio_msix())) {
blockif_close(sc->bc);
free(sc);
return (1);
}
vi_set_io_bar(&sc->vbsc_vs, 0);
return (0);
}
static int
pci_vtblk_cfgwrite(void *vsc, int offset, int size, uint32_t value)
{
DPRINTF(("vtblk: write to readonly reg %d\n\r", offset));
return (1);
}
static int
pci_vtblk_cfgread(void *vsc, int offset, int size, uint32_t *retval)
{
struct pci_vtblk_softc *sc = vsc;
void *ptr;
/* our caller has already verified offset and size */
ptr = (uint8_t *)&sc->vbsc_cfg + offset;
memcpy(retval, ptr, size);
return (0);
}
struct pci_devemu pci_de_vblk = {
.pe_emu = "virtio-blk",
.pe_init = pci_vtblk_init,
.pe_barwrite = vi_pci_write,
.pe_barread = vi_pci_read
};
PCI_EMUL_SET(pci_de_vblk);

730
bhyve/pci_virtio_net.c Normal file
View file

@ -0,0 +1,730 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/linker_set.h>
#include <sys/select.h>
#include <sys/uio.h>
#include <sys/ioctl.h>
#include <machine/atomic.h>
#include <net/ethernet.h>
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <strings.h>
#include <unistd.h>
#include <assert.h>
#include <md5.h>
#include <pthread.h>
#include <pthread_np.h>
#include "bhyverun.h"
#include "pci_emul.h"
#include "mevent.h"
#include "virtio.h"
#define VTNET_RINGSZ 1024
#define VTNET_MAXSEGS 32
/*
* Host capabilities. Note that we only offer a few of these.
*/
#define VIRTIO_NET_F_CSUM (1 << 0) /* host handles partial cksum */
#define VIRTIO_NET_F_GUEST_CSUM (1 << 1) /* guest handles partial cksum */
#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */
#define VIRTIO_NET_F_GSO_DEPREC (1 << 6) /* deprecated: host handles GSO */
#define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can rcv TSOv4 */
#define VIRTIO_NET_F_GUEST_TSO6 (1 << 8) /* guest can rcv TSOv6 */
#define VIRTIO_NET_F_GUEST_ECN (1 << 9) /* guest can rcv TSO with ECN */
#define VIRTIO_NET_F_GUEST_UFO (1 << 10) /* guest can rcv UFO */
#define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can rcv TSOv4 */
#define VIRTIO_NET_F_HOST_TSO6 (1 << 12) /* host can rcv TSOv6 */
#define VIRTIO_NET_F_HOST_ECN (1 << 13) /* host can rcv TSO with ECN */
#define VIRTIO_NET_F_HOST_UFO (1 << 14) /* host can rcv UFO */
#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */
#define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */
#define VIRTIO_NET_F_CTRL_VQ (1 << 17) /* control channel available */
#define VIRTIO_NET_F_CTRL_RX (1 << 18) /* control channel RX mode support */
#define VIRTIO_NET_F_CTRL_VLAN (1 << 19) /* control channel VLAN filtering */
#define VIRTIO_NET_F_GUEST_ANNOUNCE \
(1 << 21) /* guest can send gratuitous pkts */
#define VTNET_S_HOSTCAPS \
( VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | VIRTIO_NET_F_STATUS | \
VIRTIO_F_NOTIFY_ON_EMPTY)
/*
* PCI config-space "registers"
*/
struct virtio_net_config {
uint8_t mac[6];
uint16_t status;
} __packed;
/*
* Queue definitions.
*/
#define VTNET_RXQ 0
#define VTNET_TXQ 1
#define VTNET_CTLQ 2 /* NB: not yet supported */
#define VTNET_MAXQ 3
/*
* Fixed network header size
*/
struct virtio_net_rxhdr {
uint8_t vrh_flags;
uint8_t vrh_gso_type;
uint16_t vrh_hdr_len;
uint16_t vrh_gso_size;
uint16_t vrh_csum_start;
uint16_t vrh_csum_offset;
uint16_t vrh_bufs;
} __packed;
/*
* Debug printf
*/
static int pci_vtnet_debug;
#define DPRINTF(params) if (pci_vtnet_debug) printf params
#define WPRINTF(params) printf params
/*
* Per-device softc
*/
struct pci_vtnet_softc {
struct virtio_softc vsc_vs;
struct vqueue_info vsc_queues[VTNET_MAXQ - 1];
pthread_mutex_t vsc_mtx;
struct mevent *vsc_mevp;
int vsc_tapfd;
int vsc_rx_ready;
volatile int resetting; /* set and checked outside lock */
uint64_t vsc_features; /* negotiated features */
struct virtio_net_config vsc_config;
pthread_mutex_t rx_mtx;
int rx_in_progress;
int rx_vhdrlen;
int rx_merge; /* merged rx bufs in use */
pthread_t tx_tid;
pthread_mutex_t tx_mtx;
pthread_cond_t tx_cond;
int tx_in_progress;
};
static void pci_vtnet_reset(void *);
/* static void pci_vtnet_notify(void *, struct vqueue_info *); */
static int pci_vtnet_cfgread(void *, int, int, uint32_t *);
static int pci_vtnet_cfgwrite(void *, int, int, uint32_t);
static void pci_vtnet_neg_features(void *, uint64_t);
static struct virtio_consts vtnet_vi_consts = {
"vtnet", /* our name */
VTNET_MAXQ - 1, /* we currently support 2 virtqueues */
sizeof(struct virtio_net_config), /* config reg size */
pci_vtnet_reset, /* reset */
NULL, /* device-wide qnotify -- not used */
pci_vtnet_cfgread, /* read PCI config */
pci_vtnet_cfgwrite, /* write PCI config */
pci_vtnet_neg_features, /* apply negotiated features */
VTNET_S_HOSTCAPS, /* our capabilities */
};
/*
* If the transmit thread is active then stall until it is done.
*/
static void
pci_vtnet_txwait(struct pci_vtnet_softc *sc)
{
pthread_mutex_lock(&sc->tx_mtx);
while (sc->tx_in_progress) {
pthread_mutex_unlock(&sc->tx_mtx);
usleep(10000);
pthread_mutex_lock(&sc->tx_mtx);
}
pthread_mutex_unlock(&sc->tx_mtx);
}
/*
* If the receive thread is active then stall until it is done.
*/
static void
pci_vtnet_rxwait(struct pci_vtnet_softc *sc)
{
pthread_mutex_lock(&sc->rx_mtx);
while (sc->rx_in_progress) {
pthread_mutex_unlock(&sc->rx_mtx);
usleep(10000);
pthread_mutex_lock(&sc->rx_mtx);
}
pthread_mutex_unlock(&sc->rx_mtx);
}
static void
pci_vtnet_reset(void *vsc)
{
struct pci_vtnet_softc *sc = vsc;
DPRINTF(("vtnet: device reset requested !\n"));
sc->resetting = 1;
/*
* Wait for the transmit and receive threads to finish their
* processing.
*/
pci_vtnet_txwait(sc);
pci_vtnet_rxwait(sc);
sc->vsc_rx_ready = 0;
sc->rx_merge = 1;
sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr);
/* now reset rings, MSI-X vectors, and negotiated capabilities */
vi_reset_dev(&sc->vsc_vs);
sc->resetting = 0;
}
/*
* Called to send a buffer chain out to the tap device
*/
static void
pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
int len)
{
static char pad[60]; /* all zero bytes */
if (sc->vsc_tapfd == -1)
return;
/*
* If the length is < 60, pad out to that and add the
* extra zero'd segment to the iov. It is guaranteed that
* there is always an extra iov available by the caller.
*/
if (len < 60) {
iov[iovcnt].iov_base = pad;
iov[iovcnt].iov_len = 60 - len;
iovcnt++;
}
(void) writev(sc->vsc_tapfd, iov, iovcnt);
}
/*
* Called when there is read activity on the tap file descriptor.
* Each buffer posted by the guest is assumed to be able to contain
* an entire ethernet frame + rx header.
* MP note: the dummybuf is only used for discarding frames, so there
* is no need for it to be per-vtnet or locked.
*/
static uint8_t dummybuf[2048];
static __inline struct iovec *
rx_iov_trim(struct iovec *iov, int *niov, int tlen)
{
struct iovec *riov;
/* XXX short-cut: assume first segment is >= tlen */
assert(iov[0].iov_len >= tlen);
iov[0].iov_len -= tlen;
if (iov[0].iov_len == 0) {
assert(*niov > 1);
*niov -= 1;
riov = &iov[1];
} else {
iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen);
riov = &iov[0];
}
return (riov);
}
static void
pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
{
struct iovec iov[VTNET_MAXSEGS], *riov;
struct vqueue_info *vq;
void *vrx;
int len, n;
uint16_t idx;
/*
* Should never be called without a valid tap fd
*/
assert(sc->vsc_tapfd != -1);
/*
* But, will be called when the rx ring hasn't yet
* been set up or the guest is resetting the device.
*/
if (!sc->vsc_rx_ready || sc->resetting) {
/*
* Drop the packet and try later.
*/
(void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
return;
}
/*
* Check for available rx buffers
*/
vq = &sc->vsc_queues[VTNET_RXQ];
if (!vq_has_descs(vq)) {
/*
* Drop the packet and try later. Interrupt on
* empty, if that's negotiated.
*/
(void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
vq_endchains(vq, 1);
return;
}
do {
/*
* Get descriptor chain.
*/
n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
assert(n >= 1 && n <= VTNET_MAXSEGS);
/*
* Get a pointer to the rx header, and use the
* data immediately following it for the packet buffer.
*/
vrx = iov[0].iov_base;
riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen);
len = readv(sc->vsc_tapfd, riov, n);
if (len < 0 && errno == EWOULDBLOCK) {
/*
* No more packets, but still some avail ring
* entries. Interrupt if needed/appropriate.
*/
vq_retchain(vq);
vq_endchains(vq, 0);
return;
}
/*
* The only valid field in the rx packet header is the
* number of buffers if merged rx bufs were negotiated.
*/
memset(vrx, 0, sc->rx_vhdrlen);
if (sc->rx_merge) {
struct virtio_net_rxhdr *vrxh;
vrxh = vrx;
vrxh->vrh_bufs = 1;
}
/*
* Release this chain and handle more chains.
*/
vq_relchain(vq, idx, len + sc->rx_vhdrlen);
} while (vq_has_descs(vq));
/* Interrupt if needed, including for NOTIFY_ON_EMPTY. */
vq_endchains(vq, 1);
}
static void
pci_vtnet_tap_callback(int fd, enum ev_type type, void *param)
{
struct pci_vtnet_softc *sc = param;
pthread_mutex_lock(&sc->rx_mtx);
sc->rx_in_progress = 1;
pci_vtnet_tap_rx(sc);
sc->rx_in_progress = 0;
pthread_mutex_unlock(&sc->rx_mtx);
}
static void
pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq)
{
struct pci_vtnet_softc *sc = vsc;
/*
* A qnotify means that the rx process can now begin
*/
if (sc->vsc_rx_ready == 0) {
sc->vsc_rx_ready = 1;
vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY;
}
}
static void
pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq)
{
struct iovec iov[VTNET_MAXSEGS + 1];
int i, n;
int plen, tlen;
uint16_t idx;
/*
* Obtain chain of descriptors. The first one is
* really the header descriptor, so we need to sum
* up two lengths: packet length and transfer length.
*/
n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL);
assert(n >= 1 && n <= VTNET_MAXSEGS);
plen = 0;
tlen = iov[0].iov_len;
for (i = 1; i < n; i++) {
plen += iov[i].iov_len;
tlen += iov[i].iov_len;
}
DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, n));
pci_vtnet_tap_tx(sc, &iov[1], n - 1, plen);
/* chain is processed, release it and set tlen */
vq_relchain(vq, idx, tlen);
}
static void
pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq)
{
struct pci_vtnet_softc *sc = vsc;
/*
* Any ring entries to process?
*/
if (!vq_has_descs(vq))
return;
/* Signal the tx thread for processing */
pthread_mutex_lock(&sc->tx_mtx);
vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY;
if (sc->tx_in_progress == 0)
pthread_cond_signal(&sc->tx_cond);
pthread_mutex_unlock(&sc->tx_mtx);
}
/*
* Thread which will handle processing of TX desc
*/
static void *
pci_vtnet_tx_thread(void *param)
{
struct pci_vtnet_softc *sc = param;
struct vqueue_info *vq;
int error;
vq = &sc->vsc_queues[VTNET_TXQ];
/*
* Let us wait till the tx queue pointers get initialised &
* first tx signaled
*/
pthread_mutex_lock(&sc->tx_mtx);
error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
assert(error == 0);
for (;;) {
/* note - tx mutex is locked here */
while (sc->resetting || !vq_has_descs(vq)) {
vq->vq_used->vu_flags &= ~VRING_USED_F_NO_NOTIFY;
mb();
if (!sc->resetting && vq_has_descs(vq))
break;
sc->tx_in_progress = 0;
error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
assert(error == 0);
}
vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY;
sc->tx_in_progress = 1;
pthread_mutex_unlock(&sc->tx_mtx);
do {
/*
* Run through entries, placing them into
* iovecs and sending when an end-of-packet
* is found
*/
pci_vtnet_proctx(sc, vq);
} while (vq_has_descs(vq));
/*
* Generate an interrupt if needed.
*/
vq_endchains(vq, 1);
pthread_mutex_lock(&sc->tx_mtx);
}
}
#ifdef notyet
static void
pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq)
{
DPRINTF(("vtnet: control qnotify!\n\r"));
}
#endif
static int
pci_vtnet_parsemac(char *mac_str, uint8_t *mac_addr)
{
struct ether_addr *ea;
char *tmpstr;
char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 };
tmpstr = strsep(&mac_str,"=");
if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) {
ea = ether_aton(mac_str);
if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) ||
memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) {
fprintf(stderr, "Invalid MAC %s\n", mac_str);
return (EINVAL);
} else
memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN);
}
return (0);
}
static int
pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
MD5_CTX mdctx;
unsigned char digest[16];
char nstr[80];
char tname[MAXCOMLEN + 1];
struct pci_vtnet_softc *sc;
char *devname;
char *vtopts;
int mac_provided;
sc = calloc(1, sizeof(struct pci_vtnet_softc));
pthread_mutex_init(&sc->vsc_mtx, NULL);
vi_softc_linkup(&sc->vsc_vs, &vtnet_vi_consts, sc, pi, sc->vsc_queues);
sc->vsc_vs.vs_mtx = &sc->vsc_mtx;
sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ;
sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq;
sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ;
sc->vsc_queues[VTNET_TXQ].vq_notify = pci_vtnet_ping_txq;
#ifdef notyet
sc->vsc_queues[VTNET_CTLQ].vq_qsize = VTNET_RINGSZ;
sc->vsc_queues[VTNET_CTLQ].vq_notify = pci_vtnet_ping_ctlq;
#endif
/*
* Attempt to open the tap device and read the MAC address
* if specified
*/
mac_provided = 0;
sc->vsc_tapfd = -1;
if (opts != NULL) {
char tbuf[80];
int err;
devname = vtopts = strdup(opts);
(void) strsep(&vtopts, ",");
if (vtopts != NULL) {
err = pci_vtnet_parsemac(vtopts, sc->vsc_config.mac);
if (err != 0) {
free(devname);
return (err);
}
mac_provided = 1;
}
strcpy(tbuf, "/dev/");
strlcat(tbuf, devname, sizeof(tbuf));
free(devname);
sc->vsc_tapfd = open(tbuf, O_RDWR);
if (sc->vsc_tapfd == -1) {
WPRINTF(("open of tap device %s failed\n", tbuf));
} else {
/*
* Set non-blocking and register for read
* notifications with the event loop
*/
int opt = 1;
if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) {
WPRINTF(("tap device O_NONBLOCK failed\n"));
close(sc->vsc_tapfd);
sc->vsc_tapfd = -1;
}
sc->vsc_mevp = mevent_add(sc->vsc_tapfd,
EVF_READ,
pci_vtnet_tap_callback,
sc);
if (sc->vsc_mevp == NULL) {
WPRINTF(("Could not register event\n"));
close(sc->vsc_tapfd);
sc->vsc_tapfd = -1;
}
}
}
/*
* The default MAC address is the standard NetApp OUI of 00-a0-98,
* followed by an MD5 of the PCI slot/func number and dev name
*/
if (!mac_provided) {
snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot,
pi->pi_func, vmname);
MD5Init(&mdctx);
MD5Update(&mdctx, nstr, strlen(nstr));
MD5Final(digest, &mdctx);
sc->vsc_config.mac[0] = 0x00;
sc->vsc_config.mac[1] = 0xa0;
sc->vsc_config.mac[2] = 0x98;
sc->vsc_config.mac[3] = digest[0];
sc->vsc_config.mac[4] = digest[1];
sc->vsc_config.mac[5] = digest[2];
}
/* initialize config space */
pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET);
pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
/* Link is up if we managed to open tap device. */
sc->vsc_config.status = (opts == NULL || sc->vsc_tapfd >= 0);
/* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */
if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix()))
return (1);
/* use BAR 0 to map config regs in IO space */
vi_set_io_bar(&sc->vsc_vs, 0);
sc->resetting = 0;
sc->rx_merge = 1;
sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr);
sc->rx_in_progress = 0;
pthread_mutex_init(&sc->rx_mtx, NULL);
/*
* Initialize tx semaphore & spawn TX processing thread.
* As of now, only one thread for TX desc processing is
* spawned.
*/
sc->tx_in_progress = 0;
pthread_mutex_init(&sc->tx_mtx, NULL);
pthread_cond_init(&sc->tx_cond, NULL);
pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc);
snprintf(tname, sizeof(tname), "vtnet-%d:%d tx", pi->pi_slot,
pi->pi_func);
pthread_set_name_np(sc->tx_tid, tname);
return (0);
}
static int
pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value)
{
struct pci_vtnet_softc *sc = vsc;
void *ptr;
if (offset < 6) {
assert(offset + size <= 6);
/*
* The driver is allowed to change the MAC address
*/
ptr = &sc->vsc_config.mac[offset];
memcpy(ptr, &value, size);
} else {
/* silently ignore other writes */
DPRINTF(("vtnet: write to readonly reg %d\n\r", offset));
}
return (0);
}
static int
pci_vtnet_cfgread(void *vsc, int offset, int size, uint32_t *retval)
{
struct pci_vtnet_softc *sc = vsc;
void *ptr;
ptr = (uint8_t *)&sc->vsc_config + offset;
memcpy(retval, ptr, size);
return (0);
}
static void
pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features)
{
struct pci_vtnet_softc *sc = vsc;
sc->vsc_features = negotiated_features;
if (!(sc->vsc_features & VIRTIO_NET_F_MRG_RXBUF)) {
sc->rx_merge = 0;
/* non-merge rx header is 2 bytes shorter */
sc->rx_vhdrlen -= 2;
}
}
struct pci_devemu pci_de_vnet = {
.pe_emu = "virtio-net",
.pe_init = pci_vtnet_init,
.pe_barwrite = vi_pci_write,
.pe_barread = vi_pci_read
};
PCI_EMUL_SET(pci_de_vnet);

189
bhyve/pci_virtio_rnd.c Normal file
View file

@ -0,0 +1,189 @@
/*-
* Copyright (c) 2014 Nahanni Systems Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer
* in this position and unchanged.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* virtio entropy device emulation.
* Randomness is sourced from /dev/random which does not block
* once it has been seeded at bootup.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/linker_set.h>
#include <sys/uio.h>
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <assert.h>
#include <pthread.h>
#include "bhyverun.h"
#include "pci_emul.h"
#include "virtio.h"
#define VTRND_RINGSZ 64
static int pci_vtrnd_debug;
#define DPRINTF(params) if (pci_vtrnd_debug) printf params
#define WPRINTF(params) printf params
/*
* Per-device softc
*/
struct pci_vtrnd_softc {
struct virtio_softc vrsc_vs;
struct vqueue_info vrsc_vq;
pthread_mutex_t vrsc_mtx;
uint64_t vrsc_cfg;
int vrsc_fd;
};
static void pci_vtrnd_reset(void *);
static void pci_vtrnd_notify(void *, struct vqueue_info *);
static struct virtio_consts vtrnd_vi_consts = {
"vtrnd", /* our name */
1, /* we support 1 virtqueue */
0, /* config reg size */
pci_vtrnd_reset, /* reset */
pci_vtrnd_notify, /* device-wide qnotify */
NULL, /* read virtio config */
NULL, /* write virtio config */
NULL, /* apply negotiated features */
0, /* our capabilities */
};
static void
pci_vtrnd_reset(void *vsc)
{
struct pci_vtrnd_softc *sc;
sc = vsc;
DPRINTF(("vtrnd: device reset requested !\n"));
vi_reset_dev(&sc->vrsc_vs);
}
static void
pci_vtrnd_notify(void *vsc, struct vqueue_info *vq)
{
struct iovec iov;
struct pci_vtrnd_softc *sc;
int len;
uint16_t idx;
sc = vsc;
if (sc->vrsc_fd < 0) {
vq_endchains(vq, 0);
return;
}
while (vq_has_descs(vq)) {
vq_getchain(vq, &idx, &iov, 1, NULL);
len = read(sc->vrsc_fd, iov.iov_base, iov.iov_len);
DPRINTF(("vtrnd: vtrnd_notify(): %d\r\n", len));
/* Catastrophe if unable to read from /dev/random */
assert(len > 0);
/*
* Release this chain and handle more
*/
vq_relchain(vq, idx, len);
}
vq_endchains(vq, 1); /* Generate interrupt if appropriate. */
}
static int
pci_vtrnd_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
{
struct pci_vtrnd_softc *sc;
int fd;
int len;
uint8_t v;
/*
* Should always be able to open /dev/random.
*/
fd = open("/dev/random", O_RDONLY | O_NONBLOCK);
assert(fd >= 0);
/*
* Check that device is seeded and non-blocking.
*/
len = read(fd, &v, sizeof(v));
if (len <= 0) {
WPRINTF(("vtrnd: /dev/random not ready, read(): %d", len));
return (1);
}
sc = calloc(1, sizeof(struct pci_vtrnd_softc));
vi_softc_linkup(&sc->vrsc_vs, &vtrnd_vi_consts, sc, pi, &sc->vrsc_vq);
sc->vrsc_vs.vs_mtx = &sc->vrsc_mtx;
sc->vrsc_vq.vq_qsize = VTRND_RINGSZ;
/* keep /dev/random opened while emulating */
sc->vrsc_fd = fd;
/* initialize config space */
pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_RANDOM);
pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_CRYPTO);
pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_ENTROPY);
pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
if (vi_intr_init(&sc->vrsc_vs, 1, fbsdrun_virtio_msix()))
return (1);
vi_set_io_bar(&sc->vrsc_vs, 0);
return (0);
}
struct pci_devemu pci_de_vrnd = {
.pe_emu = "virtio-rnd",
.pe_init = pci_vtrnd_init,
.pe_barwrite = vi_pci_write,
.pe_barread = vi_pci_read
};
PCI_EMUL_SET(pci_de_vrnd);

312
bhyve/pm.c Normal file
View file

@ -0,0 +1,312 @@
/*-
* Copyright (c) 2013 Hudson River Trading LLC
* Written by: John H. Baldwin <jhb@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <machine/vmm.h>
#include <assert.h>
#include <errno.h>
#include <pthread.h>
#include <signal.h>
#include <vmmapi.h>
#include "acpi.h"
#include "inout.h"
#include "mevent.h"
#include "pci_irq.h"
#include "pci_lpc.h"
static pthread_mutex_t pm_lock = PTHREAD_MUTEX_INITIALIZER;
static struct mevent *power_button;
static sig_t old_power_handler;
/*
* Reset Control register at I/O port 0xcf9. Bit 2 forces a system
* reset when it transitions from 0 to 1. Bit 1 selects the type of
* reset to attempt: 0 selects a "soft" reset, and 1 selects a "hard"
* reset.
*/
static int
reset_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
int error;
static uint8_t reset_control;
if (bytes != 1)
return (-1);
if (in)
*eax = reset_control;
else {
reset_control = *eax;
/* Treat hard and soft resets the same. */
if (reset_control & 0x4) {
error = vm_suspend(ctx, VM_SUSPEND_RESET);
assert(error == 0 || errno == EALREADY);
}
}
return (0);
}
INOUT_PORT(reset_reg, 0xCF9, IOPORT_F_INOUT, reset_handler);
/*
* ACPI's SCI is a level-triggered interrupt.
*/
static int sci_active;
static void
sci_assert(struct vmctx *ctx)
{
if (sci_active)
return;
vm_isa_assert_irq(ctx, SCI_INT, SCI_INT);
sci_active = 1;
}
static void
sci_deassert(struct vmctx *ctx)
{
if (!sci_active)
return;
vm_isa_deassert_irq(ctx, SCI_INT, SCI_INT);
sci_active = 0;
}
/*
* Power Management 1 Event Registers
*
* The only power management event supported is a power button upon
* receiving SIGTERM.
*/
static uint16_t pm1_enable, pm1_status;
#define PM1_TMR_STS 0x0001
#define PM1_BM_STS 0x0010
#define PM1_GBL_STS 0x0020
#define PM1_PWRBTN_STS 0x0100
#define PM1_SLPBTN_STS 0x0200
#define PM1_RTC_STS 0x0400
#define PM1_WAK_STS 0x8000
#define PM1_TMR_EN 0x0001
#define PM1_GBL_EN 0x0020
#define PM1_PWRBTN_EN 0x0100
#define PM1_SLPBTN_EN 0x0200
#define PM1_RTC_EN 0x0400
static void
sci_update(struct vmctx *ctx)
{
int need_sci;
/* See if the SCI should be active or not. */
need_sci = 0;
if ((pm1_enable & PM1_TMR_EN) && (pm1_status & PM1_TMR_STS))
need_sci = 1;
if ((pm1_enable & PM1_GBL_EN) && (pm1_status & PM1_GBL_STS))
need_sci = 1;
if ((pm1_enable & PM1_PWRBTN_EN) && (pm1_status & PM1_PWRBTN_STS))
need_sci = 1;
if ((pm1_enable & PM1_SLPBTN_EN) && (pm1_status & PM1_SLPBTN_STS))
need_sci = 1;
if ((pm1_enable & PM1_RTC_EN) && (pm1_status & PM1_RTC_STS))
need_sci = 1;
if (need_sci)
sci_assert(ctx);
else
sci_deassert(ctx);
}
static int
pm1_status_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
if (bytes != 2)
return (-1);
pthread_mutex_lock(&pm_lock);
if (in)
*eax = pm1_status;
else {
/*
* Writes are only permitted to clear certain bits by
* writing 1 to those flags.
*/
pm1_status &= ~(*eax & (PM1_WAK_STS | PM1_RTC_STS |
PM1_SLPBTN_STS | PM1_PWRBTN_STS | PM1_BM_STS));
sci_update(ctx);
}
pthread_mutex_unlock(&pm_lock);
return (0);
}
static int
pm1_enable_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
if (bytes != 2)
return (-1);
pthread_mutex_lock(&pm_lock);
if (in)
*eax = pm1_enable;
else {
/*
* Only permit certain bits to be set. We never use
* the global lock, but ACPI-CA whines profusely if it
* can't set GBL_EN.
*/
pm1_enable = *eax & (PM1_PWRBTN_EN | PM1_GBL_EN);
sci_update(ctx);
}
pthread_mutex_unlock(&pm_lock);
return (0);
}
INOUT_PORT(pm1_status, PM1A_EVT_ADDR, IOPORT_F_INOUT, pm1_status_handler);
INOUT_PORT(pm1_enable, PM1A_EVT_ADDR + 2, IOPORT_F_INOUT, pm1_enable_handler);
static void
power_button_handler(int signal, enum ev_type type, void *arg)
{
struct vmctx *ctx;
ctx = arg;
pthread_mutex_lock(&pm_lock);
if (!(pm1_status & PM1_PWRBTN_STS)) {
pm1_status |= PM1_PWRBTN_STS;
sci_update(ctx);
}
pthread_mutex_unlock(&pm_lock);
}
/*
* Power Management 1 Control Register
*
* This is mostly unimplemented except that we wish to handle writes that
* set SPL_EN to handle S5 (soft power off).
*/
static uint16_t pm1_control;
#define PM1_SCI_EN 0x0001
#define PM1_SLP_TYP 0x1c00
#define PM1_SLP_EN 0x2000
#define PM1_ALWAYS_ZERO 0xc003
static int
pm1_control_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
int error;
if (bytes != 2)
return (-1);
if (in)
*eax = pm1_control;
else {
/*
* Various bits are write-only or reserved, so force them
* to zero in pm1_control. Always preserve SCI_EN as OSPM
* can never change it.
*/
pm1_control = (pm1_control & PM1_SCI_EN) |
(*eax & ~(PM1_SLP_EN | PM1_ALWAYS_ZERO));
/*
* If SLP_EN is set, check for S5. Bhyve's _S5_ method
* says that '5' should be stored in SLP_TYP for S5.
*/
if (*eax & PM1_SLP_EN) {
if ((pm1_control & PM1_SLP_TYP) >> 10 == 5) {
error = vm_suspend(ctx, VM_SUSPEND_POWEROFF);
assert(error == 0 || errno == EALREADY);
}
}
}
return (0);
}
INOUT_PORT(pm1_control, PM1A_CNT_ADDR, IOPORT_F_INOUT, pm1_control_handler);
SYSRES_IO(PM1A_EVT_ADDR, 8);
/*
* ACPI SMI Command Register
*
* This write-only register is used to enable and disable ACPI.
*/
static int
smi_cmd_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
assert(!in);
if (bytes != 1)
return (-1);
pthread_mutex_lock(&pm_lock);
switch (*eax) {
case BHYVE_ACPI_ENABLE:
pm1_control |= PM1_SCI_EN;
if (power_button == NULL) {
power_button = mevent_add(SIGTERM, EVF_SIGNAL,
power_button_handler, ctx);
old_power_handler = signal(SIGTERM, SIG_IGN);
}
break;
case BHYVE_ACPI_DISABLE:
pm1_control &= ~PM1_SCI_EN;
if (power_button != NULL) {
mevent_delete(power_button);
power_button = NULL;
signal(SIGTERM, old_power_handler);
}
break;
}
pthread_mutex_unlock(&pm_lock);
return (0);
}
INOUT_PORT(smi_cmd, SMI_CMD, IOPORT_F_OUT, smi_cmd_handler);
SYSRES_IO(SMI_CMD, 1);
void
sci_init(struct vmctx *ctx)
{
/*
* Mark ACPI's SCI as level trigger and bump its use count
* in the PIRQ router.
*/
pci_irq_use(SCI_INT);
vm_isa_set_irq_trigger(ctx, SCI_INT, LEVEL_TRIGGER);
}

53
bhyve/post.c Normal file
View file

@ -0,0 +1,53 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <assert.h>
#include "inout.h"
#include "pci_lpc.h"
static int
post_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
uint32_t *eax, void *arg)
{
assert(in == 1);
if (bytes != 1)
return (-1);
*eax = 0xff; /* return some garbage */
return (0);
}
INOUT_PORT(post, 0x84, IOPORT_F_IN, post_data_handler);
SYSRES_IO(0x84, 1);

129
bhyve/rtc.c Normal file
View file

@ -0,0 +1,129 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <time.h>
#include <assert.h>
#include <machine/vmm.h>
#include <vmmapi.h>
#include "acpi.h"
#include "pci_lpc.h"
#include "rtc.h"
#define IO_RTC 0x70
#define RTC_LMEM_LSB 0x34
#define RTC_LMEM_MSB 0x35
#define RTC_HMEM_LSB 0x5b
#define RTC_HMEM_SB 0x5c
#define RTC_HMEM_MSB 0x5d
#define m_64KB (64*1024)
#define m_16MB (16*1024*1024)
#define m_4GB (4ULL*1024*1024*1024)
/*
* Returns the current RTC time as number of seconds since 00:00:00 Jan 1, 1970
*/
static time_t
rtc_time(struct vmctx *ctx, int use_localtime)
{
struct tm tm;
time_t t;
time(&t);
if (use_localtime) {
localtime_r(&t, &tm);
t = timegm(&tm);
}
return (t);
}
void
rtc_init(struct vmctx *ctx, int use_localtime)
{
size_t himem;
size_t lomem;
int err;
/* XXX init diag/reset code/equipment/checksum ? */
/*
* Report guest memory size in nvram cells as required by UEFI.
* Little-endian encoding.
* 0x34/0x35 - 64KB chunks above 16MB, below 4GB
* 0x5b/0x5c/0x5d - 64KB chunks above 4GB
*/
lomem = (vm_get_lowmem_size(ctx) - m_16MB) / m_64KB;
err = vm_rtc_write(ctx, RTC_LMEM_LSB, lomem);
assert(err == 0);
err = vm_rtc_write(ctx, RTC_LMEM_MSB, lomem >> 8);
assert(err == 0);
himem = vm_get_highmem_size(ctx) / m_64KB;
err = vm_rtc_write(ctx, RTC_HMEM_LSB, himem);
assert(err == 0);
err = vm_rtc_write(ctx, RTC_HMEM_SB, himem >> 8);
assert(err == 0);
err = vm_rtc_write(ctx, RTC_HMEM_MSB, himem >> 16);
assert(err == 0);
err = vm_rtc_settime(ctx, rtc_time(ctx, use_localtime));
assert(err == 0);
}
static void
rtc_dsdt(void)
{
dsdt_line("");
dsdt_line("Device (RTC)");
dsdt_line("{");
dsdt_line(" Name (_HID, EisaId (\"PNP0B00\"))");
dsdt_line(" Name (_CRS, ResourceTemplate ()");
dsdt_line(" {");
dsdt_indent(2);
dsdt_fixed_ioport(IO_RTC, 2);
dsdt_fixed_irq(8);
dsdt_unindent(2);
dsdt_line(" })");
dsdt_line("}");
}
LPC_DSDT(rtc_dsdt);
/*
* Reserve the extended RTC I/O ports although they are not emulated at this
* time.
*/
SYSRES_IO(0x72, 6);

34
bhyve/rtc.h Normal file
View file

@ -0,0 +1,34 @@
/*
* Copyright (c) 2013 Peter Grehan <grehan@freebsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _RTC_H_
#define _RTC_H_
void rtc_init(struct vmctx *ctx, int use_localtime);
#endif /* _RTC_H_ */

827
bhyve/smbiostbl.c Normal file
View file

@ -0,0 +1,827 @@
/*-
* Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <assert.h>
#include <errno.h>
#include <md5.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <uuid.h>
#include <machine/vmm.h>
#include <vmmapi.h>
#include "bhyverun.h"
#include "smbiostbl.h"
#define MB (1024*1024)
#define GB (1024ULL*1024*1024)
#define SMBIOS_BASE 0xF1000
/* BHYVE_ACPI_BASE - SMBIOS_BASE) */
#define SMBIOS_MAX_LENGTH (0xF2400 - 0xF1000)
#define SMBIOS_TYPE_BIOS 0
#define SMBIOS_TYPE_SYSTEM 1
#define SMBIOS_TYPE_CHASSIS 3
#define SMBIOS_TYPE_PROCESSOR 4
#define SMBIOS_TYPE_MEMARRAY 16
#define SMBIOS_TYPE_MEMDEVICE 17
#define SMBIOS_TYPE_MEMARRAYMAP 19
#define SMBIOS_TYPE_BOOT 32
#define SMBIOS_TYPE_EOT 127
struct smbios_structure {
uint8_t type;
uint8_t length;
uint16_t handle;
} __packed;
typedef int (*initializer_func_t)(struct smbios_structure *template_entry,
const char **template_strings, char *curaddr, char **endaddr,
uint16_t *n, uint16_t *size);
struct smbios_template_entry {
struct smbios_structure *entry;
const char **strings;
initializer_func_t initializer;
};
/*
* SMBIOS Structure Table Entry Point
*/
#define SMBIOS_ENTRY_EANCHOR "_SM_"
#define SMBIOS_ENTRY_EANCHORLEN 4
#define SMBIOS_ENTRY_IANCHOR "_DMI_"
#define SMBIOS_ENTRY_IANCHORLEN 5
struct smbios_entry_point {
char eanchor[4]; /* anchor tag */
uint8_t echecksum; /* checksum of entry point structure */
uint8_t eplen; /* length in bytes of entry point */
uint8_t major; /* major version of the SMBIOS spec */
uint8_t minor; /* minor version of the SMBIOS spec */
uint16_t maxssize; /* maximum size in bytes of a struct */
uint8_t revision; /* entry point structure revision */
uint8_t format[5]; /* entry point rev-specific data */
char ianchor[5]; /* intermediate anchor tag */
uint8_t ichecksum; /* intermediate checksum */
uint16_t stlen; /* len in bytes of structure table */
uint32_t staddr; /* physical addr of structure table */
uint16_t stnum; /* number of structure table entries */
uint8_t bcdrev; /* BCD value representing DMI ver */
} __packed;
/*
* BIOS Information
*/
#define SMBIOS_FL_ISA 0x00000010 /* ISA is supported */
#define SMBIOS_FL_PCI 0x00000080 /* PCI is supported */
#define SMBIOS_FL_SHADOW 0x00001000 /* BIOS shadowing is allowed */
#define SMBIOS_FL_CDBOOT 0x00008000 /* Boot from CD is supported */
#define SMBIOS_FL_SELBOOT 0x00010000 /* Selectable Boot supported */
#define SMBIOS_FL_EDD 0x00080000 /* EDD Spec is supported */
#define SMBIOS_XB1_FL_ACPI 0x00000001 /* ACPI is supported */
#define SMBIOS_XB2_FL_BBS 0x00000001 /* BIOS Boot Specification */
#define SMBIOS_XB2_FL_VM 0x00000010 /* Virtual Machine */
struct smbios_table_type0 {
struct smbios_structure header;
uint8_t vendor; /* vendor string */
uint8_t version; /* version string */
uint16_t segment; /* address segment location */
uint8_t rel_date; /* release date */
uint8_t size; /* rom size */
uint64_t cflags; /* characteristics */
uint8_t xc_bytes[2]; /* characteristics ext bytes */
uint8_t sb_major_rel; /* system bios version */
uint8_t sb_minor_rele;
uint8_t ecfw_major_rel; /* embedded ctrl fw version */
uint8_t ecfw_minor_rel;
} __packed;
/*
* System Information
*/
#define SMBIOS_WAKEUP_SWITCH 0x06 /* power switch */
struct smbios_table_type1 {
struct smbios_structure header;
uint8_t manufacturer; /* manufacturer string */
uint8_t product; /* product name string */
uint8_t version; /* version string */
uint8_t serial; /* serial number string */
uint8_t uuid[16]; /* uuid byte array */
uint8_t wakeup; /* wake-up event */
uint8_t sku; /* sku number string */
uint8_t family; /* family name string */
} __packed;
/*
* System Enclosure or Chassis
*/
#define SMBIOS_CHT_UNKNOWN 0x02 /* unknown */
#define SMBIOS_CHST_SAFE 0x03 /* safe */
#define SMBIOS_CHSC_NONE 0x03 /* none */
struct smbios_table_type3 {
struct smbios_structure header;
uint8_t manufacturer; /* manufacturer string */
uint8_t type; /* type */
uint8_t version; /* version string */
uint8_t serial; /* serial number string */
uint8_t asset; /* asset tag string */
uint8_t bustate; /* boot-up state */
uint8_t psstate; /* power supply state */
uint8_t tstate; /* thermal state */
uint8_t security; /* security status */
uint8_t uheight; /* height in 'u's */
uint8_t cords; /* number of power cords */
uint8_t elems; /* number of element records */
uint8_t elemlen; /* length of records */
uint8_t sku; /* sku number string */
} __packed;
/*
* Processor Information
*/
#define SMBIOS_PRT_CENTRAL 0x03 /* central processor */
#define SMBIOS_PRF_OTHER 0x01 /* other */
#define SMBIOS_PRS_PRESENT 0x40 /* socket is populated */
#define SMBIOS_PRS_ENABLED 0x1 /* enabled */
#define SMBIOS_PRU_NONE 0x06 /* none */
#define SMBIOS_PFL_64B 0x04 /* 64-bit capable */
struct smbios_table_type4 {
struct smbios_structure header;
uint8_t socket; /* socket designation string */
uint8_t type; /* processor type */
uint8_t family; /* processor family */
uint8_t manufacturer; /* manufacturer string */
uint64_t cpuid; /* processor cpuid */
uint8_t version; /* version string */
uint8_t voltage; /* voltage */
uint16_t clkspeed; /* ext clock speed in mhz */
uint16_t maxspeed; /* maximum speed in mhz */
uint16_t curspeed; /* current speed in mhz */
uint8_t status; /* status */
uint8_t upgrade; /* upgrade */
uint16_t l1handle; /* l1 cache handle */
uint16_t l2handle; /* l2 cache handle */
uint16_t l3handle; /* l3 cache handle */
uint8_t serial; /* serial number string */
uint8_t asset; /* asset tag string */
uint8_t part; /* part number string */
uint8_t cores; /* cores per socket */
uint8_t ecores; /* enabled cores */
uint8_t threads; /* threads per socket */
uint16_t cflags; /* processor characteristics */
uint16_t family2; /* processor family 2 */
} __packed;
/*
* Physical Memory Array
*/
#define SMBIOS_MAL_SYSMB 0x03 /* system board or motherboard */
#define SMBIOS_MAU_SYSTEM 0x03 /* system memory */
#define SMBIOS_MAE_NONE 0x03 /* none */
struct smbios_table_type16 {
struct smbios_structure header;
uint8_t location; /* physical device location */
uint8_t use; /* device functional purpose */
uint8_t ecc; /* err detect/correct method */
uint32_t size; /* max mem capacity in kb */
uint16_t errhand; /* handle of error (if any) */
uint16_t ndevs; /* num of slots or sockets */
uint64_t xsize; /* max mem capacity in bytes */
} __packed;
/*
* Memory Device
*/
#define SMBIOS_MDFF_UNKNOWN 0x02 /* unknown */
#define SMBIOS_MDT_UNKNOWN 0x02 /* unknown */
#define SMBIOS_MDF_UNKNOWN 0x0004 /* unknown */
struct smbios_table_type17 {
struct smbios_structure header;
uint16_t arrayhand; /* handle of physl mem array */
uint16_t errhand; /* handle of mem error data */
uint16_t twidth; /* total width in bits */
uint16_t dwidth; /* data width in bits */
uint16_t size; /* size in bytes */
uint8_t form; /* form factor */
uint8_t set; /* set */
uint8_t dloc; /* device locator string */
uint8_t bloc; /* phys bank locator string */
uint8_t type; /* memory type */
uint16_t flags; /* memory characteristics */
uint16_t maxspeed; /* maximum speed in mhz */
uint8_t manufacturer; /* manufacturer string */
uint8_t serial; /* serial number string */
uint8_t asset; /* asset tag string */
uint8_t part; /* part number string */
uint8_t attributes; /* attributes */
uint32_t xsize; /* extended size in mbs */
uint16_t curspeed; /* current speed in mhz */
uint16_t minvoltage; /* minimum voltage */
uint16_t maxvoltage; /* maximum voltage */
uint16_t curvoltage; /* configured voltage */
} __packed;
/*
* Memory Array Mapped Address
*/
struct smbios_table_type19 {
struct smbios_structure header;
uint32_t saddr; /* start phys addr in kb */
uint32_t eaddr; /* end phys addr in kb */
uint16_t arrayhand; /* physical mem array handle */
uint8_t width; /* num of dev in row */
uint64_t xsaddr; /* start phys addr in bytes */
uint64_t xeaddr; /* end phys addr in bytes */
} __packed;
/*
* System Boot Information
*/
#define SMBIOS_BOOT_NORMAL 0 /* no errors detected */
struct smbios_table_type32 {
struct smbios_structure header;
uint8_t reserved[6];
uint8_t status; /* boot status */
} __packed;
/*
* End-of-Table
*/
struct smbios_table_type127 {
struct smbios_structure header;
} __packed;
struct smbios_table_type0 smbios_type0_template = {
{ SMBIOS_TYPE_BIOS, sizeof (struct smbios_table_type0), 0 },
1, /* bios vendor string */
2, /* bios version string */
0xF000, /* bios address segment location */
3, /* bios release date */
0x0, /* bios size (64k * (n + 1) is the size in bytes) */
SMBIOS_FL_ISA | SMBIOS_FL_PCI | SMBIOS_FL_SHADOW |
SMBIOS_FL_CDBOOT | SMBIOS_FL_EDD,
{ SMBIOS_XB1_FL_ACPI, SMBIOS_XB2_FL_BBS | SMBIOS_XB2_FL_VM },
0x0, /* bios major release */
0x0, /* bios minor release */
0xff, /* embedded controller firmware major release */
0xff /* embedded controller firmware minor release */
};
const char *smbios_type0_strings[] = {
"BHYVE", /* vendor string */
"1.00", /* bios version string */
"03/14/2014", /* bios release date string */
NULL
};
struct smbios_table_type1 smbios_type1_template = {
{ SMBIOS_TYPE_SYSTEM, sizeof (struct smbios_table_type1), 0 },
1, /* manufacturer string */
2, /* product string */
3, /* version string */
4, /* serial number string */
{ 0 },
SMBIOS_WAKEUP_SWITCH,
5, /* sku string */
6 /* family string */
};
static int smbios_type1_initializer(struct smbios_structure *template_entry,
const char **template_strings, char *curaddr, char **endaddr,
uint16_t *n, uint16_t *size);
const char *smbios_type1_strings[] = {
" ", /* manufacturer string */
"BHYVE", /* product name string */
"1.0", /* version string */
"None", /* serial number string */
"None", /* sku string */
" ", /* family name string */
NULL
};
struct smbios_table_type3 smbios_type3_template = {
{ SMBIOS_TYPE_CHASSIS, sizeof (struct smbios_table_type3), 0 },
1, /* manufacturer string */
SMBIOS_CHT_UNKNOWN,
2, /* version string */
3, /* serial number string */
4, /* asset tag string */
SMBIOS_CHST_SAFE,
SMBIOS_CHST_SAFE,
SMBIOS_CHST_SAFE,
SMBIOS_CHSC_NONE,
0, /* height in 'u's (0=enclosure height unspecified) */
0, /* number of power cords (0=number unspecified) */
0, /* number of contained element records */
0, /* length of records */
5 /* sku number string */
};
const char *smbios_type3_strings[] = {
" ", /* manufacturer string */
"1.0", /* version string */
"None", /* serial number string */
"None", /* asset tag string */
"None", /* sku number string */
NULL
};
struct smbios_table_type4 smbios_type4_template = {
{ SMBIOS_TYPE_PROCESSOR, sizeof (struct smbios_table_type4), 0 },
1, /* socket designation string */
SMBIOS_PRT_CENTRAL,
SMBIOS_PRF_OTHER,
2, /* manufacturer string */
0, /* cpuid */
3, /* version string */
0, /* voltage */
0, /* external clock frequency in mhz (0=unknown) */
0, /* maximum frequency in mhz (0=unknown) */
0, /* current frequency in mhz (0=unknown) */
SMBIOS_PRS_PRESENT | SMBIOS_PRS_ENABLED,
SMBIOS_PRU_NONE,
-1, /* l1 cache handle */
-1, /* l2 cache handle */
-1, /* l3 cache handle */
4, /* serial number string */
5, /* asset tag string */
6, /* part number string */
0, /* cores per socket (0=unknown) */
0, /* enabled cores per socket (0=unknown) */
0, /* threads per socket (0=unknown) */
SMBIOS_PFL_64B,
SMBIOS_PRF_OTHER
};
const char *smbios_type4_strings[] = {
" ", /* socket designation string */
" ", /* manufacturer string */
" ", /* version string */
"None", /* serial number string */
"None", /* asset tag string */
"None", /* part number string */
NULL
};
static int smbios_type4_initializer(struct smbios_structure *template_entry,
const char **template_strings, char *curaddr, char **endaddr,
uint16_t *n, uint16_t *size);
struct smbios_table_type16 smbios_type16_template = {
{ SMBIOS_TYPE_MEMARRAY, sizeof (struct smbios_table_type16), 0 },
SMBIOS_MAL_SYSMB,
SMBIOS_MAU_SYSTEM,
SMBIOS_MAE_NONE,
0x80000000, /* max mem capacity in kb (0x80000000=use extended) */
-1, /* handle of error (if any) */
0, /* number of slots or sockets (TBD) */
0 /* extended maximum memory capacity in bytes (TBD) */
};
static int smbios_type16_initializer(struct smbios_structure *template_entry,
const char **template_strings, char *curaddr, char **endaddr,
uint16_t *n, uint16_t *size);
struct smbios_table_type17 smbios_type17_template = {
{ SMBIOS_TYPE_MEMDEVICE, sizeof (struct smbios_table_type17), 0 },
-1, /* handle of physical memory array */
-1, /* handle of memory error data */
64, /* total width in bits including ecc */
64, /* data width in bits */
0x7fff, /* size in bytes (0x7fff=use extended)*/
SMBIOS_MDFF_UNKNOWN,
0, /* set (0x00=none, 0xff=unknown) */
1, /* device locator string */
2, /* physical bank locator string */
SMBIOS_MDT_UNKNOWN,
SMBIOS_MDF_UNKNOWN,
0, /* maximum memory speed in mhz (0=unknown) */
3, /* manufacturer string */
4, /* serial number string */
5, /* asset tag string */
6, /* part number string */
0, /* attributes (0=unknown rank information) */
0, /* extended size in mb (TBD) */
0, /* current speed in mhz (0=unknown) */
0, /* minimum voltage in mv (0=unknown) */
0, /* maximum voltage in mv (0=unknown) */
0 /* configured voltage in mv (0=unknown) */
};
const char *smbios_type17_strings[] = {
" ", /* device locator string */
" ", /* physical bank locator string */
" ", /* manufacturer string */
"None", /* serial number string */
"None", /* asset tag string */
"None", /* part number string */
NULL
};
static int smbios_type17_initializer(struct smbios_structure *template_entry,
const char **template_strings, char *curaddr, char **endaddr,
uint16_t *n, uint16_t *size);
struct smbios_table_type19 smbios_type19_template = {
{ SMBIOS_TYPE_MEMARRAYMAP, sizeof (struct smbios_table_type19), 0 },
0xffffffff, /* starting phys addr in kb (0xffffffff=use ext) */
0xffffffff, /* ending phys addr in kb (0xffffffff=use ext) */
-1, /* physical memory array handle */
1, /* number of devices that form a row */
0, /* extended starting phys addr in bytes (TDB) */
0 /* extended ending phys addr in bytes (TDB) */
};
static int smbios_type19_initializer(struct smbios_structure *template_entry,
const char **template_strings, char *curaddr, char **endaddr,
uint16_t *n, uint16_t *size);
struct smbios_table_type32 smbios_type32_template = {
{ SMBIOS_TYPE_BOOT, sizeof (struct smbios_table_type32), 0 },
{ 0, 0, 0, 0, 0, 0 },
SMBIOS_BOOT_NORMAL
};
struct smbios_table_type127 smbios_type127_template = {
{ SMBIOS_TYPE_EOT, sizeof (struct smbios_table_type127), 0 }
};
static int smbios_generic_initializer(struct smbios_structure *template_entry,
const char **template_strings, char *curaddr, char **endaddr,
uint16_t *n, uint16_t *size);
static struct smbios_template_entry smbios_template[] = {
{ (struct smbios_structure *)&smbios_type0_template,
smbios_type0_strings,
smbios_generic_initializer },
{ (struct smbios_structure *)&smbios_type1_template,
smbios_type1_strings,
smbios_type1_initializer },
{ (struct smbios_structure *)&smbios_type3_template,
smbios_type3_strings,
smbios_generic_initializer },
{ (struct smbios_structure *)&smbios_type4_template,
smbios_type4_strings,
smbios_type4_initializer },
{ (struct smbios_structure *)&smbios_type16_template,
NULL,
smbios_type16_initializer },
{ (struct smbios_structure *)&smbios_type17_template,
smbios_type17_strings,
smbios_type17_initializer },
{ (struct smbios_structure *)&smbios_type19_template,
NULL,
smbios_type19_initializer },
{ (struct smbios_structure *)&smbios_type32_template,
NULL,
smbios_generic_initializer },
{ (struct smbios_structure *)&smbios_type127_template,
NULL,
smbios_generic_initializer },
{ NULL,NULL, NULL }
};
static uint64_t guest_lomem, guest_himem;
static uint16_t type16_handle;
static int
smbios_generic_initializer(struct smbios_structure *template_entry,
const char **template_strings, char *curaddr, char **endaddr,
uint16_t *n, uint16_t *size)
{
struct smbios_structure *entry;
memcpy(curaddr, template_entry, template_entry->length);
entry = (struct smbios_structure *)curaddr;
entry->handle = *n + 1;
curaddr += entry->length;
if (template_strings != NULL) {
int i;
for (i = 0; template_strings[i] != NULL; i++) {
const char *string;
int len;
string = template_strings[i];
len = strlen(string) + 1;
memcpy(curaddr, string, len);
curaddr += len;
}
*curaddr = '\0';
curaddr++;
} else {
/* Minimum string section is double nul */
*curaddr = '\0';
curaddr++;
*curaddr = '\0';
curaddr++;
}
(*n)++;
*endaddr = curaddr;
return (0);
}
static int
smbios_type1_initializer(struct smbios_structure *template_entry,
const char **template_strings, char *curaddr, char **endaddr,
uint16_t *n, uint16_t *size)
{
struct smbios_table_type1 *type1;
smbios_generic_initializer(template_entry, template_strings,
curaddr, endaddr, n, size);
type1 = (struct smbios_table_type1 *)curaddr;
if (guest_uuid_str != NULL) {
uuid_t uuid;
uint32_t status;
uuid_from_string(guest_uuid_str, &uuid, &status);
if (status != uuid_s_ok)
return (-1);
uuid_enc_le(&type1->uuid, &uuid);
} else {
MD5_CTX mdctx;
u_char digest[16];
char hostname[MAXHOSTNAMELEN];
/*
* Universally unique and yet reproducible are an
* oxymoron, however reproducible is desirable in
* this case.
*/
if (gethostname(hostname, sizeof(hostname)))
return (-1);
MD5Init(&mdctx);
MD5Update(&mdctx, vmname, strlen(vmname));
MD5Update(&mdctx, hostname, sizeof(hostname));
MD5Final(digest, &mdctx);
/*
* Set the variant and version number.
*/
digest[6] &= 0x0F;
digest[6] |= 0x30; /* version 3 */
digest[8] &= 0x3F;
digest[8] |= 0x80;
memcpy(&type1->uuid, digest, sizeof (digest));
}
return (0);
}
static int
smbios_type4_initializer(struct smbios_structure *template_entry,
const char **template_strings, char *curaddr, char **endaddr,
uint16_t *n, uint16_t *size)
{
int i;
for (i = 0; i < guest_ncpus; i++) {
struct smbios_table_type4 *type4;
char *p;
int nstrings, len;
smbios_generic_initializer(template_entry, template_strings,
curaddr, endaddr, n, size);
type4 = (struct smbios_table_type4 *)curaddr;
p = curaddr + sizeof (struct smbios_table_type4);
nstrings = 0;
while (p < *endaddr - 1) {
if (*p++ == '\0')
nstrings++;
}
len = sprintf(*endaddr - 1, "CPU #%d", i) + 1;
*endaddr += len - 1;
*(*endaddr) = '\0';
(*endaddr)++;
type4->socket = nstrings + 1;
curaddr = *endaddr;
}
return (0);
}
static int
smbios_type16_initializer(struct smbios_structure *template_entry,
const char **template_strings, char *curaddr, char **endaddr,
uint16_t *n, uint16_t *size)
{
struct smbios_table_type16 *type16;
type16_handle = *n;
smbios_generic_initializer(template_entry, template_strings,
curaddr, endaddr, n, size);
type16 = (struct smbios_table_type16 *)curaddr;
type16->xsize = guest_lomem + guest_himem;
type16->ndevs = guest_himem > 0 ? 2 : 1;
return (0);
}
static int
smbios_type17_initializer(struct smbios_structure *template_entry,
const char **template_strings, char *curaddr, char **endaddr,
uint16_t *n, uint16_t *size)
{
struct smbios_table_type17 *type17;
smbios_generic_initializer(template_entry, template_strings,
curaddr, endaddr, n, size);
type17 = (struct smbios_table_type17 *)curaddr;
type17->arrayhand = type16_handle;
type17->xsize = guest_lomem;
if (guest_himem > 0) {
curaddr = *endaddr;
smbios_generic_initializer(template_entry, template_strings,
curaddr, endaddr, n, size);
type17 = (struct smbios_table_type17 *)curaddr;
type17->arrayhand = type16_handle;
type17->xsize = guest_himem;
}
return (0);
}
static int
smbios_type19_initializer(struct smbios_structure *template_entry,
const char **template_strings, char *curaddr, char **endaddr,
uint16_t *n, uint16_t *size)
{
struct smbios_table_type19 *type19;
smbios_generic_initializer(template_entry, template_strings,
curaddr, endaddr, n, size);
type19 = (struct smbios_table_type19 *)curaddr;
type19->arrayhand = type16_handle;
type19->xsaddr = 0;
type19->xeaddr = guest_lomem;
if (guest_himem > 0) {
curaddr = *endaddr;
smbios_generic_initializer(template_entry, template_strings,
curaddr, endaddr, n, size);
type19 = (struct smbios_table_type19 *)curaddr;
type19->arrayhand = type16_handle;
type19->xsaddr = 4*GB;
type19->xeaddr = guest_himem;
}
return (0);
}
static void
smbios_ep_initializer(struct smbios_entry_point *smbios_ep, uint32_t staddr)
{
memset(smbios_ep, 0, sizeof(*smbios_ep));
memcpy(smbios_ep->eanchor, SMBIOS_ENTRY_EANCHOR,
SMBIOS_ENTRY_EANCHORLEN);
smbios_ep->eplen = 0x1F;
assert(sizeof (struct smbios_entry_point) == smbios_ep->eplen);
smbios_ep->major = 2;
smbios_ep->minor = 6;
smbios_ep->revision = 0;
memcpy(smbios_ep->ianchor, SMBIOS_ENTRY_IANCHOR,
SMBIOS_ENTRY_IANCHORLEN);
smbios_ep->staddr = staddr;
smbios_ep->bcdrev = 0x24;
}
static void
smbios_ep_finalizer(struct smbios_entry_point *smbios_ep, uint16_t len,
uint16_t num, uint16_t maxssize)
{
uint8_t checksum;
int i;
smbios_ep->maxssize = maxssize;
smbios_ep->stlen = len;
smbios_ep->stnum = num;
checksum = 0;
for (i = 0x10; i < 0x1f; i++) {
checksum -= ((uint8_t *)smbios_ep)[i];
}
smbios_ep->ichecksum = checksum;
checksum = 0;
for (i = 0; i < 0x1f; i++) {
checksum -= ((uint8_t *)smbios_ep)[i];
}
smbios_ep->echecksum = checksum;
}
int
smbios_build(struct vmctx *ctx)
{
struct smbios_entry_point *smbios_ep;
uint16_t n;
uint16_t maxssize;
char *curaddr, *startaddr, *ststartaddr;
int i;
int err;
guest_lomem = vm_get_lowmem_size(ctx);
guest_himem = vm_get_highmem_size(ctx);
startaddr = paddr_guest2host(ctx, SMBIOS_BASE, SMBIOS_MAX_LENGTH);
if (startaddr == NULL) {
fprintf(stderr, "smbios table requires mapped mem\n");
return (ENOMEM);
}
curaddr = startaddr;
smbios_ep = (struct smbios_entry_point *)curaddr;
smbios_ep_initializer(smbios_ep, SMBIOS_BASE +
sizeof(struct smbios_entry_point));
curaddr += sizeof(struct smbios_entry_point);
ststartaddr = curaddr;
n = 0;
maxssize = 0;
for (i = 0; smbios_template[i].entry != NULL; i++) {
struct smbios_structure *entry;
const char **strings;
initializer_func_t initializer;
char *endaddr;
uint16_t size;
entry = smbios_template[i].entry;
strings = smbios_template[i].strings;
initializer = smbios_template[i].initializer;
err = (*initializer)(entry, strings, curaddr, &endaddr,
&n, &size);
if (err != 0)
return (err);
if (size > maxssize)
maxssize = size;
curaddr = endaddr;
}
assert(curaddr - startaddr < SMBIOS_MAX_LENGTH);
smbios_ep_finalizer(smbios_ep, curaddr - ststartaddr, n, maxssize);
return (0);
}

36
bhyve/smbiostbl.h Normal file
View file

@ -0,0 +1,36 @@
/*-
* Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _SMBIOSTBL_H_
#define _SMBIOSTBL_H_
struct vmctx;
int smbios_build(struct vmctx *ctx);
#endif /* _SMBIOSTBL_H_ */

104
bhyve/spinup_ap.c Normal file
View file

@ -0,0 +1,104 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/types.h>
#include <machine/vmm.h>
#include <vmmapi.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include "bhyverun.h"
#include "spinup_ap.h"
static void
spinup_ap_realmode(struct vmctx *ctx, int newcpu, uint64_t *rip)
{
int vector, error;
uint16_t cs;
uint64_t desc_base;
uint32_t desc_limit, desc_access;
vector = *rip >> PAGE_SHIFT;
*rip = 0;
/*
* Update the %cs and %rip of the guest so that it starts
* executing real mode code at at 'vector << 12'.
*/
error = vm_set_register(ctx, newcpu, VM_REG_GUEST_RIP, *rip);
assert(error == 0);
error = vm_get_desc(ctx, newcpu, VM_REG_GUEST_CS, &desc_base,
&desc_limit, &desc_access);
assert(error == 0);
desc_base = vector << PAGE_SHIFT;
error = vm_set_desc(ctx, newcpu, VM_REG_GUEST_CS,
desc_base, desc_limit, desc_access);
assert(error == 0);
cs = (vector << PAGE_SHIFT) >> 4;
error = vm_set_register(ctx, newcpu, VM_REG_GUEST_CS, cs);
assert(error == 0);
}
int
spinup_ap(struct vmctx *ctx, int vcpu, int newcpu, uint64_t rip)
{
int error;
assert(newcpu != 0);
assert(newcpu < guest_ncpus);
error = vcpu_reset(ctx, newcpu);
assert(error == 0);
fbsdrun_set_capabilities(ctx, newcpu);
/*
* Enable the 'unrestricted guest' mode for 'newcpu'.
*
* Set up the processor state in power-on 16-bit mode, with the CS:IP
* init'd to the specified low-mem 4K page.
*/
error = vm_set_capability(ctx, newcpu, VM_CAP_UNRESTRICTED_GUEST, 1);
assert(error == 0);
spinup_ap_realmode(ctx, newcpu, &rip);
fbsdrun_addcpu(ctx, vcpu, newcpu, rip);
return (newcpu);
}

34
bhyve/spinup_ap.h Normal file
View file

@ -0,0 +1,34 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _SPINUP_AP_H_
#define _SPINUP_AP_H_
int spinup_ap(struct vmctx *ctx, int vcpu, int newcpu, uint64_t rip);
#endif

939
bhyve/task_switch.c Normal file
View file

@ -0,0 +1,939 @@
/*-
* Copyright (c) 2014 Neel Natu <neel@freebsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/_iovec.h>
#include <sys/mman.h>
#include <x86/psl.h>
#include <x86/segments.h>
#include <x86/specialreg.h>
#include <machine/vmm.h>
#include <machine/vmm_instruction_emul.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <errno.h>
#include <vmmapi.h>
#include "bhyverun.h"
/*
* Using 'struct i386tss' is tempting but causes myriad sign extension
* issues because all of its fields are defined as signed integers.
*/
struct tss32 {
uint16_t tss_link;
uint16_t rsvd1;
uint32_t tss_esp0;
uint16_t tss_ss0;
uint16_t rsvd2;
uint32_t tss_esp1;
uint16_t tss_ss1;
uint16_t rsvd3;
uint32_t tss_esp2;
uint16_t tss_ss2;
uint16_t rsvd4;
uint32_t tss_cr3;
uint32_t tss_eip;
uint32_t tss_eflags;
uint32_t tss_eax;
uint32_t tss_ecx;
uint32_t tss_edx;
uint32_t tss_ebx;
uint32_t tss_esp;
uint32_t tss_ebp;
uint32_t tss_esi;
uint32_t tss_edi;
uint16_t tss_es;
uint16_t rsvd5;
uint16_t tss_cs;
uint16_t rsvd6;
uint16_t tss_ss;
uint16_t rsvd7;
uint16_t tss_ds;
uint16_t rsvd8;
uint16_t tss_fs;
uint16_t rsvd9;
uint16_t tss_gs;
uint16_t rsvd10;
uint16_t tss_ldt;
uint16_t rsvd11;
uint16_t tss_trap;
uint16_t tss_iomap;
};
CTASSERT(sizeof(struct tss32) == 104);
#define SEL_START(sel) (((sel) & ~0x7))
#define SEL_LIMIT(sel) (((sel) | 0x7))
#define TSS_BUSY(type) (((type) & 0x2) != 0)
static uint64_t
GETREG(struct vmctx *ctx, int vcpu, int reg)
{
uint64_t val;
int error;
error = vm_get_register(ctx, vcpu, reg, &val);
assert(error == 0);
return (val);
}
static void
SETREG(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
{
int error;
error = vm_set_register(ctx, vcpu, reg, val);
assert(error == 0);
}
static struct seg_desc
usd_to_seg_desc(struct user_segment_descriptor *usd)
{
struct seg_desc seg_desc;
seg_desc.base = (u_int)USD_GETBASE(usd);
if (usd->sd_gran)
seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff;
else
seg_desc.limit = (u_int)USD_GETLIMIT(usd);
seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7;
seg_desc.access |= usd->sd_xx << 12;
seg_desc.access |= usd->sd_def32 << 14;
seg_desc.access |= usd->sd_gran << 15;
return (seg_desc);
}
/*
* Inject an exception with an error code that is a segment selector.
* The format of the error code is described in section 6.13, "Error Code",
* Intel SDM volume 3.
*
* Bit 0 (EXT) denotes whether the exception occurred during delivery
* of an external event like an interrupt.
*
* Bit 1 (IDT) indicates whether the selector points to a gate descriptor
* in the IDT.
*
* Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI).
*/
static void
sel_exception(struct vmctx *ctx, int vcpu, int vector, uint16_t sel, int ext)
{
/*
* Bit 2 from the selector is retained as-is in the error code.
*
* Bit 1 can be safely cleared because none of the selectors
* encountered during task switch emulation refer to a task
* gate in the IDT.
*
* Bit 0 is set depending on the value of 'ext'.
*/
sel &= ~0x3;
if (ext)
sel |= 0x1;
vm_inject_fault(ctx, vcpu, vector, 1, sel);
}
/*
* Return 0 if the selector 'sel' in within the limits of the GDT/LDT
* and non-zero otherwise.
*/
static int
desc_table_limit_check(struct vmctx *ctx, int vcpu, uint16_t sel)
{
uint64_t base;
uint32_t limit, access;
int error, reg;
reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
assert(error == 0);
if (reg == VM_REG_GUEST_LDTR) {
if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access))
return (-1);
}
if (limit < SEL_LIMIT(sel))
return (-1);
else
return (0);
}
/*
* Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced
* by the selector 'sel'.
*
* Returns 0 on success.
* Returns 1 if an exception was injected into the guest.
* Returns -1 otherwise.
*/
static int
desc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
uint16_t sel, struct user_segment_descriptor *desc, bool doread,
int *faultptr)
{
struct iovec iov[2];
uint64_t base;
uint32_t limit, access;
int error, reg;
reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
assert(error == 0);
assert(limit >= SEL_LIMIT(sel));
error = vm_copy_setup(ctx, vcpu, paging, base + SEL_START(sel),
sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov),
faultptr);
if (error || *faultptr)
return (error);
if (doread)
vm_copyin(ctx, vcpu, iov, desc, sizeof(*desc));
else
vm_copyout(ctx, vcpu, desc, iov, sizeof(*desc));
return (0);
}
static int
desc_table_read(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
{
return (desc_table_rw(ctx, vcpu, paging, sel, desc, true, faultptr));
}
static int
desc_table_write(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
{
return (desc_table_rw(ctx, vcpu, paging, sel, desc, false, faultptr));
}
/*
* Read the TSS descriptor referenced by 'sel' into 'desc'.
*
* Returns 0 on success.
* Returns 1 if an exception was injected into the guest.
* Returns -1 otherwise.
*/
static int
read_tss_descriptor(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
{
struct vm_guest_paging sup_paging;
int error;
assert(!ISLDT(sel));
assert(IDXSEL(sel) != 0);
/* Fetch the new TSS descriptor */
if (desc_table_limit_check(ctx, vcpu, sel)) {
if (ts->reason == TSR_IRET)
sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
else
sel_exception(ctx, vcpu, IDT_GP, sel, ts->ext);
return (1);
}
sup_paging = ts->paging;
sup_paging.cpl = 0; /* implicit supervisor mode */
error = desc_table_read(ctx, vcpu, &sup_paging, sel, desc, faultptr);
return (error);
}
static bool
code_desc(int sd_type)
{
/* code descriptor */
return ((sd_type & 0x18) == 0x18);
}
static bool
stack_desc(int sd_type)
{
/* writable data descriptor */
return ((sd_type & 0x1A) == 0x12);
}
static bool
data_desc(int sd_type)
{
/* data descriptor or a readable code descriptor */
return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A);
}
static bool
ldt_desc(int sd_type)
{
return (sd_type == SDT_SYSLDT);
}
/*
* Validate the descriptor 'seg_desc' associated with 'segment'.
*/
static int
validate_seg_desc(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
int segment, struct seg_desc *seg_desc, int *faultptr)
{
struct vm_guest_paging sup_paging;
struct user_segment_descriptor usd;
int error, idtvec;
int cpl, dpl, rpl;
uint16_t sel, cs;
bool ldtseg, codeseg, stackseg, dataseg, conforming;
ldtseg = codeseg = stackseg = dataseg = false;
switch (segment) {
case VM_REG_GUEST_LDTR:
ldtseg = true;
break;
case VM_REG_GUEST_CS:
codeseg = true;
break;
case VM_REG_GUEST_SS:
stackseg = true;
break;
case VM_REG_GUEST_DS:
case VM_REG_GUEST_ES:
case VM_REG_GUEST_FS:
case VM_REG_GUEST_GS:
dataseg = true;
break;
default:
assert(0);
}
/* Get the segment selector */
sel = GETREG(ctx, vcpu, segment);
/* LDT selector must point into the GDT */
if (ldtseg && ISLDT(sel)) {
sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
return (1);
}
/* Descriptor table limit check */
if (desc_table_limit_check(ctx, vcpu, sel)) {
sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
return (1);
}
/* NULL selector */
if (IDXSEL(sel) == 0) {
/* Code and stack segment selectors cannot be NULL */
if (codeseg || stackseg) {
sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
return (1);
}
seg_desc->base = 0;
seg_desc->limit = 0;
seg_desc->access = 0x10000; /* unusable */
return (0);
}
/* Read the descriptor from the GDT/LDT */
sup_paging = ts->paging;
sup_paging.cpl = 0; /* implicit supervisor mode */
error = desc_table_read(ctx, vcpu, &sup_paging, sel, &usd, faultptr);
if (error || *faultptr)
return (error);
/* Verify that the descriptor type is compatible with the segment */
if ((ldtseg && !ldt_desc(usd.sd_type)) ||
(codeseg && !code_desc(usd.sd_type)) ||
(dataseg && !data_desc(usd.sd_type)) ||
(stackseg && !stack_desc(usd.sd_type))) {
sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
return (1);
}
/* Segment must be marked present */
if (!usd.sd_p) {
if (ldtseg)
idtvec = IDT_TS;
else if (stackseg)
idtvec = IDT_SS;
else
idtvec = IDT_NP;
sel_exception(ctx, vcpu, idtvec, sel, ts->ext);
return (1);
}
cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
cpl = cs & SEL_RPL_MASK;
rpl = sel & SEL_RPL_MASK;
dpl = usd.sd_dpl;
if (stackseg && (rpl != cpl || dpl != cpl)) {
sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
return (1);
}
if (codeseg) {
conforming = (usd.sd_type & 0x4) ? true : false;
if ((conforming && (cpl < dpl)) ||
(!conforming && (cpl != dpl))) {
sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
return (1);
}
}
if (dataseg) {
/*
* A data segment is always non-conforming except when it's
* descriptor is a readable, conforming code segment.
*/
if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0)
conforming = true;
else
conforming = false;
if (!conforming && (rpl > dpl || cpl > dpl)) {
sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
return (1);
}
}
*seg_desc = usd_to_seg_desc(&usd);
return (0);
}
static void
tss32_save(struct vmctx *ctx, int vcpu, struct vm_task_switch *task_switch,
uint32_t eip, struct tss32 *tss, struct iovec *iov)
{
/* General purpose registers */
tss->tss_eax = GETREG(ctx, vcpu, VM_REG_GUEST_RAX);
tss->tss_ecx = GETREG(ctx, vcpu, VM_REG_GUEST_RCX);
tss->tss_edx = GETREG(ctx, vcpu, VM_REG_GUEST_RDX);
tss->tss_ebx = GETREG(ctx, vcpu, VM_REG_GUEST_RBX);
tss->tss_esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
tss->tss_ebp = GETREG(ctx, vcpu, VM_REG_GUEST_RBP);
tss->tss_esi = GETREG(ctx, vcpu, VM_REG_GUEST_RSI);
tss->tss_edi = GETREG(ctx, vcpu, VM_REG_GUEST_RDI);
/* Segment selectors */
tss->tss_es = GETREG(ctx, vcpu, VM_REG_GUEST_ES);
tss->tss_cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
tss->tss_ss = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
tss->tss_ds = GETREG(ctx, vcpu, VM_REG_GUEST_DS);
tss->tss_fs = GETREG(ctx, vcpu, VM_REG_GUEST_FS);
tss->tss_gs = GETREG(ctx, vcpu, VM_REG_GUEST_GS);
/* eflags and eip */
tss->tss_eflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
if (task_switch->reason == TSR_IRET)
tss->tss_eflags &= ~PSL_NT;
tss->tss_eip = eip;
/* Copy updated old TSS into guest memory */
vm_copyout(ctx, vcpu, tss, iov, sizeof(struct tss32));
}
static void
update_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *sd)
{
int error;
error = vm_set_desc(ctx, vcpu, reg, sd->base, sd->limit, sd->access);
assert(error == 0);
}
/*
* Update the vcpu registers to reflect the state of the new task.
*/
static int
tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
uint16_t ot_sel, struct tss32 *tss, struct iovec *iov, int *faultptr)
{
struct seg_desc seg_desc, seg_desc2;
uint64_t *pdpte, maxphyaddr, reserved;
uint32_t eflags;
int error, i;
bool nested;
nested = false;
if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) {
tss->tss_link = ot_sel;
nested = true;
}
eflags = tss->tss_eflags;
if (nested)
eflags |= PSL_NT;
/* LDTR */
SETREG(ctx, vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt);
/* PBDR */
if (ts->paging.paging_mode != PAGING_MODE_FLAT) {
if (ts->paging.paging_mode == PAGING_MODE_PAE) {
/*
* XXX Assuming 36-bit MAXPHYADDR.
*/
maxphyaddr = (1UL << 36) - 1;
pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32);
for (i = 0; i < 4; i++) {
/* Check reserved bits if the PDPTE is valid */
if (!(pdpte[i] & 0x1))
continue;
/*
* Bits 2:1, 8:5 and bits above the processor's
* maximum physical address are reserved.
*/
reserved = ~maxphyaddr | 0x1E6;
if (pdpte[i] & reserved) {
vm_inject_gp(ctx, vcpu);
return (1);
}
}
SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]);
SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]);
SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]);
SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]);
}
SETREG(ctx, vcpu, VM_REG_GUEST_CR3, tss->tss_cr3);
ts->paging.cr3 = tss->tss_cr3;
}
/* eflags and eip */
SETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS, eflags);
SETREG(ctx, vcpu, VM_REG_GUEST_RIP, tss->tss_eip);
/* General purpose registers */
SETREG(ctx, vcpu, VM_REG_GUEST_RAX, tss->tss_eax);
SETREG(ctx, vcpu, VM_REG_GUEST_RCX, tss->tss_ecx);
SETREG(ctx, vcpu, VM_REG_GUEST_RDX, tss->tss_edx);
SETREG(ctx, vcpu, VM_REG_GUEST_RBX, tss->tss_ebx);
SETREG(ctx, vcpu, VM_REG_GUEST_RSP, tss->tss_esp);
SETREG(ctx, vcpu, VM_REG_GUEST_RBP, tss->tss_ebp);
SETREG(ctx, vcpu, VM_REG_GUEST_RSI, tss->tss_esi);
SETREG(ctx, vcpu, VM_REG_GUEST_RDI, tss->tss_edi);
/* Segment selectors */
SETREG(ctx, vcpu, VM_REG_GUEST_ES, tss->tss_es);
SETREG(ctx, vcpu, VM_REG_GUEST_CS, tss->tss_cs);
SETREG(ctx, vcpu, VM_REG_GUEST_SS, tss->tss_ss);
SETREG(ctx, vcpu, VM_REG_GUEST_DS, tss->tss_ds);
SETREG(ctx, vcpu, VM_REG_GUEST_FS, tss->tss_fs);
SETREG(ctx, vcpu, VM_REG_GUEST_GS, tss->tss_gs);
/*
* If this is a nested task then write out the new TSS to update
* the previous link field.
*/
if (nested)
vm_copyout(ctx, vcpu, tss, iov, sizeof(*tss));
/* Validate segment descriptors */
error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc,
faultptr);
if (error || *faultptr)
return (error);
update_seg_desc(ctx, vcpu, VM_REG_GUEST_LDTR, &seg_desc);
/*
* Section "Checks on Guest Segment Registers", Intel SDM, Vol 3.
*
* The SS and CS attribute checks on VM-entry are inter-dependent so
* we need to make sure that both segments are valid before updating
* either of them. This ensures that the VMCS state can pass the
* VM-entry checks so the guest can handle any exception injected
* during task switch emulation.
*/
error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_CS, &seg_desc,
faultptr);
if (error || *faultptr)
return (error);
error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_SS, &seg_desc2,
faultptr);
if (error || *faultptr)
return (error);
update_seg_desc(ctx, vcpu, VM_REG_GUEST_CS, &seg_desc);
update_seg_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc2);
ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK;
error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_DS, &seg_desc,
faultptr);
if (error || *faultptr)
return (error);
update_seg_desc(ctx, vcpu, VM_REG_GUEST_DS, &seg_desc);
error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_ES, &seg_desc,
faultptr);
if (error || *faultptr)
return (error);
update_seg_desc(ctx, vcpu, VM_REG_GUEST_ES, &seg_desc);
error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_FS, &seg_desc,
faultptr);
if (error || *faultptr)
return (error);
update_seg_desc(ctx, vcpu, VM_REG_GUEST_FS, &seg_desc);
error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_GS, &seg_desc,
faultptr);
if (error || *faultptr)
return (error);
update_seg_desc(ctx, vcpu, VM_REG_GUEST_GS, &seg_desc);
return (0);
}
/*
* Push an error code on the stack of the new task. This is needed if the
* task switch was triggered by a hardware exception that causes an error
* code to be saved (e.g. #PF).
*/
static int
push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
int task_type, uint32_t errcode, int *faultptr)
{
struct iovec iov[2];
struct seg_desc seg_desc;
int stacksize, bytes, error;
uint64_t gla, cr0, rflags;
uint32_t esp;
uint16_t stacksel;
*faultptr = 0;
cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
rflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
stacksel = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc.base,
&seg_desc.limit, &seg_desc.access);
assert(error == 0);
/*
* Section "Error Code" in the Intel SDM vol 3: the error code is
* pushed on the stack as a doubleword or word (depending on the
* default interrupt, trap or task gate size).
*/
if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS)
bytes = 4;
else
bytes = 2;
/*
* PUSH instruction from Intel SDM vol 2: the 'B' flag in the
* stack-segment descriptor determines the size of the stack
* pointer outside of 64-bit mode.
*/
if (SEG_DESC_DEF32(seg_desc.access))
stacksize = 4;
else
stacksize = 2;
esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
esp -= bytes;
if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS,
&seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) {
sel_exception(ctx, vcpu, IDT_SS, stacksel, 1);
*faultptr = 1;
return (0);
}
if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) {
vm_inject_ac(ctx, vcpu, 1);
*faultptr = 1;
return (0);
}
error = vm_copy_setup(ctx, vcpu, paging, gla, bytes, PROT_WRITE,
iov, nitems(iov), faultptr);
if (error || *faultptr)
return (error);
vm_copyout(ctx, vcpu, &errcode, iov, bytes);
SETREG(ctx, vcpu, VM_REG_GUEST_RSP, esp);
return (0);
}
/*
* Evaluate return value from helper functions and potentially return to
* the VM run loop.
*/
#define CHKERR(error,fault) \
do { \
assert((error == 0) || (error == EFAULT)); \
if (error) \
return (VMEXIT_ABORT); \
else if (fault) \
return (VMEXIT_CONTINUE); \
} while (0)
int
vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
{
struct seg_desc nt;
struct tss32 oldtss, newtss;
struct vm_task_switch *task_switch;
struct vm_guest_paging *paging, sup_paging;
struct user_segment_descriptor nt_desc, ot_desc;
struct iovec nt_iov[2], ot_iov[2];
uint64_t cr0, ot_base;
uint32_t eip, ot_lim, access;
int error, ext, fault, minlimit, nt_type, ot_type, vcpu;
enum task_switch_reason reason;
uint16_t nt_sel, ot_sel;
task_switch = &vmexit->u.task_switch;
nt_sel = task_switch->tsssel;
ext = vmexit->u.task_switch.ext;
reason = vmexit->u.task_switch.reason;
paging = &vmexit->u.task_switch.paging;
vcpu = *pvcpu;
assert(paging->cpu_mode == CPU_MODE_PROTECTED);
/*
* Calculate the instruction pointer to store in the old TSS.
*/
eip = vmexit->rip + vmexit->inst_length;
/*
* Section 4.6, "Access Rights" in Intel SDM Vol 3.
* The following page table accesses are implicitly supervisor mode:
* - accesses to GDT or LDT to load segment descriptors
* - accesses to the task state segment during task switch
*/
sup_paging = *paging;
sup_paging.cpl = 0; /* implicit supervisor mode */
/* Fetch the new TSS descriptor */
error = read_tss_descriptor(ctx, vcpu, task_switch, nt_sel, &nt_desc,
&fault);
CHKERR(error, fault);
nt = usd_to_seg_desc(&nt_desc);
/* Verify the type of the new TSS */
nt_type = SEG_DESC_TYPE(nt.access);
if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS &&
nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) {
sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
goto done;
}
/* TSS descriptor must have present bit set */
if (!SEG_DESC_PRESENT(nt.access)) {
sel_exception(ctx, vcpu, IDT_NP, nt_sel, ext);
goto done;
}
/*
* TSS must have a minimum length of 104 bytes for a 32-bit TSS and
* 44 bytes for a 16-bit TSS.
*/
if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS)
minlimit = 104 - 1;
else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS)
minlimit = 44 - 1;
else
minlimit = 0;
assert(minlimit > 0);
if (nt.limit < minlimit) {
sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
goto done;
}
/* TSS must be busy if task switch is due to IRET */
if (reason == TSR_IRET && !TSS_BUSY(nt_type)) {
sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
goto done;
}
/*
* TSS must be available (not busy) if task switch reason is
* CALL, JMP, exception or interrupt.
*/
if (reason != TSR_IRET && TSS_BUSY(nt_type)) {
sel_exception(ctx, vcpu, IDT_GP, nt_sel, ext);
goto done;
}
/* Fetch the new TSS */
error = vm_copy_setup(ctx, vcpu, &sup_paging, nt.base, minlimit + 1,
PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov), &fault);
CHKERR(error, fault);
vm_copyin(ctx, vcpu, nt_iov, &newtss, minlimit + 1);
/* Get the old TSS selector from the guest's task register */
ot_sel = GETREG(ctx, vcpu, VM_REG_GUEST_TR);
if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) {
/*
* This might happen if a task switch was attempted without
* ever loading the task register with LTR. In this case the
* TR would contain the values from power-on:
* (sel = 0, base = 0, limit = 0xffff).
*/
sel_exception(ctx, vcpu, IDT_TS, ot_sel, task_switch->ext);
goto done;
}
/* Get the old TSS base and limit from the guest's task register */
error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim,
&access);
assert(error == 0);
assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access));
ot_type = SEG_DESC_TYPE(access);
assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY);
/* Fetch the old TSS descriptor */
error = read_tss_descriptor(ctx, vcpu, task_switch, ot_sel, &ot_desc,
&fault);
CHKERR(error, fault);
/* Get the old TSS */
error = vm_copy_setup(ctx, vcpu, &sup_paging, ot_base, minlimit + 1,
PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov), &fault);
CHKERR(error, fault);
vm_copyin(ctx, vcpu, ot_iov, &oldtss, minlimit + 1);
/*
* Clear the busy bit in the old TSS descriptor if the task switch
* due to an IRET or JMP instruction.
*/
if (reason == TSR_IRET || reason == TSR_JMP) {
ot_desc.sd_type &= ~0x2;
error = desc_table_write(ctx, vcpu, &sup_paging, ot_sel,
&ot_desc, &fault);
CHKERR(error, fault);
}
if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) {
fprintf(stderr, "Task switch to 16-bit TSS not supported\n");
return (VMEXIT_ABORT);
}
/* Save processor state in old TSS */
tss32_save(ctx, vcpu, task_switch, eip, &oldtss, ot_iov);
/*
* If the task switch was triggered for any reason other than IRET
* then set the busy bit in the new TSS descriptor.
*/
if (reason != TSR_IRET) {
nt_desc.sd_type |= 0x2;
error = desc_table_write(ctx, vcpu, &sup_paging, nt_sel,
&nt_desc, &fault);
CHKERR(error, fault);
}
/* Update task register to point at the new TSS */
SETREG(ctx, vcpu, VM_REG_GUEST_TR, nt_sel);
/* Update the hidden descriptor state of the task register */
nt = usd_to_seg_desc(&nt_desc);
update_seg_desc(ctx, vcpu, VM_REG_GUEST_TR, &nt);
/* Set CR0.TS */
cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
SETREG(ctx, vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS);
/*
* We are now committed to the task switch. Any exceptions encountered
* after this point will be handled in the context of the new task and
* the saved instruction pointer will belong to the new task.
*/
error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, newtss.tss_eip);
assert(error == 0);
/* Load processor state from new TSS */
error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov,
&fault);
CHKERR(error, fault);
/*
* Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception
* caused an error code to be generated, this error code is copied
* to the stack of the new task.
*/
if (task_switch->errcode_valid) {
assert(task_switch->ext);
assert(task_switch->reason == TSR_IDT_GATE);
error = push_errcode(ctx, vcpu, &task_switch->paging, nt_type,
task_switch->errcode, &fault);
CHKERR(error, fault);
}
/*
* Treatment of virtual-NMI blocking if NMI is delivered through
* a task gate.
*
* Section "Architectural State Before A VM Exit", Intel SDM, Vol3:
* If the virtual NMIs VM-execution control is 1, VM entry injects
* an NMI, and delivery of the NMI causes a task switch that causes
* a VM exit, virtual-NMI blocking is in effect before the VM exit
* commences.
*
* Thus, virtual-NMI blocking is in effect at the time of the task
* switch VM exit.
*/
/*
* Treatment of virtual-NMI unblocking on IRET from NMI handler task.
*
* Section "Changes to Instruction Behavior in VMX Non-Root Operation"
* If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking.
* This unblocking of virtual-NMI occurs even if IRET causes a fault.
*
* Thus, virtual-NMI blocking is cleared at the time of the task switch
* VM exit.
*/
/*
* If the task switch was triggered by an event delivered through
* the IDT then extinguish the pending event from the vcpu's
* exitintinfo.
*/
if (task_switch->reason == TSR_IDT_GATE) {
error = vm_set_intinfo(ctx, vcpu, 0);
assert(error == 0);
}
/*
* XXX should inject debug exception if 'T' bit is 1
*/
done:
return (VMEXIT_CONTINUE);
}

657
bhyve/uart_emul.c Normal file
View file

@ -0,0 +1,657 @@
/*-
* Copyright (c) 2012 NetApp, Inc.
* Copyright (c) 2013 Neel Natu <neel@freebsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <dev/ic/ns16550.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <fcntl.h>
#include <termios.h>
#include <unistd.h>
#include <stdbool.h>
#include <string.h>
#include <pthread.h>
#include "mevent.h"
#include "uart_emul.h"
#define COM1_BASE 0x3F8
#define COM1_IRQ 4
#define COM2_BASE 0x2F8
#define COM2_IRQ 3
#define DEFAULT_RCLK 1843200
#define DEFAULT_BAUD 9600
#define FCR_RX_MASK 0xC0
#define MCR_OUT1 0x04
#define MCR_OUT2 0x08
#define MSR_DELTA_MASK 0x0f
#ifndef REG_SCR
#define REG_SCR com_scr
#endif
#define FIFOSZ 16
static bool uart_stdio; /* stdio in use for i/o */
static struct termios tio_stdio_orig;
static struct {
int baseaddr;
int irq;
bool inuse;
} uart_lres[] = {
{ COM1_BASE, COM1_IRQ, false},
{ COM2_BASE, COM2_IRQ, false},
};
#define UART_NLDEVS (sizeof(uart_lres) / sizeof(uart_lres[0]))
struct fifo {
uint8_t buf[FIFOSZ];
int rindex; /* index to read from */
int windex; /* index to write to */
int num; /* number of characters in the fifo */
int size; /* size of the fifo */
};
struct ttyfd {
bool opened;
int fd; /* tty device file descriptor */
struct termios tio_orig, tio_new; /* I/O Terminals */
};
struct uart_softc {
pthread_mutex_t mtx; /* protects all softc elements */
uint8_t data; /* Data register (R/W) */
uint8_t ier; /* Interrupt enable register (R/W) */
uint8_t lcr; /* Line control register (R/W) */
uint8_t mcr; /* Modem control register (R/W) */
uint8_t lsr; /* Line status register (R/W) */
uint8_t msr; /* Modem status register (R/W) */
uint8_t fcr; /* FIFO control register (W) */
uint8_t scr; /* Scratch register (R/W) */
uint8_t dll; /* Baudrate divisor latch LSB */
uint8_t dlh; /* Baudrate divisor latch MSB */
struct fifo rxfifo;
struct mevent *mev;
struct ttyfd tty;
bool thre_int_pending; /* THRE interrupt pending */
void *arg;
uart_intr_func_t intr_assert;
uart_intr_func_t intr_deassert;
};
static void uart_drain(int fd, enum ev_type ev, void *arg);
static void
ttyclose(void)
{
tcsetattr(STDIN_FILENO, TCSANOW, &tio_stdio_orig);
}
static void
ttyopen(struct ttyfd *tf)
{
tcgetattr(tf->fd, &tf->tio_orig);
tf->tio_new = tf->tio_orig;
cfmakeraw(&tf->tio_new);
tf->tio_new.c_cflag |= CLOCAL;
tcsetattr(tf->fd, TCSANOW, &tf->tio_new);
if (tf->fd == STDIN_FILENO) {
tio_stdio_orig = tf->tio_orig;
atexit(ttyclose);
}
}
static int
ttyread(struct ttyfd *tf)
{
unsigned char rb;
if (read(tf->fd, &rb, 1) == 1)
return (rb);
else
return (-1);
}
static void
ttywrite(struct ttyfd *tf, unsigned char wb)
{
(void)write(tf->fd, &wb, 1);
}
static void
rxfifo_reset(struct uart_softc *sc, int size)
{
char flushbuf[32];
struct fifo *fifo;
ssize_t nread;
int error;
fifo = &sc->rxfifo;
bzero(fifo, sizeof(struct fifo));
fifo->size = size;
if (sc->tty.opened) {
/*
* Flush any unread input from the tty buffer.
*/
while (1) {
nread = read(sc->tty.fd, flushbuf, sizeof(flushbuf));
if (nread != sizeof(flushbuf))
break;
}
/*
* Enable mevent to trigger when new characters are available
* on the tty fd.
*/
error = mevent_enable(sc->mev);
assert(error == 0);
}
}
static int
rxfifo_available(struct uart_softc *sc)
{
struct fifo *fifo;
fifo = &sc->rxfifo;
return (fifo->num < fifo->size);
}
static int
rxfifo_putchar(struct uart_softc *sc, uint8_t ch)
{
struct fifo *fifo;
int error;
fifo = &sc->rxfifo;
if (fifo->num < fifo->size) {
fifo->buf[fifo->windex] = ch;
fifo->windex = (fifo->windex + 1) % fifo->size;
fifo->num++;
if (!rxfifo_available(sc)) {
if (sc->tty.opened) {
/*
* Disable mevent callback if the FIFO is full.
*/
error = mevent_disable(sc->mev);
assert(error == 0);
}
}
return (0);
} else
return (-1);
}
static int
rxfifo_getchar(struct uart_softc *sc)
{
struct fifo *fifo;
int c, error, wasfull;
wasfull = 0;
fifo = &sc->rxfifo;
if (fifo->num > 0) {
if (!rxfifo_available(sc))
wasfull = 1;
c = fifo->buf[fifo->rindex];
fifo->rindex = (fifo->rindex + 1) % fifo->size;
fifo->num--;
if (wasfull) {
if (sc->tty.opened) {
error = mevent_enable(sc->mev);
assert(error == 0);
}
}
return (c);
} else
return (-1);
}
static int
rxfifo_numchars(struct uart_softc *sc)
{
struct fifo *fifo = &sc->rxfifo;
return (fifo->num);
}
static void
uart_opentty(struct uart_softc *sc)
{
ttyopen(&sc->tty);
sc->mev = mevent_add(sc->tty.fd, EVF_READ, uart_drain, sc);
assert(sc->mev != NULL);
}
/*
* The IIR returns a prioritized interrupt reason:
* - receive data available
* - transmit holding register empty
* - modem status change
*
* Return an interrupt reason if one is available.
*/
static int
uart_intr_reason(struct uart_softc *sc)
{
if ((sc->lsr & LSR_OE) != 0 && (sc->ier & IER_ERLS) != 0)
return (IIR_RLS);
else if (rxfifo_numchars(sc) > 0 && (sc->ier & IER_ERXRDY) != 0)
return (IIR_RXTOUT);
else if (sc->thre_int_pending && (sc->ier & IER_ETXRDY) != 0)
return (IIR_TXRDY);
else if ((sc->msr & MSR_DELTA_MASK) != 0 && (sc->ier & IER_EMSC) != 0)
return (IIR_MLSC);
else
return (IIR_NOPEND);
}
static void
uart_reset(struct uart_softc *sc)
{
uint16_t divisor;
divisor = DEFAULT_RCLK / DEFAULT_BAUD / 16;
sc->dll = divisor;
sc->dlh = divisor >> 16;
rxfifo_reset(sc, 1); /* no fifo until enabled by software */
}
/*
* Toggle the COM port's intr pin depending on whether or not we have an
* interrupt condition to report to the processor.
*/
static void
uart_toggle_intr(struct uart_softc *sc)
{
uint8_t intr_reason;
intr_reason = uart_intr_reason(sc);
if (intr_reason == IIR_NOPEND)
(*sc->intr_deassert)(sc->arg);
else
(*sc->intr_assert)(sc->arg);
}
static void
uart_drain(int fd, enum ev_type ev, void *arg)
{
struct uart_softc *sc;
int ch;
sc = arg;
assert(fd == sc->tty.fd);
assert(ev == EVF_READ);
/*
* This routine is called in the context of the mevent thread
* to take out the softc lock to protect against concurrent
* access from a vCPU i/o exit
*/
pthread_mutex_lock(&sc->mtx);
if ((sc->mcr & MCR_LOOPBACK) != 0) {
(void) ttyread(&sc->tty);
} else {
while (rxfifo_available(sc) &&
((ch = ttyread(&sc->tty)) != -1)) {
rxfifo_putchar(sc, ch);
}
uart_toggle_intr(sc);
}
pthread_mutex_unlock(&sc->mtx);
}
void
uart_write(struct uart_softc *sc, int offset, uint8_t value)
{
int fifosz;
uint8_t msr;
pthread_mutex_lock(&sc->mtx);
/*
* Take care of the special case DLAB accesses first
*/
if ((sc->lcr & LCR_DLAB) != 0) {
if (offset == REG_DLL) {
sc->dll = value;
goto done;
}
if (offset == REG_DLH) {
sc->dlh = value;
goto done;
}
}
switch (offset) {
case REG_DATA:
if (sc->mcr & MCR_LOOPBACK) {
if (rxfifo_putchar(sc, value) != 0)
sc->lsr |= LSR_OE;
} else if (sc->tty.opened) {
ttywrite(&sc->tty, value);
} /* else drop on floor */
sc->thre_int_pending = true;
break;
case REG_IER:
/*
* Apply mask so that bits 4-7 are 0
* Also enables bits 0-3 only if they're 1
*/
sc->ier = value & 0x0F;
break;
case REG_FCR:
/*
* When moving from FIFO and 16450 mode and vice versa,
* the FIFO contents are reset.
*/
if ((sc->fcr & FCR_ENABLE) ^ (value & FCR_ENABLE)) {
fifosz = (value & FCR_ENABLE) ? FIFOSZ : 1;
rxfifo_reset(sc, fifosz);
}
/*
* The FCR_ENABLE bit must be '1' for the programming
* of other FCR bits to be effective.
*/
if ((value & FCR_ENABLE) == 0) {
sc->fcr = 0;
} else {
if ((value & FCR_RCV_RST) != 0)
rxfifo_reset(sc, FIFOSZ);
sc->fcr = value &
(FCR_ENABLE | FCR_DMA | FCR_RX_MASK);
}
break;
case REG_LCR:
sc->lcr = value;
break;
case REG_MCR:
/* Apply mask so that bits 5-7 are 0 */
sc->mcr = value & 0x1F;
msr = 0;
if (sc->mcr & MCR_LOOPBACK) {
/*
* In the loopback mode certain bits from the
* MCR are reflected back into MSR
*/
if (sc->mcr & MCR_RTS)
msr |= MSR_CTS;
if (sc->mcr & MCR_DTR)
msr |= MSR_DSR;
if (sc->mcr & MCR_OUT1)
msr |= MSR_RI;
if (sc->mcr & MCR_OUT2)
msr |= MSR_DCD;
}
/*
* Detect if there has been any change between the
* previous and the new value of MSR. If there is
* then assert the appropriate MSR delta bit.
*/
if ((msr & MSR_CTS) ^ (sc->msr & MSR_CTS))
sc->msr |= MSR_DCTS;
if ((msr & MSR_DSR) ^ (sc->msr & MSR_DSR))
sc->msr |= MSR_DDSR;
if ((msr & MSR_DCD) ^ (sc->msr & MSR_DCD))
sc->msr |= MSR_DDCD;
if ((sc->msr & MSR_RI) != 0 && (msr & MSR_RI) == 0)
sc->msr |= MSR_TERI;
/*
* Update the value of MSR while retaining the delta
* bits.
*/
sc->msr &= MSR_DELTA_MASK;
sc->msr |= msr;
break;
case REG_LSR:
/*
* Line status register is not meant to be written to
* during normal operation.
*/
break;
case REG_MSR:
/*
* As far as I can tell MSR is a read-only register.
*/
break;
case REG_SCR:
sc->scr = value;
break;
default:
break;
}
done:
uart_toggle_intr(sc);
pthread_mutex_unlock(&sc->mtx);
}
uint8_t
uart_read(struct uart_softc *sc, int offset)
{
uint8_t iir, intr_reason, reg;
pthread_mutex_lock(&sc->mtx);
/*
* Take care of the special case DLAB accesses first
*/
if ((sc->lcr & LCR_DLAB) != 0) {
if (offset == REG_DLL) {
reg = sc->dll;
goto done;
}
if (offset == REG_DLH) {
reg = sc->dlh;
goto done;
}
}
switch (offset) {
case REG_DATA:
reg = rxfifo_getchar(sc);
break;
case REG_IER:
reg = sc->ier;
break;
case REG_IIR:
iir = (sc->fcr & FCR_ENABLE) ? IIR_FIFO_MASK : 0;
intr_reason = uart_intr_reason(sc);
/*
* Deal with side effects of reading the IIR register
*/
if (intr_reason == IIR_TXRDY)
sc->thre_int_pending = false;
iir |= intr_reason;
reg = iir;
break;
case REG_LCR:
reg = sc->lcr;
break;
case REG_MCR:
reg = sc->mcr;
break;
case REG_LSR:
/* Transmitter is always ready for more data */
sc->lsr |= LSR_TEMT | LSR_THRE;
/* Check for new receive data */
if (rxfifo_numchars(sc) > 0)
sc->lsr |= LSR_RXRDY;
else
sc->lsr &= ~LSR_RXRDY;
reg = sc->lsr;
/* The LSR_OE bit is cleared on LSR read */
sc->lsr &= ~LSR_OE;
break;
case REG_MSR:
/*
* MSR delta bits are cleared on read
*/
reg = sc->msr;
sc->msr &= ~MSR_DELTA_MASK;
break;
case REG_SCR:
reg = sc->scr;
break;
default:
reg = 0xFF;
break;
}
done:
uart_toggle_intr(sc);
pthread_mutex_unlock(&sc->mtx);
return (reg);
}
int
uart_legacy_alloc(int which, int *baseaddr, int *irq)
{
if (which < 0 || which >= UART_NLDEVS || uart_lres[which].inuse)
return (-1);
uart_lres[which].inuse = true;
*baseaddr = uart_lres[which].baseaddr;
*irq = uart_lres[which].irq;
return (0);
}
struct uart_softc *
uart_init(uart_intr_func_t intr_assert, uart_intr_func_t intr_deassert,
void *arg)
{
struct uart_softc *sc;
sc = calloc(1, sizeof(struct uart_softc));
sc->arg = arg;
sc->intr_assert = intr_assert;
sc->intr_deassert = intr_deassert;
pthread_mutex_init(&sc->mtx, NULL);
uart_reset(sc);
return (sc);
}
static int
uart_tty_backend(struct uart_softc *sc, const char *opts)
{
int fd;
int retval;
retval = -1;
fd = open(opts, O_RDWR | O_NONBLOCK);
if (fd > 0 && isatty(fd)) {
sc->tty.fd = fd;
sc->tty.opened = true;
retval = 0;
}
return (retval);
}
int
uart_set_backend(struct uart_softc *sc, const char *opts)
{
int retval;
retval = -1;
if (opts == NULL)
return (0);
if (strcmp("stdio", opts) == 0) {
if (!uart_stdio) {
sc->tty.fd = STDIN_FILENO;
sc->tty.opened = true;
uart_stdio = true;
retval = 0;
}
} else if (uart_tty_backend(sc, opts) == 0) {
retval = 0;
}
/* Make the backend file descriptor non-blocking */
if (retval == 0)
retval = fcntl(sc->tty.fd, F_SETFL, O_NONBLOCK);
if (retval == 0)
uart_opentty(sc);
return (retval);
}

45
bhyve/uart_emul.h Normal file
View file

@ -0,0 +1,45 @@
/*-
* Copyright (c) 2013 Neel Natu <neel@freebsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _UART_EMUL_H_
#define _UART_EMUL_H_
#define UART_IO_BAR_SIZE 8
struct uart_softc;
typedef void (*uart_intr_func_t)(void *arg);
struct uart_softc *uart_init(uart_intr_func_t intr_assert,
uart_intr_func_t intr_deassert, void *arg);
int uart_legacy_alloc(int unit, int *ioaddr, int *irq);
uint8_t uart_read(struct uart_softc *sc, int offset);
void uart_write(struct uart_softc *sc, int offset, uint8_t value);
int uart_set_backend(struct uart_softc *sc, const char *opt);
#endif

777
bhyve/virtio.c Normal file
View file

@ -0,0 +1,777 @@
/*-
* Copyright (c) 2013 Chris Torek <torek @ torek net>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/uio.h>
#include <stdio.h>
#include <stdint.h>
#include <pthread.h>
#include <pthread_np.h>
#include "bhyverun.h"
#include "pci_emul.h"
#include "virtio.h"
/*
* Functions for dealing with generalized "virtual devices" as
* defined by <https://www.google.com/#output=search&q=virtio+spec>
*/
/*
* In case we decide to relax the "virtio softc comes at the
* front of virtio-based device softc" constraint, let's use
* this to convert.
*/
#define DEV_SOFTC(vs) ((void *)(vs))
/*
* Link a virtio_softc to its constants, the device softc, and
* the PCI emulation.
*/
void
vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc,
void *dev_softc, struct pci_devinst *pi,
struct vqueue_info *queues)
{
int i;
/* vs and dev_softc addresses must match */
assert((void *)vs == dev_softc);
vs->vs_vc = vc;
vs->vs_pi = pi;
pi->pi_arg = vs;
vs->vs_queues = queues;
for (i = 0; i < vc->vc_nvq; i++) {
queues[i].vq_vs = vs;
queues[i].vq_num = i;
}
}
/*
* Reset device (device-wide). This erases all queues, i.e.,
* all the queues become invalid (though we don't wipe out the
* internal pointers, we just clear the VQ_ALLOC flag).
*
* It resets negotiated features to "none".
*
* If MSI-X is enabled, this also resets all the vectors to NO_VECTOR.
*/
void
vi_reset_dev(struct virtio_softc *vs)
{
struct vqueue_info *vq;
int i, nvq;
if (vs->vs_mtx)
assert(pthread_mutex_isowned_np(vs->vs_mtx));
nvq = vs->vs_vc->vc_nvq;
for (vq = vs->vs_queues, i = 0; i < nvq; vq++, i++) {
vq->vq_flags = 0;
vq->vq_last_avail = 0;
vq->vq_save_used = 0;
vq->vq_pfn = 0;
vq->vq_msix_idx = VIRTIO_MSI_NO_VECTOR;
}
vs->vs_negotiated_caps = 0;
vs->vs_curq = 0;
/* vs->vs_status = 0; -- redundant */
if (vs->vs_isr)
pci_lintr_deassert(vs->vs_pi);
vs->vs_isr = 0;
vs->vs_msix_cfg_idx = VIRTIO_MSI_NO_VECTOR;
}
/*
* Set I/O BAR (usually 0) to map PCI config registers.
*/
void
vi_set_io_bar(struct virtio_softc *vs, int barnum)
{
size_t size;
/*
* ??? should we use CFG0 if MSI-X is disabled?
* Existing code did not...
*/
size = VTCFG_R_CFG1 + vs->vs_vc->vc_cfgsize;
pci_emul_alloc_bar(vs->vs_pi, barnum, PCIBAR_IO, size);
}
/*
* Initialize MSI-X vector capabilities if we're to use MSI-X,
* or MSI capabilities if not.
*
* We assume we want one MSI-X vector per queue, here, plus one
* for the config vec.
*/
int
vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix)
{
int nvec;
if (use_msix) {
vs->vs_flags |= VIRTIO_USE_MSIX;
VS_LOCK(vs);
vi_reset_dev(vs); /* set all vectors to NO_VECTOR */
VS_UNLOCK(vs);
nvec = vs->vs_vc->vc_nvq + 1;
if (pci_emul_add_msixcap(vs->vs_pi, nvec, barnum))
return (1);
} else
vs->vs_flags &= ~VIRTIO_USE_MSIX;
/* Only 1 MSI vector for bhyve */
pci_emul_add_msicap(vs->vs_pi, 1);
/* Legacy interrupts are mandatory for virtio devices */
pci_lintr_request(vs->vs_pi);
return (0);
}
/*
* Initialize the currently-selected virtio queue (vs->vs_curq).
* The guest just gave us a page frame number, from which we can
* calculate the addresses of the queue.
*/
void
vi_vq_init(struct virtio_softc *vs, uint32_t pfn)
{
struct vqueue_info *vq;
uint64_t phys;
size_t size;
char *base;
vq = &vs->vs_queues[vs->vs_curq];
vq->vq_pfn = pfn;
phys = (uint64_t)pfn << VRING_PFN;
size = vring_size(vq->vq_qsize);
base = paddr_guest2host(vs->vs_pi->pi_vmctx, phys, size);
/* First page(s) are descriptors... */
vq->vq_desc = (struct virtio_desc *)base;
base += vq->vq_qsize * sizeof(struct virtio_desc);
/* ... immediately followed by "avail" ring (entirely uint16_t's) */
vq->vq_avail = (struct vring_avail *)base;
base += (2 + vq->vq_qsize + 1) * sizeof(uint16_t);
/* Then it's rounded up to the next page... */
base = (char *)roundup2((uintptr_t)base, VRING_ALIGN);
/* ... and the last page(s) are the used ring. */
vq->vq_used = (struct vring_used *)base;
/* Mark queue as allocated, and start at 0 when we use it. */
vq->vq_flags = VQ_ALLOC;
vq->vq_last_avail = 0;
vq->vq_save_used = 0;
}
/*
* Helper inline for vq_getchain(): record the i'th "real"
* descriptor.
*/
static inline void
_vq_record(int i, volatile struct virtio_desc *vd, struct vmctx *ctx,
struct iovec *iov, int n_iov, uint16_t *flags) {
if (i >= n_iov)
return;
iov[i].iov_base = paddr_guest2host(ctx, vd->vd_addr, vd->vd_len);
iov[i].iov_len = vd->vd_len;
if (flags != NULL)
flags[i] = vd->vd_flags;
}
#define VQ_MAX_DESCRIPTORS 512 /* see below */
/*
* Examine the chain of descriptors starting at the "next one" to
* make sure that they describe a sensible request. If so, return
* the number of "real" descriptors that would be needed/used in
* acting on this request. This may be smaller than the number of
* available descriptors, e.g., if there are two available but
* they are two separate requests, this just returns 1. Or, it
* may be larger: if there are indirect descriptors involved,
* there may only be one descriptor available but it may be an
* indirect pointing to eight more. We return 8 in this case,
* i.e., we do not count the indirect descriptors, only the "real"
* ones.
*
* Basically, this vets the vd_flags and vd_next field of each
* descriptor and tells you how many are involved. Since some may
* be indirect, this also needs the vmctx (in the pci_devinst
* at vs->vs_pi) so that it can find indirect descriptors.
*
* As we process each descriptor, we copy and adjust it (guest to
* host address wise, also using the vmtctx) into the given iov[]
* array (of the given size). If the array overflows, we stop
* placing values into the array but keep processing descriptors,
* up to VQ_MAX_DESCRIPTORS, before giving up and returning -1.
* So you, the caller, must not assume that iov[] is as big as the
* return value (you can process the same thing twice to allocate
* a larger iov array if needed, or supply a zero length to find
* out how much space is needed).
*
* If you want to verify the WRITE flag on each descriptor, pass a
* non-NULL "flags" pointer to an array of "uint16_t" of the same size
* as n_iov and we'll copy each vd_flags field after unwinding any
* indirects.
*
* If some descriptor(s) are invalid, this prints a diagnostic message
* and returns -1. If no descriptors are ready now it simply returns 0.
*
* You are assumed to have done a vq_ring_ready() if needed (note
* that vq_has_descs() does one).
*/
int
vq_getchain(struct vqueue_info *vq, uint16_t *pidx,
struct iovec *iov, int n_iov, uint16_t *flags)
{
int i;
u_int ndesc, n_indir;
u_int idx, next;
volatile struct virtio_desc *vdir, *vindir, *vp;
struct vmctx *ctx;
struct virtio_softc *vs;
const char *name;
vs = vq->vq_vs;
name = vs->vs_vc->vc_name;
/*
* Note: it's the responsibility of the guest not to
* update vq->vq_avail->va_idx until all of the descriptors
* the guest has written are valid (including all their
* vd_next fields and vd_flags).
*
* Compute (last_avail - va_idx) in integers mod 2**16. This is
* the number of descriptors the device has made available
* since the last time we updated vq->vq_last_avail.
*
* We just need to do the subtraction as an unsigned int,
* then trim off excess bits.
*/
idx = vq->vq_last_avail;
ndesc = (uint16_t)((u_int)vq->vq_avail->va_idx - idx);
if (ndesc == 0)
return (0);
if (ndesc > vq->vq_qsize) {
/* XXX need better way to diagnose issues */
fprintf(stderr,
"%s: ndesc (%u) out of range, driver confused?\r\n",
name, (u_int)ndesc);
return (-1);
}
/*
* Now count/parse "involved" descriptors starting from
* the head of the chain.
*
* To prevent loops, we could be more complicated and
* check whether we're re-visiting a previously visited
* index, but we just abort if the count gets excessive.
*/
ctx = vs->vs_pi->pi_vmctx;
*pidx = next = vq->vq_avail->va_ring[idx & (vq->vq_qsize - 1)];
vq->vq_last_avail++;
for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->vd_next) {
if (next >= vq->vq_qsize) {
fprintf(stderr,
"%s: descriptor index %u out of range, "
"driver confused?\r\n",
name, next);
return (-1);
}
vdir = &vq->vq_desc[next];
if ((vdir->vd_flags & VRING_DESC_F_INDIRECT) == 0) {
_vq_record(i, vdir, ctx, iov, n_iov, flags);
i++;
} else if ((vs->vs_vc->vc_hv_caps &
VIRTIO_RING_F_INDIRECT_DESC) == 0) {
fprintf(stderr,
"%s: descriptor has forbidden INDIRECT flag, "
"driver confused?\r\n",
name);
return (-1);
} else {
n_indir = vdir->vd_len / 16;
if ((vdir->vd_len & 0xf) || n_indir == 0) {
fprintf(stderr,
"%s: invalid indir len 0x%x, "
"driver confused?\r\n",
name, (u_int)vdir->vd_len);
return (-1);
}
vindir = paddr_guest2host(ctx,
vdir->vd_addr, vdir->vd_len);
/*
* Indirects start at the 0th, then follow
* their own embedded "next"s until those run
* out. Each one's indirect flag must be off
* (we don't really have to check, could just
* ignore errors...).
*/
next = 0;
for (;;) {
vp = &vindir[next];
if (vp->vd_flags & VRING_DESC_F_INDIRECT) {
fprintf(stderr,
"%s: indirect desc has INDIR flag,"
" driver confused?\r\n",
name);
return (-1);
}
_vq_record(i, vp, ctx, iov, n_iov, flags);
if (++i > VQ_MAX_DESCRIPTORS)
goto loopy;
if ((vp->vd_flags & VRING_DESC_F_NEXT) == 0)
break;
next = vp->vd_next;
if (next >= n_indir) {
fprintf(stderr,
"%s: invalid next %u > %u, "
"driver confused?\r\n",
name, (u_int)next, n_indir);
return (-1);
}
}
}
if ((vdir->vd_flags & VRING_DESC_F_NEXT) == 0)
return (i);
}
loopy:
fprintf(stderr,
"%s: descriptor loop? count > %d - driver confused?\r\n",
name, i);
return (-1);
}
/*
* Return the currently-first request chain back to the available queue.
*
* (This chain is the one you handled when you called vq_getchain()
* and used its positive return value.)
*/
void
vq_retchain(struct vqueue_info *vq)
{
vq->vq_last_avail--;
}
/*
* Return specified request chain to the guest, setting its I/O length
* to the provided value.
*
* (This chain is the one you handled when you called vq_getchain()
* and used its positive return value.)
*/
void
vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen)
{
uint16_t uidx, mask;
volatile struct vring_used *vuh;
volatile struct virtio_used *vue;
/*
* Notes:
* - mask is N-1 where N is a power of 2 so computes x % N
* - vuh points to the "used" data shared with guest
* - vue points to the "used" ring entry we want to update
* - head is the same value we compute in vq_iovecs().
*
* (I apologize for the two fields named vu_idx; the
* virtio spec calls the one that vue points to, "id"...)
*/
mask = vq->vq_qsize - 1;
vuh = vq->vq_used;
uidx = vuh->vu_idx;
vue = &vuh->vu_ring[uidx++ & mask];
vue->vu_idx = idx;
vue->vu_tlen = iolen;
vuh->vu_idx = uidx;
}
/*
* Driver has finished processing "available" chains and calling
* vq_relchain on each one. If driver used all the available
* chains, used_all should be set.
*
* If the "used" index moved we may need to inform the guest, i.e.,
* deliver an interrupt. Even if the used index did NOT move we
* may need to deliver an interrupt, if the avail ring is empty and
* we are supposed to interrupt on empty.
*
* Note that used_all_avail is provided by the caller because it's
* a snapshot of the ring state when he decided to finish interrupt
* processing -- it's possible that descriptors became available after
* that point. (It's also typically a constant 1/True as well.)
*/
void
vq_endchains(struct vqueue_info *vq, int used_all_avail)
{
struct virtio_softc *vs;
uint16_t event_idx, new_idx, old_idx;
int intr;
/*
* Interrupt generation: if we're using EVENT_IDX,
* interrupt if we've crossed the event threshold.
* Otherwise interrupt is generated if we added "used" entries,
* but suppressed by VRING_AVAIL_F_NO_INTERRUPT.
*
* In any case, though, if NOTIFY_ON_EMPTY is set and the
* entire avail was processed, we need to interrupt always.
*/
vs = vq->vq_vs;
old_idx = vq->vq_save_used;
vq->vq_save_used = new_idx = vq->vq_used->vu_idx;
if (used_all_avail &&
(vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY))
intr = 1;
else if (vs->vs_negotiated_caps & VIRTIO_RING_F_EVENT_IDX) {
event_idx = VQ_USED_EVENT_IDX(vq);
/*
* This calculation is per docs and the kernel
* (see src/sys/dev/virtio/virtio_ring.h).
*/
intr = (uint16_t)(new_idx - event_idx - 1) <
(uint16_t)(new_idx - old_idx);
} else {
intr = new_idx != old_idx &&
!(vq->vq_avail->va_flags & VRING_AVAIL_F_NO_INTERRUPT);
}
if (intr)
vq_interrupt(vs, vq);
}
/* Note: these are in sorted order to make for a fast search */
static struct config_reg {
uint16_t cr_offset; /* register offset */
uint8_t cr_size; /* size (bytes) */
uint8_t cr_ro; /* true => reg is read only */
const char *cr_name; /* name of reg */
} config_regs[] = {
{ VTCFG_R_HOSTCAP, 4, 1, "HOSTCAP" },
{ VTCFG_R_GUESTCAP, 4, 0, "GUESTCAP" },
{ VTCFG_R_PFN, 4, 0, "PFN" },
{ VTCFG_R_QNUM, 2, 1, "QNUM" },
{ VTCFG_R_QSEL, 2, 0, "QSEL" },
{ VTCFG_R_QNOTIFY, 2, 0, "QNOTIFY" },
{ VTCFG_R_STATUS, 1, 0, "STATUS" },
{ VTCFG_R_ISR, 1, 0, "ISR" },
{ VTCFG_R_CFGVEC, 2, 0, "CFGVEC" },
{ VTCFG_R_QVEC, 2, 0, "QVEC" },
};
static inline struct config_reg *
vi_find_cr(int offset) {
u_int hi, lo, mid;
struct config_reg *cr;
lo = 0;
hi = sizeof(config_regs) / sizeof(*config_regs) - 1;
while (hi >= lo) {
mid = (hi + lo) >> 1;
cr = &config_regs[mid];
if (cr->cr_offset == offset)
return (cr);
if (cr->cr_offset < offset)
lo = mid + 1;
else
hi = mid - 1;
}
return (NULL);
}
/*
* Handle pci config space reads.
* If it's to the MSI-X info, do that.
* If it's part of the virtio standard stuff, do that.
* Otherwise dispatch to the actual driver.
*/
uint64_t
vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int baridx, uint64_t offset, int size)
{
struct virtio_softc *vs = pi->pi_arg;
struct virtio_consts *vc;
struct config_reg *cr;
uint64_t virtio_config_size, max;
const char *name;
uint32_t newoff;
uint32_t value;
int error;
if (vs->vs_flags & VIRTIO_USE_MSIX) {
if (baridx == pci_msix_table_bar(pi) ||
baridx == pci_msix_pba_bar(pi)) {
return (pci_emul_msix_tread(pi, offset, size));
}
}
/* XXX probably should do something better than just assert() */
assert(baridx == 0);
if (vs->vs_mtx)
pthread_mutex_lock(vs->vs_mtx);
vc = vs->vs_vc;
name = vc->vc_name;
value = size == 1 ? 0xff : size == 2 ? 0xffff : 0xffffffff;
if (size != 1 && size != 2 && size != 4)
goto bad;
if (pci_msix_enabled(pi))
virtio_config_size = VTCFG_R_CFG1;
else
virtio_config_size = VTCFG_R_CFG0;
if (offset >= virtio_config_size) {
/*
* Subtract off the standard size (including MSI-X
* registers if enabled) and dispatch to underlying driver.
* If that fails, fall into general code.
*/
newoff = offset - virtio_config_size;
max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000;
if (newoff + size > max)
goto bad;
error = (*vc->vc_cfgread)(DEV_SOFTC(vs), newoff, size, &value);
if (!error)
goto done;
}
bad:
cr = vi_find_cr(offset);
if (cr == NULL || cr->cr_size != size) {
if (cr != NULL) {
/* offset must be OK, so size must be bad */
fprintf(stderr,
"%s: read from %s: bad size %d\r\n",
name, cr->cr_name, size);
} else {
fprintf(stderr,
"%s: read from bad offset/size %jd/%d\r\n",
name, (uintmax_t)offset, size);
}
goto done;
}
switch (offset) {
case VTCFG_R_HOSTCAP:
value = vc->vc_hv_caps;
break;
case VTCFG_R_GUESTCAP:
value = vs->vs_negotiated_caps;
break;
case VTCFG_R_PFN:
if (vs->vs_curq < vc->vc_nvq)
value = vs->vs_queues[vs->vs_curq].vq_pfn;
break;
case VTCFG_R_QNUM:
value = vs->vs_curq < vc->vc_nvq ?
vs->vs_queues[vs->vs_curq].vq_qsize : 0;
break;
case VTCFG_R_QSEL:
value = vs->vs_curq;
break;
case VTCFG_R_QNOTIFY:
value = 0; /* XXX */
break;
case VTCFG_R_STATUS:
value = vs->vs_status;
break;
case VTCFG_R_ISR:
value = vs->vs_isr;
vs->vs_isr = 0; /* a read clears this flag */
if (value)
pci_lintr_deassert(pi);
break;
case VTCFG_R_CFGVEC:
value = vs->vs_msix_cfg_idx;
break;
case VTCFG_R_QVEC:
value = vs->vs_curq < vc->vc_nvq ?
vs->vs_queues[vs->vs_curq].vq_msix_idx :
VIRTIO_MSI_NO_VECTOR;
break;
}
done:
if (vs->vs_mtx)
pthread_mutex_unlock(vs->vs_mtx);
return (value);
}
/*
* Handle pci config space writes.
* If it's to the MSI-X info, do that.
* If it's part of the virtio standard stuff, do that.
* Otherwise dispatch to the actual driver.
*/
void
vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int baridx, uint64_t offset, int size, uint64_t value)
{
struct virtio_softc *vs = pi->pi_arg;
struct vqueue_info *vq;
struct virtio_consts *vc;
struct config_reg *cr;
uint64_t virtio_config_size, max;
const char *name;
uint32_t newoff;
int error;
if (vs->vs_flags & VIRTIO_USE_MSIX) {
if (baridx == pci_msix_table_bar(pi) ||
baridx == pci_msix_pba_bar(pi)) {
pci_emul_msix_twrite(pi, offset, size, value);
return;
}
}
/* XXX probably should do something better than just assert() */
assert(baridx == 0);
if (vs->vs_mtx)
pthread_mutex_lock(vs->vs_mtx);
vc = vs->vs_vc;
name = vc->vc_name;
if (size != 1 && size != 2 && size != 4)
goto bad;
if (pci_msix_enabled(pi))
virtio_config_size = VTCFG_R_CFG1;
else
virtio_config_size = VTCFG_R_CFG0;
if (offset >= virtio_config_size) {
/*
* Subtract off the standard size (including MSI-X
* registers if enabled) and dispatch to underlying driver.
*/
newoff = offset - virtio_config_size;
max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000;
if (newoff + size > max)
goto bad;
error = (*vc->vc_cfgwrite)(DEV_SOFTC(vs), newoff, size, value);
if (!error)
goto done;
}
bad:
cr = vi_find_cr(offset);
if (cr == NULL || cr->cr_size != size || cr->cr_ro) {
if (cr != NULL) {
/* offset must be OK, wrong size and/or reg is R/O */
if (cr->cr_size != size)
fprintf(stderr,
"%s: write to %s: bad size %d\r\n",
name, cr->cr_name, size);
if (cr->cr_ro)
fprintf(stderr,
"%s: write to read-only reg %s\r\n",
name, cr->cr_name);
} else {
fprintf(stderr,
"%s: write to bad offset/size %jd/%d\r\n",
name, (uintmax_t)offset, size);
}
goto done;
}
switch (offset) {
case VTCFG_R_GUESTCAP:
vs->vs_negotiated_caps = value & vc->vc_hv_caps;
if (vc->vc_apply_features)
(*vc->vc_apply_features)(DEV_SOFTC(vs),
vs->vs_negotiated_caps);
break;
case VTCFG_R_PFN:
if (vs->vs_curq >= vc->vc_nvq)
goto bad_qindex;
vi_vq_init(vs, value);
break;
case VTCFG_R_QSEL:
/*
* Note that the guest is allowed to select an
* invalid queue; we just need to return a QNUM
* of 0 while the bad queue is selected.
*/
vs->vs_curq = value;
break;
case VTCFG_R_QNOTIFY:
if (value >= vc->vc_nvq) {
fprintf(stderr, "%s: queue %d notify out of range\r\n",
name, (int)value);
goto done;
}
vq = &vs->vs_queues[value];
if (vq->vq_notify)
(*vq->vq_notify)(DEV_SOFTC(vs), vq);
else if (vc->vc_qnotify)
(*vc->vc_qnotify)(DEV_SOFTC(vs), vq);
else
fprintf(stderr,
"%s: qnotify queue %d: missing vq/vc notify\r\n",
name, (int)value);
break;
case VTCFG_R_STATUS:
vs->vs_status = value;
if (value == 0)
(*vc->vc_reset)(DEV_SOFTC(vs));
break;
case VTCFG_R_CFGVEC:
vs->vs_msix_cfg_idx = value;
break;
case VTCFG_R_QVEC:
if (vs->vs_curq >= vc->vc_nvq)
goto bad_qindex;
vq = &vs->vs_queues[vs->vs_curq];
vq->vq_msix_idx = value;
break;
}
goto done;
bad_qindex:
fprintf(stderr,
"%s: write config reg %s: curq %d >= max %d\r\n",
name, cr->cr_name, vs->vs_curq, vc->vc_nvq);
done:
if (vs->vs_mtx)
pthread_mutex_unlock(vs->vs_mtx);
}

464
bhyve/virtio.h Normal file
View file

@ -0,0 +1,464 @@
/*-
* Copyright (c) 2013 Chris Torek <torek @ torek net>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VIRTIO_H_
#define _VIRTIO_H_
/*
* These are derived from several virtio specifications.
*
* Some useful links:
* https://github.com/rustyrussell/virtio-spec
* http://people.redhat.com/pbonzini/virtio-spec.pdf
*/
/*
* A virtual device has zero or more "virtual queues" (virtqueue).
* Each virtqueue uses at least two 4096-byte pages, laid out thus:
*
* +-----------------------------------------------+
* | "desc": <N> descriptors, 16 bytes each |
* | ----------------------------------------- |
* | "avail": 2 uint16; <N> uint16; 1 uint16 |
* | ----------------------------------------- |
* | pad to 4k boundary |
* +-----------------------------------------------+
* | "used": 2 x uint16; <N> elems; 1 uint16 |
* | ----------------------------------------- |
* | pad to 4k boundary |
* +-----------------------------------------------+
*
* The number <N> that appears here is always a power of two and is
* limited to no more than 32768 (as it must fit in a 16-bit field).
* If <N> is sufficiently large, the above will occupy more than
* two pages. In any case, all pages must be physically contiguous
* within the guest's physical address space.
*
* The <N> 16-byte "desc" descriptors consist of a 64-bit guest
* physical address <addr>, a 32-bit length <len>, a 16-bit
* <flags>, and a 16-bit <next> field (all in guest byte order).
*
* There are three flags that may be set :
* NEXT descriptor is chained, so use its "next" field
* WRITE descriptor is for host to write into guest RAM
* (else host is to read from guest RAM)
* INDIRECT descriptor address field is (guest physical)
* address of a linear array of descriptors
*
* Unless INDIRECT is set, <len> is the number of bytes that may
* be read/written from guest physical address <addr>. If
* INDIRECT is set, WRITE is ignored and <len> provides the length
* of the indirect descriptors (and <len> must be a multiple of
* 16). Note that NEXT may still be set in the main descriptor
* pointing to the indirect, and should be set in each indirect
* descriptor that uses the next descriptor (these should generally
* be numbered sequentially). However, INDIRECT must not be set
* in the indirect descriptors. Upon reaching an indirect descriptor
* without a NEXT bit, control returns to the direct descriptors.
*
* Except inside an indirect, each <next> value must be in the
* range [0 .. N) (i.e., the half-open interval). (Inside an
* indirect, each <next> must be in the range [0 .. <len>/16).)
*
* The "avail" data structures reside in the same pages as the
* "desc" structures since both together are used by the device to
* pass information to the hypervisor's virtual driver. These
* begin with a 16-bit <flags> field and 16-bit index <idx>, then
* have <N> 16-bit <ring> values, followed by one final 16-bit
* field <used_event>. The <N> <ring> entries are simply indices
* indices into the descriptor ring (and thus must meet the same
* constraints as each <next> value). However, <idx> is counted
* up from 0 (initially) and simply wraps around after 65535; it
* is taken mod <N> to find the next available entry.
*
* The "used" ring occupies a separate page or pages, and contains
* values written from the virtual driver back to the guest OS.
* This begins with a 16-bit <flags> and 16-bit <idx>, then there
* are <N> "vring_used" elements, followed by a 16-bit <avail_event>.
* The <N> "vring_used" elements consist of a 32-bit <id> and a
* 32-bit <len> (vu_tlen below). The <id> is simply the index of
* the head of a descriptor chain the guest made available
* earlier, and the <len> is the number of bytes actually written,
* e.g., in the case of a network driver that provided a large
* receive buffer but received only a small amount of data.
*
* The two event fields, <used_event> and <avail_event>, in the
* avail and used rings (respectively -- note the reversal!), are
* always provided, but are used only if the virtual device
* negotiates the VIRTIO_RING_F_EVENT_IDX feature during feature
* negotiation. Similarly, both rings provide a flag --
* VRING_AVAIL_F_NO_INTERRUPT and VRING_USED_F_NO_NOTIFY -- in
* their <flags> field, indicating that the guest does not need an
* interrupt, or that the hypervisor driver does not need a
* notify, when descriptors are added to the corresponding ring.
* (These are provided only for interrupt optimization and need
* not be implemented.)
*/
#define VRING_ALIGN 4096
#define VRING_DESC_F_NEXT (1 << 0)
#define VRING_DESC_F_WRITE (1 << 1)
#define VRING_DESC_F_INDIRECT (1 << 2)
struct virtio_desc { /* AKA vring_desc */
uint64_t vd_addr; /* guest physical address */
uint32_t vd_len; /* length of scatter/gather seg */
uint16_t vd_flags; /* VRING_F_DESC_* */
uint16_t vd_next; /* next desc if F_NEXT */
} __packed;
struct virtio_used { /* AKA vring_used_elem */
uint32_t vu_idx; /* head of used descriptor chain */
uint32_t vu_tlen; /* length written-to */
} __packed;
#define VRING_AVAIL_F_NO_INTERRUPT 1
struct vring_avail {
uint16_t va_flags; /* VRING_AVAIL_F_* */
uint16_t va_idx; /* counts to 65535, then cycles */
uint16_t va_ring[]; /* size N, reported in QNUM value */
/* uint16_t va_used_event; -- after N ring entries */
} __packed;
#define VRING_USED_F_NO_NOTIFY 1
struct vring_used {
uint16_t vu_flags; /* VRING_USED_F_* */
uint16_t vu_idx; /* counts to 65535, then cycles */
struct virtio_used vu_ring[]; /* size N */
/* uint16_t vu_avail_event; -- after N ring entries */
} __packed;
/*
* The address of any given virtual queue is determined by a single
* Page Frame Number register. The guest writes the PFN into the
* PCI config space. However, a device that has two or more
* virtqueues can have a different PFN, and size, for each queue.
* The number of queues is determinable via the PCI config space
* VTCFG_R_QSEL register. Writes to QSEL select the queue: 0 means
* queue #0, 1 means queue#1, etc. Once a queue is selected, the
* remaining PFN and QNUM registers refer to that queue.
*
* QNUM is a read-only register containing a nonzero power of two
* that indicates the (hypervisor's) queue size. Or, if reading it
* produces zero, the hypervisor does not have a corresponding
* queue. (The number of possible queues depends on the virtual
* device. The block device has just one; the network device
* provides either two -- 0 = receive, 1 = transmit -- or three,
* with 2 = control.)
*
* PFN is a read/write register giving the physical page address of
* the virtqueue in guest memory (the guest must allocate enough space
* based on the hypervisor's provided QNUM).
*
* QNOTIFY is effectively write-only: when the guest writes a queue
* number to the register, the hypervisor should scan the specified
* virtqueue. (Reading QNOTIFY currently always gets 0).
*/
/*
* PFN register shift amount
*/
#define VRING_PFN 12
/*
* Virtio device types
*
* XXX Should really be merged with <dev/virtio/virtio.h> defines
*/
#define VIRTIO_TYPE_NET 1
#define VIRTIO_TYPE_BLOCK 2
#define VIRTIO_TYPE_CONSOLE 3
#define VIRTIO_TYPE_ENTROPY 4
#define VIRTIO_TYPE_BALLOON 5
#define VIRTIO_TYPE_IOMEMORY 6
#define VIRTIO_TYPE_RPMSG 7
#define VIRTIO_TYPE_SCSI 8
#define VIRTIO_TYPE_9P 9
/* experimental IDs start at 65535 and work down */
/*
* PCI vendor/device IDs
*/
#define VIRTIO_VENDOR 0x1AF4
#define VIRTIO_DEV_NET 0x1000
#define VIRTIO_DEV_BLOCK 0x1001
#define VIRTIO_DEV_RANDOM 0x1002
/*
* PCI config space constants.
*
* If MSI-X is enabled, the ISR register is generally not used,
* and the configuration vector and queue vector appear at offsets
* 20 and 22 with the remaining configuration registers at 24.
* If MSI-X is not enabled, those two registers disappear and
* the remaining configuration registers start at offset 20.
*/
#define VTCFG_R_HOSTCAP 0
#define VTCFG_R_GUESTCAP 4
#define VTCFG_R_PFN 8
#define VTCFG_R_QNUM 12
#define VTCFG_R_QSEL 14
#define VTCFG_R_QNOTIFY 16
#define VTCFG_R_STATUS 18
#define VTCFG_R_ISR 19
#define VTCFG_R_CFGVEC 20
#define VTCFG_R_QVEC 22
#define VTCFG_R_CFG0 20 /* No MSI-X */
#define VTCFG_R_CFG1 24 /* With MSI-X */
#define VTCFG_R_MSIX 20
/*
* Bits in VTCFG_R_STATUS. Guests need not actually set any of these,
* but a guest writing 0 to this register means "please reset".
*/
#define VTCFG_STATUS_ACK 0x01 /* guest OS has acknowledged dev */
#define VTCFG_STATUS_DRIVER 0x02 /* guest OS driver is loaded */
#define VTCFG_STATUS_DRIVER_OK 0x04 /* guest OS driver ready */
#define VTCFG_STATUS_FAILED 0x80 /* guest has given up on this dev */
/*
* Bits in VTCFG_R_ISR. These apply only if not using MSI-X.
*
* (We don't [yet?] ever use CONF_CHANGED.)
*/
#define VTCFG_ISR_QUEUES 0x01 /* re-scan queues */
#define VTCFG_ISR_CONF_CHANGED 0x80 /* configuration changed */
#define VIRTIO_MSI_NO_VECTOR 0xFFFF
/*
* Feature flags.
* Note: bits 0 through 23 are reserved to each device type.
*/
#define VIRTIO_F_NOTIFY_ON_EMPTY (1 << 24)
#define VIRTIO_RING_F_INDIRECT_DESC (1 << 28)
#define VIRTIO_RING_F_EVENT_IDX (1 << 29)
/* From section 2.3, "Virtqueue Configuration", of the virtio specification */
static inline size_t
vring_size(u_int qsz)
{
size_t size;
/* constant 3 below = va_flags, va_idx, va_used_event */
size = sizeof(struct virtio_desc) * qsz + sizeof(uint16_t) * (3 + qsz);
size = roundup2(size, VRING_ALIGN);
/* constant 3 below = vu_flags, vu_idx, vu_avail_event */
size += sizeof(uint16_t) * 3 + sizeof(struct virtio_used) * qsz;
size = roundup2(size, VRING_ALIGN);
return (size);
}
struct vmctx;
struct pci_devinst;
struct vqueue_info;
/*
* A virtual device, with some number (possibly 0) of virtual
* queues and some size (possibly 0) of configuration-space
* registers private to the device. The virtio_softc should come
* at the front of each "derived class", so that a pointer to the
* virtio_softc is also a pointer to the more specific, derived-
* from-virtio driver's softc.
*
* Note: inside each hypervisor virtio driver, changes to these
* data structures must be locked against other threads, if any.
* Except for PCI config space register read/write, we assume each
* driver does the required locking, but we need a pointer to the
* lock (if there is one) for PCI config space read/write ops.
*
* When the guest reads or writes the device's config space, the
* generic layer checks for operations on the special registers
* described above. If the offset of the register(s) being read
* or written is past the CFG area (CFG0 or CFG1), the request is
* passed on to the virtual device, after subtracting off the
* generic-layer size. (So, drivers can just use the offset as
* an offset into "struct config", for instance.)
*
* (The virtio layer also makes sure that the read or write is to/
* from a "good" config offset, hence vc_cfgsize, and on BAR #0.
* However, the driver must verify the read or write size and offset
* and that no one is writing a readonly register.)
*
* The BROKED flag ("this thing done gone and broked") is for future
* use.
*/
#define VIRTIO_USE_MSIX 0x01
#define VIRTIO_EVENT_IDX 0x02 /* use the event-index values */
#define VIRTIO_BROKED 0x08 /* ??? */
struct virtio_softc {
struct virtio_consts *vs_vc; /* constants (see below) */
int vs_flags; /* VIRTIO_* flags from above */
pthread_mutex_t *vs_mtx; /* POSIX mutex, if any */
struct pci_devinst *vs_pi; /* PCI device instance */
uint32_t vs_negotiated_caps; /* negotiated capabilities */
struct vqueue_info *vs_queues; /* one per vc_nvq */
int vs_curq; /* current queue */
uint8_t vs_status; /* value from last status write */
uint8_t vs_isr; /* ISR flags, if not MSI-X */
uint16_t vs_msix_cfg_idx; /* MSI-X vector for config event */
};
#define VS_LOCK(vs) \
do { \
if (vs->vs_mtx) \
pthread_mutex_lock(vs->vs_mtx); \
} while (0)
#define VS_UNLOCK(vs) \
do { \
if (vs->vs_mtx) \
pthread_mutex_unlock(vs->vs_mtx); \
} while (0)
struct virtio_consts {
const char *vc_name; /* name of driver (for diagnostics) */
int vc_nvq; /* number of virtual queues */
size_t vc_cfgsize; /* size of dev-specific config regs */
void (*vc_reset)(void *); /* called on virtual device reset */
void (*vc_qnotify)(void *, struct vqueue_info *);
/* called on QNOTIFY if no VQ notify */
int (*vc_cfgread)(void *, int, int, uint32_t *);
/* called to read config regs */
int (*vc_cfgwrite)(void *, int, int, uint32_t);
/* called to write config regs */
void (*vc_apply_features)(void *, uint64_t);
/* called to apply negotiated features */
uint64_t vc_hv_caps; /* hypervisor-provided capabilities */
};
/*
* Data structure allocated (statically) per virtual queue.
*
* Drivers may change vq_qsize after a reset. When the guest OS
* requests a device reset, the hypervisor first calls
* vs->vs_vc->vc_reset(); then the data structure below is
* reinitialized (for each virtqueue: vs->vs_vc->vc_nvq).
*
* The remaining fields should only be fussed-with by the generic
* code.
*
* Note: the addresses of vq_desc, vq_avail, and vq_used are all
* computable from each other, but it's a lot simpler if we just
* keep a pointer to each one. The event indices are similarly
* (but more easily) computable, and this time we'll compute them:
* they're just XX_ring[N].
*/
#define VQ_ALLOC 0x01 /* set once we have a pfn */
#define VQ_BROKED 0x02 /* ??? */
struct vqueue_info {
uint16_t vq_qsize; /* size of this queue (a power of 2) */
void (*vq_notify)(void *, struct vqueue_info *);
/* called instead of vc_notify, if not NULL */
struct virtio_softc *vq_vs; /* backpointer to softc */
uint16_t vq_num; /* we're the num'th queue in the softc */
uint16_t vq_flags; /* flags (see above) */
uint16_t vq_last_avail; /* a recent value of vq_avail->va_idx */
uint16_t vq_save_used; /* saved vq_used->vu_idx; see vq_endchains */
uint16_t vq_msix_idx; /* MSI-X index, or VIRTIO_MSI_NO_VECTOR */
uint32_t vq_pfn; /* PFN of virt queue (not shifted!) */
volatile struct virtio_desc *vq_desc; /* descriptor array */
volatile struct vring_avail *vq_avail; /* the "avail" ring */
volatile struct vring_used *vq_used; /* the "used" ring */
};
/* as noted above, these are sort of backwards, name-wise */
#define VQ_AVAIL_EVENT_IDX(vq) \
(*(volatile uint16_t *)&(vq)->vq_used->vu_ring[(vq)->vq_qsize])
#define VQ_USED_EVENT_IDX(vq) \
((vq)->vq_avail->va_ring[(vq)->vq_qsize])
/*
* Is this ring ready for I/O?
*/
static inline int
vq_ring_ready(struct vqueue_info *vq)
{
return (vq->vq_flags & VQ_ALLOC);
}
/*
* Are there "available" descriptors? (This does not count
* how many, just returns True if there are some.)
*/
static inline int
vq_has_descs(struct vqueue_info *vq)
{
return (vq_ring_ready(vq) && vq->vq_last_avail !=
vq->vq_avail->va_idx);
}
/*
* Deliver an interrupt to guest on the given virtual queue
* (if possible, or a generic MSI interrupt if not using MSI-X).
*/
static inline void
vq_interrupt(struct virtio_softc *vs, struct vqueue_info *vq)
{
if (pci_msix_enabled(vs->vs_pi))
pci_generate_msix(vs->vs_pi, vq->vq_msix_idx);
else {
VS_LOCK(vs);
vs->vs_isr |= VTCFG_ISR_QUEUES;
pci_generate_msi(vs->vs_pi, 0);
pci_lintr_assert(vs->vs_pi);
VS_UNLOCK(vs);
}
}
struct iovec;
void vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc,
void *dev_softc, struct pci_devinst *pi,
struct vqueue_info *queues);
int vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix);
void vi_reset_dev(struct virtio_softc *);
void vi_set_io_bar(struct virtio_softc *, int);
int vq_getchain(struct vqueue_info *vq, uint16_t *pidx,
struct iovec *iov, int n_iov, uint16_t *flags);
void vq_retchain(struct vqueue_info *vq);
void vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen);
void vq_endchains(struct vqueue_info *vq, int used_all_avail);
uint64_t vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int baridx, uint64_t offset, int size);
void vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
int baridx, uint64_t offset, int size, uint64_t value);
#endif /* _VIRTIO_H_ */

230
bhyve/xmsr.c Normal file
View file

@ -0,0 +1,230 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <machine/cpufunc.h>
#include <machine/vmm.h>
#include <machine/specialreg.h>
#include <vmmapi.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "xmsr.h"
static int cpu_vendor_intel, cpu_vendor_amd;
int
emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t num, uint64_t val)
{
if (cpu_vendor_intel) {
switch (num) {
case 0xd04: /* Sandy Bridge uncore PMCs */
case 0xc24:
return (0);
case MSR_BIOS_UPDT_TRIG:
return (0);
case MSR_BIOS_SIGN:
return (0);
default:
break;
}
} else if (cpu_vendor_amd) {
switch (num) {
case MSR_HWCR:
/*
* Ignore writes to hardware configuration MSR.
*/
return (0);
case MSR_NB_CFG1:
case MSR_IC_CFG:
return (0); /* Ignore writes */
case MSR_PERFEVSEL0:
case MSR_PERFEVSEL1:
case MSR_PERFEVSEL2:
case MSR_PERFEVSEL3:
/* Ignore writes to the PerfEvtSel MSRs */
return (0);
case MSR_K7_PERFCTR0:
case MSR_K7_PERFCTR1:
case MSR_K7_PERFCTR2:
case MSR_K7_PERFCTR3:
/* Ignore writes to the PerfCtr MSRs */
return (0);
case MSR_P_STATE_CONTROL:
/* Ignore write to change the P-state */
return (0);
default:
break;
}
}
return (-1);
}
int
emulate_rdmsr(struct vmctx *ctx, int vcpu, uint32_t num, uint64_t *val)
{
int error = 0;
if (cpu_vendor_intel) {
switch (num) {
case MSR_BIOS_SIGN:
case MSR_IA32_PLATFORM_ID:
case MSR_PKG_ENERGY_STATUS:
case MSR_PP0_ENERGY_STATUS:
case MSR_PP1_ENERGY_STATUS:
case MSR_DRAM_ENERGY_STATUS:
*val = 0;
break;
case MSR_RAPL_POWER_UNIT:
/*
* Use the default value documented in section
* "RAPL Interfaces" in Intel SDM vol3.
*/
*val = 0x000a1003;
break;
default:
error = -1;
break;
}
} else if (cpu_vendor_amd) {
switch (num) {
case MSR_BIOS_SIGN:
*val = 0;
break;
case MSR_HWCR:
/*
* Bios and Kernel Developer's Guides for AMD Families
* 12H, 14H, 15H and 16H.
*/
*val = 0x01000010; /* Reset value */
*val |= 1 << 9; /* MONITOR/MWAIT disable */
break;
case MSR_NB_CFG1:
case MSR_IC_CFG:
/*
* The reset value is processor family dependent so
* just return 0.
*/
*val = 0;
break;
case MSR_PERFEVSEL0:
case MSR_PERFEVSEL1:
case MSR_PERFEVSEL2:
case MSR_PERFEVSEL3:
/*
* PerfEvtSel MSRs are not properly virtualized so just
* return zero.
*/
*val = 0;
break;
case MSR_K7_PERFCTR0:
case MSR_K7_PERFCTR1:
case MSR_K7_PERFCTR2:
case MSR_K7_PERFCTR3:
/*
* PerfCtr MSRs are not properly virtualized so just
* return zero.
*/
*val = 0;
break;
case MSR_SMM_ADDR:
case MSR_SMM_MASK:
/*
* Return the reset value defined in the AMD Bios and
* Kernel Developer's Guide.
*/
*val = 0;
break;
case MSR_P_STATE_LIMIT:
case MSR_P_STATE_CONTROL:
case MSR_P_STATE_STATUS:
case MSR_P_STATE_CONFIG(0): /* P0 configuration */
*val = 0;
break;
/*
* OpenBSD guests test bit 0 of this MSR to detect if the
* workaround for erratum 721 is already applied.
* http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf
*/
case 0xC0011029:
*val = 1;
break;
default:
error = -1;
break;
}
} else {
error = -1;
}
return (error);
}
int
init_msr(void)
{
int error;
u_int regs[4];
char cpu_vendor[13];
do_cpuid(0, regs);
((u_int *)&cpu_vendor)[0] = regs[1];
((u_int *)&cpu_vendor)[1] = regs[3];
((u_int *)&cpu_vendor)[2] = regs[2];
cpu_vendor[12] = '\0';
error = 0;
if (strcmp(cpu_vendor, "AuthenticAMD") == 0) {
cpu_vendor_amd = 1;
} else if (strcmp(cpu_vendor, "GenuineIntel") == 0) {
cpu_vendor_intel = 1;
} else {
fprintf(stderr, "Unknown cpu vendor \"%s\"\n", cpu_vendor);
error = -1;
}
return (error);
}

36
bhyve/xmsr.h Normal file
View file

@ -0,0 +1,36 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _XMSR_H_
#define _XMSR_H_
int init_msr(void);
int emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t val);
int emulate_rdmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t *val);
#endif

16
bhyvectl/Makefile Normal file
View file

@ -0,0 +1,16 @@
#
# $FreeBSD$
#
PROG= bhyvectl
SRCS= bhyvectl.c
MAN=
LIBADD= vmmapi
WARNS?= 3
CFLAGS+= -I${.CURDIR}/../../sys/amd64/vmm
.include <bsd.prog.mk>

2142
bhyvectl/bhyvectl.c Normal file

File diff suppressed because it is too large Load diff

13
bhyveload/Makefile Normal file
View file

@ -0,0 +1,13 @@
# $FreeBSD$
PROG= bhyveload
SRCS= bhyveload.c
MAN= bhyveload.8
LIBADD= vmmapi
WARNS?= 3
CFLAGS+=-I${.CURDIR}/../../sys/boot/userboot
.include <bsd.prog.mk>

157
bhyveload/bhyveload.8 Normal file
View file

@ -0,0 +1,157 @@
.\"
.\" Copyright (c) 2012 NetApp Inc
.\" All rights reserved.
.\"
.\" Redistribution and use in source and binary forms, with or without
.\" modification, are permitted provided that the following conditions
.\" are met:
.\" 1. Redistributions of source code must retain the above copyright
.\" notice, this list of conditions and the following disclaimer.
.\" 2. Redistributions in binary form must reproduce the above copyright
.\" notice, this list of conditions and the following disclaimer in the
.\" documentation and/or other materials provided with the distribution.
.\"
.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
.\" SUCH DAMAGE.
.\"
.\" $FreeBSD$
.\"
.Dd January 7, 2012
.Dt BHYVELOAD 8
.Os
.Sh NAME
.Nm bhyveload
.Nd load a
.Fx
guest inside a bhyve virtual machine
.Sh SYNOPSIS
.Nm
.Op Fl c Ar cons-dev
.Op Fl d Ar disk-path
.Op Fl e Ar name=value
.Op Fl h Ar host-path
.Op Fl m Ar mem-size
.Ar vmname
.Sh DESCRIPTION
.Nm
is used to load a
.Fx
guest inside a
.Xr bhyve 4
virtual machine.
.Pp
.Nm
is based on
.Xr loader 8
and will present an interface identical to the
.Fx
loader on the user's terminal.
.Pp
The virtual machine is identified as
.Ar vmname
and will be created if it does not already exist.
.Sh OPTIONS
The following options are available:
.Bl -tag -width indent
.It Fl c Ar cons-dev
.Ar cons-dev
is a
.Xr tty 4
device to use for
.Nm
terminal I/O.
.Pp
The text string "stdio" is also accepted and selects the use of
unbuffered standard I/O. This is the default value.
.It Fl d Ar disk-path
The
.Ar disk-path
is the pathname of the guest's boot disk image.
.It Fl e Ar name=value
Set the FreeBSD loader environment variable
.Ar name
to
.Ar value .
.Pp
The option may be used more than once to set more than one environment
variable.
.It Fl h Ar host-path
The
.Ar host-path
is the directory at the top of the guest's boot filesystem.
.It Fl m Ar mem-size Xo
.Sm off
.Op Cm K | k | M | m | G | g | T | t
.Xc
.Sm on
.Ar mem-size
is the amount of memory allocated to the guest.
.Pp
The
.Ar mem-size
argument may be suffixed with one of
.Cm K ,
.Cm M ,
.Cm G
or
.Cm T
(either upper or lower case) to indicate a multiple of
Kilobytes, Megabytes, Gigabytes or Terabytes
respectively.
.Pp
The default value of
.Ar mem-size
is 256M.
.El
.Sh EXAMPLES
To create a virtual machine named
.Ar freebsd-vm
that boots off the ISO image
.Pa /freebsd/release.iso
and has 1GB memory allocated to it:
.Pp
.Dl "bhyveload -m 1G -d /freebsd/release.iso freebsd-vm"
.Pp
To create a virtual machine named
.Ar test-vm
with 256MB of memory allocated, the guest root filesystem under the host
directory
.Pa /user/images/test
and terminal I/O sent to the
.Xr nmdm 4
device
.Pa /dev/nmdm1B
.Pp
.Dl "bhyveload -m 256MB -h /usr/images/test -c /dev/nmdm1B test-vm"
.Sh SEE ALSO
.Xr bhyve 4 ,
.Xr nmdm 4 ,
.Xr vmm 4 ,
.Xr bhyve 8 ,
.Xr loader 8
.Sh HISTORY
.Nm
first appeared in
.Fx 10.0 ,
and was developed at NetApp Inc.
.Sh AUTHORS
.Nm
was developed by
.An -nosplit
.An Neel Natu Aq Mt neel@FreeBSD.org
at NetApp Inc with a lot of help from
.An Doug Rabson Aq Mt dfr@FreeBSD.org .
.Sh BUGS
.Nm
can only load
.Fx
as a guest.

746
bhyveload/bhyveload.c Normal file
View file

@ -0,0 +1,746 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
/*-
* Copyright (c) 2011 Google, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <sys/disk.h>
#include <sys/queue.h>
#include <machine/specialreg.h>
#include <machine/vmm.h>
#include <dirent.h>
#include <dlfcn.h>
#include <errno.h>
#include <err.h>
#include <fcntl.h>
#include <getopt.h>
#include <libgen.h>
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sysexits.h>
#include <termios.h>
#include <unistd.h>
#include <vmmapi.h>
#include "userboot.h"
#define MB (1024 * 1024UL)
#define GB (1024 * 1024 * 1024UL)
#define BSP 0
#define NDISKS 32
static char *host_base;
static struct termios term, oldterm;
static int disk_fd[NDISKS];
static int ndisks;
static int consin_fd, consout_fd;
static char *vmname, *progname;
static struct vmctx *ctx;
static uint64_t gdtbase, cr3, rsp;
static void cb_exit(void *arg, int v);
/*
* Console i/o callbacks
*/
static void
cb_putc(void *arg, int ch)
{
char c = ch;
(void) write(consout_fd, &c, 1);
}
static int
cb_getc(void *arg)
{
char c;
if (read(consin_fd, &c, 1) == 1)
return (c);
return (-1);
}
static int
cb_poll(void *arg)
{
int n;
if (ioctl(consin_fd, FIONREAD, &n) >= 0)
return (n > 0);
return (0);
}
/*
* Host filesystem i/o callbacks
*/
struct cb_file {
int cf_isdir;
size_t cf_size;
struct stat cf_stat;
union {
int fd;
DIR *dir;
} cf_u;
};
static int
cb_open(void *arg, const char *filename, void **hp)
{
struct stat st;
struct cb_file *cf;
char path[PATH_MAX];
if (!host_base)
return (ENOENT);
strlcpy(path, host_base, PATH_MAX);
if (path[strlen(path) - 1] == '/')
path[strlen(path) - 1] = 0;
strlcat(path, filename, PATH_MAX);
cf = malloc(sizeof(struct cb_file));
if (stat(path, &cf->cf_stat) < 0) {
free(cf);
return (errno);
}
cf->cf_size = st.st_size;
if (S_ISDIR(cf->cf_stat.st_mode)) {
cf->cf_isdir = 1;
cf->cf_u.dir = opendir(path);
if (!cf->cf_u.dir)
goto out;
*hp = cf;
return (0);
}
if (S_ISREG(cf->cf_stat.st_mode)) {
cf->cf_isdir = 0;
cf->cf_u.fd = open(path, O_RDONLY);
if (cf->cf_u.fd < 0)
goto out;
*hp = cf;
return (0);
}
out:
free(cf);
return (EINVAL);
}
static int
cb_close(void *arg, void *h)
{
struct cb_file *cf = h;
if (cf->cf_isdir)
closedir(cf->cf_u.dir);
else
close(cf->cf_u.fd);
free(cf);
return (0);
}
static int
cb_isdir(void *arg, void *h)
{
struct cb_file *cf = h;
return (cf->cf_isdir);
}
static int
cb_read(void *arg, void *h, void *buf, size_t size, size_t *resid)
{
struct cb_file *cf = h;
ssize_t sz;
if (cf->cf_isdir)
return (EINVAL);
sz = read(cf->cf_u.fd, buf, size);
if (sz < 0)
return (EINVAL);
*resid = size - sz;
return (0);
}
static int
cb_readdir(void *arg, void *h, uint32_t *fileno_return, uint8_t *type_return,
size_t *namelen_return, char *name)
{
struct cb_file *cf = h;
struct dirent *dp;
if (!cf->cf_isdir)
return (EINVAL);
dp = readdir(cf->cf_u.dir);
if (!dp)
return (ENOENT);
/*
* Note: d_namlen is in the range 0..255 and therefore less
* than PATH_MAX so we don't need to test before copying.
*/
*fileno_return = dp->d_fileno;
*type_return = dp->d_type;
*namelen_return = dp->d_namlen;
memcpy(name, dp->d_name, dp->d_namlen);
name[dp->d_namlen] = 0;
return (0);
}
static int
cb_seek(void *arg, void *h, uint64_t offset, int whence)
{
struct cb_file *cf = h;
if (cf->cf_isdir)
return (EINVAL);
if (lseek(cf->cf_u.fd, offset, whence) < 0)
return (errno);
return (0);
}
static int
cb_stat(void *arg, void *h, int *mode, int *uid, int *gid, uint64_t *size)
{
struct cb_file *cf = h;
*mode = cf->cf_stat.st_mode;
*uid = cf->cf_stat.st_uid;
*gid = cf->cf_stat.st_gid;
*size = cf->cf_stat.st_size;
return (0);
}
/*
* Disk image i/o callbacks
*/
static int
cb_diskread(void *arg, int unit, uint64_t from, void *to, size_t size,
size_t *resid)
{
ssize_t n;
if (unit < 0 || unit >= ndisks )
return (EIO);
n = pread(disk_fd[unit], to, size, from);
if (n < 0)
return (errno);
*resid = size - n;
return (0);
}
static int
cb_diskioctl(void *arg, int unit, u_long cmd, void *data)
{
struct stat sb;
if (unit < 0 || unit >= ndisks)
return (EBADF);
switch (cmd) {
case DIOCGSECTORSIZE:
*(u_int *)data = 512;
break;
case DIOCGMEDIASIZE:
if (fstat(disk_fd[unit], &sb) == 0)
*(off_t *)data = sb.st_size;
else
return (ENOTTY);
break;
default:
return (ENOTTY);
}
return (0);
}
/*
* Guest virtual machine i/o callbacks
*/
static int
cb_copyin(void *arg, const void *from, uint64_t to, size_t size)
{
char *ptr;
to &= 0x7fffffff;
ptr = vm_map_gpa(ctx, to, size);
if (ptr == NULL)
return (EFAULT);
memcpy(ptr, from, size);
return (0);
}
static int
cb_copyout(void *arg, uint64_t from, void *to, size_t size)
{
char *ptr;
from &= 0x7fffffff;
ptr = vm_map_gpa(ctx, from, size);
if (ptr == NULL)
return (EFAULT);
memcpy(to, ptr, size);
return (0);
}
static void
cb_setreg(void *arg, int r, uint64_t v)
{
int error;
enum vm_reg_name vmreg;
vmreg = VM_REG_LAST;
switch (r) {
case 4:
vmreg = VM_REG_GUEST_RSP;
rsp = v;
break;
default:
break;
}
if (vmreg == VM_REG_LAST) {
printf("test_setreg(%d): not implemented\n", r);
cb_exit(NULL, USERBOOT_EXIT_QUIT);
}
error = vm_set_register(ctx, BSP, vmreg, v);
if (error) {
perror("vm_set_register");
cb_exit(NULL, USERBOOT_EXIT_QUIT);
}
}
static void
cb_setmsr(void *arg, int r, uint64_t v)
{
int error;
enum vm_reg_name vmreg;
vmreg = VM_REG_LAST;
switch (r) {
case MSR_EFER:
vmreg = VM_REG_GUEST_EFER;
break;
default:
break;
}
if (vmreg == VM_REG_LAST) {
printf("test_setmsr(%d): not implemented\n", r);
cb_exit(NULL, USERBOOT_EXIT_QUIT);
}
error = vm_set_register(ctx, BSP, vmreg, v);
if (error) {
perror("vm_set_msr");
cb_exit(NULL, USERBOOT_EXIT_QUIT);
}
}
static void
cb_setcr(void *arg, int r, uint64_t v)
{
int error;
enum vm_reg_name vmreg;
vmreg = VM_REG_LAST;
switch (r) {
case 0:
vmreg = VM_REG_GUEST_CR0;
break;
case 3:
vmreg = VM_REG_GUEST_CR3;
cr3 = v;
break;
case 4:
vmreg = VM_REG_GUEST_CR4;
break;
default:
break;
}
if (vmreg == VM_REG_LAST) {
printf("test_setcr(%d): not implemented\n", r);
cb_exit(NULL, USERBOOT_EXIT_QUIT);
}
error = vm_set_register(ctx, BSP, vmreg, v);
if (error) {
perror("vm_set_cr");
cb_exit(NULL, USERBOOT_EXIT_QUIT);
}
}
static void
cb_setgdt(void *arg, uint64_t base, size_t size)
{
int error;
error = vm_set_desc(ctx, BSP, VM_REG_GUEST_GDTR, base, size - 1, 0);
if (error != 0) {
perror("vm_set_desc(gdt)");
cb_exit(NULL, USERBOOT_EXIT_QUIT);
}
gdtbase = base;
}
static void
cb_exec(void *arg, uint64_t rip)
{
int error;
if (cr3 == 0)
error = vm_setup_freebsd_registers_i386(ctx, BSP, rip, gdtbase,
rsp);
else
error = vm_setup_freebsd_registers(ctx, BSP, rip, cr3, gdtbase,
rsp);
if (error) {
perror("vm_setup_freebsd_registers");
cb_exit(NULL, USERBOOT_EXIT_QUIT);
}
cb_exit(NULL, 0);
}
/*
* Misc
*/
static void
cb_delay(void *arg, int usec)
{
usleep(usec);
}
static void
cb_exit(void *arg, int v)
{
tcsetattr(consout_fd, TCSAFLUSH, &oldterm);
exit(v);
}
static void
cb_getmem(void *arg, uint64_t *ret_lowmem, uint64_t *ret_highmem)
{
*ret_lowmem = vm_get_lowmem_size(ctx);
*ret_highmem = vm_get_highmem_size(ctx);
}
struct env {
const char *str; /* name=value */
SLIST_ENTRY(env) next;
};
static SLIST_HEAD(envhead, env) envhead;
static void
addenv(const char *str)
{
struct env *env;
env = malloc(sizeof(struct env));
env->str = str;
SLIST_INSERT_HEAD(&envhead, env, next);
}
static const char *
cb_getenv(void *arg, int num)
{
int i;
struct env *env;
i = 0;
SLIST_FOREACH(env, &envhead, next) {
if (i == num)
return (env->str);
i++;
}
return (NULL);
}
static struct loader_callbacks cb = {
.getc = cb_getc,
.putc = cb_putc,
.poll = cb_poll,
.open = cb_open,
.close = cb_close,
.isdir = cb_isdir,
.read = cb_read,
.readdir = cb_readdir,
.seek = cb_seek,
.stat = cb_stat,
.diskread = cb_diskread,
.diskioctl = cb_diskioctl,
.copyin = cb_copyin,
.copyout = cb_copyout,
.setreg = cb_setreg,
.setmsr = cb_setmsr,
.setcr = cb_setcr,
.setgdt = cb_setgdt,
.exec = cb_exec,
.delay = cb_delay,
.exit = cb_exit,
.getmem = cb_getmem,
.getenv = cb_getenv,
};
static int
altcons_open(char *path)
{
struct stat sb;
int err;
int fd;
/*
* Allow stdio to be passed in so that the same string
* can be used for the bhyveload console and bhyve com-port
* parameters
*/
if (!strcmp(path, "stdio"))
return (0);
err = stat(path, &sb);
if (err == 0) {
if (!S_ISCHR(sb.st_mode))
err = ENOTSUP;
else {
fd = open(path, O_RDWR | O_NONBLOCK);
if (fd < 0)
err = errno;
else
consin_fd = consout_fd = fd;
}
}
return (err);
}
static int
disk_open(char *path)
{
int err, fd;
if (ndisks >= NDISKS)
return (ERANGE);
err = 0;
fd = open(path, O_RDONLY);
if (fd > 0) {
disk_fd[ndisks] = fd;
ndisks++;
} else
err = errno;
return (err);
}
static void
usage(void)
{
fprintf(stderr,
"usage: %s [-c <console-device>] [-d <disk-path>] [-e <name=value>]\n"
" %*s [-h <host-path>] [-m mem-size] <vmname>\n",
progname,
(int)strlen(progname), "");
exit(1);
}
int
main(int argc, char** argv)
{
void *h;
void (*func)(struct loader_callbacks *, void *, int, int);
uint64_t mem_size;
int opt, error, need_reinit;
progname = basename(argv[0]);
mem_size = 256 * MB;
consin_fd = STDIN_FILENO;
consout_fd = STDOUT_FILENO;
while ((opt = getopt(argc, argv, "c:d:e:h:m:")) != -1) {
switch (opt) {
case 'c':
error = altcons_open(optarg);
if (error != 0)
errx(EX_USAGE, "Could not open '%s'", optarg);
break;
case 'd':
error = disk_open(optarg);
if (error != 0)
errx(EX_USAGE, "Could not open '%s'", optarg);
break;
case 'e':
addenv(optarg);
break;
case 'h':
host_base = optarg;
break;
case 'm':
error = vm_parse_memsize(optarg, &mem_size);
if (error != 0)
errx(EX_USAGE, "Invalid memsize '%s'", optarg);
break;
case '?':
usage();
}
}
argc -= optind;
argv += optind;
if (argc != 1)
usage();
vmname = argv[0];
need_reinit = 0;
error = vm_create(vmname);
if (error) {
if (errno != EEXIST) {
perror("vm_create");
exit(1);
}
need_reinit = 1;
}
ctx = vm_open(vmname);
if (ctx == NULL) {
perror("vm_open");
exit(1);
}
if (need_reinit) {
error = vm_reinit(ctx);
if (error) {
perror("vm_reinit");
exit(1);
}
}
error = vm_setup_memory(ctx, mem_size, VM_MMAP_ALL);
if (error) {
perror("vm_setup_memory");
exit(1);
}
tcgetattr(consout_fd, &term);
oldterm = term;
cfmakeraw(&term);
term.c_cflag |= CLOCAL;
tcsetattr(consout_fd, TCSAFLUSH, &term);
h = dlopen("/boot/userboot.so", RTLD_LOCAL);
if (!h) {
printf("%s\n", dlerror());
return (1);
}
func = dlsym(h, "loader_main");
if (!func) {
printf("%s\n", dlerror());
return (1);
}
addenv("smbios.bios.vendor=BHYVE");
addenv("boot_serial=1");
func(&cb, NULL, USERBOOT_VERSION_3, ndisks);
}

13
libvmmapi/Makefile Normal file
View file

@ -0,0 +1,13 @@
# $FreeBSD$
LIB= vmmapi
SRCS= vmmapi.c vmmapi_freebsd.c
INCS= vmmapi.h
WARNS?= 2
LIBADD= util
CFLAGS+= -I${.CURDIR}
.include <bsd.lib.mk>

1201
libvmmapi/vmmapi.c Normal file

File diff suppressed because it is too large Load diff

173
libvmmapi/vmmapi.h Normal file
View file

@ -0,0 +1,173 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMMAPI_H_
#define _VMMAPI_H_
#include <sys/param.h>
#include <sys/cpuset.h>
/*
* API version for out-of-tree consumers like grub-bhyve for making compile
* time decisions.
*/
#define VMMAPI_VERSION 0101 /* 2 digit major followed by 2 digit minor */
struct iovec;
struct vmctx;
enum x2apic_state;
/*
* Different styles of mapping the memory assigned to a VM into the address
* space of the controlling process.
*/
enum vm_mmap_style {
VM_MMAP_NONE, /* no mapping */
VM_MMAP_ALL, /* fully and statically mapped */
VM_MMAP_SPARSE, /* mappings created on-demand */
};
#define VM_MEM_F_INCORE 0x01 /* include guest memory in core file */
int vm_create(const char *name);
struct vmctx *vm_open(const char *name);
void vm_destroy(struct vmctx *ctx);
int vm_parse_memsize(const char *optarg, size_t *memsize);
int vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len,
int *wired);
int vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s);
void *vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len);
int vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num);
int vm_gla2gpa(struct vmctx *, int vcpuid, struct vm_guest_paging *paging,
uint64_t gla, int prot, uint64_t *gpa, int *fault);
uint32_t vm_get_lowmem_limit(struct vmctx *ctx);
void vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit);
void vm_set_memflags(struct vmctx *ctx, int flags);
size_t vm_get_lowmem_size(struct vmctx *ctx);
size_t vm_get_highmem_size(struct vmctx *ctx);
int vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
uint64_t base, uint32_t limit, uint32_t access);
int vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
uint64_t *base, uint32_t *limit, uint32_t *access);
int vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg,
struct seg_desc *seg_desc);
int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val);
int vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval);
int vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *ret_vmexit);
int vm_suspend(struct vmctx *ctx, enum vm_suspend_how how);
int vm_reinit(struct vmctx *ctx);
int vm_apicid2vcpu(struct vmctx *ctx, int apicid);
int vm_inject_exception(struct vmctx *ctx, int vcpu, int vector,
int errcode_valid, uint32_t errcode, int restart_instruction);
int vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector);
int vm_lapic_local_irq(struct vmctx *ctx, int vcpu, int vector);
int vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg);
int vm_ioapic_assert_irq(struct vmctx *ctx, int irq);
int vm_ioapic_deassert_irq(struct vmctx *ctx, int irq);
int vm_ioapic_pulse_irq(struct vmctx *ctx, int irq);
int vm_ioapic_pincount(struct vmctx *ctx, int *pincount);
int vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq);
int vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq);
int vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq);
int vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq,
enum vm_intr_trigger trigger);
int vm_inject_nmi(struct vmctx *ctx, int vcpu);
int vm_capability_name2type(const char *capname);
const char *vm_capability_type2name(int type);
int vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
int *retval);
int vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
int val);
int vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func);
int vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func);
int vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func,
vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
int vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot,
int func, uint64_t addr, uint64_t msg, int numvec);
int vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot,
int func, int idx, uint64_t addr, uint64_t msg,
uint32_t vector_control);
int vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *i1, uint64_t *i2);
int vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t exit_intinfo);
/*
* Return a pointer to the statistics buffer. Note that this is not MT-safe.
*/
uint64_t *vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv,
int *ret_entries);
const char *vm_get_stat_desc(struct vmctx *ctx, int index);
int vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *s);
int vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state s);
int vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities);
/*
* Translate the GLA range [gla,gla+len) into GPA segments in 'iov'.
* The 'iovcnt' should be big enough to accomodate all GPA segments.
*
* retval fault Interpretation
* 0 0 Success
* 0 1 An exception was injected into the guest
* EFAULT N/A Error
*/
int vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *pg,
uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt,
int *fault);
void vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *guest_iov,
void *host_dst, size_t len);
void vm_copyout(struct vmctx *ctx, int vcpu, const void *host_src,
struct iovec *guest_iov, size_t len);
void vm_copy_teardown(struct vmctx *ctx, int vcpu, struct iovec *iov,
int iovcnt);
/* RTC */
int vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value);
int vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval);
int vm_rtc_settime(struct vmctx *ctx, time_t secs);
int vm_rtc_gettime(struct vmctx *ctx, time_t *secs);
/* Reset vcpu register state */
int vcpu_reset(struct vmctx *ctx, int vcpu);
int vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus);
int vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus);
int vm_activate_cpu(struct vmctx *ctx, int vcpu);
/*
* FreeBSD specific APIs
*/
int vm_setup_freebsd_registers(struct vmctx *ctx, int vcpu,
uint64_t rip, uint64_t cr3, uint64_t gdtbase,
uint64_t rsp);
int vm_setup_freebsd_registers_i386(struct vmctx *vmctx, int vcpu,
uint32_t eip, uint32_t gdtbase,
uint32_t esp);
void vm_setup_freebsd_gdt(uint64_t *gdtr);
#endif /* _VMMAPI_H_ */

345
libvmmapi/vmmapi_freebsd.c Normal file
View file

@ -0,0 +1,345 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <machine/specialreg.h>
#include <machine/segments.h>
#include <machine/vmm.h>
#include <errno.h>
#include <string.h>
#include "vmmapi.h"
#define I386_TSS_SIZE 104
#define DESC_PRESENT 0x00000080
#define DESC_LONGMODE 0x00002000
#define DESC_DEF32 0x00004000
#define DESC_GRAN 0x00008000
#define DESC_UNUSABLE 0x00010000
#define GUEST_NULL_SEL 0
#define GUEST_CODE_SEL 1
#define GUEST_DATA_SEL 2
#define GUEST_TSS_SEL 3
#define GUEST_GDTR_LIMIT64 (3 * 8 - 1)
static struct segment_descriptor i386_gdt[] = {
{}, /* NULL */
{ .sd_lolimit = 0xffff, .sd_type = SDT_MEMER, /* CODE */
.sd_p = 1, .sd_hilimit = 0xf, .sd_def32 = 1, .sd_gran = 1 },
{ .sd_lolimit = 0xffff, .sd_type = SDT_MEMRW, /* DATA */
.sd_p = 1, .sd_hilimit = 0xf, .sd_def32 = 1, .sd_gran = 1 },
{ .sd_lolimit = I386_TSS_SIZE - 1, /* TSS */
.sd_type = SDT_SYS386TSS, .sd_p = 1 }
};
/*
* Setup the 'vcpu' register set such that it will begin execution at
* 'eip' in flat mode.
*/
int
vm_setup_freebsd_registers_i386(struct vmctx *vmctx, int vcpu, uint32_t eip,
uint32_t gdtbase, uint32_t esp)
{
uint64_t cr0, rflags, desc_base;
uint32_t desc_access, desc_limit, tssbase;
uint16_t gsel;
struct segment_descriptor *gdt;
int error, tmp;
/* A 32-bit guest requires unrestricted mode. */
error = vm_get_capability(vmctx, vcpu, VM_CAP_UNRESTRICTED_GUEST, &tmp);
if (error)
goto done;
error = vm_set_capability(vmctx, vcpu, VM_CAP_UNRESTRICTED_GUEST, 1);
if (error)
goto done;
cr0 = CR0_PE | CR0_NE;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, 0)) != 0)
goto done;
/*
* Forcing EFER to 0 causes bhyve to clear the "IA-32e guest
* mode" entry control.
*/
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_EFER, 0)))
goto done;
gdt = vm_map_gpa(vmctx, gdtbase, 0x1000);
if (gdt == NULL)
return (EFAULT);
memcpy(gdt, i386_gdt, sizeof(i386_gdt));
desc_base = gdtbase;
desc_limit = sizeof(i386_gdt) - 1;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR,
desc_base, desc_limit, 0);
if (error != 0)
goto done;
/* Place the TSS one page above the GDT. */
tssbase = gdtbase + 0x1000;
gdt[3].sd_lobase = tssbase;
rflags = 0x2;
error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags);
if (error)
goto done;
desc_base = 0;
desc_limit = 0xffffffff;
desc_access = DESC_GRAN | DESC_DEF32 | DESC_PRESENT | SDT_MEMERA;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS,
desc_base, desc_limit, desc_access);
desc_access = DESC_GRAN | DESC_DEF32 | DESC_PRESENT | SDT_MEMRWA;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
desc_base = tssbase;
desc_limit = I386_TSS_SIZE - 1;
desc_access = DESC_PRESENT | SDT_SYS386BSY;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, 0, 0,
DESC_UNUSABLE);
if (error)
goto done;
gsel = GSEL(GUEST_CODE_SEL, SEL_KPL);
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, gsel)) != 0)
goto done;
gsel = GSEL(GUEST_DATA_SEL, SEL_KPL);
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, gsel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, gsel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, gsel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, gsel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, gsel)) != 0)
goto done;
gsel = GSEL(GUEST_TSS_SEL, SEL_KPL);
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, gsel)) != 0)
goto done;
/* LDTR is pointing to the null selector */
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0)
goto done;
/* entry point */
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, eip)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, esp)) != 0)
goto done;
error = 0;
done:
return (error);
}
void
vm_setup_freebsd_gdt(uint64_t *gdtr)
{
gdtr[GUEST_NULL_SEL] = 0;
gdtr[GUEST_CODE_SEL] = 0x0020980000000000;
gdtr[GUEST_DATA_SEL] = 0x0000900000000000;
}
/*
* Setup the 'vcpu' register set such that it will begin execution at
* 'rip' in long mode.
*/
int
vm_setup_freebsd_registers(struct vmctx *vmctx, int vcpu,
uint64_t rip, uint64_t cr3, uint64_t gdtbase,
uint64_t rsp)
{
int error;
uint64_t cr0, cr4, efer, rflags, desc_base;
uint32_t desc_access, desc_limit;
uint16_t gsel;
cr0 = CR0_PE | CR0_PG | CR0_NE;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0)
goto done;
cr4 = CR4_PAE;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0)
goto done;
efer = EFER_LME | EFER_LMA;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_EFER, efer)))
goto done;
rflags = 0x2;
error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags);
if (error)
goto done;
desc_base = 0;
desc_limit = 0;
desc_access = 0x0000209B;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
desc_access = 0x00000093;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
/*
* XXX TR is pointing to null selector even though we set the
* TSS segment to be usable with a base address and limit of 0.
*/
desc_access = 0x0000008b;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, 0, 0,
DESC_UNUSABLE);
if (error)
goto done;
gsel = GSEL(GUEST_CODE_SEL, SEL_KPL);
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, gsel)) != 0)
goto done;
gsel = GSEL(GUEST_DATA_SEL, SEL_KPL);
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, gsel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, gsel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, gsel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, gsel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, gsel)) != 0)
goto done;
/* XXX TR is pointing to the null selector */
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, 0)) != 0)
goto done;
/* LDTR is pointing to the null selector */
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0)
goto done;
/* entry point */
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0)
goto done;
/* page table base */
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, cr3)) != 0)
goto done;
desc_base = gdtbase;
desc_limit = GUEST_GDTR_LIMIT64;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR,
desc_base, desc_limit, 0);
if (error != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, rsp)) != 0)
goto done;
error = 0;
done:
return (error);
}

648
vmm.h Normal file
View file

@ -0,0 +1,648 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMM_H_
#define _VMM_H_
#include <x86/segments.h>
enum vm_suspend_how {
VM_SUSPEND_NONE,
VM_SUSPEND_RESET,
VM_SUSPEND_POWEROFF,
VM_SUSPEND_HALT,
VM_SUSPEND_TRIPLEFAULT,
VM_SUSPEND_LAST
};
/*
* Identifiers for architecturally defined registers.
*/
enum vm_reg_name {
VM_REG_GUEST_RAX,
VM_REG_GUEST_RBX,
VM_REG_GUEST_RCX,
VM_REG_GUEST_RDX,
VM_REG_GUEST_RSI,
VM_REG_GUEST_RDI,
VM_REG_GUEST_RBP,
VM_REG_GUEST_R8,
VM_REG_GUEST_R9,
VM_REG_GUEST_R10,
VM_REG_GUEST_R11,
VM_REG_GUEST_R12,
VM_REG_GUEST_R13,
VM_REG_GUEST_R14,
VM_REG_GUEST_R15,
VM_REG_GUEST_CR0,
VM_REG_GUEST_CR3,
VM_REG_GUEST_CR4,
VM_REG_GUEST_DR7,
VM_REG_GUEST_RSP,
VM_REG_GUEST_RIP,
VM_REG_GUEST_RFLAGS,
VM_REG_GUEST_ES,
VM_REG_GUEST_CS,
VM_REG_GUEST_SS,
VM_REG_GUEST_DS,
VM_REG_GUEST_FS,
VM_REG_GUEST_GS,
VM_REG_GUEST_LDTR,
VM_REG_GUEST_TR,
VM_REG_GUEST_IDTR,
VM_REG_GUEST_GDTR,
VM_REG_GUEST_EFER,
VM_REG_GUEST_CR2,
VM_REG_GUEST_PDPTE0,
VM_REG_GUEST_PDPTE1,
VM_REG_GUEST_PDPTE2,
VM_REG_GUEST_PDPTE3,
VM_REG_GUEST_INTR_SHADOW,
VM_REG_LAST
};
enum x2apic_state {
X2APIC_DISABLED,
X2APIC_ENABLED,
X2APIC_STATE_LAST
};
#define VM_INTINFO_VECTOR(info) ((info) & 0xff)
#define VM_INTINFO_DEL_ERRCODE 0x800
#define VM_INTINFO_RSVD 0x7ffff000
#define VM_INTINFO_VALID 0x80000000
#define VM_INTINFO_TYPE 0x700
#define VM_INTINFO_HWINTR (0 << 8)
#define VM_INTINFO_NMI (2 << 8)
#define VM_INTINFO_HWEXCEPTION (3 << 8)
#define VM_INTINFO_SWINTR (4 << 8)
#ifdef _KERNEL
#define VM_MAX_NAMELEN 32
struct vm;
struct vm_exception;
struct vm_memory_segment;
struct seg_desc;
struct vm_exit;
struct vm_run;
struct vhpet;
struct vioapic;
struct vlapic;
struct vmspace;
struct vm_object;
struct vm_guest_paging;
struct pmap;
typedef int (*vmm_init_func_t)(int ipinum);
typedef int (*vmm_cleanup_func_t)(void);
typedef void (*vmm_resume_func_t)(void);
typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap);
typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip,
struct pmap *pmap, void *rendezvous_cookie,
void *suspend_cookie);
typedef void (*vmi_cleanup_func_t)(void *vmi);
typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num,
uint64_t *retval);
typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num,
uint64_t val);
typedef int (*vmi_get_desc_t)(void *vmi, int vcpu, int num,
struct seg_desc *desc);
typedef int (*vmi_set_desc_t)(void *vmi, int vcpu, int num,
struct seg_desc *desc);
typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval);
typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val);
typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max);
typedef void (*vmi_vmspace_free)(struct vmspace *vmspace);
typedef struct vlapic * (*vmi_vlapic_init)(void *vmi, int vcpu);
typedef void (*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic);
struct vmm_ops {
vmm_init_func_t init; /* module wide initialization */
vmm_cleanup_func_t cleanup;
vmm_resume_func_t resume;
vmi_init_func_t vminit; /* vm-specific initialization */
vmi_run_func_t vmrun;
vmi_cleanup_func_t vmcleanup;
vmi_get_register_t vmgetreg;
vmi_set_register_t vmsetreg;
vmi_get_desc_t vmgetdesc;
vmi_set_desc_t vmsetdesc;
vmi_get_cap_t vmgetcap;
vmi_set_cap_t vmsetcap;
vmi_vmspace_alloc vmspace_alloc;
vmi_vmspace_free vmspace_free;
vmi_vlapic_init vlapic_init;
vmi_vlapic_cleanup vlapic_cleanup;
};
extern struct vmm_ops vmm_ops_intel;
extern struct vmm_ops vmm_ops_amd;
int vm_create(const char *name, struct vm **retvm);
void vm_destroy(struct vm *vm);
int vm_reinit(struct vm *vm);
const char *vm_name(struct vm *vm);
int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len);
int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len);
void *vm_gpa_hold(struct vm *, vm_paddr_t gpa, size_t len, int prot,
void **cookie);
void vm_gpa_release(void *cookie);
int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
struct vm_memory_segment *seg);
int vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
vm_offset_t *offset, struct vm_object **object);
boolean_t vm_mem_allocated(struct vm *vm, vm_paddr_t gpa);
int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval);
int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val);
int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
struct seg_desc *ret_desc);
int vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
struct seg_desc *desc);
int vm_run(struct vm *vm, struct vm_run *vmrun);
int vm_suspend(struct vm *vm, enum vm_suspend_how how);
int vm_inject_nmi(struct vm *vm, int vcpu);
int vm_nmi_pending(struct vm *vm, int vcpuid);
void vm_nmi_clear(struct vm *vm, int vcpuid);
int vm_inject_extint(struct vm *vm, int vcpu);
int vm_extint_pending(struct vm *vm, int vcpuid);
void vm_extint_clear(struct vm *vm, int vcpuid);
struct vlapic *vm_lapic(struct vm *vm, int cpu);
struct vioapic *vm_ioapic(struct vm *vm);
struct vhpet *vm_hpet(struct vm *vm);
int vm_get_capability(struct vm *vm, int vcpu, int type, int *val);
int vm_set_capability(struct vm *vm, int vcpu, int type, int val);
int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state);
int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state);
int vm_apicid2vcpuid(struct vm *vm, int apicid);
int vm_activate_cpu(struct vm *vm, int vcpu);
struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid);
void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip);
void vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip);
void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip);
#ifdef _SYS__CPUSET_H_
/*
* Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'.
* The rendezvous 'func(arg)' is not allowed to do anything that will
* cause the thread to be put to sleep.
*
* If the rendezvous is being initiated from a vcpu context then the
* 'vcpuid' must refer to that vcpu, otherwise it should be set to -1.
*
* The caller cannot hold any locks when initiating the rendezvous.
*
* The implementation of this API may cause vcpus other than those specified
* by 'dest' to be stalled. The caller should not rely on any vcpus making
* forward progress when the rendezvous is in progress.
*/
typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg);
void vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
vm_rendezvous_func_t func, void *arg);
cpuset_t vm_active_cpus(struct vm *vm);
cpuset_t vm_suspended_cpus(struct vm *vm);
#endif /* _SYS__CPUSET_H_ */
static __inline int
vcpu_rendezvous_pending(void *rendezvous_cookie)
{
return (*(uintptr_t *)rendezvous_cookie != 0);
}
static __inline int
vcpu_suspended(void *suspend_cookie)
{
return (*(int *)suspend_cookie);
}
/*
* Return 1 if device indicated by bus/slot/func is supposed to be a
* pci passthrough device.
*
* Return 0 otherwise.
*/
int vmm_is_pptdev(int bus, int slot, int func);
void *vm_iommu_domain(struct vm *vm);
enum vcpu_state {
VCPU_IDLE,
VCPU_FROZEN,
VCPU_RUNNING,
VCPU_SLEEPING,
};
int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state,
bool from_idle);
enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu, int *hostcpu);
static int __inline
vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu)
{
return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING);
}
#ifdef _SYS_PROC_H_
static int __inline
vcpu_should_yield(struct vm *vm, int vcpu)
{
if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED))
return (1);
else if (curthread->td_owepreempt)
return (1);
else
return (0);
}
#endif
void *vcpu_stats(struct vm *vm, int vcpu);
void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr);
struct vmspace *vm_get_vmspace(struct vm *vm);
int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func);
int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func);
struct vatpic *vm_atpic(struct vm *vm);
struct vatpit *vm_atpit(struct vm *vm);
struct vpmtmr *vm_pmtmr(struct vm *vm);
struct vrtc *vm_rtc(struct vm *vm);
/*
* Inject exception 'vector' into the guest vcpu. This function returns 0 on
* success and non-zero on failure.
*
* Wrapper functions like 'vm_inject_gp()' should be preferred to calling
* this function directly because they enforce the trap-like or fault-like
* behavior of an exception.
*
* This function should only be called in the context of the thread that is
* executing this vcpu.
*/
int vm_inject_exception(struct vm *vm, int vcpuid, int vector, int err_valid,
uint32_t errcode, int restart_instruction);
/*
* This function is called after a VM-exit that occurred during exception or
* interrupt delivery through the IDT. The format of 'intinfo' is described
* in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2.
*
* If a VM-exit handler completes the event delivery successfully then it
* should call vm_exit_intinfo() to extinguish the pending event. For e.g.,
* if the task switch emulation is triggered via a task gate then it should
* call this function with 'intinfo=0' to indicate that the external event
* is not pending anymore.
*
* Return value is 0 on success and non-zero on failure.
*/
int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t intinfo);
/*
* This function is called before every VM-entry to retrieve a pending
* event that should be injected into the guest. This function combines
* nested events into a double or triple fault.
*
* Returns 0 if there are no events that need to be injected into the guest
* and non-zero otherwise.
*/
int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info);
int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2);
enum vm_reg_name vm_segment_name(int seg_encoding);
struct vm_copyinfo {
uint64_t gpa;
size_t len;
void *hva;
void *cookie;
};
/*
* Set up 'copyinfo[]' to copy to/from guest linear address space starting
* at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for
* a copyin or PROT_WRITE for a copyout.
*
* retval is_fault Intepretation
* 0 0 Success
* 0 1 An exception was injected into the guest
* EFAULT N/A Unrecoverable error
*
* The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if
* the return value is 0. The 'copyinfo[]' resources should be freed by calling
* 'vm_copy_teardown()' after the copy is done.
*/
int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
int num_copyinfo, int *is_fault);
void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
int num_copyinfo);
void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
void *kaddr, size_t len);
void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
struct vm_copyinfo *copyinfo, size_t len);
int vcpu_trace_exceptions(struct vm *vm, int vcpuid);
#endif /* KERNEL */
#define VM_MAXCPU 16 /* maximum virtual cpus */
/*
* Identifiers for optional vmm capabilities
*/
enum vm_cap_type {
VM_CAP_HALT_EXIT,
VM_CAP_MTRAP_EXIT,
VM_CAP_PAUSE_EXIT,
VM_CAP_UNRESTRICTED_GUEST,
VM_CAP_ENABLE_INVPCID,
VM_CAP_MAX
};
enum vm_intr_trigger {
EDGE_TRIGGER,
LEVEL_TRIGGER
};
/*
* The 'access' field has the format specified in Table 21-2 of the Intel
* Architecture Manual vol 3b.
*
* XXX The contents of the 'access' field are architecturally defined except
* bit 16 - Segment Unusable.
*/
struct seg_desc {
uint64_t base;
uint32_t limit;
uint32_t access;
};
#define SEG_DESC_TYPE(access) ((access) & 0x001f)
#define SEG_DESC_DPL(access) (((access) >> 5) & 0x3)
#define SEG_DESC_PRESENT(access) (((access) & 0x0080) ? 1 : 0)
#define SEG_DESC_DEF32(access) (((access) & 0x4000) ? 1 : 0)
#define SEG_DESC_GRANULARITY(access) (((access) & 0x8000) ? 1 : 0)
#define SEG_DESC_UNUSABLE(access) (((access) & 0x10000) ? 1 : 0)
enum vm_cpu_mode {
CPU_MODE_REAL,
CPU_MODE_PROTECTED,
CPU_MODE_COMPATIBILITY, /* IA-32E mode (CS.L = 0) */
CPU_MODE_64BIT, /* IA-32E mode (CS.L = 1) */
};
enum vm_paging_mode {
PAGING_MODE_FLAT,
PAGING_MODE_32,
PAGING_MODE_PAE,
PAGING_MODE_64,
};
struct vm_guest_paging {
uint64_t cr3;
int cpl;
enum vm_cpu_mode cpu_mode;
enum vm_paging_mode paging_mode;
};
/*
* The data structures 'vie' and 'vie_op' are meant to be opaque to the
* consumers of instruction decoding. The only reason why their contents
* need to be exposed is because they are part of the 'vm_exit' structure.
*/
struct vie_op {
uint8_t op_byte; /* actual opcode byte */
uint8_t op_type; /* type of operation (e.g. MOV) */
uint16_t op_flags;
};
#define VIE_INST_SIZE 15
struct vie {
uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */
uint8_t num_valid; /* size of the instruction */
uint8_t num_processed;
uint8_t addrsize:4, opsize:4; /* address and operand sizes */
uint8_t rex_w:1, /* REX prefix */
rex_r:1,
rex_x:1,
rex_b:1,
rex_present:1,
repz_present:1, /* REP/REPE/REPZ prefix */
repnz_present:1, /* REPNE/REPNZ prefix */
opsize_override:1, /* Operand size override */
addrsize_override:1, /* Address size override */
segment_override:1; /* Segment override */
uint8_t mod:2, /* ModRM byte */
reg:4,
rm:4;
uint8_t ss:2, /* SIB byte */
index:4,
base:4;
uint8_t disp_bytes;
uint8_t imm_bytes;
uint8_t scale;
int base_register; /* VM_REG_GUEST_xyz */
int index_register; /* VM_REG_GUEST_xyz */
int segment_register; /* VM_REG_GUEST_xyz */
int64_t displacement; /* optional addr displacement */
int64_t immediate; /* optional immediate operand */
uint8_t decoded; /* set to 1 if successfully decoded */
struct vie_op op; /* opcode description */
};
enum vm_exitcode {
VM_EXITCODE_INOUT,
VM_EXITCODE_VMX,
VM_EXITCODE_BOGUS,
VM_EXITCODE_RDMSR,
VM_EXITCODE_WRMSR,
VM_EXITCODE_HLT,
VM_EXITCODE_MTRAP,
VM_EXITCODE_PAUSE,
VM_EXITCODE_PAGING,
VM_EXITCODE_INST_EMUL,
VM_EXITCODE_SPINUP_AP,
VM_EXITCODE_DEPRECATED1, /* used to be SPINDOWN_CPU */
VM_EXITCODE_RENDEZVOUS,
VM_EXITCODE_IOAPIC_EOI,
VM_EXITCODE_SUSPENDED,
VM_EXITCODE_INOUT_STR,
VM_EXITCODE_TASK_SWITCH,
VM_EXITCODE_MONITOR,
VM_EXITCODE_MWAIT,
VM_EXITCODE_SVM,
VM_EXITCODE_MAX
};
struct vm_inout {
uint16_t bytes:3; /* 1 or 2 or 4 */
uint16_t in:1;
uint16_t string:1;
uint16_t rep:1;
uint16_t port;
uint32_t eax; /* valid for out */
};
struct vm_inout_str {
struct vm_inout inout; /* must be the first element */
struct vm_guest_paging paging;
uint64_t rflags;
uint64_t cr0;
uint64_t index;
uint64_t count; /* rep=1 (%rcx), rep=0 (1) */
int addrsize;
enum vm_reg_name seg_name;
struct seg_desc seg_desc;
};
enum task_switch_reason {
TSR_CALL,
TSR_IRET,
TSR_JMP,
TSR_IDT_GATE, /* task gate in IDT */
};
struct vm_task_switch {
uint16_t tsssel; /* new TSS selector */
int ext; /* task switch due to external event */
uint32_t errcode;
int errcode_valid; /* push 'errcode' on the new stack */
enum task_switch_reason reason;
struct vm_guest_paging paging;
};
struct vm_exit {
enum vm_exitcode exitcode;
int inst_length; /* 0 means unknown */
uint64_t rip;
union {
struct vm_inout inout;
struct vm_inout_str inout_str;
struct {
uint64_t gpa;
int fault_type;
} paging;
struct {
uint64_t gpa;
uint64_t gla;
uint64_t cs_base;
int cs_d; /* CS.D */
struct vm_guest_paging paging;
struct vie vie;
} inst_emul;
/*
* VMX specific payload. Used when there is no "better"
* exitcode to represent the VM-exit.
*/
struct {
int status; /* vmx inst status */
/*
* 'exit_reason' and 'exit_qualification' are valid
* only if 'status' is zero.
*/
uint32_t exit_reason;
uint64_t exit_qualification;
/*
* 'inst_error' and 'inst_type' are valid
* only if 'status' is non-zero.
*/
int inst_type;
int inst_error;
} vmx;
/*
* SVM specific payload.
*/
struct {
uint64_t exitcode;
uint64_t exitinfo1;
uint64_t exitinfo2;
} svm;
struct {
uint32_t code; /* ecx value */
uint64_t wval;
} msr;
struct {
int vcpu;
uint64_t rip;
} spinup_ap;
struct {
uint64_t rflags;
} hlt;
struct {
int vector;
} ioapic_eoi;
struct {
enum vm_suspend_how how;
} suspended;
struct vm_task_switch task_switch;
} u;
};
/* APIs to inject faults into the guest */
void vm_inject_fault(void *vm, int vcpuid, int vector, int errcode_valid,
int errcode);
static __inline void
vm_inject_ud(void *vm, int vcpuid)
{
vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0);
}
static __inline void
vm_inject_gp(void *vm, int vcpuid)
{
vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0);
}
static __inline void
vm_inject_ac(void *vm, int vcpuid, int errcode)
{
vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode);
}
static __inline void
vm_inject_ss(void *vm, int vcpuid, int errcode)
{
vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode);
}
void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2);
int vm_restart_instruction(void *vm, int vcpuid);
#endif /* _VMM_H_ */

133
vmm/amd/amdv.c Normal file
View file

@ -0,0 +1,133 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/errno.h>
#include <machine/vmm.h>
#include "io/iommu.h"
static int
amd_iommu_init(void)
{
printf("amd_iommu_init: not implemented\n");
return (ENXIO);
}
static void
amd_iommu_cleanup(void)
{
printf("amd_iommu_cleanup: not implemented\n");
}
static void
amd_iommu_enable(void)
{
printf("amd_iommu_enable: not implemented\n");
}
static void
amd_iommu_disable(void)
{
printf("amd_iommu_disable: not implemented\n");
}
static void *
amd_iommu_create_domain(vm_paddr_t maxaddr)
{
printf("amd_iommu_create_domain: not implemented\n");
return (NULL);
}
static void
amd_iommu_destroy_domain(void *domain)
{
printf("amd_iommu_destroy_domain: not implemented\n");
}
static uint64_t
amd_iommu_create_mapping(void *domain, vm_paddr_t gpa, vm_paddr_t hpa,
uint64_t len)
{
printf("amd_iommu_create_mapping: not implemented\n");
return (0);
}
static uint64_t
amd_iommu_remove_mapping(void *domain, vm_paddr_t gpa, uint64_t len)
{
printf("amd_iommu_remove_mapping: not implemented\n");
return (0);
}
static void
amd_iommu_add_device(void *domain, uint16_t rid)
{
printf("amd_iommu_add_device: not implemented\n");
}
static void
amd_iommu_remove_device(void *domain, uint16_t rid)
{
printf("amd_iommu_remove_device: not implemented\n");
}
static void
amd_iommu_invalidate_tlb(void *domain)
{
printf("amd_iommu_invalidate_tlb: not implemented\n");
}
struct iommu_ops iommu_ops_amd = {
amd_iommu_init,
amd_iommu_cleanup,
amd_iommu_enable,
amd_iommu_disable,
amd_iommu_create_domain,
amd_iommu_destroy_domain,
amd_iommu_create_mapping,
amd_iommu_remove_mapping,
amd_iommu_add_device,
amd_iommu_remove_device,
amd_iommu_invalidate_tlb,
};

87
vmm/amd/npt.c Normal file
View file

@ -0,0 +1,87 @@
/*-
* Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com)
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice unmodified, this list of conditions, and the following
* disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <vm/vm_extern.h>
#include <machine/pmap.h>
#include "npt.h"
SYSCTL_DECL(_hw_vmm);
SYSCTL_NODE(_hw_vmm, OID_AUTO, npt, CTLFLAG_RW, NULL, NULL);
static int npt_flags;
SYSCTL_INT(_hw_vmm_npt, OID_AUTO, pmap_flags, CTLFLAG_RD,
&npt_flags, 0, NULL);
#define NPT_IPIMASK 0xFF
/*
* AMD nested page table init.
*/
int
svm_npt_init(int ipinum)
{
int enable_superpage = 1;
npt_flags = ipinum & NPT_IPIMASK;
TUNABLE_INT_FETCH("hw.vmm.npt.enable_superpage", &enable_superpage);
if (enable_superpage)
npt_flags |= PMAP_PDE_SUPERPAGE;
return (0);
}
static int
npt_pinit(pmap_t pmap)
{
return (pmap_pinit_type(pmap, PT_RVI, npt_flags));
}
struct vmspace *
svm_npt_alloc(vm_offset_t min, vm_offset_t max)
{
return (vmspace_alloc(min, max, npt_pinit));
}
void
svm_npt_free(struct vmspace *vmspace)
{
vmspace_free(vmspace);
}

36
vmm/amd/npt.h Normal file
View file

@ -0,0 +1,36 @@
/*-
* Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com)
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice unmodified, this list of conditions, and the following
* disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _SVM_NPT_H_
#define _SVM_NPT_H_
int svm_npt_init(int ipinum);
struct vmspace *svm_npt_alloc(vm_offset_t min, vm_offset_t max);
void svm_npt_free(struct vmspace *vmspace);
#endif /* _SVM_NPT_H_ */

2259
vmm/amd/svm.c Normal file

File diff suppressed because it is too large Load diff

54
vmm/amd/svm.h Normal file
View file

@ -0,0 +1,54 @@
/*-
* Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com)
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice unmodified, this list of conditions, and the following
* disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _SVM_H_
#define _SVM_H_
/*
* Guest register state that is saved outside the VMCB.
*/
struct svm_regctx {
register_t sctx_rbp;
register_t sctx_rbx;
register_t sctx_rcx;
register_t sctx_rdx;
register_t sctx_rdi;
register_t sctx_rsi;
register_t sctx_r8;
register_t sctx_r9;
register_t sctx_r10;
register_t sctx_r11;
register_t sctx_r12;
register_t sctx_r13;
register_t sctx_r14;
register_t sctx_r15;
};
void svm_launch(uint64_t pa, struct svm_regctx *);
#endif /* _SVM_H_ */

48
vmm/amd/svm_genassym.c Normal file
View file

@ -0,0 +1,48 @@
/*-
* Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com)
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice unmodified, this list of conditions, and the following
* disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/assym.h>
#include "svm.h"
ASSYM(SCTX_RBX, offsetof(struct svm_regctx, sctx_rbx));
ASSYM(SCTX_RCX, offsetof(struct svm_regctx, sctx_rcx));
ASSYM(SCTX_RBP, offsetof(struct svm_regctx, sctx_rbp));
ASSYM(SCTX_RDX, offsetof(struct svm_regctx, sctx_rdx));
ASSYM(SCTX_RDI, offsetof(struct svm_regctx, sctx_rdi));
ASSYM(SCTX_RSI, offsetof(struct svm_regctx, sctx_rsi));
ASSYM(SCTX_R8, offsetof(struct svm_regctx, sctx_r8));
ASSYM(SCTX_R9, offsetof(struct svm_regctx, sctx_r9));
ASSYM(SCTX_R10, offsetof(struct svm_regctx, sctx_r10));
ASSYM(SCTX_R11, offsetof(struct svm_regctx, sctx_r11));
ASSYM(SCTX_R12, offsetof(struct svm_regctx, sctx_r12));
ASSYM(SCTX_R13, offsetof(struct svm_regctx, sctx_r13));
ASSYM(SCTX_R14, offsetof(struct svm_regctx, sctx_r14));
ASSYM(SCTX_R15, offsetof(struct svm_regctx, sctx_r15));

165
vmm/amd/svm_msr.c Normal file
View file

@ -0,0 +1,165 @@
/*-
* Copyright (c) 2014, Neel Natu (neel@freebsd.org)
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice unmodified, this list of conditions, and the following
* disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/errno.h>
#include <sys/systm.h>
#include <machine/cpufunc.h>
#include <machine/specialreg.h>
#include <machine/vmm.h>
#include "svm.h"
#include "vmcb.h"
#include "svm_softc.h"
#include "svm_msr.h"
#ifndef MSR_AMDK8_IPM
#define MSR_AMDK8_IPM 0xc0010055
#endif
enum {
IDX_MSR_LSTAR,
IDX_MSR_CSTAR,
IDX_MSR_STAR,
IDX_MSR_SF_MASK,
HOST_MSR_NUM /* must be the last enumeration */
};
static uint64_t host_msrs[HOST_MSR_NUM];
void
svm_msr_init(void)
{
/*
* It is safe to cache the values of the following MSRs because they
* don't change based on curcpu, curproc or curthread.
*/
host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR);
host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR);
host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
}
void
svm_msr_guest_init(struct svm_softc *sc, int vcpu)
{
/*
* All the MSRs accessible to the guest are either saved/restored by
* hardware on every #VMEXIT/VMRUN (e.g., G_PAT) or are saved/restored
* by VMSAVE/VMLOAD (e.g., MSR_GSBASE).
*
* There are no guest MSRs that are saved/restored "by hand" so nothing
* more to do here.
*/
return;
}
void
svm_msr_guest_enter(struct svm_softc *sc, int vcpu)
{
/*
* Save host MSRs (if any) and restore guest MSRs (if any).
*/
}
void
svm_msr_guest_exit(struct svm_softc *sc, int vcpu)
{
/*
* Save guest MSRs (if any) and restore host MSRs.
*/
wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]);
wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]);
wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]);
wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]);
/* MSR_KGSBASE will be restored on the way back to userspace */
}
int
svm_rdmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t *result,
bool *retu)
{
int error = 0;
switch (num) {
case MSR_MCG_CAP:
case MSR_MCG_STATUS:
*result = 0;
break;
case MSR_MTRRcap:
case MSR_MTRRdefType:
case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
case MSR_MTRR64kBase:
case MSR_SYSCFG:
*result = 0;
break;
case MSR_AMDK8_IPM:
*result = 0;
break;
default:
error = EINVAL;
break;
}
return (error);
}
int
svm_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val, bool *retu)
{
int error = 0;
switch (num) {
case MSR_MCG_CAP:
case MSR_MCG_STATUS:
break; /* ignore writes */
case MSR_MTRRcap:
vm_inject_gp(sc->vm, vcpu);
break;
case MSR_MTRRdefType:
case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
case MSR_MTRR64kBase:
case MSR_SYSCFG:
break; /* Ignore writes */
case MSR_AMDK8_IPM:
/*
* Ignore writes to the "Interrupt Pending Message" MSR.
*/
break;
default:
error = EINVAL;
break;
}
return (error);
}

44
vmm/amd/svm_msr.h Normal file
View file

@ -0,0 +1,44 @@
/*-
* Copyright (c) 2014 Neel Natu (neel@freebsd.org)
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice unmodified, this list of conditions, and the following
* disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _SVM_MSR_H_
#define _SVM_MSR_H_
struct svm_softc;
void svm_msr_init(void);
void svm_msr_guest_init(struct svm_softc *sc, int vcpu);
void svm_msr_guest_enter(struct svm_softc *sc, int vcpu);
void svm_msr_guest_exit(struct svm_softc *sc, int vcpu);
int svm_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val,
bool *retu);
int svm_rdmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t *result,
bool *retu);
#endif /* _SVM_MSR_H_ */

114
vmm/amd/svm_softc.h Normal file
View file

@ -0,0 +1,114 @@
/*-
* Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com)
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice unmodified, this list of conditions, and the following
* disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _SVM_SOFTC_H_
#define _SVM_SOFTC_H_
#define SVM_IO_BITMAP_SIZE (3 * PAGE_SIZE)
#define SVM_MSR_BITMAP_SIZE (2 * PAGE_SIZE)
struct asid {
uint64_t gen; /* range is [1, ~0UL] */
uint32_t num; /* range is [1, nasid - 1] */
};
/*
* XXX separate out 'struct vmcb' from 'svm_vcpu' to avoid wasting space
* due to VMCB alignment requirements.
*/
struct svm_vcpu {
struct vmcb vmcb; /* hardware saved vcpu context */
struct svm_regctx swctx; /* software saved vcpu context */
uint64_t vmcb_pa; /* VMCB physical address */
uint64_t nextrip; /* next instruction to be executed by guest */
int lastcpu; /* host cpu that the vcpu last ran on */
uint32_t dirty; /* state cache bits that must be cleared */
long eptgen; /* pmap->pm_eptgen when the vcpu last ran */
struct asid asid;
} __aligned(PAGE_SIZE);
/*
* SVM softc, one per virtual machine.
*/
struct svm_softc {
uint8_t iopm_bitmap[SVM_IO_BITMAP_SIZE]; /* shared by all vcpus */
uint8_t msr_bitmap[SVM_MSR_BITMAP_SIZE]; /* shared by all vcpus */
uint8_t apic_page[VM_MAXCPU][PAGE_SIZE];
struct svm_vcpu vcpu[VM_MAXCPU];
vm_offset_t nptp; /* nested page table */
struct vm *vm;
} __aligned(PAGE_SIZE);
CTASSERT((offsetof(struct svm_softc, nptp) & PAGE_MASK) == 0);
static __inline struct svm_vcpu *
svm_get_vcpu(struct svm_softc *sc, int vcpu)
{
return (&(sc->vcpu[vcpu]));
}
static __inline struct vmcb *
svm_get_vmcb(struct svm_softc *sc, int vcpu)
{
return (&(sc->vcpu[vcpu].vmcb));
}
static __inline struct vmcb_state *
svm_get_vmcb_state(struct svm_softc *sc, int vcpu)
{
return (&(sc->vcpu[vcpu].vmcb.state));
}
static __inline struct vmcb_ctrl *
svm_get_vmcb_ctrl(struct svm_softc *sc, int vcpu)
{
return (&(sc->vcpu[vcpu].vmcb.ctrl));
}
static __inline struct svm_regctx *
svm_get_guest_regctx(struct svm_softc *sc, int vcpu)
{
return (&(sc->vcpu[vcpu].swctx));
}
static __inline void
svm_set_dirty(struct svm_softc *sc, int vcpu, uint32_t dirtybits)
{
struct svm_vcpu *vcpustate;
vcpustate = svm_get_vcpu(sc, vcpu);
vcpustate->dirty |= dirtybits;
}
#endif /* _SVM_SOFTC_H_ */

121
vmm/amd/svm_support.S Normal file
View file

@ -0,0 +1,121 @@
/*-
* Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com)
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice unmodified, this list of conditions, and the following
* disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <machine/asmacros.h>
#include "svm_assym.h"
/*
* Be friendly to DTrace FBT's prologue/epilogue pattern matching.
*
* They are also responsible for saving/restoring the host %rbp across VMRUN.
*/
#define VENTER push %rbp ; mov %rsp,%rbp
#define VLEAVE pop %rbp
#define VMLOAD .byte 0x0f, 0x01, 0xda
#define VMRUN .byte 0x0f, 0x01, 0xd8
#define VMSAVE .byte 0x0f, 0x01, 0xdb
/*
* svm_launch(uint64_t vmcb, struct svm_regctx *gctx)
* %rdi: physical address of VMCB
* %rsi: pointer to guest context
*/
ENTRY(svm_launch)
VENTER
/*
* Host register state saved across a VMRUN.
*
* All "callee saved registers" except:
* %rsp: because it is preserved by the processor across VMRUN.
* %rbp: because it is saved/restored by the function prologue/epilogue.
*/
push %rbx
push %r12
push %r13
push %r14
push %r15
/* Save the physical address of the VMCB in %rax */
movq %rdi, %rax
push %rsi /* push guest context pointer on the stack */
/*
* Restore guest state.
*/
movq SCTX_R8(%rsi), %r8
movq SCTX_R9(%rsi), %r9
movq SCTX_R10(%rsi), %r10
movq SCTX_R11(%rsi), %r11
movq SCTX_R12(%rsi), %r12
movq SCTX_R13(%rsi), %r13
movq SCTX_R14(%rsi), %r14
movq SCTX_R15(%rsi), %r15
movq SCTX_RBP(%rsi), %rbp
movq SCTX_RBX(%rsi), %rbx
movq SCTX_RCX(%rsi), %rcx
movq SCTX_RDX(%rsi), %rdx
movq SCTX_RDI(%rsi), %rdi
movq SCTX_RSI(%rsi), %rsi /* %rsi must be restored last */
VMLOAD
VMRUN
VMSAVE
pop %rax /* pop guest context pointer from the stack */
/*
* Save guest state.
*/
movq %r8, SCTX_R8(%rax)
movq %r9, SCTX_R9(%rax)
movq %r10, SCTX_R10(%rax)
movq %r11, SCTX_R11(%rax)
movq %r12, SCTX_R12(%rax)
movq %r13, SCTX_R13(%rax)
movq %r14, SCTX_R14(%rax)
movq %r15, SCTX_R15(%rax)
movq %rbp, SCTX_RBP(%rax)
movq %rbx, SCTX_RBX(%rax)
movq %rcx, SCTX_RCX(%rax)
movq %rdx, SCTX_RDX(%rax)
movq %rdi, SCTX_RDI(%rax)
movq %rsi, SCTX_RSI(%rax)
/* Restore host state */
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbx
VLEAVE
ret
END(svm_launch)

442
vmm/amd/vmcb.c Normal file
View file

@ -0,0 +1,442 @@
/*-
* Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com)
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice unmodified, this list of conditions, and the following
* disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <machine/segments.h>
#include <machine/specialreg.h>
#include <machine/vmm.h>
#include "vmm_ktr.h"
#include "vmcb.h"
#include "svm.h"
#include "svm_softc.h"
/*
* The VMCB aka Virtual Machine Control Block is a 4KB aligned page
* in memory that describes the virtual machine.
*
* The VMCB contains:
* - instructions or events in the guest to intercept
* - control bits that modify execution environment of the guest
* - guest processor state (e.g. general purpose registers)
*/
/*
* Return VMCB segment area.
*/
static struct vmcb_segment *
vmcb_segptr(struct vmcb *vmcb, int type)
{
struct vmcb_state *state;
struct vmcb_segment *seg;
state = &vmcb->state;
switch (type) {
case VM_REG_GUEST_CS:
seg = &state->cs;
break;
case VM_REG_GUEST_DS:
seg = &state->ds;
break;
case VM_REG_GUEST_ES:
seg = &state->es;
break;
case VM_REG_GUEST_FS:
seg = &state->fs;
break;
case VM_REG_GUEST_GS:
seg = &state->gs;
break;
case VM_REG_GUEST_SS:
seg = &state->ss;
break;
case VM_REG_GUEST_GDTR:
seg = &state->gdt;
break;
case VM_REG_GUEST_IDTR:
seg = &state->idt;
break;
case VM_REG_GUEST_LDTR:
seg = &state->ldt;
break;
case VM_REG_GUEST_TR:
seg = &state->tr;
break;
default:
seg = NULL;
break;
}
return (seg);
}
static int
vmcb_access(struct svm_softc *softc, int vcpu, int write, int ident,
uint64_t *val)
{
struct vmcb *vmcb;
int off, bytes;
char *ptr;
vmcb = svm_get_vmcb(softc, vcpu);
off = VMCB_ACCESS_OFFSET(ident);
bytes = VMCB_ACCESS_BYTES(ident);
if ((off + bytes) >= sizeof (struct vmcb))
return (EINVAL);
ptr = (char *)vmcb;
if (!write)
*val = 0;
switch (bytes) {
case 8:
case 4:
case 2:
if (write)
memcpy(ptr + off, val, bytes);
else
memcpy(val, ptr + off, bytes);
break;
default:
VCPU_CTR1(softc->vm, vcpu,
"Invalid size %d for VMCB access: %d", bytes);
return (EINVAL);
}
/* Invalidate all VMCB state cached by h/w. */
if (write)
svm_set_dirty(softc, vcpu, 0xffffffff);
return (0);
}
/*
* Read from segment selector, control and general purpose register of VMCB.
*/
int
vmcb_read(struct svm_softc *sc, int vcpu, int ident, uint64_t *retval)
{
struct vmcb *vmcb;
struct vmcb_state *state;
struct vmcb_segment *seg;
int err;
vmcb = svm_get_vmcb(sc, vcpu);
state = &vmcb->state;
err = 0;
if (VMCB_ACCESS_OK(ident))
return (vmcb_access(sc, vcpu, 0, ident, retval));
switch (ident) {
case VM_REG_GUEST_CR0:
*retval = state->cr0;
break;
case VM_REG_GUEST_CR2:
*retval = state->cr2;
break;
case VM_REG_GUEST_CR3:
*retval = state->cr3;
break;
case VM_REG_GUEST_CR4:
*retval = state->cr4;
break;
case VM_REG_GUEST_DR7:
*retval = state->dr7;
break;
case VM_REG_GUEST_EFER:
*retval = state->efer;
break;
case VM_REG_GUEST_RAX:
*retval = state->rax;
break;
case VM_REG_GUEST_RFLAGS:
*retval = state->rflags;
break;
case VM_REG_GUEST_RIP:
*retval = state->rip;
break;
case VM_REG_GUEST_RSP:
*retval = state->rsp;
break;
case VM_REG_GUEST_CS:
case VM_REG_GUEST_DS:
case VM_REG_GUEST_ES:
case VM_REG_GUEST_FS:
case VM_REG_GUEST_GS:
case VM_REG_GUEST_SS:
case VM_REG_GUEST_LDTR:
case VM_REG_GUEST_TR:
seg = vmcb_segptr(vmcb, ident);
KASSERT(seg != NULL, ("%s: unable to get segment %d from VMCB",
__func__, ident));
*retval = seg->selector;
break;
case VM_REG_GUEST_GDTR:
case VM_REG_GUEST_IDTR:
/* GDTR and IDTR don't have segment selectors */
err = EINVAL;
break;
default:
err = EINVAL;
break;
}
return (err);
}
/*
* Write to segment selector, control and general purpose register of VMCB.
*/
int
vmcb_write(struct svm_softc *sc, int vcpu, int ident, uint64_t val)
{
struct vmcb *vmcb;
struct vmcb_state *state;
struct vmcb_segment *seg;
int err, dirtyseg;
vmcb = svm_get_vmcb(sc, vcpu);
state = &vmcb->state;
dirtyseg = 0;
err = 0;
if (VMCB_ACCESS_OK(ident))
return (vmcb_access(sc, vcpu, 1, ident, &val));
switch (ident) {
case VM_REG_GUEST_CR0:
state->cr0 = val;
svm_set_dirty(sc, vcpu, VMCB_CACHE_CR);
break;
case VM_REG_GUEST_CR2:
state->cr2 = val;
svm_set_dirty(sc, vcpu, VMCB_CACHE_CR2);
break;
case VM_REG_GUEST_CR3:
state->cr3 = val;
svm_set_dirty(sc, vcpu, VMCB_CACHE_CR);
break;
case VM_REG_GUEST_CR4:
state->cr4 = val;
svm_set_dirty(sc, vcpu, VMCB_CACHE_CR);
break;
case VM_REG_GUEST_DR7:
state->dr7 = val;
break;
case VM_REG_GUEST_EFER:
/* EFER_SVM must always be set when the guest is executing */
state->efer = val | EFER_SVM;
svm_set_dirty(sc, vcpu, VMCB_CACHE_CR);
break;
case VM_REG_GUEST_RAX:
state->rax = val;
break;
case VM_REG_GUEST_RFLAGS:
state->rflags = val;
break;
case VM_REG_GUEST_RIP:
state->rip = val;
break;
case VM_REG_GUEST_RSP:
state->rsp = val;
break;
case VM_REG_GUEST_CS:
case VM_REG_GUEST_DS:
case VM_REG_GUEST_ES:
case VM_REG_GUEST_SS:
dirtyseg = 1; /* FALLTHROUGH */
case VM_REG_GUEST_FS:
case VM_REG_GUEST_GS:
case VM_REG_GUEST_LDTR:
case VM_REG_GUEST_TR:
seg = vmcb_segptr(vmcb, ident);
KASSERT(seg != NULL, ("%s: unable to get segment %d from VMCB",
__func__, ident));
seg->selector = val;
if (dirtyseg)
svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG);
break;
case VM_REG_GUEST_GDTR:
case VM_REG_GUEST_IDTR:
/* GDTR and IDTR don't have segment selectors */
err = EINVAL;
break;
default:
err = EINVAL;
break;
}
return (err);
}
int
vmcb_seg(struct vmcb *vmcb, int ident, struct vmcb_segment *seg2)
{
struct vmcb_segment *seg;
seg = vmcb_segptr(vmcb, ident);
if (seg != NULL) {
bcopy(seg, seg2, sizeof(struct vmcb_segment));
return (0);
} else {
return (EINVAL);
}
}
int
vmcb_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
{
struct vmcb *vmcb;
struct svm_softc *sc;
struct vmcb_segment *seg;
uint16_t attrib;
sc = arg;
vmcb = svm_get_vmcb(sc, vcpu);
seg = vmcb_segptr(vmcb, reg);
KASSERT(seg != NULL, ("%s: invalid segment descriptor %d",
__func__, reg));
seg->base = desc->base;
seg->limit = desc->limit;
if (reg != VM_REG_GUEST_GDTR && reg != VM_REG_GUEST_IDTR) {
/*
* Map seg_desc access to VMCB attribute format.
*
* SVM uses the 'P' bit in the segment attributes to indicate a
* NULL segment so clear it if the segment is marked unusable.
*/
attrib = ((desc->access & 0xF000) >> 4) | (desc->access & 0xFF);
if (SEG_DESC_UNUSABLE(desc->access)) {
attrib &= ~0x80;
}
seg->attrib = attrib;
}
VCPU_CTR4(sc->vm, vcpu, "Setting desc %d: base (%#lx), limit (%#x), "
"attrib (%#x)", reg, seg->base, seg->limit, seg->attrib);
switch (reg) {
case VM_REG_GUEST_CS:
case VM_REG_GUEST_DS:
case VM_REG_GUEST_ES:
case VM_REG_GUEST_SS:
svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG);
break;
case VM_REG_GUEST_GDTR:
case VM_REG_GUEST_IDTR:
svm_set_dirty(sc, vcpu, VMCB_CACHE_DT);
break;
default:
break;
}
return (0);
}
int
vmcb_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
{
struct vmcb *vmcb;
struct svm_softc *sc;
struct vmcb_segment *seg;
sc = arg;
vmcb = svm_get_vmcb(sc, vcpu);
seg = vmcb_segptr(vmcb, reg);
KASSERT(seg != NULL, ("%s: invalid segment descriptor %d",
__func__, reg));
desc->base = seg->base;
desc->limit = seg->limit;
desc->access = 0;
if (reg != VM_REG_GUEST_GDTR && reg != VM_REG_GUEST_IDTR) {
/* Map seg_desc access to VMCB attribute format */
desc->access = ((seg->attrib & 0xF00) << 4) |
(seg->attrib & 0xFF);
/*
* VT-x uses bit 16 to indicate a segment that has been loaded
* with a NULL selector (aka unusable). The 'desc->access'
* field is interpreted in the VT-x format by the
* processor-independent code.
*
* SVM uses the 'P' bit to convey the same information so
* convert it into the VT-x format. For more details refer to
* section "Segment State in the VMCB" in APMv2.
*/
if (reg != VM_REG_GUEST_CS && reg != VM_REG_GUEST_TR) {
if ((desc->access & 0x80) == 0)
desc->access |= 0x10000; /* Unusable segment */
}
}
return (0);
}

334
vmm/amd/vmcb.h Normal file
View file

@ -0,0 +1,334 @@
/*-
* Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com)
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice unmodified, this list of conditions, and the following
* disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMCB_H_
#define _VMCB_H_
struct svm_softc;
#define BIT(n) (1ULL << n)
/*
* Secure Virtual Machine: AMD64 Programmer's Manual Vol2, Chapter 15
* Layout of VMCB: AMD64 Programmer's Manual Vol2, Appendix B
*/
/* vmcb_ctrl->intercept[] array indices */
#define VMCB_CR_INTCPT 0
#define VMCB_DR_INTCPT 1
#define VMCB_EXC_INTCPT 2
#define VMCB_CTRL1_INTCPT 3
#define VMCB_CTRL2_INTCPT 4
/* intercept[VMCB_CTRL1_INTCPT] fields */
#define VMCB_INTCPT_INTR BIT(0)
#define VMCB_INTCPT_NMI BIT(1)
#define VMCB_INTCPT_SMI BIT(2)
#define VMCB_INTCPT_INIT BIT(3)
#define VMCB_INTCPT_VINTR BIT(4)
#define VMCB_INTCPT_CR0_WRITE BIT(5)
#define VMCB_INTCPT_IDTR_READ BIT(6)
#define VMCB_INTCPT_GDTR_READ BIT(7)
#define VMCB_INTCPT_LDTR_READ BIT(8)
#define VMCB_INTCPT_TR_READ BIT(9)
#define VMCB_INTCPT_IDTR_WRITE BIT(10)
#define VMCB_INTCPT_GDTR_WRITE BIT(11)
#define VMCB_INTCPT_LDTR_WRITE BIT(12)
#define VMCB_INTCPT_TR_WRITE BIT(13)
#define VMCB_INTCPT_RDTSC BIT(14)
#define VMCB_INTCPT_RDPMC BIT(15)
#define VMCB_INTCPT_PUSHF BIT(16)
#define VMCB_INTCPT_POPF BIT(17)
#define VMCB_INTCPT_CPUID BIT(18)
#define VMCB_INTCPT_RSM BIT(19)
#define VMCB_INTCPT_IRET BIT(20)
#define VMCB_INTCPT_INTn BIT(21)
#define VMCB_INTCPT_INVD BIT(22)
#define VMCB_INTCPT_PAUSE BIT(23)
#define VMCB_INTCPT_HLT BIT(24)
#define VMCB_INTCPT_INVPG BIT(25)
#define VMCB_INTCPT_INVPGA BIT(26)
#define VMCB_INTCPT_IO BIT(27)
#define VMCB_INTCPT_MSR BIT(28)
#define VMCB_INTCPT_TASK_SWITCH BIT(29)
#define VMCB_INTCPT_FERR_FREEZE BIT(30)
#define VMCB_INTCPT_SHUTDOWN BIT(31)
/* intercept[VMCB_CTRL2_INTCPT] fields */
#define VMCB_INTCPT_VMRUN BIT(0)
#define VMCB_INTCPT_VMMCALL BIT(1)
#define VMCB_INTCPT_VMLOAD BIT(2)
#define VMCB_INTCPT_VMSAVE BIT(3)
#define VMCB_INTCPT_STGI BIT(4)
#define VMCB_INTCPT_CLGI BIT(5)
#define VMCB_INTCPT_SKINIT BIT(6)
#define VMCB_INTCPT_RDTSCP BIT(7)
#define VMCB_INTCPT_ICEBP BIT(8)
#define VMCB_INTCPT_WBINVD BIT(9)
#define VMCB_INTCPT_MONITOR BIT(10)
#define VMCB_INTCPT_MWAIT BIT(11)
#define VMCB_INTCPT_MWAIT_ARMED BIT(12)
#define VMCB_INTCPT_XSETBV BIT(13)
/* VMCB TLB control */
#define VMCB_TLB_FLUSH_NOTHING 0 /* Flush nothing */
#define VMCB_TLB_FLUSH_ALL 1 /* Flush entire TLB */
#define VMCB_TLB_FLUSH_GUEST 3 /* Flush all guest entries */
#define VMCB_TLB_FLUSH_GUEST_NONGLOBAL 7 /* Flush guest non-PG entries */
/* VMCB state caching */
#define VMCB_CACHE_NONE 0 /* No caching */
#define VMCB_CACHE_I BIT(0) /* Intercept, TSC off, Pause filter */
#define VMCB_CACHE_IOPM BIT(1) /* I/O and MSR permission */
#define VMCB_CACHE_ASID BIT(2) /* ASID */
#define VMCB_CACHE_TPR BIT(3) /* V_TPR to V_INTR_VECTOR */
#define VMCB_CACHE_NP BIT(4) /* Nested Paging */
#define VMCB_CACHE_CR BIT(5) /* CR0, CR3, CR4 & EFER */
#define VMCB_CACHE_DR BIT(6) /* Debug registers */
#define VMCB_CACHE_DT BIT(7) /* GDT/IDT */
#define VMCB_CACHE_SEG BIT(8) /* User segments, CPL */
#define VMCB_CACHE_CR2 BIT(9) /* page fault address */
#define VMCB_CACHE_LBR BIT(10) /* Last branch */
/* VMCB control event injection */
#define VMCB_EVENTINJ_EC_VALID BIT(11) /* Error Code valid */
#define VMCB_EVENTINJ_VALID BIT(31) /* Event valid */
/* Event types that can be injected */
#define VMCB_EVENTINJ_TYPE_INTR 0
#define VMCB_EVENTINJ_TYPE_NMI 2
#define VMCB_EVENTINJ_TYPE_EXCEPTION 3
#define VMCB_EVENTINJ_TYPE_INTn 4
/* VMCB exit code, APM vol2 Appendix C */
#define VMCB_EXIT_MC 0x52
#define VMCB_EXIT_INTR 0x60
#define VMCB_EXIT_NMI 0x61
#define VMCB_EXIT_VINTR 0x64
#define VMCB_EXIT_PUSHF 0x70
#define VMCB_EXIT_POPF 0x71
#define VMCB_EXIT_CPUID 0x72
#define VMCB_EXIT_IRET 0x74
#define VMCB_EXIT_PAUSE 0x77
#define VMCB_EXIT_HLT 0x78
#define VMCB_EXIT_IO 0x7B
#define VMCB_EXIT_MSR 0x7C
#define VMCB_EXIT_SHUTDOWN 0x7F
#define VMCB_EXIT_VMSAVE 0x83
#define VMCB_EXIT_MONITOR 0x8A
#define VMCB_EXIT_MWAIT 0x8B
#define VMCB_EXIT_NPF 0x400
#define VMCB_EXIT_INVALID -1
/*
* Nested page fault.
* Bit definitions to decode EXITINFO1.
*/
#define VMCB_NPF_INFO1_P BIT(0) /* Nested page present. */
#define VMCB_NPF_INFO1_W BIT(1) /* Access was write. */
#define VMCB_NPF_INFO1_U BIT(2) /* Access was user access. */
#define VMCB_NPF_INFO1_RSV BIT(3) /* Reserved bits present. */
#define VMCB_NPF_INFO1_ID BIT(4) /* Code read. */
#define VMCB_NPF_INFO1_GPA BIT(32) /* Guest physical address. */
#define VMCB_NPF_INFO1_GPT BIT(33) /* Guest page table. */
/*
* EXITINTINFO, Interrupt exit info for all intrecepts.
* Section 15.7.2, Intercepts during IDT Interrupt Delivery.
*/
#define VMCB_EXITINTINFO_VECTOR(x) ((x) & 0xFF)
#define VMCB_EXITINTINFO_TYPE(x) (((x) >> 8) & 0x7)
#define VMCB_EXITINTINFO_EC_VALID(x) (((x) & BIT(11)) ? 1 : 0)
#define VMCB_EXITINTINFO_VALID(x) (((x) & BIT(31)) ? 1 : 0)
#define VMCB_EXITINTINFO_EC(x) (((x) >> 32) & 0xFFFFFFFF)
/* Offset of various VMCB fields. */
#define VMCB_OFF_CTRL(x) (x)
#define VMCB_OFF_STATE(x) ((x) + 0x400)
#define VMCB_OFF_CR_INTERCEPT VMCB_OFF_CTRL(0x0)
#define VMCB_OFF_DR_INTERCEPT VMCB_OFF_CTRL(0x4)
#define VMCB_OFF_EXC_INTERCEPT VMCB_OFF_CTRL(0x8)
#define VMCB_OFF_INST1_INTERCEPT VMCB_OFF_CTRL(0xC)
#define VMCB_OFF_INST2_INTERCEPT VMCB_OFF_CTRL(0x10)
#define VMCB_OFF_IO_PERM VMCB_OFF_CTRL(0x40)
#define VMCB_OFF_MSR_PERM VMCB_OFF_CTRL(0x48)
#define VMCB_OFF_TSC_OFFSET VMCB_OFF_CTRL(0x50)
#define VMCB_OFF_ASID VMCB_OFF_CTRL(0x58)
#define VMCB_OFF_TLB_CTRL VMCB_OFF_CTRL(0x5C)
#define VMCB_OFF_VIRQ VMCB_OFF_CTRL(0x60)
#define VMCB_OFF_EXIT_REASON VMCB_OFF_CTRL(0x70)
#define VMCB_OFF_EXITINFO1 VMCB_OFF_CTRL(0x78)
#define VMCB_OFF_EXITINFO2 VMCB_OFF_CTRL(0x80)
#define VMCB_OFF_EXITINTINFO VMCB_OFF_CTRL(0x88)
#define VMCB_OFF_AVIC_BAR VMCB_OFF_CTRL(0x98)
#define VMCB_OFF_NPT_BASE VMCB_OFF_CTRL(0xB0)
#define VMCB_OFF_AVIC_PAGE VMCB_OFF_CTRL(0xE0)
#define VMCB_OFF_AVIC_LT VMCB_OFF_CTRL(0xF0)
#define VMCB_OFF_AVIC_PT VMCB_OFF_CTRL(0xF8)
#define VMCB_OFF_SYSENTER_CS VMCB_OFF_STATE(0x228)
#define VMCB_OFF_SYSENTER_ESP VMCB_OFF_STATE(0x230)
#define VMCB_OFF_SYSENTER_EIP VMCB_OFF_STATE(0x238)
#define VMCB_OFF_GUEST_PAT VMCB_OFF_STATE(0x268)
/*
* Encode the VMCB offset and bytes that we want to read from VMCB.
*/
#define VMCB_ACCESS(o, w) (0x80000000 | (((w) & 0xF) << 16) | \
((o) & 0xFFF))
#define VMCB_ACCESS_OK(v) ((v) & 0x80000000 )
#define VMCB_ACCESS_BYTES(v) (((v) >> 16) & 0xF)
#define VMCB_ACCESS_OFFSET(v) ((v) & 0xFFF)
#ifdef _KERNEL
/* VMCB save state area segment format */
struct vmcb_segment {
uint16_t selector;
uint16_t attrib;
uint32_t limit;
uint64_t base;
} __attribute__ ((__packed__));
CTASSERT(sizeof(struct vmcb_segment) == 16);
/* Code segment descriptor attribute in 12 bit format as saved by VMCB. */
#define VMCB_CS_ATTRIB_L BIT(9) /* Long mode. */
#define VMCB_CS_ATTRIB_D BIT(10) /* OPerand size bit. */
/*
* The VMCB is divided into two areas - the first one contains various
* control bits including the intercept vector and the second one contains
* the guest state.
*/
/* VMCB control area - padded up to 1024 bytes */
struct vmcb_ctrl {
uint32_t intercept[5]; /* all intercepts */
uint8_t pad1[0x28]; /* Offsets 0x14-0x3B are reserved. */
uint16_t pause_filthresh; /* Offset 0x3C, PAUSE filter threshold */
uint16_t pause_filcnt; /* Offset 0x3E, PAUSE filter count */
uint64_t iopm_base_pa; /* 0x40: IOPM_BASE_PA */
uint64_t msrpm_base_pa; /* 0x48: MSRPM_BASE_PA */
uint64_t tsc_offset; /* 0x50: TSC_OFFSET */
uint32_t asid; /* 0x58: Guest ASID */
uint8_t tlb_ctrl; /* 0x5C: TLB_CONTROL */
uint8_t pad2[3]; /* 0x5D-0x5F: Reserved. */
uint8_t v_tpr; /* 0x60: V_TPR, guest CR8 */
uint8_t v_irq:1; /* Is virtual interrupt pending? */
uint8_t :7; /* Padding */
uint8_t v_intr_prio:4; /* 0x62: Priority for virtual interrupt. */
uint8_t v_ign_tpr:1;
uint8_t :3;
uint8_t v_intr_masking:1; /* Guest and host sharing of RFLAGS. */
uint8_t :7;
uint8_t v_intr_vector; /* 0x65: Vector for virtual interrupt. */
uint8_t pad3[3]; /* Bit64-40 Reserved. */
uint64_t intr_shadow:1; /* 0x68: Interrupt shadow, section15.2.1 APM2 */
uint64_t :63;
uint64_t exitcode; /* 0x70, Exitcode */
uint64_t exitinfo1; /* 0x78, EXITINFO1 */
uint64_t exitinfo2; /* 0x80, EXITINFO2 */
uint64_t exitintinfo; /* 0x88, Interrupt exit value. */
uint64_t np_enable:1; /* 0x90, Nested paging enable. */
uint64_t :63;
uint8_t pad4[0x10]; /* 0x98-0xA7 reserved. */
uint64_t eventinj; /* 0xA8, Event injection. */
uint64_t n_cr3; /* B0, Nested page table. */
uint64_t lbr_virt_en:1; /* Enable LBR virtualization. */
uint64_t :63;
uint32_t vmcb_clean; /* 0xC0: VMCB clean bits for caching */
uint32_t :32; /* 0xC4: Reserved */
uint64_t nrip; /* 0xC8: Guest next nRIP. */
uint8_t inst_len; /* 0xD0: #NPF decode assist */
uint8_t inst_bytes[15];
uint8_t padd6[0x320];
} __attribute__ ((__packed__));
CTASSERT(sizeof(struct vmcb_ctrl) == 1024);
struct vmcb_state {
struct vmcb_segment es;
struct vmcb_segment cs;
struct vmcb_segment ss;
struct vmcb_segment ds;
struct vmcb_segment fs;
struct vmcb_segment gs;
struct vmcb_segment gdt;
struct vmcb_segment ldt;
struct vmcb_segment idt;
struct vmcb_segment tr;
uint8_t pad1[0x2b]; /* Reserved: 0xA0-0xCA */
uint8_t cpl;
uint8_t pad2[4];
uint64_t efer;
uint8_t pad3[0x70]; /* Reserved: 0xd8-0x147 */
uint64_t cr4;
uint64_t cr3; /* Guest CR3 */
uint64_t cr0;
uint64_t dr7;
uint64_t dr6;
uint64_t rflags;
uint64_t rip;
uint8_t pad4[0x58]; /* Reserved: 0x180-0x1D7 */
uint64_t rsp;
uint8_t pad5[0x18]; /* Reserved 0x1E0-0x1F7 */
uint64_t rax;
uint64_t star;
uint64_t lstar;
uint64_t cstar;
uint64_t sfmask;
uint64_t kernelgsbase;
uint64_t sysenter_cs;
uint64_t sysenter_esp;
uint64_t sysenter_eip;
uint64_t cr2;
uint8_t pad6[0x20];
uint64_t g_pat;
uint64_t dbgctl;
uint64_t br_from;
uint64_t br_to;
uint64_t int_from;
uint64_t int_to;
uint8_t pad7[0x968]; /* Reserved upto end of VMCB */
} __attribute__ ((__packed__));
CTASSERT(sizeof(struct vmcb_state) == 0xC00);
struct vmcb {
struct vmcb_ctrl ctrl;
struct vmcb_state state;
} __attribute__ ((__packed__));
CTASSERT(sizeof(struct vmcb) == PAGE_SIZE);
CTASSERT(offsetof(struct vmcb, state) == 0x400);
int vmcb_read(struct svm_softc *sc, int vcpu, int ident, uint64_t *retval);
int vmcb_write(struct svm_softc *sc, int vcpu, int ident, uint64_t val);
int vmcb_setdesc(void *arg, int vcpu, int ident, struct seg_desc *desc);
int vmcb_getdesc(void *arg, int vcpu, int ident, struct seg_desc *desc);
int vmcb_seg(struct vmcb *vmcb, int ident, struct vmcb_segment *seg);
#endif /* _KERNEL */
#endif /* _VMCB_H_ */

205
vmm/intel/ept.c Normal file
View file

@ -0,0 +1,205 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/smp.h>
#include <sys/sysctl.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <vm/vm_extern.h>
#include <machine/vmm.h>
#include "vmx_cpufunc.h"
#include "ept.h"
#define EPT_SUPPORTS_EXEC_ONLY(cap) ((cap) & (1UL << 0))
#define EPT_PWL4(cap) ((cap) & (1UL << 6))
#define EPT_MEMORY_TYPE_WB(cap) ((cap) & (1UL << 14))
#define EPT_PDE_SUPERPAGE(cap) ((cap) & (1UL << 16)) /* 2MB pages */
#define EPT_PDPTE_SUPERPAGE(cap) ((cap) & (1UL << 17)) /* 1GB pages */
#define INVEPT_SUPPORTED(cap) ((cap) & (1UL << 20))
#define AD_BITS_SUPPORTED(cap) ((cap) & (1UL << 21))
#define INVVPID_SUPPORTED(cap) ((cap) & (1UL << 32))
#define INVVPID_ALL_TYPES_MASK 0xF0000000000UL
#define INVVPID_ALL_TYPES_SUPPORTED(cap) \
(((cap) & INVVPID_ALL_TYPES_MASK) == INVVPID_ALL_TYPES_MASK)
#define INVEPT_ALL_TYPES_MASK 0x6000000UL
#define INVEPT_ALL_TYPES_SUPPORTED(cap) \
(((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK)
#define EPT_PWLEVELS 4 /* page walk levels */
#define EPT_ENABLE_AD_BITS (1 << 6)
SYSCTL_DECL(_hw_vmm);
SYSCTL_NODE(_hw_vmm, OID_AUTO, ept, CTLFLAG_RW, NULL, NULL);
static int ept_enable_ad_bits;
static int ept_pmap_flags;
SYSCTL_INT(_hw_vmm_ept, OID_AUTO, pmap_flags, CTLFLAG_RD,
&ept_pmap_flags, 0, NULL);
int
ept_init(int ipinum)
{
int use_hw_ad_bits, use_superpages, use_exec_only;
uint64_t cap;
cap = rdmsr(MSR_VMX_EPT_VPID_CAP);
/*
* Verify that:
* - page walk length is 4 steps
* - extended page tables can be laid out in write-back memory
* - invvpid instruction with all possible types is supported
* - invept instruction with all possible types is supported
*/
if (!EPT_PWL4(cap) ||
!EPT_MEMORY_TYPE_WB(cap) ||
!INVVPID_SUPPORTED(cap) ||
!INVVPID_ALL_TYPES_SUPPORTED(cap) ||
!INVEPT_SUPPORTED(cap) ||
!INVEPT_ALL_TYPES_SUPPORTED(cap))
return (EINVAL);
ept_pmap_flags = ipinum & PMAP_NESTED_IPIMASK;
use_superpages = 1;
TUNABLE_INT_FETCH("hw.vmm.ept.use_superpages", &use_superpages);
if (use_superpages && EPT_PDE_SUPERPAGE(cap))
ept_pmap_flags |= PMAP_PDE_SUPERPAGE; /* 2MB superpage */
use_hw_ad_bits = 1;
TUNABLE_INT_FETCH("hw.vmm.ept.use_hw_ad_bits", &use_hw_ad_bits);
if (use_hw_ad_bits && AD_BITS_SUPPORTED(cap))
ept_enable_ad_bits = 1;
else
ept_pmap_flags |= PMAP_EMULATE_AD_BITS;
use_exec_only = 1;
TUNABLE_INT_FETCH("hw.vmm.ept.use_exec_only", &use_exec_only);
if (use_exec_only && EPT_SUPPORTS_EXEC_ONLY(cap))
ept_pmap_flags |= PMAP_SUPPORTS_EXEC_ONLY;
return (0);
}
#if 0
static void
ept_dump(uint64_t *ptp, int nlevels)
{
int i, t, tabs;
uint64_t *ptpnext, ptpval;
if (--nlevels < 0)
return;
tabs = 3 - nlevels;
for (t = 0; t < tabs; t++)
printf("\t");
printf("PTP = %p\n", ptp);
for (i = 0; i < 512; i++) {
ptpval = ptp[i];
if (ptpval == 0)
continue;
for (t = 0; t < tabs; t++)
printf("\t");
printf("%3d 0x%016lx\n", i, ptpval);
if (nlevels != 0 && (ptpval & EPT_PG_SUPERPAGE) == 0) {
ptpnext = (uint64_t *)
PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
ept_dump(ptpnext, nlevels);
}
}
}
#endif
static void
invept_single_context(void *arg)
{
struct invept_desc desc = *(struct invept_desc *)arg;
invept(INVEPT_TYPE_SINGLE_CONTEXT, desc);
}
void
ept_invalidate_mappings(u_long eptp)
{
struct invept_desc invept_desc = { 0 };
invept_desc.eptp = eptp;
smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc);
}
static int
ept_pinit(pmap_t pmap)
{
return (pmap_pinit_type(pmap, PT_EPT, ept_pmap_flags));
}
struct vmspace *
ept_vmspace_alloc(vm_offset_t min, vm_offset_t max)
{
return (vmspace_alloc(min, max, ept_pinit));
}
void
ept_vmspace_free(struct vmspace *vmspace)
{
vmspace_free(vmspace);
}
uint64_t
eptp(uint64_t pml4)
{
uint64_t eptp_val;
eptp_val = pml4 | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK;
if (ept_enable_ad_bits)
eptp_val |= EPT_ENABLE_AD_BITS;
return (eptp_val);
}

39
vmm/intel/ept.h Normal file
View file

@ -0,0 +1,39 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _EPT_H_
#define _EPT_H_
struct vmx;
int ept_init(int ipinum);
void ept_invalidate_mappings(u_long eptp);
struct vmspace *ept_vmspace_alloc(vm_offset_t min, vm_offset_t max);
void ept_vmspace_free(struct vmspace *vmspace);
uint64_t eptp(uint64_t pml4);
#endif

503
vmm/intel/vmcs.c Normal file
View file

@ -0,0 +1,503 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include "opt_ddb.h"
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/pcpu.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <machine/segments.h>
#include <machine/vmm.h>
#include "vmm_host.h"
#include "vmx_cpufunc.h"
#include "vmcs.h"
#include "ept.h"
#include "vmx.h"
#ifdef DDB
#include <ddb/ddb.h>
#endif
static uint64_t
vmcs_fix_regval(uint32_t encoding, uint64_t val)
{
switch (encoding) {
case VMCS_GUEST_CR0:
val = vmx_fix_cr0(val);
break;
case VMCS_GUEST_CR4:
val = vmx_fix_cr4(val);
break;
default:
break;
}
return (val);
}
static uint32_t
vmcs_field_encoding(int ident)
{
switch (ident) {
case VM_REG_GUEST_CR0:
return (VMCS_GUEST_CR0);
case VM_REG_GUEST_CR3:
return (VMCS_GUEST_CR3);
case VM_REG_GUEST_CR4:
return (VMCS_GUEST_CR4);
case VM_REG_GUEST_DR7:
return (VMCS_GUEST_DR7);
case VM_REG_GUEST_RSP:
return (VMCS_GUEST_RSP);
case VM_REG_GUEST_RIP:
return (VMCS_GUEST_RIP);
case VM_REG_GUEST_RFLAGS:
return (VMCS_GUEST_RFLAGS);
case VM_REG_GUEST_ES:
return (VMCS_GUEST_ES_SELECTOR);
case VM_REG_GUEST_CS:
return (VMCS_GUEST_CS_SELECTOR);
case VM_REG_GUEST_SS:
return (VMCS_GUEST_SS_SELECTOR);
case VM_REG_GUEST_DS:
return (VMCS_GUEST_DS_SELECTOR);
case VM_REG_GUEST_FS:
return (VMCS_GUEST_FS_SELECTOR);
case VM_REG_GUEST_GS:
return (VMCS_GUEST_GS_SELECTOR);
case VM_REG_GUEST_TR:
return (VMCS_GUEST_TR_SELECTOR);
case VM_REG_GUEST_LDTR:
return (VMCS_GUEST_LDTR_SELECTOR);
case VM_REG_GUEST_EFER:
return (VMCS_GUEST_IA32_EFER);
case VM_REG_GUEST_PDPTE0:
return (VMCS_GUEST_PDPTE0);
case VM_REG_GUEST_PDPTE1:
return (VMCS_GUEST_PDPTE1);
case VM_REG_GUEST_PDPTE2:
return (VMCS_GUEST_PDPTE2);
case VM_REG_GUEST_PDPTE3:
return (VMCS_GUEST_PDPTE3);
default:
return (-1);
}
}
static int
vmcs_seg_desc_encoding(int seg, uint32_t *base, uint32_t *lim, uint32_t *acc)
{
switch (seg) {
case VM_REG_GUEST_ES:
*base = VMCS_GUEST_ES_BASE;
*lim = VMCS_GUEST_ES_LIMIT;
*acc = VMCS_GUEST_ES_ACCESS_RIGHTS;
break;
case VM_REG_GUEST_CS:
*base = VMCS_GUEST_CS_BASE;
*lim = VMCS_GUEST_CS_LIMIT;
*acc = VMCS_GUEST_CS_ACCESS_RIGHTS;
break;
case VM_REG_GUEST_SS:
*base = VMCS_GUEST_SS_BASE;
*lim = VMCS_GUEST_SS_LIMIT;
*acc = VMCS_GUEST_SS_ACCESS_RIGHTS;
break;
case VM_REG_GUEST_DS:
*base = VMCS_GUEST_DS_BASE;
*lim = VMCS_GUEST_DS_LIMIT;
*acc = VMCS_GUEST_DS_ACCESS_RIGHTS;
break;
case VM_REG_GUEST_FS:
*base = VMCS_GUEST_FS_BASE;
*lim = VMCS_GUEST_FS_LIMIT;
*acc = VMCS_GUEST_FS_ACCESS_RIGHTS;
break;
case VM_REG_GUEST_GS:
*base = VMCS_GUEST_GS_BASE;
*lim = VMCS_GUEST_GS_LIMIT;
*acc = VMCS_GUEST_GS_ACCESS_RIGHTS;
break;
case VM_REG_GUEST_TR:
*base = VMCS_GUEST_TR_BASE;
*lim = VMCS_GUEST_TR_LIMIT;
*acc = VMCS_GUEST_TR_ACCESS_RIGHTS;
break;
case VM_REG_GUEST_LDTR:
*base = VMCS_GUEST_LDTR_BASE;
*lim = VMCS_GUEST_LDTR_LIMIT;
*acc = VMCS_GUEST_LDTR_ACCESS_RIGHTS;
break;
case VM_REG_GUEST_IDTR:
*base = VMCS_GUEST_IDTR_BASE;
*lim = VMCS_GUEST_IDTR_LIMIT;
*acc = VMCS_INVALID_ENCODING;
break;
case VM_REG_GUEST_GDTR:
*base = VMCS_GUEST_GDTR_BASE;
*lim = VMCS_GUEST_GDTR_LIMIT;
*acc = VMCS_INVALID_ENCODING;
break;
default:
return (EINVAL);
}
return (0);
}
int
vmcs_getreg(struct vmcs *vmcs, int running, int ident, uint64_t *retval)
{
int error;
uint32_t encoding;
/*
* If we need to get at vmx-specific state in the VMCS we can bypass
* the translation of 'ident' to 'encoding' by simply setting the
* sign bit. As it so happens the upper 16 bits are reserved (i.e
* set to 0) in the encodings for the VMCS so we are free to use the
* sign bit.
*/
if (ident < 0)
encoding = ident & 0x7fffffff;
else
encoding = vmcs_field_encoding(ident);
if (encoding == (uint32_t)-1)
return (EINVAL);
if (!running)
VMPTRLD(vmcs);
error = vmread(encoding, retval);
if (!running)
VMCLEAR(vmcs);
return (error);
}
int
vmcs_setreg(struct vmcs *vmcs, int running, int ident, uint64_t val)
{
int error;
uint32_t encoding;
if (ident < 0)
encoding = ident & 0x7fffffff;
else
encoding = vmcs_field_encoding(ident);
if (encoding == (uint32_t)-1)
return (EINVAL);
val = vmcs_fix_regval(encoding, val);
if (!running)
VMPTRLD(vmcs);
error = vmwrite(encoding, val);
if (!running)
VMCLEAR(vmcs);
return (error);
}
int
vmcs_setdesc(struct vmcs *vmcs, int running, int seg, struct seg_desc *desc)
{
int error;
uint32_t base, limit, access;
error = vmcs_seg_desc_encoding(seg, &base, &limit, &access);
if (error != 0)
panic("vmcs_setdesc: invalid segment register %d", seg);
if (!running)
VMPTRLD(vmcs);
if ((error = vmwrite(base, desc->base)) != 0)
goto done;
if ((error = vmwrite(limit, desc->limit)) != 0)
goto done;
if (access != VMCS_INVALID_ENCODING) {
if ((error = vmwrite(access, desc->access)) != 0)
goto done;
}
done:
if (!running)
VMCLEAR(vmcs);
return (error);
}
int
vmcs_getdesc(struct vmcs *vmcs, int running, int seg, struct seg_desc *desc)
{
int error;
uint32_t base, limit, access;
uint64_t u64;
error = vmcs_seg_desc_encoding(seg, &base, &limit, &access);
if (error != 0)
panic("vmcs_getdesc: invalid segment register %d", seg);
if (!running)
VMPTRLD(vmcs);
if ((error = vmread(base, &u64)) != 0)
goto done;
desc->base = u64;
if ((error = vmread(limit, &u64)) != 0)
goto done;
desc->limit = u64;
if (access != VMCS_INVALID_ENCODING) {
if ((error = vmread(access, &u64)) != 0)
goto done;
desc->access = u64;
}
done:
if (!running)
VMCLEAR(vmcs);
return (error);
}
int
vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count)
{
int error;
VMPTRLD(vmcs);
/*
* Guest MSRs are saved in the VM-exit MSR-store area.
* Guest MSRs are loaded from the VM-entry MSR-load area.
* Both areas point to the same location in memory.
*/
if ((error = vmwrite(VMCS_EXIT_MSR_STORE, g_area)) != 0)
goto done;
if ((error = vmwrite(VMCS_EXIT_MSR_STORE_COUNT, g_count)) != 0)
goto done;
if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD, g_area)) != 0)
goto done;
if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD_COUNT, g_count)) != 0)
goto done;
error = 0;
done:
VMCLEAR(vmcs);
return (error);
}
int
vmcs_init(struct vmcs *vmcs)
{
int error, codesel, datasel, tsssel;
u_long cr0, cr4, efer;
uint64_t pat, fsbase, idtrbase;
codesel = vmm_get_host_codesel();
datasel = vmm_get_host_datasel();
tsssel = vmm_get_host_tsssel();
/*
* Make sure we have a "current" VMCS to work with.
*/
VMPTRLD(vmcs);
/* Host state */
/* Initialize host IA32_PAT MSR */
pat = vmm_get_host_pat();
if ((error = vmwrite(VMCS_HOST_IA32_PAT, pat)) != 0)
goto done;
/* Load the IA32_EFER MSR */
efer = vmm_get_host_efer();
if ((error = vmwrite(VMCS_HOST_IA32_EFER, efer)) != 0)
goto done;
/* Load the control registers */
cr0 = vmm_get_host_cr0();
if ((error = vmwrite(VMCS_HOST_CR0, cr0)) != 0)
goto done;
cr4 = vmm_get_host_cr4() | CR4_VMXE;
if ((error = vmwrite(VMCS_HOST_CR4, cr4)) != 0)
goto done;
/* Load the segment selectors */
if ((error = vmwrite(VMCS_HOST_ES_SELECTOR, datasel)) != 0)
goto done;
if ((error = vmwrite(VMCS_HOST_CS_SELECTOR, codesel)) != 0)
goto done;
if ((error = vmwrite(VMCS_HOST_SS_SELECTOR, datasel)) != 0)
goto done;
if ((error = vmwrite(VMCS_HOST_DS_SELECTOR, datasel)) != 0)
goto done;
if ((error = vmwrite(VMCS_HOST_FS_SELECTOR, datasel)) != 0)
goto done;
if ((error = vmwrite(VMCS_HOST_GS_SELECTOR, datasel)) != 0)
goto done;
if ((error = vmwrite(VMCS_HOST_TR_SELECTOR, tsssel)) != 0)
goto done;
/*
* Load the Base-Address for %fs and idtr.
*
* Note that we exclude %gs, tss and gdtr here because their base
* address is pcpu specific.
*/
fsbase = vmm_get_host_fsbase();
if ((error = vmwrite(VMCS_HOST_FS_BASE, fsbase)) != 0)
goto done;
idtrbase = vmm_get_host_idtrbase();
if ((error = vmwrite(VMCS_HOST_IDTR_BASE, idtrbase)) != 0)
goto done;
/* instruction pointer */
if ((error = vmwrite(VMCS_HOST_RIP, (u_long)vmx_exit_guest)) != 0)
goto done;
/* link pointer */
if ((error = vmwrite(VMCS_LINK_POINTER, ~0)) != 0)
goto done;
done:
VMCLEAR(vmcs);
return (error);
}
#ifdef DDB
extern int vmxon_enabled[];
DB_SHOW_COMMAND(vmcs, db_show_vmcs)
{
uint64_t cur_vmcs, val;
uint32_t exit;
if (!vmxon_enabled[curcpu]) {
db_printf("VMX not enabled\n");
return;
}
if (have_addr) {
db_printf("Only current VMCS supported\n");
return;
}
vmptrst(&cur_vmcs);
if (cur_vmcs == VMCS_INITIAL) {
db_printf("No current VM context\n");
return;
}
db_printf("VMCS: %jx\n", cur_vmcs);
db_printf("VPID: %lu\n", vmcs_read(VMCS_VPID));
db_printf("Activity: ");
val = vmcs_read(VMCS_GUEST_ACTIVITY);
switch (val) {
case 0:
db_printf("Active");
break;
case 1:
db_printf("HLT");
break;
case 2:
db_printf("Shutdown");
break;
case 3:
db_printf("Wait for SIPI");
break;
default:
db_printf("Unknown: %#lx", val);
}
db_printf("\n");
exit = vmcs_read(VMCS_EXIT_REASON);
if (exit & 0x80000000)
db_printf("Entry Failure Reason: %u\n", exit & 0xffff);
else
db_printf("Exit Reason: %u\n", exit & 0xffff);
db_printf("Qualification: %#lx\n", vmcs_exit_qualification());
db_printf("Guest Linear Address: %#lx\n",
vmcs_read(VMCS_GUEST_LINEAR_ADDRESS));
switch (exit & 0x8000ffff) {
case EXIT_REASON_EXCEPTION:
case EXIT_REASON_EXT_INTR:
val = vmcs_read(VMCS_EXIT_INTR_INFO);
db_printf("Interrupt Type: ");
switch (val >> 8 & 0x7) {
case 0:
db_printf("external");
break;
case 2:
db_printf("NMI");
break;
case 3:
db_printf("HW exception");
break;
case 4:
db_printf("SW exception");
break;
default:
db_printf("?? %lu", val >> 8 & 0x7);
break;
}
db_printf(" Vector: %lu", val & 0xff);
if (val & 0x800)
db_printf(" Error Code: %lx",
vmcs_read(VMCS_EXIT_INTR_ERRCODE));
db_printf("\n");
break;
case EXIT_REASON_EPT_FAULT:
case EXIT_REASON_EPT_MISCONFIG:
db_printf("Guest Physical Address: %#lx\n",
vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS));
break;
}
db_printf("VM-instruction error: %#lx\n", vmcs_instruction_error());
}
#endif

401
vmm/intel/vmcs.h Normal file
View file

@ -0,0 +1,401 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMCS_H_
#define _VMCS_H_
#ifdef _KERNEL
struct vmcs {
uint32_t identifier;
uint32_t abort_code;
char _impl_specific[PAGE_SIZE - sizeof(uint32_t) * 2];
};
CTASSERT(sizeof(struct vmcs) == PAGE_SIZE);
/* MSR save region is composed of an array of 'struct msr_entry' */
struct msr_entry {
uint32_t index;
uint32_t reserved;
uint64_t val;
};
int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count);
int vmcs_init(struct vmcs *vmcs);
int vmcs_getreg(struct vmcs *vmcs, int running, int ident, uint64_t *rv);
int vmcs_setreg(struct vmcs *vmcs, int running, int ident, uint64_t val);
int vmcs_getdesc(struct vmcs *vmcs, int running, int ident,
struct seg_desc *desc);
int vmcs_setdesc(struct vmcs *vmcs, int running, int ident,
struct seg_desc *desc);
/*
* Avoid header pollution caused by inline use of 'vtophys()' in vmx_cpufunc.h
*/
#ifdef _VMX_CPUFUNC_H_
static __inline uint64_t
vmcs_read(uint32_t encoding)
{
int error;
uint64_t val;
error = vmread(encoding, &val);
KASSERT(error == 0, ("vmcs_read(%u) error %d", encoding, error));
return (val);
}
static __inline void
vmcs_write(uint32_t encoding, uint64_t val)
{
int error;
error = vmwrite(encoding, val);
KASSERT(error == 0, ("vmcs_write(%u) error %d", encoding, error));
}
#endif /* _VMX_CPUFUNC_H_ */
#define vmexit_instruction_length() vmcs_read(VMCS_EXIT_INSTRUCTION_LENGTH)
#define vmcs_guest_rip() vmcs_read(VMCS_GUEST_RIP)
#define vmcs_instruction_error() vmcs_read(VMCS_INSTRUCTION_ERROR)
#define vmcs_exit_reason() (vmcs_read(VMCS_EXIT_REASON) & 0xffff)
#define vmcs_exit_qualification() vmcs_read(VMCS_EXIT_QUALIFICATION)
#define vmcs_guest_cr3() vmcs_read(VMCS_GUEST_CR3)
#define vmcs_gpa() vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS)
#define vmcs_gla() vmcs_read(VMCS_GUEST_LINEAR_ADDRESS)
#define vmcs_idt_vectoring_info() vmcs_read(VMCS_IDT_VECTORING_INFO)
#define vmcs_idt_vectoring_err() vmcs_read(VMCS_IDT_VECTORING_ERROR)
#endif /* _KERNEL */
#define VMCS_INITIAL 0xffffffffffffffff
#define VMCS_IDENT(encoding) ((encoding) | 0x80000000)
/*
* VMCS field encodings from Appendix H, Intel Architecture Manual Vol3B.
*/
#define VMCS_INVALID_ENCODING 0xffffffff
/* 16-bit control fields */
#define VMCS_VPID 0x00000000
#define VMCS_PIR_VECTOR 0x00000002
/* 16-bit guest-state fields */
#define VMCS_GUEST_ES_SELECTOR 0x00000800
#define VMCS_GUEST_CS_SELECTOR 0x00000802
#define VMCS_GUEST_SS_SELECTOR 0x00000804
#define VMCS_GUEST_DS_SELECTOR 0x00000806
#define VMCS_GUEST_FS_SELECTOR 0x00000808
#define VMCS_GUEST_GS_SELECTOR 0x0000080A
#define VMCS_GUEST_LDTR_SELECTOR 0x0000080C
#define VMCS_GUEST_TR_SELECTOR 0x0000080E
#define VMCS_GUEST_INTR_STATUS 0x00000810
/* 16-bit host-state fields */
#define VMCS_HOST_ES_SELECTOR 0x00000C00
#define VMCS_HOST_CS_SELECTOR 0x00000C02
#define VMCS_HOST_SS_SELECTOR 0x00000C04
#define VMCS_HOST_DS_SELECTOR 0x00000C06
#define VMCS_HOST_FS_SELECTOR 0x00000C08
#define VMCS_HOST_GS_SELECTOR 0x00000C0A
#define VMCS_HOST_TR_SELECTOR 0x00000C0C
/* 64-bit control fields */
#define VMCS_IO_BITMAP_A 0x00002000
#define VMCS_IO_BITMAP_B 0x00002002
#define VMCS_MSR_BITMAP 0x00002004
#define VMCS_EXIT_MSR_STORE 0x00002006
#define VMCS_EXIT_MSR_LOAD 0x00002008
#define VMCS_ENTRY_MSR_LOAD 0x0000200A
#define VMCS_EXECUTIVE_VMCS 0x0000200C
#define VMCS_TSC_OFFSET 0x00002010
#define VMCS_VIRTUAL_APIC 0x00002012
#define VMCS_APIC_ACCESS 0x00002014
#define VMCS_PIR_DESC 0x00002016
#define VMCS_EPTP 0x0000201A
#define VMCS_EOI_EXIT0 0x0000201C
#define VMCS_EOI_EXIT1 0x0000201E
#define VMCS_EOI_EXIT2 0x00002020
#define VMCS_EOI_EXIT3 0x00002022
#define VMCS_EOI_EXIT(vector) (VMCS_EOI_EXIT0 + ((vector) / 64) * 2)
/* 64-bit read-only fields */
#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400
/* 64-bit guest-state fields */
#define VMCS_LINK_POINTER 0x00002800
#define VMCS_GUEST_IA32_DEBUGCTL 0x00002802
#define VMCS_GUEST_IA32_PAT 0x00002804
#define VMCS_GUEST_IA32_EFER 0x00002806
#define VMCS_GUEST_IA32_PERF_GLOBAL_CTRL 0x00002808
#define VMCS_GUEST_PDPTE0 0x0000280A
#define VMCS_GUEST_PDPTE1 0x0000280C
#define VMCS_GUEST_PDPTE2 0x0000280E
#define VMCS_GUEST_PDPTE3 0x00002810
/* 64-bit host-state fields */
#define VMCS_HOST_IA32_PAT 0x00002C00
#define VMCS_HOST_IA32_EFER 0x00002C02
#define VMCS_HOST_IA32_PERF_GLOBAL_CTRL 0x00002C04
/* 32-bit control fields */
#define VMCS_PIN_BASED_CTLS 0x00004000
#define VMCS_PRI_PROC_BASED_CTLS 0x00004002
#define VMCS_EXCEPTION_BITMAP 0x00004004
#define VMCS_PF_ERROR_MASK 0x00004006
#define VMCS_PF_ERROR_MATCH 0x00004008
#define VMCS_CR3_TARGET_COUNT 0x0000400A
#define VMCS_EXIT_CTLS 0x0000400C
#define VMCS_EXIT_MSR_STORE_COUNT 0x0000400E
#define VMCS_EXIT_MSR_LOAD_COUNT 0x00004010
#define VMCS_ENTRY_CTLS 0x00004012
#define VMCS_ENTRY_MSR_LOAD_COUNT 0x00004014
#define VMCS_ENTRY_INTR_INFO 0x00004016
#define VMCS_ENTRY_EXCEPTION_ERROR 0x00004018
#define VMCS_ENTRY_INST_LENGTH 0x0000401A
#define VMCS_TPR_THRESHOLD 0x0000401C
#define VMCS_SEC_PROC_BASED_CTLS 0x0000401E
#define VMCS_PLE_GAP 0x00004020
#define VMCS_PLE_WINDOW 0x00004022
/* 32-bit read-only data fields */
#define VMCS_INSTRUCTION_ERROR 0x00004400
#define VMCS_EXIT_REASON 0x00004402
#define VMCS_EXIT_INTR_INFO 0x00004404
#define VMCS_EXIT_INTR_ERRCODE 0x00004406
#define VMCS_IDT_VECTORING_INFO 0x00004408
#define VMCS_IDT_VECTORING_ERROR 0x0000440A
#define VMCS_EXIT_INSTRUCTION_LENGTH 0x0000440C
#define VMCS_EXIT_INSTRUCTION_INFO 0x0000440E
/* 32-bit guest-state fields */
#define VMCS_GUEST_ES_LIMIT 0x00004800
#define VMCS_GUEST_CS_LIMIT 0x00004802
#define VMCS_GUEST_SS_LIMIT 0x00004804
#define VMCS_GUEST_DS_LIMIT 0x00004806
#define VMCS_GUEST_FS_LIMIT 0x00004808
#define VMCS_GUEST_GS_LIMIT 0x0000480A
#define VMCS_GUEST_LDTR_LIMIT 0x0000480C
#define VMCS_GUEST_TR_LIMIT 0x0000480E
#define VMCS_GUEST_GDTR_LIMIT 0x00004810
#define VMCS_GUEST_IDTR_LIMIT 0x00004812
#define VMCS_GUEST_ES_ACCESS_RIGHTS 0x00004814
#define VMCS_GUEST_CS_ACCESS_RIGHTS 0x00004816
#define VMCS_GUEST_SS_ACCESS_RIGHTS 0x00004818
#define VMCS_GUEST_DS_ACCESS_RIGHTS 0x0000481A
#define VMCS_GUEST_FS_ACCESS_RIGHTS 0x0000481C
#define VMCS_GUEST_GS_ACCESS_RIGHTS 0x0000481E
#define VMCS_GUEST_LDTR_ACCESS_RIGHTS 0x00004820
#define VMCS_GUEST_TR_ACCESS_RIGHTS 0x00004822
#define VMCS_GUEST_INTERRUPTIBILITY 0x00004824
#define VMCS_GUEST_ACTIVITY 0x00004826
#define VMCS_GUEST_SMBASE 0x00004828
#define VMCS_GUEST_IA32_SYSENTER_CS 0x0000482A
#define VMCS_PREEMPTION_TIMER_VALUE 0x0000482E
/* 32-bit host state fields */
#define VMCS_HOST_IA32_SYSENTER_CS 0x00004C00
/* Natural Width control fields */
#define VMCS_CR0_MASK 0x00006000
#define VMCS_CR4_MASK 0x00006002
#define VMCS_CR0_SHADOW 0x00006004
#define VMCS_CR4_SHADOW 0x00006006
#define VMCS_CR3_TARGET0 0x00006008
#define VMCS_CR3_TARGET1 0x0000600A
#define VMCS_CR3_TARGET2 0x0000600C
#define VMCS_CR3_TARGET3 0x0000600E
/* Natural Width read-only fields */
#define VMCS_EXIT_QUALIFICATION 0x00006400
#define VMCS_IO_RCX 0x00006402
#define VMCS_IO_RSI 0x00006404
#define VMCS_IO_RDI 0x00006406
#define VMCS_IO_RIP 0x00006408
#define VMCS_GUEST_LINEAR_ADDRESS 0x0000640A
/* Natural Width guest-state fields */
#define VMCS_GUEST_CR0 0x00006800
#define VMCS_GUEST_CR3 0x00006802
#define VMCS_GUEST_CR4 0x00006804
#define VMCS_GUEST_ES_BASE 0x00006806
#define VMCS_GUEST_CS_BASE 0x00006808
#define VMCS_GUEST_SS_BASE 0x0000680A
#define VMCS_GUEST_DS_BASE 0x0000680C
#define VMCS_GUEST_FS_BASE 0x0000680E
#define VMCS_GUEST_GS_BASE 0x00006810
#define VMCS_GUEST_LDTR_BASE 0x00006812
#define VMCS_GUEST_TR_BASE 0x00006814
#define VMCS_GUEST_GDTR_BASE 0x00006816
#define VMCS_GUEST_IDTR_BASE 0x00006818
#define VMCS_GUEST_DR7 0x0000681A
#define VMCS_GUEST_RSP 0x0000681C
#define VMCS_GUEST_RIP 0x0000681E
#define VMCS_GUEST_RFLAGS 0x00006820
#define VMCS_GUEST_PENDING_DBG_EXCEPTIONS 0x00006822
#define VMCS_GUEST_IA32_SYSENTER_ESP 0x00006824
#define VMCS_GUEST_IA32_SYSENTER_EIP 0x00006826
/* Natural Width host-state fields */
#define VMCS_HOST_CR0 0x00006C00
#define VMCS_HOST_CR3 0x00006C02
#define VMCS_HOST_CR4 0x00006C04
#define VMCS_HOST_FS_BASE 0x00006C06
#define VMCS_HOST_GS_BASE 0x00006C08
#define VMCS_HOST_TR_BASE 0x00006C0A
#define VMCS_HOST_GDTR_BASE 0x00006C0C
#define VMCS_HOST_IDTR_BASE 0x00006C0E
#define VMCS_HOST_IA32_SYSENTER_ESP 0x00006C10
#define VMCS_HOST_IA32_SYSENTER_EIP 0x00006C12
#define VMCS_HOST_RSP 0x00006C14
#define VMCS_HOST_RIP 0x00006c16
/*
* VM instruction error numbers
*/
#define VMRESUME_WITH_NON_LAUNCHED_VMCS 5
/*
* VMCS exit reasons
*/
#define EXIT_REASON_EXCEPTION 0
#define EXIT_REASON_EXT_INTR 1
#define EXIT_REASON_TRIPLE_FAULT 2
#define EXIT_REASON_INIT 3
#define EXIT_REASON_SIPI 4
#define EXIT_REASON_IO_SMI 5
#define EXIT_REASON_SMI 6
#define EXIT_REASON_INTR_WINDOW 7
#define EXIT_REASON_NMI_WINDOW 8
#define EXIT_REASON_TASK_SWITCH 9
#define EXIT_REASON_CPUID 10
#define EXIT_REASON_GETSEC 11
#define EXIT_REASON_HLT 12
#define EXIT_REASON_INVD 13
#define EXIT_REASON_INVLPG 14
#define EXIT_REASON_RDPMC 15
#define EXIT_REASON_RDTSC 16
#define EXIT_REASON_RSM 17
#define EXIT_REASON_VMCALL 18
#define EXIT_REASON_VMCLEAR 19
#define EXIT_REASON_VMLAUNCH 20
#define EXIT_REASON_VMPTRLD 21
#define EXIT_REASON_VMPTRST 22
#define EXIT_REASON_VMREAD 23
#define EXIT_REASON_VMRESUME 24
#define EXIT_REASON_VMWRITE 25
#define EXIT_REASON_VMXOFF 26
#define EXIT_REASON_VMXON 27
#define EXIT_REASON_CR_ACCESS 28
#define EXIT_REASON_DR_ACCESS 29
#define EXIT_REASON_INOUT 30
#define EXIT_REASON_RDMSR 31
#define EXIT_REASON_WRMSR 32
#define EXIT_REASON_INVAL_VMCS 33
#define EXIT_REASON_INVAL_MSR 34
#define EXIT_REASON_MWAIT 36
#define EXIT_REASON_MTF 37
#define EXIT_REASON_MONITOR 39
#define EXIT_REASON_PAUSE 40
#define EXIT_REASON_MCE_DURING_ENTRY 41
#define EXIT_REASON_TPR 43
#define EXIT_REASON_APIC_ACCESS 44
#define EXIT_REASON_VIRTUALIZED_EOI 45
#define EXIT_REASON_GDTR_IDTR 46
#define EXIT_REASON_LDTR_TR 47
#define EXIT_REASON_EPT_FAULT 48
#define EXIT_REASON_EPT_MISCONFIG 49
#define EXIT_REASON_INVEPT 50
#define EXIT_REASON_RDTSCP 51
#define EXIT_REASON_VMX_PREEMPT 52
#define EXIT_REASON_INVVPID 53
#define EXIT_REASON_WBINVD 54
#define EXIT_REASON_XSETBV 55
#define EXIT_REASON_APIC_WRITE 56
/*
* NMI unblocking due to IRET.
*
* Applies to VM-exits due to hardware exception or EPT fault.
*/
#define EXIT_QUAL_NMIUDTI (1 << 12)
/*
* VMCS interrupt information fields
*/
#define VMCS_INTR_VALID (1U << 31)
#define VMCS_INTR_T_MASK 0x700 /* Interruption-info type */
#define VMCS_INTR_T_HWINTR (0 << 8)
#define VMCS_INTR_T_NMI (2 << 8)
#define VMCS_INTR_T_HWEXCEPTION (3 << 8)
#define VMCS_INTR_T_SWINTR (4 << 8)
#define VMCS_INTR_T_PRIV_SWEXCEPTION (5 << 8)
#define VMCS_INTR_T_SWEXCEPTION (6 << 8)
#define VMCS_INTR_DEL_ERRCODE (1 << 11)
/*
* VMCS IDT-Vectoring information fields
*/
#define VMCS_IDT_VEC_VALID (1U << 31)
#define VMCS_IDT_VEC_ERRCODE_VALID (1 << 11)
/*
* VMCS Guest interruptibility field
*/
#define VMCS_INTERRUPTIBILITY_STI_BLOCKING (1 << 0)
#define VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING (1 << 1)
#define VMCS_INTERRUPTIBILITY_SMI_BLOCKING (1 << 2)
#define VMCS_INTERRUPTIBILITY_NMI_BLOCKING (1 << 3)
/*
* Exit qualification for EXIT_REASON_INVAL_VMCS
*/
#define EXIT_QUAL_NMI_WHILE_STI_BLOCKING 3
/*
* Exit qualification for EPT violation
*/
#define EPT_VIOLATION_DATA_READ (1UL << 0)
#define EPT_VIOLATION_DATA_WRITE (1UL << 1)
#define EPT_VIOLATION_INST_FETCH (1UL << 2)
#define EPT_VIOLATION_GPA_READABLE (1UL << 3)
#define EPT_VIOLATION_GPA_WRITEABLE (1UL << 4)
#define EPT_VIOLATION_GPA_EXECUTABLE (1UL << 5)
#define EPT_VIOLATION_GLA_VALID (1UL << 7)
#define EPT_VIOLATION_XLAT_VALID (1UL << 8)
/*
* Exit qualification for APIC-access VM exit
*/
#define APIC_ACCESS_OFFSET(qual) ((qual) & 0xFFF)
#define APIC_ACCESS_TYPE(qual) (((qual) >> 12) & 0xF)
/*
* Exit qualification for APIC-write VM exit
*/
#define APIC_WRITE_OFFSET(qual) ((qual) & 0xFFF)
#endif

3416
vmm/intel/vmx.c Normal file

File diff suppressed because it is too large Load diff

140
vmm/intel/vmx.h Normal file
View file

@ -0,0 +1,140 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMX_H_
#define _VMX_H_
#include "vmcs.h"
struct pmap;
struct vmxctx {
register_t guest_rdi; /* Guest state */
register_t guest_rsi;
register_t guest_rdx;
register_t guest_rcx;
register_t guest_r8;
register_t guest_r9;
register_t guest_rax;
register_t guest_rbx;
register_t guest_rbp;
register_t guest_r10;
register_t guest_r11;
register_t guest_r12;
register_t guest_r13;
register_t guest_r14;
register_t guest_r15;
register_t guest_cr2;
register_t host_r15; /* Host state */
register_t host_r14;
register_t host_r13;
register_t host_r12;
register_t host_rbp;
register_t host_rsp;
register_t host_rbx;
/*
* XXX todo debug registers and fpu state
*/
int inst_fail_status;
/*
* The pmap needs to be deactivated in vmx_enter_guest()
* so keep a copy of the 'pmap' in each vmxctx.
*/
struct pmap *pmap;
};
struct vmxcap {
int set;
uint32_t proc_ctls;
uint32_t proc_ctls2;
};
struct vmxstate {
uint64_t nextrip; /* next instruction to be executed by guest */
int lastcpu; /* host cpu that this 'vcpu' last ran on */
uint16_t vpid;
};
struct apic_page {
uint32_t reg[PAGE_SIZE / 4];
};
CTASSERT(sizeof(struct apic_page) == PAGE_SIZE);
/* Posted Interrupt Descriptor (described in section 29.6 of the Intel SDM) */
struct pir_desc {
uint64_t pir[4];
uint64_t pending;
uint64_t unused[3];
} __aligned(64);
CTASSERT(sizeof(struct pir_desc) == 64);
/* Index into the 'guest_msrs[]' array */
enum {
IDX_MSR_LSTAR,
IDX_MSR_CSTAR,
IDX_MSR_STAR,
IDX_MSR_SF_MASK,
IDX_MSR_KGSBASE,
IDX_MSR_PAT,
GUEST_MSR_NUM /* must be the last enumeration */
};
/* virtual machine softc */
struct vmx {
struct vmcs vmcs[VM_MAXCPU]; /* one vmcs per virtual cpu */
struct apic_page apic_page[VM_MAXCPU]; /* one apic page per vcpu */
char msr_bitmap[PAGE_SIZE];
struct pir_desc pir_desc[VM_MAXCPU];
uint64_t guest_msrs[VM_MAXCPU][GUEST_MSR_NUM];
struct vmxctx ctx[VM_MAXCPU];
struct vmxcap cap[VM_MAXCPU];
struct vmxstate state[VM_MAXCPU];
uint64_t eptp;
struct vm *vm;
long eptgen[MAXCPU]; /* cached pmap->pm_eptgen */
};
CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0);
CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0);
CTASSERT((offsetof(struct vmx, pir_desc[0]) & 63) == 0);
#define VMX_GUEST_VMEXIT 0
#define VMX_VMRESUME_ERROR 1
#define VMX_VMLAUNCH_ERROR 2
#define VMX_INVEPT_ERROR 3
int vmx_enter_guest(struct vmxctx *ctx, struct vmx *vmx, int launched);
void vmx_call_isr(uintptr_t entry);
u_long vmx_fix_cr0(u_long cr0);
u_long vmx_fix_cr4(u_long cr4);
extern char vmx_exit_guest[];
#endif

96
vmm/intel/vmx_controls.h Normal file
View file

@ -0,0 +1,96 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMX_CONTROLS_H_
#define _VMX_CONTROLS_H_
/* Pin-Based VM-Execution Controls */
#define PINBASED_EXTINT_EXITING (1 << 0)
#define PINBASED_NMI_EXITING (1 << 3)
#define PINBASED_VIRTUAL_NMI (1 << 5)
#define PINBASED_PREMPTION_TIMER (1 << 6)
#define PINBASED_POSTED_INTERRUPT (1 << 7)
/* Primary Processor-Based VM-Execution Controls */
#define PROCBASED_INT_WINDOW_EXITING (1 << 2)
#define PROCBASED_TSC_OFFSET (1 << 3)
#define PROCBASED_HLT_EXITING (1 << 7)
#define PROCBASED_INVLPG_EXITING (1 << 9)
#define PROCBASED_MWAIT_EXITING (1 << 10)
#define PROCBASED_RDPMC_EXITING (1 << 11)
#define PROCBASED_RDTSC_EXITING (1 << 12)
#define PROCBASED_CR3_LOAD_EXITING (1 << 15)
#define PROCBASED_CR3_STORE_EXITING (1 << 16)
#define PROCBASED_CR8_LOAD_EXITING (1 << 19)
#define PROCBASED_CR8_STORE_EXITING (1 << 20)
#define PROCBASED_USE_TPR_SHADOW (1 << 21)
#define PROCBASED_NMI_WINDOW_EXITING (1 << 22)
#define PROCBASED_MOV_DR_EXITING (1 << 23)
#define PROCBASED_IO_EXITING (1 << 24)
#define PROCBASED_IO_BITMAPS (1 << 25)
#define PROCBASED_MTF (1 << 27)
#define PROCBASED_MSR_BITMAPS (1 << 28)
#define PROCBASED_MONITOR_EXITING (1 << 29)
#define PROCBASED_PAUSE_EXITING (1 << 30)
#define PROCBASED_SECONDARY_CONTROLS (1U << 31)
/* Secondary Processor-Based VM-Execution Controls */
#define PROCBASED2_VIRTUALIZE_APIC_ACCESSES (1 << 0)
#define PROCBASED2_ENABLE_EPT (1 << 1)
#define PROCBASED2_DESC_TABLE_EXITING (1 << 2)
#define PROCBASED2_ENABLE_RDTSCP (1 << 3)
#define PROCBASED2_VIRTUALIZE_X2APIC_MODE (1 << 4)
#define PROCBASED2_ENABLE_VPID (1 << 5)
#define PROCBASED2_WBINVD_EXITING (1 << 6)
#define PROCBASED2_UNRESTRICTED_GUEST (1 << 7)
#define PROCBASED2_APIC_REGISTER_VIRTUALIZATION (1 << 8)
#define PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY (1 << 9)
#define PROCBASED2_PAUSE_LOOP_EXITING (1 << 10)
#define PROCBASED2_ENABLE_INVPCID (1 << 12)
/* VM Exit Controls */
#define VM_EXIT_SAVE_DEBUG_CONTROLS (1 << 2)
#define VM_EXIT_HOST_LMA (1 << 9)
#define VM_EXIT_LOAD_PERF_GLOBAL_CTRL (1 << 12)
#define VM_EXIT_ACKNOWLEDGE_INTERRUPT (1 << 15)
#define VM_EXIT_SAVE_PAT (1 << 18)
#define VM_EXIT_LOAD_PAT (1 << 19)
#define VM_EXIT_SAVE_EFER (1 << 20)
#define VM_EXIT_LOAD_EFER (1 << 21)
#define VM_EXIT_SAVE_PREEMPTION_TIMER (1 << 22)
/* VM Entry Controls */
#define VM_ENTRY_LOAD_DEBUG_CONTROLS (1 << 2)
#define VM_ENTRY_GUEST_LMA (1 << 9)
#define VM_ENTRY_INTO_SMM (1 << 10)
#define VM_ENTRY_DEACTIVATE_DUAL_MONITOR (1 << 11)
#define VM_ENTRY_LOAD_PERF_GLOBAL_CTRL (1 << 13)
#define VM_ENTRY_LOAD_PAT (1 << 14)
#define VM_ENTRY_LOAD_EFER (1 << 15)
#endif

218
vmm/intel/vmx_cpufunc.h Normal file
View file

@ -0,0 +1,218 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMX_CPUFUNC_H_
#define _VMX_CPUFUNC_H_
struct vmcs;
/*
* Section 5.2 "Conventions" from Intel Architecture Manual 2B.
*
* error
* VMsucceed 0
* VMFailInvalid 1
* VMFailValid 2 see also VMCS VM-Instruction Error Field
*/
#define VM_SUCCESS 0
#define VM_FAIL_INVALID 1
#define VM_FAIL_VALID 2
#define VMX_SET_ERROR_CODE \
" jnc 1f;" \
" mov $1, %[error];" /* CF: error = 1 */ \
" jmp 3f;" \
"1: jnz 2f;" \
" mov $2, %[error];" /* ZF: error = 2 */ \
" jmp 3f;" \
"2: mov $0, %[error];" \
"3:"
/* returns 0 on success and non-zero on failure */
static __inline int
vmxon(char *region)
{
int error;
uint64_t addr;
addr = vtophys(region);
__asm __volatile("vmxon %[addr];"
VMX_SET_ERROR_CODE
: [error] "=r" (error)
: [addr] "m" (*(uint64_t *)&addr)
: "memory");
return (error);
}
/* returns 0 on success and non-zero on failure */
static __inline int
vmclear(struct vmcs *vmcs)
{
int error;
uint64_t addr;
addr = vtophys(vmcs);
__asm __volatile("vmclear %[addr];"
VMX_SET_ERROR_CODE
: [error] "=r" (error)
: [addr] "m" (*(uint64_t *)&addr)
: "memory");
return (error);
}
static __inline void
vmxoff(void)
{
__asm __volatile("vmxoff");
}
static __inline void
vmptrst(uint64_t *addr)
{
__asm __volatile("vmptrst %[addr]" :: [addr]"m" (*addr) : "memory");
}
static __inline int
vmptrld(struct vmcs *vmcs)
{
int error;
uint64_t addr;
addr = vtophys(vmcs);
__asm __volatile("vmptrld %[addr];"
VMX_SET_ERROR_CODE
: [error] "=r" (error)
: [addr] "m" (*(uint64_t *)&addr)
: "memory");
return (error);
}
static __inline int
vmwrite(uint64_t reg, uint64_t val)
{
int error;
__asm __volatile("vmwrite %[val], %[reg];"
VMX_SET_ERROR_CODE
: [error] "=r" (error)
: [val] "r" (val), [reg] "r" (reg)
: "memory");
return (error);
}
static __inline int
vmread(uint64_t r, uint64_t *addr)
{
int error;
__asm __volatile("vmread %[r], %[addr];"
VMX_SET_ERROR_CODE
: [error] "=r" (error)
: [r] "r" (r), [addr] "m" (*addr)
: "memory");
return (error);
}
static void __inline
VMCLEAR(struct vmcs *vmcs)
{
int err;
err = vmclear(vmcs);
if (err != 0)
panic("%s: vmclear(%p) error %d", __func__, vmcs, err);
critical_exit();
}
static void __inline
VMPTRLD(struct vmcs *vmcs)
{
int err;
critical_enter();
err = vmptrld(vmcs);
if (err != 0)
panic("%s: vmptrld(%p) error %d", __func__, vmcs, err);
}
#define INVVPID_TYPE_ADDRESS 0UL
#define INVVPID_TYPE_SINGLE_CONTEXT 1UL
#define INVVPID_TYPE_ALL_CONTEXTS 2UL
struct invvpid_desc {
uint16_t vpid;
uint16_t _res1;
uint32_t _res2;
uint64_t linear_addr;
};
CTASSERT(sizeof(struct invvpid_desc) == 16);
static void __inline
invvpid(uint64_t type, struct invvpid_desc desc)
{
int error;
__asm __volatile("invvpid %[desc], %[type];"
VMX_SET_ERROR_CODE
: [error] "=r" (error)
: [desc] "m" (desc), [type] "r" (type)
: "memory");
if (error)
panic("invvpid error %d", error);
}
#define INVEPT_TYPE_SINGLE_CONTEXT 1UL
#define INVEPT_TYPE_ALL_CONTEXTS 2UL
struct invept_desc {
uint64_t eptp;
uint64_t _res;
};
CTASSERT(sizeof(struct invept_desc) == 16);
static void __inline
invept(uint64_t type, struct invept_desc desc)
{
int error;
__asm __volatile("invept %[desc], %[type];"
VMX_SET_ERROR_CODE
: [error] "=r" (error)
: [desc] "m" (desc), [type] "r" (type)
: "memory");
if (error)
panic("invept error %d", error);
}
#endif

88
vmm/intel/vmx_genassym.c Normal file
View file

@ -0,0 +1,88 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/assym.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <machine/vmm.h>
#include "vmx_cpufunc.h"
#include "vmx.h"
ASSYM(VMXCTX_GUEST_RDI, offsetof(struct vmxctx, guest_rdi));
ASSYM(VMXCTX_GUEST_RSI, offsetof(struct vmxctx, guest_rsi));
ASSYM(VMXCTX_GUEST_RDX, offsetof(struct vmxctx, guest_rdx));
ASSYM(VMXCTX_GUEST_RCX, offsetof(struct vmxctx, guest_rcx));
ASSYM(VMXCTX_GUEST_R8, offsetof(struct vmxctx, guest_r8));
ASSYM(VMXCTX_GUEST_R9, offsetof(struct vmxctx, guest_r9));
ASSYM(VMXCTX_GUEST_RAX, offsetof(struct vmxctx, guest_rax));
ASSYM(VMXCTX_GUEST_RBX, offsetof(struct vmxctx, guest_rbx));
ASSYM(VMXCTX_GUEST_RBP, offsetof(struct vmxctx, guest_rbp));
ASSYM(VMXCTX_GUEST_R10, offsetof(struct vmxctx, guest_r10));
ASSYM(VMXCTX_GUEST_R11, offsetof(struct vmxctx, guest_r11));
ASSYM(VMXCTX_GUEST_R12, offsetof(struct vmxctx, guest_r12));
ASSYM(VMXCTX_GUEST_R13, offsetof(struct vmxctx, guest_r13));
ASSYM(VMXCTX_GUEST_R14, offsetof(struct vmxctx, guest_r14));
ASSYM(VMXCTX_GUEST_R15, offsetof(struct vmxctx, guest_r15));
ASSYM(VMXCTX_GUEST_CR2, offsetof(struct vmxctx, guest_cr2));
ASSYM(VMXCTX_HOST_R15, offsetof(struct vmxctx, host_r15));
ASSYM(VMXCTX_HOST_R14, offsetof(struct vmxctx, host_r14));
ASSYM(VMXCTX_HOST_R13, offsetof(struct vmxctx, host_r13));
ASSYM(VMXCTX_HOST_R12, offsetof(struct vmxctx, host_r12));
ASSYM(VMXCTX_HOST_RBP, offsetof(struct vmxctx, host_rbp));
ASSYM(VMXCTX_HOST_RSP, offsetof(struct vmxctx, host_rsp));
ASSYM(VMXCTX_HOST_RBX, offsetof(struct vmxctx, host_rbx));
ASSYM(VMXCTX_INST_FAIL_STATUS, offsetof(struct vmxctx, inst_fail_status));
ASSYM(VMXCTX_PMAP, offsetof(struct vmxctx, pmap));
ASSYM(VMX_EPTGEN, offsetof(struct vmx, eptgen));
ASSYM(VMX_EPTP, offsetof(struct vmx, eptp));
ASSYM(VM_FAIL_INVALID, VM_FAIL_INVALID);
ASSYM(VM_FAIL_VALID, VM_FAIL_VALID);
ASSYM(VMX_GUEST_VMEXIT, VMX_GUEST_VMEXIT);
ASSYM(VMX_VMRESUME_ERROR, VMX_VMRESUME_ERROR);
ASSYM(VMX_VMLAUNCH_ERROR, VMX_VMLAUNCH_ERROR);
ASSYM(VMX_INVEPT_ERROR, VMX_INVEPT_ERROR);
ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid));
ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active));
ASSYM(PM_EPTGEN, offsetof(struct pmap, pm_eptgen));
ASSYM(KERNEL_SS, GSEL(GDATA_SEL, SEL_KPL));
ASSYM(KERNEL_CS, GSEL(GCODE_SEL, SEL_KPL));

483
vmm/intel/vmx_msr.c Normal file
View file

@ -0,0 +1,483 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <machine/clock.h>
#include <machine/cpufunc.h>
#include <machine/md_var.h>
#include <machine/specialreg.h>
#include <machine/vmm.h>
#include "vmx.h"
#include "vmx_msr.h"
static boolean_t
vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos)
{
if (msr_val & (1UL << (bitpos + 32)))
return (TRUE);
else
return (FALSE);
}
static boolean_t
vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos)
{
if ((msr_val & (1UL << bitpos)) == 0)
return (TRUE);
else
return (FALSE);
}
uint32_t
vmx_revision(void)
{
return (rdmsr(MSR_VMX_BASIC) & 0xffffffff);
}
/*
* Generate a bitmask to be used for the VMCS execution control fields.
*
* The caller specifies what bits should be set to one in 'ones_mask'
* and what bits should be set to zero in 'zeros_mask'. The don't-care
* bits are set to the default value. The default values are obtained
* based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining
* VMX Capabilities".
*
* Returns zero on success and non-zero on error.
*/
int
vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
uint32_t zeros_mask, uint32_t *retval)
{
int i;
uint64_t val, trueval;
boolean_t true_ctls_avail, one_allowed, zero_allowed;
/* We cannot ask the same bit to be set to both '1' and '0' */
if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask))
return (EINVAL);
if (rdmsr(MSR_VMX_BASIC) & (1UL << 55))
true_ctls_avail = TRUE;
else
true_ctls_avail = FALSE;
val = rdmsr(ctl_reg);
if (true_ctls_avail)
trueval = rdmsr(true_ctl_reg); /* step c */
else
trueval = val; /* step a */
for (i = 0; i < 32; i++) {
one_allowed = vmx_ctl_allows_one_setting(trueval, i);
zero_allowed = vmx_ctl_allows_zero_setting(trueval, i);
KASSERT(one_allowed || zero_allowed,
("invalid zero/one setting for bit %d of ctl 0x%0x, "
"truectl 0x%0x\n", i, ctl_reg, true_ctl_reg));
if (zero_allowed && !one_allowed) { /* b(i),c(i) */
if (ones_mask & (1 << i))
return (EINVAL);
*retval &= ~(1 << i);
} else if (one_allowed && !zero_allowed) { /* b(i),c(i) */
if (zeros_mask & (1 << i))
return (EINVAL);
*retval |= 1 << i;
} else {
if (zeros_mask & (1 << i)) /* b(ii),c(ii) */
*retval &= ~(1 << i);
else if (ones_mask & (1 << i)) /* b(ii), c(ii) */
*retval |= 1 << i;
else if (!true_ctls_avail)
*retval &= ~(1 << i); /* b(iii) */
else if (vmx_ctl_allows_zero_setting(val, i))/* c(iii)*/
*retval &= ~(1 << i);
else if (vmx_ctl_allows_one_setting(val, i)) /* c(iv) */
*retval |= 1 << i;
else {
panic("vmx_set_ctlreg: unable to determine "
"correct value of ctl bit %d for msr "
"0x%0x and true msr 0x%0x", i, ctl_reg,
true_ctl_reg);
}
}
}
return (0);
}
void
msr_bitmap_initialize(char *bitmap)
{
memset(bitmap, 0xff, PAGE_SIZE);
}
int
msr_bitmap_change_access(char *bitmap, u_int msr, int access)
{
int byte, bit;
if (msr <= 0x00001FFF)
byte = msr / 8;
else if (msr >= 0xC0000000 && msr <= 0xC0001FFF)
byte = 1024 + (msr - 0xC0000000) / 8;
else
return (EINVAL);
bit = msr & 0x7;
if (access & MSR_BITMAP_ACCESS_READ)
bitmap[byte] &= ~(1 << bit);
else
bitmap[byte] |= 1 << bit;
byte += 2048;
if (access & MSR_BITMAP_ACCESS_WRITE)
bitmap[byte] &= ~(1 << bit);
else
bitmap[byte] |= 1 << bit;
return (0);
}
static uint64_t misc_enable;
static uint64_t platform_info;
static uint64_t turbo_ratio_limit;
static uint64_t host_msrs[GUEST_MSR_NUM];
static bool
nehalem_cpu(void)
{
u_int family, model;
/*
* The family:model numbers belonging to the Nehalem microarchitecture
* are documented in Section 35.5, Intel SDM dated Feb 2014.
*/
family = CPUID_TO_FAMILY(cpu_id);
model = CPUID_TO_MODEL(cpu_id);
if (family == 0x6) {
switch (model) {
case 0x1A:
case 0x1E:
case 0x1F:
case 0x2E:
return (true);
default:
break;
}
}
return (false);
}
static bool
westmere_cpu(void)
{
u_int family, model;
/*
* The family:model numbers belonging to the Westmere microarchitecture
* are documented in Section 35.6, Intel SDM dated Feb 2014.
*/
family = CPUID_TO_FAMILY(cpu_id);
model = CPUID_TO_MODEL(cpu_id);
if (family == 0x6) {
switch (model) {
case 0x25:
case 0x2C:
return (true);
default:
break;
}
}
return (false);
}
static bool
pat_valid(uint64_t val)
{
int i, pa;
/*
* From Intel SDM: Table "Memory Types That Can Be Encoded With PAT"
*
* Extract PA0 through PA7 and validate that each one encodes a
* valid memory type.
*/
for (i = 0; i < 8; i++) {
pa = (val >> (i * 8)) & 0xff;
if (pa == 2 || pa == 3 || pa >= 8)
return (false);
}
return (true);
}
void
vmx_msr_init(void)
{
uint64_t bus_freq, ratio;
int i;
/*
* It is safe to cache the values of the following MSRs because
* they don't change based on curcpu, curproc or curthread.
*/
host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR);
host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR);
host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
/*
* Initialize emulated MSRs
*/
misc_enable = rdmsr(MSR_IA32_MISC_ENABLE);
/*
* Set mandatory bits
* 11: branch trace disabled
* 12: PEBS unavailable
* Clear unsupported features
* 16: SpeedStep enable
* 18: enable MONITOR FSM
*/
misc_enable |= (1 << 12) | (1 << 11);
misc_enable &= ~((1 << 18) | (1 << 16));
if (nehalem_cpu() || westmere_cpu())
bus_freq = 133330000; /* 133Mhz */
else
bus_freq = 100000000; /* 100Mhz */
/*
* XXXtime
* The ratio should really be based on the virtual TSC frequency as
* opposed to the host TSC.
*/
ratio = (tsc_freq / bus_freq) & 0xff;
/*
* The register definition is based on the micro-architecture
* but the following bits are always the same:
* [15:8] Maximum Non-Turbo Ratio
* [28] Programmable Ratio Limit for Turbo Mode
* [29] Programmable TDC-TDP Limit for Turbo Mode
* [47:40] Maximum Efficiency Ratio
*
* The other bits can be safely set to 0 on all
* micro-architectures up to Haswell.
*/
platform_info = (ratio << 8) | (ratio << 40);
/*
* The number of valid bits in the MSR_TURBO_RATIO_LIMITx register is
* dependent on the maximum cores per package supported by the micro-
* architecture. For e.g., Westmere supports 6 cores per package and
* uses the low 48 bits. Sandybridge support 8 cores per package and
* uses up all 64 bits.
*
* However, the unused bits are reserved so we pretend that all bits
* in this MSR are valid.
*/
for (i = 0; i < 8; i++)
turbo_ratio_limit = (turbo_ratio_limit << 8) | ratio;
}
void
vmx_msr_guest_init(struct vmx *vmx, int vcpuid)
{
uint64_t *guest_msrs;
guest_msrs = vmx->guest_msrs[vcpuid];
/*
* The permissions bitmap is shared between all vcpus so initialize it
* once when initializing the vBSP.
*/
if (vcpuid == 0) {
guest_msr_rw(vmx, MSR_LSTAR);
guest_msr_rw(vmx, MSR_CSTAR);
guest_msr_rw(vmx, MSR_STAR);
guest_msr_rw(vmx, MSR_SF_MASK);
guest_msr_rw(vmx, MSR_KGSBASE);
}
/*
* Initialize guest IA32_PAT MSR with default value after reset.
*/
guest_msrs[IDX_MSR_PAT] = PAT_VALUE(0, PAT_WRITE_BACK) |
PAT_VALUE(1, PAT_WRITE_THROUGH) |
PAT_VALUE(2, PAT_UNCACHED) |
PAT_VALUE(3, PAT_UNCACHEABLE) |
PAT_VALUE(4, PAT_WRITE_BACK) |
PAT_VALUE(5, PAT_WRITE_THROUGH) |
PAT_VALUE(6, PAT_UNCACHED) |
PAT_VALUE(7, PAT_UNCACHEABLE);
return;
}
void
vmx_msr_guest_enter(struct vmx *vmx, int vcpuid)
{
uint64_t *guest_msrs = vmx->guest_msrs[vcpuid];
/* Save host MSRs (if any) and restore guest MSRs */
wrmsr(MSR_LSTAR, guest_msrs[IDX_MSR_LSTAR]);
wrmsr(MSR_CSTAR, guest_msrs[IDX_MSR_CSTAR]);
wrmsr(MSR_STAR, guest_msrs[IDX_MSR_STAR]);
wrmsr(MSR_SF_MASK, guest_msrs[IDX_MSR_SF_MASK]);
wrmsr(MSR_KGSBASE, guest_msrs[IDX_MSR_KGSBASE]);
}
void
vmx_msr_guest_exit(struct vmx *vmx, int vcpuid)
{
uint64_t *guest_msrs = vmx->guest_msrs[vcpuid];
/* Save guest MSRs */
guest_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR);
guest_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR);
guest_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
guest_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
guest_msrs[IDX_MSR_KGSBASE] = rdmsr(MSR_KGSBASE);
/* Restore host MSRs */
wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]);
wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]);
wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]);
wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]);
/* MSR_KGSBASE will be restored on the way back to userspace */
}
int
vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val, bool *retu)
{
const uint64_t *guest_msrs;
int error;
guest_msrs = vmx->guest_msrs[vcpuid];
error = 0;
switch (num) {
case MSR_MCG_CAP:
case MSR_MCG_STATUS:
*val = 0;
break;
case MSR_MTRRcap:
case MSR_MTRRdefType:
case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
case MSR_MTRR64kBase:
*val = 0;
break;
case MSR_IA32_MISC_ENABLE:
*val = misc_enable;
break;
case MSR_PLATFORM_INFO:
*val = platform_info;
break;
case MSR_TURBO_RATIO_LIMIT:
case MSR_TURBO_RATIO_LIMIT1:
*val = turbo_ratio_limit;
break;
case MSR_PAT:
*val = guest_msrs[IDX_MSR_PAT];
break;
default:
error = EINVAL;
break;
}
return (error);
}
int
vmx_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu)
{
uint64_t *guest_msrs;
uint64_t changed;
int error;
guest_msrs = vmx->guest_msrs[vcpuid];
error = 0;
switch (num) {
case MSR_MCG_CAP:
case MSR_MCG_STATUS:
break; /* ignore writes */
case MSR_MTRRcap:
vm_inject_gp(vmx->vm, vcpuid);
break;
case MSR_MTRRdefType:
case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
case MSR_MTRR64kBase:
break; /* Ignore writes */
case MSR_IA32_MISC_ENABLE:
changed = val ^ misc_enable;
/*
* If the host has disabled the NX feature then the guest
* also cannot use it. However, a Linux guest will try to
* enable the NX feature by writing to the MISC_ENABLE MSR.
*
* This can be safely ignored because the memory management
* code looks at CPUID.80000001H:EDX.NX to check if the
* functionality is actually enabled.
*/
changed &= ~(1UL << 34);
/*
* Punt to userspace if any other bits are being modified.
*/
if (changed)
error = EINVAL;
break;
case MSR_PAT:
if (pat_valid(val))
guest_msrs[IDX_MSR_PAT] = val;
else
vm_inject_gp(vmx->vm, vcpuid);
break;
default:
error = EINVAL;
break;
}
return (error);
}

70
vmm/intel/vmx_msr.h Normal file
View file

@ -0,0 +1,70 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMX_MSR_H_
#define _VMX_MSR_H_
struct vmx;
void vmx_msr_init(void);
void vmx_msr_guest_init(struct vmx *vmx, int vcpuid);
void vmx_msr_guest_enter(struct vmx *vmx, int vcpuid);
void vmx_msr_guest_exit(struct vmx *vmx, int vcpuid);
int vmx_rdmsr(struct vmx *, int vcpuid, u_int num, uint64_t *val, bool *retu);
int vmx_wrmsr(struct vmx *, int vcpuid, u_int num, uint64_t val, bool *retu);
uint32_t vmx_revision(void);
int vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
uint32_t zeros_mask, uint32_t *retval);
/*
* According to Section 21.10.4 "Software Access to Related Structures",
* changes to data structures pointed to by the VMCS must be made only when
* there is no logical processor with a current VMCS that points to the
* data structure.
*
* This pretty much limits us to configuring the MSR bitmap before VMCS
* initialization for SMP VMs. Unless of course we do it the hard way - which
* would involve some form of synchronization between the vcpus to vmclear
* all VMCSs' that point to the bitmap.
*/
#define MSR_BITMAP_ACCESS_NONE 0x0
#define MSR_BITMAP_ACCESS_READ 0x1
#define MSR_BITMAP_ACCESS_WRITE 0x2
#define MSR_BITMAP_ACCESS_RW (MSR_BITMAP_ACCESS_READ|MSR_BITMAP_ACCESS_WRITE)
void msr_bitmap_initialize(char *bitmap);
int msr_bitmap_change_access(char *bitmap, u_int msr, int access);
#define guest_msr_rw(vmx, msr) \
msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW)
#define guest_msr_ro(vmx, msr) \
msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_READ)
#endif

262
vmm/intel/vmx_support.S Normal file
View file

@ -0,0 +1,262 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* Copyright (c) 2013 Neel Natu <neel@freebsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <machine/asmacros.h>
#include "vmx_assym.h"
#ifdef SMP
#define LK lock ;
#else
#define LK
#endif
/* Be friendly to DTrace FBT's prologue/epilogue pattern matching */
#define VENTER push %rbp ; mov %rsp,%rbp
#define VLEAVE pop %rbp
/*
* Assumes that %rdi holds a pointer to the 'vmxctx'.
*
* On "return" all registers are updated to reflect guest state. The two
* exceptions are %rip and %rsp. These registers are atomically switched
* by hardware from the guest area of the vmcs.
*
* We modify %rsp to point to the 'vmxctx' so we can use it to restore
* host context in case of an error with 'vmlaunch' or 'vmresume'.
*/
#define VMX_GUEST_RESTORE \
movq %rdi,%rsp; \
movq VMXCTX_GUEST_CR2(%rdi),%rsi; \
movq %rsi,%cr2; \
movq VMXCTX_GUEST_RSI(%rdi),%rsi; \
movq VMXCTX_GUEST_RDX(%rdi),%rdx; \
movq VMXCTX_GUEST_RCX(%rdi),%rcx; \
movq VMXCTX_GUEST_R8(%rdi),%r8; \
movq VMXCTX_GUEST_R9(%rdi),%r9; \
movq VMXCTX_GUEST_RAX(%rdi),%rax; \
movq VMXCTX_GUEST_RBX(%rdi),%rbx; \
movq VMXCTX_GUEST_RBP(%rdi),%rbp; \
movq VMXCTX_GUEST_R10(%rdi),%r10; \
movq VMXCTX_GUEST_R11(%rdi),%r11; \
movq VMXCTX_GUEST_R12(%rdi),%r12; \
movq VMXCTX_GUEST_R13(%rdi),%r13; \
movq VMXCTX_GUEST_R14(%rdi),%r14; \
movq VMXCTX_GUEST_R15(%rdi),%r15; \
movq VMXCTX_GUEST_RDI(%rdi),%rdi; /* restore rdi the last */
/*
* Save and restore the host context.
*
* Assumes that %rdi holds a pointer to the 'vmxctx'.
*/
#define VMX_HOST_SAVE \
movq %r15, VMXCTX_HOST_R15(%rdi); \
movq %r14, VMXCTX_HOST_R14(%rdi); \
movq %r13, VMXCTX_HOST_R13(%rdi); \
movq %r12, VMXCTX_HOST_R12(%rdi); \
movq %rbp, VMXCTX_HOST_RBP(%rdi); \
movq %rsp, VMXCTX_HOST_RSP(%rdi); \
movq %rbx, VMXCTX_HOST_RBX(%rdi); \
#define VMX_HOST_RESTORE \
movq VMXCTX_HOST_R15(%rdi), %r15; \
movq VMXCTX_HOST_R14(%rdi), %r14; \
movq VMXCTX_HOST_R13(%rdi), %r13; \
movq VMXCTX_HOST_R12(%rdi), %r12; \
movq VMXCTX_HOST_RBP(%rdi), %rbp; \
movq VMXCTX_HOST_RSP(%rdi), %rsp; \
movq VMXCTX_HOST_RBX(%rdi), %rbx; \
/*
* vmx_enter_guest(struct vmxctx *vmxctx, int launched)
* %rdi: pointer to the 'vmxctx'
* %rsi: pointer to the 'vmx'
* %edx: launch state of the VMCS
* Interrupts must be disabled on entry.
*/
ENTRY(vmx_enter_guest)
VENTER
/*
* Save host state before doing anything else.
*/
VMX_HOST_SAVE
/*
* Activate guest pmap on this cpu.
*/
movq VMXCTX_PMAP(%rdi), %r11
movl PCPU(CPUID), %eax
LK btsl %eax, PM_ACTIVE(%r11)
/*
* If 'vmx->eptgen[curcpu]' is not identical to 'pmap->pm_eptgen'
* then we must invalidate all mappings associated with this EPTP.
*/
movq PM_EPTGEN(%r11), %r10
cmpq %r10, VMX_EPTGEN(%rsi, %rax, 8)
je guest_restore
/* Refresh 'vmx->eptgen[curcpu]' */
movq %r10, VMX_EPTGEN(%rsi, %rax, 8)
/* Setup the invept descriptor on the host stack */
mov %rsp, %r11
movq VMX_EPTP(%rsi), %rax
movq %rax, -16(%r11)
movq $0x0, -8(%r11)
mov $0x1, %eax /* Single context invalidate */
invept -16(%r11), %rax
jbe invept_error /* Check invept instruction error */
guest_restore:
cmpl $0, %edx
je do_launch
VMX_GUEST_RESTORE
vmresume
/*
* In the common case 'vmresume' returns back to the host through
* 'vmx_exit_guest' with %rsp pointing to 'vmxctx'.
*
* If there is an error we return VMX_VMRESUME_ERROR to the caller.
*/
movq %rsp, %rdi /* point %rdi back to 'vmxctx' */
movl $VMX_VMRESUME_ERROR, %eax
jmp decode_inst_error
do_launch:
VMX_GUEST_RESTORE
vmlaunch
/*
* In the common case 'vmlaunch' returns back to the host through
* 'vmx_exit_guest' with %rsp pointing to 'vmxctx'.
*
* If there is an error we return VMX_VMLAUNCH_ERROR to the caller.
*/
movq %rsp, %rdi /* point %rdi back to 'vmxctx' */
movl $VMX_VMLAUNCH_ERROR, %eax
jmp decode_inst_error
invept_error:
movl $VMX_INVEPT_ERROR, %eax
jmp decode_inst_error
decode_inst_error:
movl $VM_FAIL_VALID, %r11d
jz inst_error
movl $VM_FAIL_INVALID, %r11d
inst_error:
movl %r11d, VMXCTX_INST_FAIL_STATUS(%rdi)
/*
* The return value is already populated in %eax so we cannot use
* it as a scratch register beyond this point.
*/
/*
* Deactivate guest pmap from this cpu.
*/
movq VMXCTX_PMAP(%rdi), %r11
movl PCPU(CPUID), %r10d
LK btrl %r10d, PM_ACTIVE(%r11)
VMX_HOST_RESTORE
VLEAVE
ret
/*
* Non-error VM-exit from the guest. Make this a label so it can
* be used by C code when setting up the VMCS.
* The VMCS-restored %rsp points to the struct vmxctx
*/
ALIGN_TEXT
.globl vmx_exit_guest
vmx_exit_guest:
/*
* Save guest state that is not automatically saved in the vmcs.
*/
movq %rdi,VMXCTX_GUEST_RDI(%rsp)
movq %rsi,VMXCTX_GUEST_RSI(%rsp)
movq %rdx,VMXCTX_GUEST_RDX(%rsp)
movq %rcx,VMXCTX_GUEST_RCX(%rsp)
movq %r8,VMXCTX_GUEST_R8(%rsp)
movq %r9,VMXCTX_GUEST_R9(%rsp)
movq %rax,VMXCTX_GUEST_RAX(%rsp)
movq %rbx,VMXCTX_GUEST_RBX(%rsp)
movq %rbp,VMXCTX_GUEST_RBP(%rsp)
movq %r10,VMXCTX_GUEST_R10(%rsp)
movq %r11,VMXCTX_GUEST_R11(%rsp)
movq %r12,VMXCTX_GUEST_R12(%rsp)
movq %r13,VMXCTX_GUEST_R13(%rsp)
movq %r14,VMXCTX_GUEST_R14(%rsp)
movq %r15,VMXCTX_GUEST_R15(%rsp)
movq %cr2,%rdi
movq %rdi,VMXCTX_GUEST_CR2(%rsp)
movq %rsp,%rdi
/*
* Deactivate guest pmap from this cpu.
*/
movq VMXCTX_PMAP(%rdi), %r11
movl PCPU(CPUID), %r10d
LK btrl %r10d, PM_ACTIVE(%r11)
VMX_HOST_RESTORE
/*
* This will return to the caller of 'vmx_enter_guest()' with a return
* value of VMX_GUEST_VMEXIT.
*/
movl $VMX_GUEST_VMEXIT, %eax
VLEAVE
ret
END(vmx_enter_guest)
/*
* %rdi = interrupt handler entry point
*
* Calling sequence described in the "Instruction Set Reference" for the "INT"
* instruction in Intel SDM, Vol 2.
*/
ENTRY(vmx_call_isr)
VENTER
mov %rsp, %r11 /* save %rsp */
and $~0xf, %rsp /* align on 16-byte boundary */
pushq $KERNEL_SS /* %ss */
pushq %r11 /* %rsp */
pushfq /* %rflags */
pushq $KERNEL_CS /* %cs */
cli /* disable interrupts */
callq *%rdi /* push %rip and call isr */
VLEAVE
ret
END(vmx_call_isr)

688
vmm/intel/vtd.c Normal file
View file

@ -0,0 +1,688 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <dev/pci/pcireg.h>
#include <machine/vmparam.h>
#include <contrib/dev/acpica/include/acpi.h>
#include "io/iommu.h"
/*
* Documented in the "Intel Virtualization Technology for Directed I/O",
* Architecture Spec, September 2008.
*/
/* Section 10.4 "Register Descriptions" */
struct vtdmap {
volatile uint32_t version;
volatile uint32_t res0;
volatile uint64_t cap;
volatile uint64_t ext_cap;
volatile uint32_t gcr;
volatile uint32_t gsr;
volatile uint64_t rta;
volatile uint64_t ccr;
};
#define VTD_CAP_SAGAW(cap) (((cap) >> 8) & 0x1F)
#define VTD_CAP_ND(cap) ((cap) & 0x7)
#define VTD_CAP_CM(cap) (((cap) >> 7) & 0x1)
#define VTD_CAP_SPS(cap) (((cap) >> 34) & 0xF)
#define VTD_CAP_RWBF(cap) (((cap) >> 4) & 0x1)
#define VTD_ECAP_DI(ecap) (((ecap) >> 2) & 0x1)
#define VTD_ECAP_COHERENCY(ecap) ((ecap) & 0x1)
#define VTD_ECAP_IRO(ecap) (((ecap) >> 8) & 0x3FF)
#define VTD_GCR_WBF (1 << 27)
#define VTD_GCR_SRTP (1 << 30)
#define VTD_GCR_TE (1U << 31)
#define VTD_GSR_WBFS (1 << 27)
#define VTD_GSR_RTPS (1 << 30)
#define VTD_GSR_TES (1U << 31)
#define VTD_CCR_ICC (1UL << 63) /* invalidate context cache */
#define VTD_CCR_CIRG_GLOBAL (1UL << 61) /* global invalidation */
#define VTD_IIR_IVT (1UL << 63) /* invalidation IOTLB */
#define VTD_IIR_IIRG_GLOBAL (1ULL << 60) /* global IOTLB invalidation */
#define VTD_IIR_IIRG_DOMAIN (2ULL << 60) /* domain IOTLB invalidation */
#define VTD_IIR_IIRG_PAGE (3ULL << 60) /* page IOTLB invalidation */
#define VTD_IIR_DRAIN_READS (1ULL << 49) /* drain pending DMA reads */
#define VTD_IIR_DRAIN_WRITES (1ULL << 48) /* drain pending DMA writes */
#define VTD_IIR_DOMAIN_P 32
#define VTD_ROOT_PRESENT 0x1
#define VTD_CTX_PRESENT 0x1
#define VTD_CTX_TT_ALL (1UL << 2)
#define VTD_PTE_RD (1UL << 0)
#define VTD_PTE_WR (1UL << 1)
#define VTD_PTE_SUPERPAGE (1UL << 7)
#define VTD_PTE_ADDR_M (0x000FFFFFFFFFF000UL)
#define VTD_RID2IDX(rid) (((rid) & 0xff) * 2)
struct domain {
uint64_t *ptp; /* first level page table page */
int pt_levels; /* number of page table levels */
int addrwidth; /* 'AW' field in context entry */
int spsmask; /* supported super page sizes */
u_int id; /* domain id */
vm_paddr_t maxaddr; /* highest address to be mapped */
SLIST_ENTRY(domain) next;
};
static SLIST_HEAD(, domain) domhead;
#define DRHD_MAX_UNITS 8
static int drhd_num;
static struct vtdmap *vtdmaps[DRHD_MAX_UNITS];
static int max_domains;
typedef int (*drhd_ident_func_t)(void);
static uint64_t root_table[PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
static uint64_t ctx_tables[256][PAGE_SIZE / sizeof(uint64_t)] __aligned(4096);
static MALLOC_DEFINE(M_VTD, "vtd", "vtd");
static int
vtd_max_domains(struct vtdmap *vtdmap)
{
int nd;
nd = VTD_CAP_ND(vtdmap->cap);
switch (nd) {
case 0:
return (16);
case 1:
return (64);
case 2:
return (256);
case 3:
return (1024);
case 4:
return (4 * 1024);
case 5:
return (16 * 1024);
case 6:
return (64 * 1024);
default:
panic("vtd_max_domains: invalid value of nd (0x%0x)", nd);
}
}
static u_int
domain_id(void)
{
u_int id;
struct domain *dom;
/* Skip domain id 0 - it is reserved when Caching Mode field is set */
for (id = 1; id < max_domains; id++) {
SLIST_FOREACH(dom, &domhead, next) {
if (dom->id == id)
break;
}
if (dom == NULL)
break; /* found it */
}
if (id >= max_domains)
panic("domain ids exhausted");
return (id);
}
static void
vtd_wbflush(struct vtdmap *vtdmap)
{
if (VTD_ECAP_COHERENCY(vtdmap->ext_cap) == 0)
pmap_invalidate_cache();
if (VTD_CAP_RWBF(vtdmap->cap)) {
vtdmap->gcr = VTD_GCR_WBF;
while ((vtdmap->gsr & VTD_GSR_WBFS) != 0)
;
}
}
static void
vtd_ctx_global_invalidate(struct vtdmap *vtdmap)
{
vtdmap->ccr = VTD_CCR_ICC | VTD_CCR_CIRG_GLOBAL;
while ((vtdmap->ccr & VTD_CCR_ICC) != 0)
;
}
static void
vtd_iotlb_global_invalidate(struct vtdmap *vtdmap)
{
int offset;
volatile uint64_t *iotlb_reg, val;
vtd_wbflush(vtdmap);
offset = VTD_ECAP_IRO(vtdmap->ext_cap) * 16;
iotlb_reg = (volatile uint64_t *)((caddr_t)vtdmap + offset + 8);
*iotlb_reg = VTD_IIR_IVT | VTD_IIR_IIRG_GLOBAL |
VTD_IIR_DRAIN_READS | VTD_IIR_DRAIN_WRITES;
while (1) {
val = *iotlb_reg;
if ((val & VTD_IIR_IVT) == 0)
break;
}
}
static void
vtd_translation_enable(struct vtdmap *vtdmap)
{
vtdmap->gcr = VTD_GCR_TE;
while ((vtdmap->gsr & VTD_GSR_TES) == 0)
;
}
static void
vtd_translation_disable(struct vtdmap *vtdmap)
{
vtdmap->gcr = 0;
while ((vtdmap->gsr & VTD_GSR_TES) != 0)
;
}
static int
vtd_init(void)
{
int i, units, remaining;
struct vtdmap *vtdmap;
vm_paddr_t ctx_paddr;
char *end, envname[32];
unsigned long mapaddr;
ACPI_STATUS status;
ACPI_TABLE_DMAR *dmar;
ACPI_DMAR_HEADER *hdr;
ACPI_DMAR_HARDWARE_UNIT *drhd;
/*
* Allow the user to override the ACPI DMAR table by specifying the
* physical address of each remapping unit.
*
* The following example specifies two remapping units at
* physical addresses 0xfed90000 and 0xfeda0000 respectively.
* set vtd.regmap.0.addr=0xfed90000
* set vtd.regmap.1.addr=0xfeda0000
*/
for (units = 0; units < DRHD_MAX_UNITS; units++) {
snprintf(envname, sizeof(envname), "vtd.regmap.%d.addr", units);
if (getenv_ulong(envname, &mapaddr) == 0)
break;
vtdmaps[units] = (struct vtdmap *)PHYS_TO_DMAP(mapaddr);
}
if (units > 0)
goto skip_dmar;
/* Search for DMAR table. */
status = AcpiGetTable(ACPI_SIG_DMAR, 0, (ACPI_TABLE_HEADER **)&dmar);
if (ACPI_FAILURE(status))
return (ENXIO);
end = (char *)dmar + dmar->Header.Length;
remaining = dmar->Header.Length - sizeof(ACPI_TABLE_DMAR);
while (remaining > sizeof(ACPI_DMAR_HEADER)) {
hdr = (ACPI_DMAR_HEADER *)(end - remaining);
if (hdr->Length > remaining)
break;
/*
* From Intel VT-d arch spec, version 1.3:
* BIOS implementations must report mapping structures
* in numerical order, i.e. All remapping structures of
* type 0 (DRHD) enumerated before remapping structures of
* type 1 (RMRR) and so forth.
*/
if (hdr->Type != ACPI_DMAR_TYPE_HARDWARE_UNIT)
break;
drhd = (ACPI_DMAR_HARDWARE_UNIT *)hdr;
vtdmaps[units++] = (struct vtdmap *)PHYS_TO_DMAP(drhd->Address);
if (units >= DRHD_MAX_UNITS)
break;
remaining -= hdr->Length;
}
if (units <= 0)
return (ENXIO);
skip_dmar:
drhd_num = units;
vtdmap = vtdmaps[0];
if (VTD_CAP_CM(vtdmap->cap) != 0)
panic("vtd_init: invalid caching mode");
max_domains = vtd_max_domains(vtdmap);
/*
* Set up the root-table to point to the context-entry tables
*/
for (i = 0; i < 256; i++) {
ctx_paddr = vtophys(ctx_tables[i]);
if (ctx_paddr & PAGE_MASK)
panic("ctx table (0x%0lx) not page aligned", ctx_paddr);
root_table[i * 2] = ctx_paddr | VTD_ROOT_PRESENT;
}
return (0);
}
static void
vtd_cleanup(void)
{
}
static void
vtd_enable(void)
{
int i;
struct vtdmap *vtdmap;
for (i = 0; i < drhd_num; i++) {
vtdmap = vtdmaps[i];
vtd_wbflush(vtdmap);
/* Update the root table address */
vtdmap->rta = vtophys(root_table);
vtdmap->gcr = VTD_GCR_SRTP;
while ((vtdmap->gsr & VTD_GSR_RTPS) == 0)
;
vtd_ctx_global_invalidate(vtdmap);
vtd_iotlb_global_invalidate(vtdmap);
vtd_translation_enable(vtdmap);
}
}
static void
vtd_disable(void)
{
int i;
struct vtdmap *vtdmap;
for (i = 0; i < drhd_num; i++) {
vtdmap = vtdmaps[i];
vtd_translation_disable(vtdmap);
}
}
static void
vtd_add_device(void *arg, uint16_t rid)
{
int idx;
uint64_t *ctxp;
struct domain *dom = arg;
vm_paddr_t pt_paddr;
struct vtdmap *vtdmap;
uint8_t bus;
vtdmap = vtdmaps[0];
bus = PCI_RID2BUS(rid);
ctxp = ctx_tables[bus];
pt_paddr = vtophys(dom->ptp);
idx = VTD_RID2IDX(rid);
if (ctxp[idx] & VTD_CTX_PRESENT) {
panic("vtd_add_device: device %x is already owned by "
"domain %d", rid,
(uint16_t)(ctxp[idx + 1] >> 8));
}
/*
* Order is important. The 'present' bit is set only after all fields
* of the context pointer are initialized.
*/
ctxp[idx + 1] = dom->addrwidth | (dom->id << 8);
if (VTD_ECAP_DI(vtdmap->ext_cap))
ctxp[idx] = VTD_CTX_TT_ALL;
else
ctxp[idx] = 0;
ctxp[idx] |= pt_paddr | VTD_CTX_PRESENT;
/*
* 'Not Present' entries are not cached in either the Context Cache
* or in the IOTLB, so there is no need to invalidate either of them.
*/
}
static void
vtd_remove_device(void *arg, uint16_t rid)
{
int i, idx;
uint64_t *ctxp;
struct vtdmap *vtdmap;
uint8_t bus;
bus = PCI_RID2BUS(rid);
ctxp = ctx_tables[bus];
idx = VTD_RID2IDX(rid);
/*
* Order is important. The 'present' bit is must be cleared first.
*/
ctxp[idx] = 0;
ctxp[idx + 1] = 0;
/*
* Invalidate the Context Cache and the IOTLB.
*
* XXX use device-selective invalidation for Context Cache
* XXX use domain-selective invalidation for IOTLB
*/
for (i = 0; i < drhd_num; i++) {
vtdmap = vtdmaps[i];
vtd_ctx_global_invalidate(vtdmap);
vtd_iotlb_global_invalidate(vtdmap);
}
}
#define CREATE_MAPPING 0
#define REMOVE_MAPPING 1
static uint64_t
vtd_update_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len,
int remove)
{
struct domain *dom;
int i, spshift, ptpshift, ptpindex, nlevels;
uint64_t spsize, *ptp;
dom = arg;
ptpindex = 0;
ptpshift = 0;
KASSERT(gpa + len > gpa, ("%s: invalid gpa range %#lx/%#lx", __func__,
gpa, len));
KASSERT(gpa + len <= dom->maxaddr, ("%s: gpa range %#lx/%#lx beyond "
"domain maxaddr %#lx", __func__, gpa, len, dom->maxaddr));
if (gpa & PAGE_MASK)
panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa);
if (hpa & PAGE_MASK)
panic("vtd_create_mapping: unaligned hpa 0x%0lx", hpa);
if (len & PAGE_MASK)
panic("vtd_create_mapping: unaligned len 0x%0lx", len);
/*
* Compute the size of the mapping that we can accomodate.
*
* This is based on three factors:
* - supported super page size
* - alignment of the region starting at 'gpa' and 'hpa'
* - length of the region 'len'
*/
spshift = 48;
for (i = 3; i >= 0; i--) {
spsize = 1UL << spshift;
if ((dom->spsmask & (1 << i)) != 0 &&
(gpa & (spsize - 1)) == 0 &&
(hpa & (spsize - 1)) == 0 &&
(len >= spsize)) {
break;
}
spshift -= 9;
}
ptp = dom->ptp;
nlevels = dom->pt_levels;
while (--nlevels >= 0) {
ptpshift = 12 + nlevels * 9;
ptpindex = (gpa >> ptpshift) & 0x1FF;
/* We have reached the leaf mapping */
if (spshift >= ptpshift) {
break;
}
/*
* We are working on a non-leaf page table page.
*
* Create a downstream page table page if necessary and point
* to it from the current page table.
*/
if (ptp[ptpindex] == 0) {
void *nlp = malloc(PAGE_SIZE, M_VTD, M_WAITOK | M_ZERO);
ptp[ptpindex] = vtophys(nlp)| VTD_PTE_RD | VTD_PTE_WR;
}
ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & VTD_PTE_ADDR_M);
}
if ((gpa & ((1UL << ptpshift) - 1)) != 0)
panic("gpa 0x%lx and ptpshift %d mismatch", gpa, ptpshift);
/*
* Update the 'gpa' -> 'hpa' mapping
*/
if (remove) {
ptp[ptpindex] = 0;
} else {
ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR;
if (nlevels > 0)
ptp[ptpindex] |= VTD_PTE_SUPERPAGE;
}
return (1UL << ptpshift);
}
static uint64_t
vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
{
return (vtd_update_mapping(arg, gpa, hpa, len, CREATE_MAPPING));
}
static uint64_t
vtd_remove_mapping(void *arg, vm_paddr_t gpa, uint64_t len)
{
return (vtd_update_mapping(arg, gpa, 0, len, REMOVE_MAPPING));
}
static void
vtd_invalidate_tlb(void *dom)
{
int i;
struct vtdmap *vtdmap;
/*
* Invalidate the IOTLB.
* XXX use domain-selective invalidation for IOTLB
*/
for (i = 0; i < drhd_num; i++) {
vtdmap = vtdmaps[i];
vtd_iotlb_global_invalidate(vtdmap);
}
}
static void *
vtd_create_domain(vm_paddr_t maxaddr)
{
struct domain *dom;
vm_paddr_t addr;
int tmp, i, gaw, agaw, sagaw, res, pt_levels, addrwidth;
struct vtdmap *vtdmap;
if (drhd_num <= 0)
panic("vtd_create_domain: no dma remapping hardware available");
vtdmap = vtdmaps[0];
/*
* Calculate AGAW.
* Section 3.4.2 "Adjusted Guest Address Width", Architecture Spec.
*/
addr = 0;
for (gaw = 0; addr < maxaddr; gaw++)
addr = 1ULL << gaw;
res = (gaw - 12) % 9;
if (res == 0)
agaw = gaw;
else
agaw = gaw + 9 - res;
if (agaw > 64)
agaw = 64;
/*
* Select the smallest Supported AGAW and the corresponding number
* of page table levels.
*/
pt_levels = 2;
sagaw = 30;
addrwidth = 0;
tmp = VTD_CAP_SAGAW(vtdmap->cap);
for (i = 0; i < 5; i++) {
if ((tmp & (1 << i)) != 0 && sagaw >= agaw)
break;
pt_levels++;
addrwidth++;
sagaw += 9;
if (sagaw > 64)
sagaw = 64;
}
if (i >= 5) {
panic("vtd_create_domain: SAGAW 0x%lx does not support AGAW %d",
VTD_CAP_SAGAW(vtdmap->cap), agaw);
}
dom = malloc(sizeof(struct domain), M_VTD, M_ZERO | M_WAITOK);
dom->pt_levels = pt_levels;
dom->addrwidth = addrwidth;
dom->id = domain_id();
dom->maxaddr = maxaddr;
dom->ptp = malloc(PAGE_SIZE, M_VTD, M_ZERO | M_WAITOK);
if ((uintptr_t)dom->ptp & PAGE_MASK)
panic("vtd_create_domain: ptp (%p) not page aligned", dom->ptp);
#ifdef notyet
/*
* XXX superpage mappings for the iommu do not work correctly.
*
* By default all physical memory is mapped into the host_domain.
* When a VM is allocated wired memory the pages belonging to it
* are removed from the host_domain and added to the vm's domain.
*
* If the page being removed was mapped using a superpage mapping
* in the host_domain then we need to demote the mapping before
* removing the page.
*
* There is not any code to deal with the demotion at the moment
* so we disable superpage mappings altogether.
*/
dom->spsmask = VTD_CAP_SPS(vtdmap->cap);
#endif
SLIST_INSERT_HEAD(&domhead, dom, next);
return (dom);
}
static void
vtd_free_ptp(uint64_t *ptp, int level)
{
int i;
uint64_t *nlp;
if (level > 1) {
for (i = 0; i < 512; i++) {
if ((ptp[i] & (VTD_PTE_RD | VTD_PTE_WR)) == 0)
continue;
if ((ptp[i] & VTD_PTE_SUPERPAGE) != 0)
continue;
nlp = (uint64_t *)PHYS_TO_DMAP(ptp[i] & VTD_PTE_ADDR_M);
vtd_free_ptp(nlp, level - 1);
}
}
bzero(ptp, PAGE_SIZE);
free(ptp, M_VTD);
}
static void
vtd_destroy_domain(void *arg)
{
struct domain *dom;
dom = arg;
SLIST_REMOVE(&domhead, dom, domain, next);
vtd_free_ptp(dom->ptp, dom->pt_levels);
free(dom, M_VTD);
}
struct iommu_ops iommu_ops_intel = {
vtd_init,
vtd_cleanup,
vtd_enable,
vtd_disable,
vtd_create_domain,
vtd_destroy_domain,
vtd_create_mapping,
vtd_remove_mapping,
vtd_add_device,
vtd_remove_device,
vtd_invalidate_tlb,
};

285
vmm/io/iommu.c Normal file
View file

@ -0,0 +1,285 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/bus.h>
#include <sys/sysctl.h>
#include <dev/pci/pcivar.h>
#include <dev/pci/pcireg.h>
#include <machine/md_var.h>
#include "vmm_util.h"
#include "vmm_mem.h"
#include "iommu.h"
SYSCTL_DECL(_hw_vmm);
SYSCTL_NODE(_hw_vmm, OID_AUTO, iommu, CTLFLAG_RW, 0, "bhyve iommu parameters");
static int iommu_avail;
SYSCTL_INT(_hw_vmm_iommu, OID_AUTO, initialized, CTLFLAG_RD, &iommu_avail,
0, "bhyve iommu initialized?");
static struct iommu_ops *ops;
static void *host_domain;
static __inline int
IOMMU_INIT(void)
{
if (ops != NULL)
return ((*ops->init)());
else
return (ENXIO);
}
static __inline void
IOMMU_CLEANUP(void)
{
if (ops != NULL && iommu_avail)
(*ops->cleanup)();
}
static __inline void *
IOMMU_CREATE_DOMAIN(vm_paddr_t maxaddr)
{
if (ops != NULL && iommu_avail)
return ((*ops->create_domain)(maxaddr));
else
return (NULL);
}
static __inline void
IOMMU_DESTROY_DOMAIN(void *dom)
{
if (ops != NULL && iommu_avail)
(*ops->destroy_domain)(dom);
}
static __inline uint64_t
IOMMU_CREATE_MAPPING(void *domain, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len)
{
if (ops != NULL && iommu_avail)
return ((*ops->create_mapping)(domain, gpa, hpa, len));
else
return (len); /* XXX */
}
static __inline uint64_t
IOMMU_REMOVE_MAPPING(void *domain, vm_paddr_t gpa, uint64_t len)
{
if (ops != NULL && iommu_avail)
return ((*ops->remove_mapping)(domain, gpa, len));
else
return (len); /* XXX */
}
static __inline void
IOMMU_ADD_DEVICE(void *domain, uint16_t rid)
{
if (ops != NULL && iommu_avail)
(*ops->add_device)(domain, rid);
}
static __inline void
IOMMU_REMOVE_DEVICE(void *domain, uint16_t rid)
{
if (ops != NULL && iommu_avail)
(*ops->remove_device)(domain, rid);
}
static __inline void
IOMMU_INVALIDATE_TLB(void *domain)
{
if (ops != NULL && iommu_avail)
(*ops->invalidate_tlb)(domain);
}
static __inline void
IOMMU_ENABLE(void)
{
if (ops != NULL && iommu_avail)
(*ops->enable)();
}
static __inline void
IOMMU_DISABLE(void)
{
if (ops != NULL && iommu_avail)
(*ops->disable)();
}
void
iommu_init(void)
{
int error, bus, slot, func;
vm_paddr_t maxaddr;
const char *name;
device_t dev;
if (vmm_is_intel())
ops = &iommu_ops_intel;
else if (vmm_is_amd())
ops = &iommu_ops_amd;
else
ops = NULL;
error = IOMMU_INIT();
if (error)
return;
iommu_avail = 1;
/*
* Create a domain for the devices owned by the host
*/
maxaddr = vmm_mem_maxaddr();
host_domain = IOMMU_CREATE_DOMAIN(maxaddr);
if (host_domain == NULL)
panic("iommu_init: unable to create a host domain");
/*
* Create 1:1 mappings from '0' to 'maxaddr' for devices assigned to
* the host
*/
iommu_create_mapping(host_domain, 0, 0, maxaddr);
for (bus = 0; bus <= PCI_BUSMAX; bus++) {
for (slot = 0; slot <= PCI_SLOTMAX; slot++) {
for (func = 0; func <= PCI_FUNCMAX; func++) {
dev = pci_find_dbsf(0, bus, slot, func);
if (dev == NULL)
continue;
/* skip passthrough devices */
name = device_get_name(dev);
if (name != NULL && strcmp(name, "ppt") == 0)
continue;
/* everything else belongs to the host domain */
iommu_add_device(host_domain,
pci_get_rid(dev));
}
}
}
IOMMU_ENABLE();
}
void
iommu_cleanup(void)
{
IOMMU_DISABLE();
IOMMU_DESTROY_DOMAIN(host_domain);
IOMMU_CLEANUP();
}
void *
iommu_create_domain(vm_paddr_t maxaddr)
{
return (IOMMU_CREATE_DOMAIN(maxaddr));
}
void
iommu_destroy_domain(void *dom)
{
IOMMU_DESTROY_DOMAIN(dom);
}
void
iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, size_t len)
{
uint64_t mapped, remaining;
remaining = len;
while (remaining > 0) {
mapped = IOMMU_CREATE_MAPPING(dom, gpa, hpa, remaining);
gpa += mapped;
hpa += mapped;
remaining -= mapped;
}
}
void
iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len)
{
uint64_t unmapped, remaining;
remaining = len;
while (remaining > 0) {
unmapped = IOMMU_REMOVE_MAPPING(dom, gpa, remaining);
gpa += unmapped;
remaining -= unmapped;
}
}
void *
iommu_host_domain(void)
{
return (host_domain);
}
void
iommu_add_device(void *dom, uint16_t rid)
{
IOMMU_ADD_DEVICE(dom, rid);
}
void
iommu_remove_device(void *dom, uint16_t rid)
{
IOMMU_REMOVE_DEVICE(dom, rid);
}
void
iommu_invalidate_tlb(void *domain)
{
IOMMU_INVALIDATE_TLB(domain);
}

75
vmm/io/iommu.h Normal file
View file

@ -0,0 +1,75 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _IO_IOMMU_H_
#define _IO_IOMMU_H_
typedef int (*iommu_init_func_t)(void);
typedef void (*iommu_cleanup_func_t)(void);
typedef void (*iommu_enable_func_t)(void);
typedef void (*iommu_disable_func_t)(void);
typedef void *(*iommu_create_domain_t)(vm_paddr_t maxaddr);
typedef void (*iommu_destroy_domain_t)(void *domain);
typedef uint64_t (*iommu_create_mapping_t)(void *domain, vm_paddr_t gpa,
vm_paddr_t hpa, uint64_t len);
typedef uint64_t (*iommu_remove_mapping_t)(void *domain, vm_paddr_t gpa,
uint64_t len);
typedef void (*iommu_add_device_t)(void *domain, uint16_t rid);
typedef void (*iommu_remove_device_t)(void *dom, uint16_t rid);
typedef void (*iommu_invalidate_tlb_t)(void *dom);
struct iommu_ops {
iommu_init_func_t init; /* module wide */
iommu_cleanup_func_t cleanup;
iommu_enable_func_t enable;
iommu_disable_func_t disable;
iommu_create_domain_t create_domain; /* domain-specific */
iommu_destroy_domain_t destroy_domain;
iommu_create_mapping_t create_mapping;
iommu_remove_mapping_t remove_mapping;
iommu_add_device_t add_device;
iommu_remove_device_t remove_device;
iommu_invalidate_tlb_t invalidate_tlb;
};
extern struct iommu_ops iommu_ops_intel;
extern struct iommu_ops iommu_ops_amd;
void iommu_init(void);
void iommu_cleanup(void);
void *iommu_host_domain(void);
void *iommu_create_domain(vm_paddr_t maxaddr);
void iommu_destroy_domain(void *dom);
void iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa,
size_t len);
void iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len);
void iommu_add_device(void *dom, uint16_t rid);
void iommu_remove_device(void *dom, uint16_t rid);
void iommu_invalidate_tlb(void *domain);
#endif

651
vmm/io/ppt.c Normal file
View file

@ -0,0 +1,651 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/bus.h>
#include <sys/pciio.h>
#include <sys/rman.h>
#include <sys/smp.h>
#include <sys/sysctl.h>
#include <dev/pci/pcivar.h>
#include <dev/pci/pcireg.h>
#include <machine/resource.h>
#include <machine/vmm.h>
#include <machine/vmm_dev.h>
#include "vmm_lapic.h"
#include "vmm_ktr.h"
#include "iommu.h"
#include "ppt.h"
/* XXX locking */
#define MAX_MSIMSGS 32
/*
* If the MSI-X table is located in the middle of a BAR then that MMIO
* region gets split into two segments - one segment above the MSI-X table
* and the other segment below the MSI-X table - with a hole in place of
* the MSI-X table so accesses to it can be trapped and emulated.
*
* So, allocate a MMIO segment for each BAR register + 1 additional segment.
*/
#define MAX_MMIOSEGS ((PCIR_MAX_BAR_0 + 1) + 1)
MALLOC_DEFINE(M_PPTMSIX, "pptmsix", "Passthru MSI-X resources");
struct pptintr_arg { /* pptintr(pptintr_arg) */
struct pptdev *pptdev;
uint64_t addr;
uint64_t msg_data;
};
struct pptdev {
device_t dev;
struct vm *vm; /* owner of this device */
TAILQ_ENTRY(pptdev) next;
struct vm_memory_segment mmio[MAX_MMIOSEGS];
struct {
int num_msgs; /* guest state */
int startrid; /* host state */
struct resource *res[MAX_MSIMSGS];
void *cookie[MAX_MSIMSGS];
struct pptintr_arg arg[MAX_MSIMSGS];
} msi;
struct {
int num_msgs;
int startrid;
int msix_table_rid;
struct resource *msix_table_res;
struct resource **res;
void **cookie;
struct pptintr_arg *arg;
} msix;
};
SYSCTL_DECL(_hw_vmm);
SYSCTL_NODE(_hw_vmm, OID_AUTO, ppt, CTLFLAG_RW, 0, "bhyve passthru devices");
static int num_pptdevs;
SYSCTL_INT(_hw_vmm_ppt, OID_AUTO, devices, CTLFLAG_RD, &num_pptdevs, 0,
"number of pci passthru devices");
static TAILQ_HEAD(, pptdev) pptdev_list = TAILQ_HEAD_INITIALIZER(pptdev_list);
static int
ppt_probe(device_t dev)
{
int bus, slot, func;
struct pci_devinfo *dinfo;
dinfo = (struct pci_devinfo *)device_get_ivars(dev);
bus = pci_get_bus(dev);
slot = pci_get_slot(dev);
func = pci_get_function(dev);
/*
* To qualify as a pci passthrough device a device must:
* - be allowed by administrator to be used in this role
* - be an endpoint device
*/
if ((dinfo->cfg.hdrtype & PCIM_HDRTYPE) != PCIM_HDRTYPE_NORMAL)
return (ENXIO);
else if (vmm_is_pptdev(bus, slot, func))
return (0);
else
/*
* Returning BUS_PROBE_NOWILDCARD here matches devices that the
* SR-IOV infrastructure specified as "ppt" passthrough devices.
* All normal devices that did not have "ppt" specified as their
* driver will not be matched by this.
*/
return (BUS_PROBE_NOWILDCARD);
}
static int
ppt_attach(device_t dev)
{
struct pptdev *ppt;
ppt = device_get_softc(dev);
num_pptdevs++;
TAILQ_INSERT_TAIL(&pptdev_list, ppt, next);
ppt->dev = dev;
if (bootverbose)
device_printf(dev, "attached\n");
return (0);
}
static int
ppt_detach(device_t dev)
{
struct pptdev *ppt;
ppt = device_get_softc(dev);
if (ppt->vm != NULL)
return (EBUSY);
num_pptdevs--;
TAILQ_REMOVE(&pptdev_list, ppt, next);
return (0);
}
static device_method_t ppt_methods[] = {
/* Device interface */
DEVMETHOD(device_probe, ppt_probe),
DEVMETHOD(device_attach, ppt_attach),
DEVMETHOD(device_detach, ppt_detach),
{0, 0}
};
static devclass_t ppt_devclass;
DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, sizeof(struct pptdev));
DRIVER_MODULE(ppt, pci, ppt_driver, ppt_devclass, NULL, NULL);
static struct pptdev *
ppt_find(int bus, int slot, int func)
{
device_t dev;
struct pptdev *ppt;
int b, s, f;
TAILQ_FOREACH(ppt, &pptdev_list, next) {
dev = ppt->dev;
b = pci_get_bus(dev);
s = pci_get_slot(dev);
f = pci_get_function(dev);
if (bus == b && slot == s && func == f)
return (ppt);
}
return (NULL);
}
static void
ppt_unmap_mmio(struct vm *vm, struct pptdev *ppt)
{
int i;
struct vm_memory_segment *seg;
for (i = 0; i < MAX_MMIOSEGS; i++) {
seg = &ppt->mmio[i];
if (seg->len == 0)
continue;
(void)vm_unmap_mmio(vm, seg->gpa, seg->len);
bzero(seg, sizeof(struct vm_memory_segment));
}
}
static void
ppt_teardown_msi(struct pptdev *ppt)
{
int i, rid;
void *cookie;
struct resource *res;
if (ppt->msi.num_msgs == 0)
return;
for (i = 0; i < ppt->msi.num_msgs; i++) {
rid = ppt->msi.startrid + i;
res = ppt->msi.res[i];
cookie = ppt->msi.cookie[i];
if (cookie != NULL)
bus_teardown_intr(ppt->dev, res, cookie);
if (res != NULL)
bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
ppt->msi.res[i] = NULL;
ppt->msi.cookie[i] = NULL;
}
if (ppt->msi.startrid == 1)
pci_release_msi(ppt->dev);
ppt->msi.num_msgs = 0;
}
static void
ppt_teardown_msix_intr(struct pptdev *ppt, int idx)
{
int rid;
struct resource *res;
void *cookie;
rid = ppt->msix.startrid + idx;
res = ppt->msix.res[idx];
cookie = ppt->msix.cookie[idx];
if (cookie != NULL)
bus_teardown_intr(ppt->dev, res, cookie);
if (res != NULL)
bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
ppt->msix.res[idx] = NULL;
ppt->msix.cookie[idx] = NULL;
}
static void
ppt_teardown_msix(struct pptdev *ppt)
{
int i;
if (ppt->msix.num_msgs == 0)
return;
for (i = 0; i < ppt->msix.num_msgs; i++)
ppt_teardown_msix_intr(ppt, i);
if (ppt->msix.msix_table_res) {
bus_release_resource(ppt->dev, SYS_RES_MEMORY,
ppt->msix.msix_table_rid,
ppt->msix.msix_table_res);
ppt->msix.msix_table_res = NULL;
ppt->msix.msix_table_rid = 0;
}
free(ppt->msix.res, M_PPTMSIX);
free(ppt->msix.cookie, M_PPTMSIX);
free(ppt->msix.arg, M_PPTMSIX);
pci_release_msi(ppt->dev);
ppt->msix.num_msgs = 0;
}
int
ppt_avail_devices(void)
{
return (num_pptdevs);
}
int
ppt_assigned_devices(struct vm *vm)
{
struct pptdev *ppt;
int num;
num = 0;
TAILQ_FOREACH(ppt, &pptdev_list, next) {
if (ppt->vm == vm)
num++;
}
return (num);
}
boolean_t
ppt_is_mmio(struct vm *vm, vm_paddr_t gpa)
{
int i;
struct pptdev *ppt;
struct vm_memory_segment *seg;
TAILQ_FOREACH(ppt, &pptdev_list, next) {
if (ppt->vm != vm)
continue;
for (i = 0; i < MAX_MMIOSEGS; i++) {
seg = &ppt->mmio[i];
if (seg->len == 0)
continue;
if (gpa >= seg->gpa && gpa < seg->gpa + seg->len)
return (TRUE);
}
}
return (FALSE);
}
int
ppt_assign_device(struct vm *vm, int bus, int slot, int func)
{
struct pptdev *ppt;
ppt = ppt_find(bus, slot, func);
if (ppt != NULL) {
/*
* If this device is owned by a different VM then we
* cannot change its owner.
*/
if (ppt->vm != NULL && ppt->vm != vm)
return (EBUSY);
ppt->vm = vm;
iommu_add_device(vm_iommu_domain(vm), pci_get_rid(ppt->dev));
return (0);
}
return (ENOENT);
}
int
ppt_unassign_device(struct vm *vm, int bus, int slot, int func)
{
struct pptdev *ppt;
ppt = ppt_find(bus, slot, func);
if (ppt != NULL) {
/*
* If this device is not owned by this 'vm' then bail out.
*/
if (ppt->vm != vm)
return (EBUSY);
ppt_unmap_mmio(vm, ppt);
ppt_teardown_msi(ppt);
ppt_teardown_msix(ppt);
iommu_remove_device(vm_iommu_domain(vm), pci_get_rid(ppt->dev));
ppt->vm = NULL;
return (0);
}
return (ENOENT);
}
int
ppt_unassign_all(struct vm *vm)
{
struct pptdev *ppt;
int bus, slot, func;
device_t dev;
TAILQ_FOREACH(ppt, &pptdev_list, next) {
if (ppt->vm == vm) {
dev = ppt->dev;
bus = pci_get_bus(dev);
slot = pci_get_slot(dev);
func = pci_get_function(dev);
vm_unassign_pptdev(vm, bus, slot, func);
}
}
return (0);
}
int
ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
{
int i, error;
struct vm_memory_segment *seg;
struct pptdev *ppt;
ppt = ppt_find(bus, slot, func);
if (ppt != NULL) {
if (ppt->vm != vm)
return (EBUSY);
for (i = 0; i < MAX_MMIOSEGS; i++) {
seg = &ppt->mmio[i];
if (seg->len == 0) {
error = vm_map_mmio(vm, gpa, len, hpa);
if (error == 0) {
seg->gpa = gpa;
seg->len = len;
}
return (error);
}
}
return (ENOSPC);
}
return (ENOENT);
}
static int
pptintr(void *arg)
{
struct pptdev *ppt;
struct pptintr_arg *pptarg;
pptarg = arg;
ppt = pptarg->pptdev;
if (ppt->vm != NULL)
lapic_intr_msi(ppt->vm, pptarg->addr, pptarg->msg_data);
else {
/*
* XXX
* This is not expected to happen - panic?
*/
}
/*
* For legacy interrupts give other filters a chance in case
* the interrupt was not generated by the passthrough device.
*/
if (ppt->msi.startrid == 0)
return (FILTER_STRAY);
else
return (FILTER_HANDLED);
}
int
ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
uint64_t addr, uint64_t msg, int numvec)
{
int i, rid, flags;
int msi_count, startrid, error, tmp;
struct pptdev *ppt;
if (numvec < 0 || numvec > MAX_MSIMSGS)
return (EINVAL);
ppt = ppt_find(bus, slot, func);
if (ppt == NULL)
return (ENOENT);
if (ppt->vm != vm) /* Make sure we own this device */
return (EBUSY);
/* Free any allocated resources */
ppt_teardown_msi(ppt);
if (numvec == 0) /* nothing more to do */
return (0);
flags = RF_ACTIVE;
msi_count = pci_msi_count(ppt->dev);
if (msi_count == 0) {
startrid = 0; /* legacy interrupt */
msi_count = 1;
flags |= RF_SHAREABLE;
} else
startrid = 1; /* MSI */
/*
* The device must be capable of supporting the number of vectors
* the guest wants to allocate.
*/
if (numvec > msi_count)
return (EINVAL);
/*
* Make sure that we can allocate all the MSI vectors that are needed
* by the guest.
*/
if (startrid == 1) {
tmp = numvec;
error = pci_alloc_msi(ppt->dev, &tmp);
if (error)
return (error);
else if (tmp != numvec) {
pci_release_msi(ppt->dev);
return (ENOSPC);
} else {
/* success */
}
}
ppt->msi.startrid = startrid;
/*
* Allocate the irq resource and attach it to the interrupt handler.
*/
for (i = 0; i < numvec; i++) {
ppt->msi.num_msgs = i + 1;
ppt->msi.cookie[i] = NULL;
rid = startrid + i;
ppt->msi.res[i] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
&rid, flags);
if (ppt->msi.res[i] == NULL)
break;
ppt->msi.arg[i].pptdev = ppt;
ppt->msi.arg[i].addr = addr;
ppt->msi.arg[i].msg_data = msg + i;
error = bus_setup_intr(ppt->dev, ppt->msi.res[i],
INTR_TYPE_NET | INTR_MPSAFE,
pptintr, NULL, &ppt->msi.arg[i],
&ppt->msi.cookie[i]);
if (error != 0)
break;
}
if (i < numvec) {
ppt_teardown_msi(ppt);
return (ENXIO);
}
return (0);
}
int
ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
int idx, uint64_t addr, uint64_t msg, uint32_t vector_control)
{
struct pptdev *ppt;
struct pci_devinfo *dinfo;
int numvec, alloced, rid, error;
size_t res_size, cookie_size, arg_size;
ppt = ppt_find(bus, slot, func);
if (ppt == NULL)
return (ENOENT);
if (ppt->vm != vm) /* Make sure we own this device */
return (EBUSY);
dinfo = device_get_ivars(ppt->dev);
if (!dinfo)
return (ENXIO);
/*
* First-time configuration:
* Allocate the MSI-X table
* Allocate the IRQ resources
* Set up some variables in ppt->msix
*/
if (ppt->msix.num_msgs == 0) {
numvec = pci_msix_count(ppt->dev);
if (numvec <= 0)
return (EINVAL);
ppt->msix.startrid = 1;
ppt->msix.num_msgs = numvec;
res_size = numvec * sizeof(ppt->msix.res[0]);
cookie_size = numvec * sizeof(ppt->msix.cookie[0]);
arg_size = numvec * sizeof(ppt->msix.arg[0]);
ppt->msix.res = malloc(res_size, M_PPTMSIX, M_WAITOK | M_ZERO);
ppt->msix.cookie = malloc(cookie_size, M_PPTMSIX,
M_WAITOK | M_ZERO);
ppt->msix.arg = malloc(arg_size, M_PPTMSIX, M_WAITOK | M_ZERO);
rid = dinfo->cfg.msix.msix_table_bar;
ppt->msix.msix_table_res = bus_alloc_resource_any(ppt->dev,
SYS_RES_MEMORY, &rid, RF_ACTIVE);
if (ppt->msix.msix_table_res == NULL) {
ppt_teardown_msix(ppt);
return (ENOSPC);
}
ppt->msix.msix_table_rid = rid;
alloced = numvec;
error = pci_alloc_msix(ppt->dev, &alloced);
if (error || alloced != numvec) {
ppt_teardown_msix(ppt);
return (error == 0 ? ENOSPC: error);
}
}
if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
/* Tear down the IRQ if it's already set up */
ppt_teardown_msix_intr(ppt, idx);
/* Allocate the IRQ resource */
ppt->msix.cookie[idx] = NULL;
rid = ppt->msix.startrid + idx;
ppt->msix.res[idx] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
&rid, RF_ACTIVE);
if (ppt->msix.res[idx] == NULL)
return (ENXIO);
ppt->msix.arg[idx].pptdev = ppt;
ppt->msix.arg[idx].addr = addr;
ppt->msix.arg[idx].msg_data = msg;
/* Setup the MSI-X interrupt */
error = bus_setup_intr(ppt->dev, ppt->msix.res[idx],
INTR_TYPE_NET | INTR_MPSAFE,
pptintr, NULL, &ppt->msix.arg[idx],
&ppt->msix.cookie[idx]);
if (error != 0) {
bus_teardown_intr(ppt->dev, ppt->msix.res[idx], ppt->msix.cookie[idx]);
bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, ppt->msix.res[idx]);
ppt->msix.cookie[idx] = NULL;
ppt->msix.res[idx] = NULL;
return (ENXIO);
}
} else {
/* Masked, tear it down if it's already been set up */
ppt_teardown_msix_intr(ppt, idx);
}
return (0);
}

54
vmm/io/ppt.h Normal file
View file

@ -0,0 +1,54 @@
/*-
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _IO_PPT_H_
#define _IO_PPT_H_
int ppt_unassign_all(struct vm *vm);
int ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
int ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
uint64_t addr, uint64_t msg, int numvec);
int ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
int idx, uint64_t addr, uint64_t msg, uint32_t vector_control);
int ppt_assigned_devices(struct vm *vm);
boolean_t ppt_is_mmio(struct vm *vm, vm_paddr_t gpa);
/*
* Returns the number of devices sequestered by the ppt driver for assignment
* to virtual machines.
*/
int ppt_avail_devices(void);
/*
* The following functions should never be called directly.
* Use 'vm_assign_pptdev()' and 'vm_unassign_pptdev()' instead.
*/
int ppt_assign_device(struct vm *vm, int bus, int slot, int func);
int ppt_unassign_device(struct vm *vm, int bus, int slot, int func);
#endif

808
vmm/io/vatpic.c Normal file
View file

@ -0,0 +1,808 @@
/*-
* Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/types.h>
#include <sys/queue.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/systm.h>
#include <x86/apicreg.h>
#include <dev/ic/i8259.h>
#include <machine/vmm.h>
#include "vmm_ktr.h"
#include "vmm_lapic.h"
#include "vioapic.h"
#include "vatpic.h"
static MALLOC_DEFINE(M_VATPIC, "atpic", "bhyve virtual atpic (8259)");
#define VATPIC_LOCK(vatpic) mtx_lock_spin(&((vatpic)->mtx))
#define VATPIC_UNLOCK(vatpic) mtx_unlock_spin(&((vatpic)->mtx))
#define VATPIC_LOCKED(vatpic) mtx_owned(&((vatpic)->mtx))
enum irqstate {
IRQSTATE_ASSERT,
IRQSTATE_DEASSERT,
IRQSTATE_PULSE
};
struct atpic {
bool ready;
int icw_num;
int rd_cmd_reg;
bool aeoi;
bool poll;
bool rotate;
bool sfn; /* special fully-nested mode */
int irq_base;
uint8_t request; /* Interrupt Request Register (IIR) */
uint8_t service; /* Interrupt Service (ISR) */
uint8_t mask; /* Interrupt Mask Register (IMR) */
uint8_t smm; /* special mask mode */
int acnt[8]; /* sum of pin asserts and deasserts */
int lowprio; /* lowest priority irq */
bool intr_raised;
};
struct vatpic {
struct vm *vm;
struct mtx mtx;
struct atpic atpic[2];
uint8_t elc[2];
};
#define VATPIC_CTR0(vatpic, fmt) \
VM_CTR0((vatpic)->vm, fmt)
#define VATPIC_CTR1(vatpic, fmt, a1) \
VM_CTR1((vatpic)->vm, fmt, a1)
#define VATPIC_CTR2(vatpic, fmt, a1, a2) \
VM_CTR2((vatpic)->vm, fmt, a1, a2)
#define VATPIC_CTR3(vatpic, fmt, a1, a2, a3) \
VM_CTR3((vatpic)->vm, fmt, a1, a2, a3)
#define VATPIC_CTR4(vatpic, fmt, a1, a2, a3, a4) \
VM_CTR4((vatpic)->vm, fmt, a1, a2, a3, a4)
/*
* Loop over all the pins in priority order from highest to lowest.
*/
#define ATPIC_PIN_FOREACH(pinvar, atpic, tmpvar) \
for (tmpvar = 0, pinvar = (atpic->lowprio + 1) & 0x7; \
tmpvar < 8; \
tmpvar++, pinvar = (pinvar + 1) & 0x7)
static void vatpic_set_pinstate(struct vatpic *vatpic, int pin, bool newstate);
static __inline bool
master_atpic(struct vatpic *vatpic, struct atpic *atpic)
{
if (atpic == &vatpic->atpic[0])
return (true);
else
return (false);
}
static __inline int
vatpic_get_highest_isrpin(struct atpic *atpic)
{
int bit, pin;
int i;
ATPIC_PIN_FOREACH(pin, atpic, i) {
bit = (1 << pin);
if (atpic->service & bit) {
/*
* An IS bit that is masked by an IMR bit will not be
* cleared by a non-specific EOI in Special Mask Mode.
*/
if (atpic->smm && (atpic->mask & bit) != 0)
continue;
else
return (pin);
}
}
return (-1);
}
static __inline int
vatpic_get_highest_irrpin(struct atpic *atpic)
{
int serviced;
int bit, pin, tmp;
/*
* In 'Special Fully-Nested Mode' when an interrupt request from
* a slave is in service, the slave is not locked out from the
* master's priority logic.
*/
serviced = atpic->service;
if (atpic->sfn)
serviced &= ~(1 << 2);
/*
* In 'Special Mask Mode', when a mask bit is set in OCW1 it inhibits
* further interrupts at that level and enables interrupts from all
* other levels that are not masked. In other words the ISR has no
* bearing on the levels that can generate interrupts.
*/
if (atpic->smm)
serviced = 0;
ATPIC_PIN_FOREACH(pin, atpic, tmp) {
bit = 1 << pin;
/*
* If there is already an interrupt in service at the same
* or higher priority then bail.
*/
if ((serviced & bit) != 0)
break;
/*
* If an interrupt is asserted and not masked then return
* the corresponding 'pin' to the caller.
*/
if ((atpic->request & bit) != 0 && (atpic->mask & bit) == 0)
return (pin);
}
return (-1);
}
static void
vatpic_notify_intr(struct vatpic *vatpic)
{
struct atpic *atpic;
int pin;
KASSERT(VATPIC_LOCKED(vatpic), ("vatpic_notify_intr not locked"));
/*
* First check the slave.
*/
atpic = &vatpic->atpic[1];
if (!atpic->intr_raised &&
(pin = vatpic_get_highest_irrpin(atpic)) != -1) {
VATPIC_CTR4(vatpic, "atpic slave notify pin = %d "
"(imr 0x%x irr 0x%x isr 0x%x)", pin,
atpic->mask, atpic->request, atpic->service);
/*
* Cascade the request from the slave to the master.
*/
atpic->intr_raised = true;
vatpic_set_pinstate(vatpic, 2, true);
vatpic_set_pinstate(vatpic, 2, false);
} else {
VATPIC_CTR3(vatpic, "atpic slave no eligible interrupts "
"(imr 0x%x irr 0x%x isr 0x%x)",
atpic->mask, atpic->request, atpic->service);
}
/*
* Then check the master.
*/
atpic = &vatpic->atpic[0];
if (!atpic->intr_raised &&
(pin = vatpic_get_highest_irrpin(atpic)) != -1) {
VATPIC_CTR4(vatpic, "atpic master notify pin = %d "
"(imr 0x%x irr 0x%x isr 0x%x)", pin,
atpic->mask, atpic->request, atpic->service);
/*
* From Section 3.6.2, "Interrupt Modes", in the
* MPtable Specification, Version 1.4
*
* PIC interrupts are routed to both the Local APIC
* and the I/O APIC to support operation in 1 of 3
* modes.
*
* 1. Legacy PIC Mode: the PIC effectively bypasses
* all APIC components. In this mode the local APIC is
* disabled and LINT0 is reconfigured as INTR to
* deliver the PIC interrupt directly to the CPU.
*
* 2. Virtual Wire Mode: the APIC is treated as a
* virtual wire which delivers interrupts from the PIC
* to the CPU. In this mode LINT0 is programmed as
* ExtINT to indicate that the PIC is the source of
* the interrupt.
*
* 3. Virtual Wire Mode via I/O APIC: PIC interrupts are
* fielded by the I/O APIC and delivered to the appropriate
* CPU. In this mode the I/O APIC input 0 is programmed
* as ExtINT to indicate that the PIC is the source of the
* interrupt.
*/
atpic->intr_raised = true;
lapic_set_local_intr(vatpic->vm, -1, APIC_LVT_LINT0);
vioapic_pulse_irq(vatpic->vm, 0);
} else {
VATPIC_CTR3(vatpic, "atpic master no eligible interrupts "
"(imr 0x%x irr 0x%x isr 0x%x)",
atpic->mask, atpic->request, atpic->service);
}
}
static int
vatpic_icw1(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
{
VATPIC_CTR1(vatpic, "atpic icw1 0x%x", val);
atpic->ready = false;
atpic->icw_num = 1;
atpic->request = 0;
atpic->mask = 0;
atpic->lowprio = 7;
atpic->rd_cmd_reg = 0;
atpic->poll = 0;
atpic->smm = 0;
if ((val & ICW1_SNGL) != 0) {
VATPIC_CTR0(vatpic, "vatpic cascade mode required");
return (-1);
}
if ((val & ICW1_IC4) == 0) {
VATPIC_CTR0(vatpic, "vatpic icw4 required");
return (-1);
}
atpic->icw_num++;
return (0);
}
static int
vatpic_icw2(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
{
VATPIC_CTR1(vatpic, "atpic icw2 0x%x", val);
atpic->irq_base = val & 0xf8;
atpic->icw_num++;
return (0);
}
static int
vatpic_icw3(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
{
VATPIC_CTR1(vatpic, "atpic icw3 0x%x", val);
atpic->icw_num++;
return (0);
}
static int
vatpic_icw4(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
{
VATPIC_CTR1(vatpic, "atpic icw4 0x%x", val);
if ((val & ICW4_8086) == 0) {
VATPIC_CTR0(vatpic, "vatpic microprocessor mode required");
return (-1);
}
if ((val & ICW4_AEOI) != 0)
atpic->aeoi = true;
if ((val & ICW4_SFNM) != 0) {
if (master_atpic(vatpic, atpic)) {
atpic->sfn = true;
} else {
VATPIC_CTR1(vatpic, "Ignoring special fully nested "
"mode on slave atpic: %#x", val);
}
}
atpic->icw_num = 0;
atpic->ready = true;
return (0);
}
static int
vatpic_ocw1(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
{
VATPIC_CTR1(vatpic, "atpic ocw1 0x%x", val);
atpic->mask = val & 0xff;
return (0);
}
static int
vatpic_ocw2(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
{
VATPIC_CTR1(vatpic, "atpic ocw2 0x%x", val);
atpic->rotate = ((val & OCW2_R) != 0);
if ((val & OCW2_EOI) != 0) {
int isr_bit;
if ((val & OCW2_SL) != 0) {
/* specific EOI */
isr_bit = val & 0x7;
} else {
/* non-specific EOI */
isr_bit = vatpic_get_highest_isrpin(atpic);
}
if (isr_bit != -1) {
atpic->service &= ~(1 << isr_bit);
if (atpic->rotate)
atpic->lowprio = isr_bit;
}
} else if ((val & OCW2_SL) != 0 && atpic->rotate == true) {
/* specific priority */
atpic->lowprio = val & 0x7;
}
return (0);
}
static int
vatpic_ocw3(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
{
VATPIC_CTR1(vatpic, "atpic ocw3 0x%x", val);
if (val & OCW3_ESMM) {
atpic->smm = val & OCW3_SMM ? 1 : 0;
VATPIC_CTR2(vatpic, "%s atpic special mask mode %s",
master_atpic(vatpic, atpic) ? "master" : "slave",
atpic->smm ? "enabled" : "disabled");
}
if (val & OCW3_RR) {
/* read register command */
atpic->rd_cmd_reg = val & OCW3_RIS;
/* Polling mode */
atpic->poll = ((val & OCW3_P) != 0);
}
return (0);
}
static void
vatpic_set_pinstate(struct vatpic *vatpic, int pin, bool newstate)
{
struct atpic *atpic;
int oldcnt, newcnt;
bool level;
KASSERT(pin >= 0 && pin < 16,
("vatpic_set_pinstate: invalid pin number %d", pin));
KASSERT(VATPIC_LOCKED(vatpic),
("vatpic_set_pinstate: vatpic is not locked"));
atpic = &vatpic->atpic[pin >> 3];
oldcnt = atpic->acnt[pin & 0x7];
if (newstate)
atpic->acnt[pin & 0x7]++;
else
atpic->acnt[pin & 0x7]--;
newcnt = atpic->acnt[pin & 0x7];
if (newcnt < 0) {
VATPIC_CTR2(vatpic, "atpic pin%d: bad acnt %d", pin, newcnt);
}
level = ((vatpic->elc[pin >> 3] & (1 << (pin & 0x7))) != 0);
if ((oldcnt == 0 && newcnt == 1) || (newcnt > 0 && level == true)) {
/* rising edge or level */
VATPIC_CTR1(vatpic, "atpic pin%d: asserted", pin);
atpic->request |= (1 << (pin & 0x7));
} else if (oldcnt == 1 && newcnt == 0) {
/* falling edge */
VATPIC_CTR1(vatpic, "atpic pin%d: deasserted", pin);
if (level)
atpic->request &= ~(1 << (pin & 0x7));
} else {
VATPIC_CTR3(vatpic, "atpic pin%d: %s, ignored, acnt %d",
pin, newstate ? "asserted" : "deasserted", newcnt);
}
vatpic_notify_intr(vatpic);
}
static int
vatpic_set_irqstate(struct vm *vm, int irq, enum irqstate irqstate)
{
struct vatpic *vatpic;
struct atpic *atpic;
if (irq < 0 || irq > 15)
return (EINVAL);
vatpic = vm_atpic(vm);
atpic = &vatpic->atpic[irq >> 3];
if (atpic->ready == false)
return (0);
VATPIC_LOCK(vatpic);
switch (irqstate) {
case IRQSTATE_ASSERT:
vatpic_set_pinstate(vatpic, irq, true);
break;
case IRQSTATE_DEASSERT:
vatpic_set_pinstate(vatpic, irq, false);
break;
case IRQSTATE_PULSE:
vatpic_set_pinstate(vatpic, irq, true);
vatpic_set_pinstate(vatpic, irq, false);
break;
default:
panic("vatpic_set_irqstate: invalid irqstate %d", irqstate);
}
VATPIC_UNLOCK(vatpic);
return (0);
}
int
vatpic_assert_irq(struct vm *vm, int irq)
{
return (vatpic_set_irqstate(vm, irq, IRQSTATE_ASSERT));
}
int
vatpic_deassert_irq(struct vm *vm, int irq)
{
return (vatpic_set_irqstate(vm, irq, IRQSTATE_DEASSERT));
}
int
vatpic_pulse_irq(struct vm *vm, int irq)
{
return (vatpic_set_irqstate(vm, irq, IRQSTATE_PULSE));
}
int
vatpic_set_irq_trigger(struct vm *vm, int irq, enum vm_intr_trigger trigger)
{
struct vatpic *vatpic;
if (irq < 0 || irq > 15)
return (EINVAL);
/*
* See comment in vatpic_elc_handler. These IRQs must be
* edge triggered.
*/
if (trigger == LEVEL_TRIGGER) {
switch (irq) {
case 0:
case 1:
case 2:
case 8:
case 13:
return (EINVAL);
}
}
vatpic = vm_atpic(vm);
VATPIC_LOCK(vatpic);
if (trigger == LEVEL_TRIGGER)
vatpic->elc[irq >> 3] |= 1 << (irq & 0x7);
else
vatpic->elc[irq >> 3] &= ~(1 << (irq & 0x7));
VATPIC_UNLOCK(vatpic);
return (0);
}
void
vatpic_pending_intr(struct vm *vm, int *vecptr)
{
struct vatpic *vatpic;
struct atpic *atpic;
int pin;
vatpic = vm_atpic(vm);
atpic = &vatpic->atpic[0];
VATPIC_LOCK(vatpic);
pin = vatpic_get_highest_irrpin(atpic);
if (pin == 2) {
atpic = &vatpic->atpic[1];
pin = vatpic_get_highest_irrpin(atpic);
}
/*
* If there are no pins active at this moment then return the spurious
* interrupt vector instead.
*/
if (pin == -1)
pin = 7;
KASSERT(pin >= 0 && pin <= 7, ("%s: invalid pin %d", __func__, pin));
*vecptr = atpic->irq_base + pin;
VATPIC_UNLOCK(vatpic);
}
static void
vatpic_pin_accepted(struct atpic *atpic, int pin)
{
atpic->intr_raised = false;
if (atpic->acnt[pin] == 0)
atpic->request &= ~(1 << pin);
if (atpic->aeoi == true) {
if (atpic->rotate == true)
atpic->lowprio = pin;
} else {
atpic->service |= (1 << pin);
}
}
void
vatpic_intr_accepted(struct vm *vm, int vector)
{
struct vatpic *vatpic;
int pin;
vatpic = vm_atpic(vm);
VATPIC_LOCK(vatpic);
pin = vector & 0x7;
if ((vector & ~0x7) == vatpic->atpic[1].irq_base) {
vatpic_pin_accepted(&vatpic->atpic[1], pin);
/*
* If this vector originated from the slave,
* accept the cascaded interrupt too.
*/
vatpic_pin_accepted(&vatpic->atpic[0], 2);
} else {
vatpic_pin_accepted(&vatpic->atpic[0], pin);
}
vatpic_notify_intr(vatpic);
VATPIC_UNLOCK(vatpic);
}
static int
vatpic_read(struct vatpic *vatpic, struct atpic *atpic, bool in, int port,
int bytes, uint32_t *eax)
{
int pin;
VATPIC_LOCK(vatpic);
if (atpic->poll) {
atpic->poll = 0;
pin = vatpic_get_highest_irrpin(atpic);
if (pin >= 0) {
vatpic_pin_accepted(atpic, pin);
*eax = 0x80 | pin;
} else {
*eax = 0;
}
} else {
if (port & ICU_IMR_OFFSET) {
/* read interrrupt mask register */
*eax = atpic->mask;
} else {
if (atpic->rd_cmd_reg == OCW3_RIS) {
/* read interrupt service register */
*eax = atpic->service;
} else {
/* read interrupt request register */
*eax = atpic->request;
}
}
}
VATPIC_UNLOCK(vatpic);
return (0);
}
static int
vatpic_write(struct vatpic *vatpic, struct atpic *atpic, bool in, int port,
int bytes, uint32_t *eax)
{
int error;
uint8_t val;
error = 0;
val = *eax;
VATPIC_LOCK(vatpic);
if (port & ICU_IMR_OFFSET) {
switch (atpic->icw_num) {
case 2:
error = vatpic_icw2(vatpic, atpic, val);
break;
case 3:
error = vatpic_icw3(vatpic, atpic, val);
break;
case 4:
error = vatpic_icw4(vatpic, atpic, val);
break;
default:
error = vatpic_ocw1(vatpic, atpic, val);
break;
}
} else {
if (val & (1 << 4))
error = vatpic_icw1(vatpic, atpic, val);
if (atpic->ready) {
if (val & (1 << 3))
error = vatpic_ocw3(vatpic, atpic, val);
else
error = vatpic_ocw2(vatpic, atpic, val);
}
}
if (atpic->ready)
vatpic_notify_intr(vatpic);
VATPIC_UNLOCK(vatpic);
return (error);
}
int
vatpic_master_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
uint32_t *eax)
{
struct vatpic *vatpic;
struct atpic *atpic;
vatpic = vm_atpic(vm);
atpic = &vatpic->atpic[0];
if (bytes != 1)
return (-1);
if (in) {
return (vatpic_read(vatpic, atpic, in, port, bytes, eax));
}
return (vatpic_write(vatpic, atpic, in, port, bytes, eax));
}
int
vatpic_slave_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
uint32_t *eax)
{
struct vatpic *vatpic;
struct atpic *atpic;
vatpic = vm_atpic(vm);
atpic = &vatpic->atpic[1];
if (bytes != 1)
return (-1);
if (in) {
return (vatpic_read(vatpic, atpic, in, port, bytes, eax));
}
return (vatpic_write(vatpic, atpic, in, port, bytes, eax));
}
int
vatpic_elc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
uint32_t *eax)
{
struct vatpic *vatpic;
bool is_master;
vatpic = vm_atpic(vm);
is_master = (port == IO_ELCR1);
if (bytes != 1)
return (-1);
VATPIC_LOCK(vatpic);
if (in) {
if (is_master)
*eax = vatpic->elc[0];
else
*eax = vatpic->elc[1];
} else {
/*
* For the master PIC the cascade channel (IRQ2), the
* heart beat timer (IRQ0), and the keyboard
* controller (IRQ1) cannot be programmed for level
* mode.
*
* For the slave PIC the real time clock (IRQ8) and
* the floating point error interrupt (IRQ13) cannot
* be programmed for level mode.
*/
if (is_master)
vatpic->elc[0] = (*eax & 0xf8);
else
vatpic->elc[1] = (*eax & 0xde);
}
VATPIC_UNLOCK(vatpic);
return (0);
}
struct vatpic *
vatpic_init(struct vm *vm)
{
struct vatpic *vatpic;
vatpic = malloc(sizeof(struct vatpic), M_VATPIC, M_WAITOK | M_ZERO);
vatpic->vm = vm;
mtx_init(&vatpic->mtx, "vatpic lock", NULL, MTX_SPIN);
return (vatpic);
}
void
vatpic_cleanup(struct vatpic *vatpic)
{
free(vatpic, M_VATPIC);
}

57
vmm/io/vatpic.h Normal file
View file

@ -0,0 +1,57 @@
/*-
* Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VATPIC_H_
#define _VATPIC_H_
#include <isa/isareg.h>
#define ICU_IMR_OFFSET 1
#define IO_ELCR1 0x4d0
#define IO_ELCR2 0x4d1
struct vatpic *vatpic_init(struct vm *vm);
void vatpic_cleanup(struct vatpic *vatpic);
int vatpic_master_handler(struct vm *vm, int vcpuid, bool in, int port,
int bytes, uint32_t *eax);
int vatpic_slave_handler(struct vm *vm, int vcpuid, bool in, int port,
int bytes, uint32_t *eax);
int vatpic_elc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
uint32_t *eax);
int vatpic_assert_irq(struct vm *vm, int irq);
int vatpic_deassert_irq(struct vm *vm, int irq);
int vatpic_pulse_irq(struct vm *vm, int irq);
int vatpic_set_irq_trigger(struct vm *vm, int irq, enum vm_intr_trigger trigger);
void vatpic_pending_intr(struct vm *vm, int *vecptr);
void vatpic_intr_accepted(struct vm *vm, int vector);
#endif /* _VATPIC_H_ */

457
vmm/io/vatpit.c Normal file
View file

@ -0,0 +1,457 @@
/*-
* Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/types.h>
#include <sys/queue.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/systm.h>
#include <machine/vmm.h>
#include "vmm_ktr.h"
#include "vatpic.h"
#include "vioapic.h"
#include "vatpit.h"
static MALLOC_DEFINE(M_VATPIT, "atpit", "bhyve virtual atpit (8254)");
#define VATPIT_LOCK(vatpit) mtx_lock_spin(&((vatpit)->mtx))
#define VATPIT_UNLOCK(vatpit) mtx_unlock_spin(&((vatpit)->mtx))
#define VATPIT_LOCKED(vatpit) mtx_owned(&((vatpit)->mtx))
#define TIMER_SEL_MASK 0xc0
#define TIMER_RW_MASK 0x30
#define TIMER_MODE_MASK 0x0f
#define TIMER_SEL_READBACK 0xc0
#define TIMER_STS_OUT 0x80
#define TIMER_STS_NULLCNT 0x40
#define TIMER_RB_LCTR 0x20
#define TIMER_RB_LSTATUS 0x10
#define TIMER_RB_CTR_2 0x08
#define TIMER_RB_CTR_1 0x04
#define TIMER_RB_CTR_0 0x02
#define TMR2_OUT_STS 0x20
#define PIT_8254_FREQ 1193182
#define TIMER_DIV(freq, hz) (((freq) + (hz) / 2) / (hz))
struct vatpit_callout_arg {
struct vatpit *vatpit;
int channel_num;
};
struct channel {
int mode;
uint16_t initial; /* initial counter value */
sbintime_t now_sbt; /* uptime when counter was loaded */
uint8_t cr[2];
uint8_t ol[2];
bool slatched; /* status latched */
uint8_t status;
int crbyte;
int olbyte;
int frbyte;
struct callout callout;
sbintime_t callout_sbt; /* target time */
struct vatpit_callout_arg callout_arg;
};
struct vatpit {
struct vm *vm;
struct mtx mtx;
sbintime_t freq_sbt;
struct channel channel[3];
};
static void pit_timer_start_cntr0(struct vatpit *vatpit);
static int
vatpit_get_out(struct vatpit *vatpit, int channel)
{
struct channel *c;
sbintime_t delta_ticks;
int out;
c = &vatpit->channel[channel];
switch (c->mode) {
case TIMER_INTTC:
delta_ticks = (sbinuptime() - c->now_sbt) / vatpit->freq_sbt;
out = ((c->initial - delta_ticks) <= 0);
break;
default:
out = 0;
break;
}
return (out);
}
static void
vatpit_callout_handler(void *a)
{
struct vatpit_callout_arg *arg = a;
struct vatpit *vatpit;
struct callout *callout;
struct channel *c;
vatpit = arg->vatpit;
c = &vatpit->channel[arg->channel_num];
callout = &c->callout;
VM_CTR1(vatpit->vm, "atpit t%d fired", arg->channel_num);
VATPIT_LOCK(vatpit);
if (callout_pending(callout)) /* callout was reset */
goto done;
if (!callout_active(callout)) /* callout was stopped */
goto done;
callout_deactivate(callout);
if (c->mode == TIMER_RATEGEN) {
pit_timer_start_cntr0(vatpit);
}
vatpic_pulse_irq(vatpit->vm, 0);
vioapic_pulse_irq(vatpit->vm, 2);
done:
VATPIT_UNLOCK(vatpit);
return;
}
static void
pit_timer_start_cntr0(struct vatpit *vatpit)
{
struct channel *c;
sbintime_t now, delta, precision;
c = &vatpit->channel[0];
if (c->initial != 0) {
delta = c->initial * vatpit->freq_sbt;
precision = delta >> tc_precexp;
c->callout_sbt = c->callout_sbt + delta;
/*
* Reset 'callout_sbt' if the time that the callout
* was supposed to fire is more than 'c->initial'
* ticks in the past.
*/
now = sbinuptime();
if (c->callout_sbt < now)
c->callout_sbt = now + delta;
callout_reset_sbt(&c->callout, c->callout_sbt,
precision, vatpit_callout_handler, &c->callout_arg,
C_ABSOLUTE);
}
}
static uint16_t
pit_update_counter(struct vatpit *vatpit, struct channel *c, bool latch)
{
uint16_t lval;
sbintime_t delta_ticks;
/* cannot latch a new value until the old one has been consumed */
if (latch && c->olbyte != 0)
return (0);
if (c->initial == 0) {
/*
* This is possibly an o/s bug - reading the value of
* the timer without having set up the initial value.
*
* The original user-space version of this code set
* the timer to 100hz in this condition; do the same
* here.
*/
c->initial = TIMER_DIV(PIT_8254_FREQ, 100);
c->now_sbt = sbinuptime();
c->status &= ~TIMER_STS_NULLCNT;
}
delta_ticks = (sbinuptime() - c->now_sbt) / vatpit->freq_sbt;
lval = c->initial - delta_ticks % c->initial;
if (latch) {
c->olbyte = 2;
c->ol[1] = lval; /* LSB */
c->ol[0] = lval >> 8; /* MSB */
}
return (lval);
}
static int
pit_readback1(struct vatpit *vatpit, int channel, uint8_t cmd)
{
struct channel *c;
c = &vatpit->channel[channel];
/*
* Latch the count/status of the timer if not already latched.
* N.B. that the count/status latch-select bits are active-low.
*/
if (!(cmd & TIMER_RB_LCTR) && !c->olbyte) {
(void) pit_update_counter(vatpit, c, true);
}
if (!(cmd & TIMER_RB_LSTATUS) && !c->slatched) {
c->slatched = true;
/*
* For mode 0, see if the elapsed time is greater
* than the initial value - this results in the
* output pin being set to 1 in the status byte.
*/
if (c->mode == TIMER_INTTC && vatpit_get_out(vatpit, channel))
c->status |= TIMER_STS_OUT;
else
c->status &= ~TIMER_STS_OUT;
}
return (0);
}
static int
pit_readback(struct vatpit *vatpit, uint8_t cmd)
{
int error;
/*
* The readback command can apply to all timers.
*/
error = 0;
if (cmd & TIMER_RB_CTR_0)
error = pit_readback1(vatpit, 0, cmd);
if (!error && cmd & TIMER_RB_CTR_1)
error = pit_readback1(vatpit, 1, cmd);
if (!error && cmd & TIMER_RB_CTR_2)
error = pit_readback1(vatpit, 2, cmd);
return (error);
}
static int
vatpit_update_mode(struct vatpit *vatpit, uint8_t val)
{
struct channel *c;
int sel, rw, mode;
sel = val & TIMER_SEL_MASK;
rw = val & TIMER_RW_MASK;
mode = val & TIMER_MODE_MASK;
if (sel == TIMER_SEL_READBACK)
return (pit_readback(vatpit, val));
if (rw != TIMER_LATCH && rw != TIMER_16BIT)
return (-1);
if (rw != TIMER_LATCH) {
/*
* Counter mode is not affected when issuing a
* latch command.
*/
if (mode != TIMER_INTTC &&
mode != TIMER_RATEGEN &&
mode != TIMER_SQWAVE &&
mode != TIMER_SWSTROBE)
return (-1);
}
c = &vatpit->channel[sel >> 6];
if (rw == TIMER_LATCH)
pit_update_counter(vatpit, c, true);
else {
c->mode = mode;
c->olbyte = 0; /* reset latch after reprogramming */
c->status |= TIMER_STS_NULLCNT;
}
return (0);
}
int
vatpit_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
uint32_t *eax)
{
struct vatpit *vatpit;
struct channel *c;
uint8_t val;
int error;
vatpit = vm_atpit(vm);
if (bytes != 1)
return (-1);
val = *eax;
if (port == TIMER_MODE) {
if (in) {
VM_CTR0(vatpit->vm, "vatpit attempt to read mode");
return (-1);
}
VATPIT_LOCK(vatpit);
error = vatpit_update_mode(vatpit, val);
VATPIT_UNLOCK(vatpit);
return (error);
}
/* counter ports */
KASSERT(port >= TIMER_CNTR0 && port <= TIMER_CNTR2,
("invalid port 0x%x", port));
c = &vatpit->channel[port - TIMER_CNTR0];
VATPIT_LOCK(vatpit);
if (in && c->slatched) {
/*
* Return the status byte if latched
*/
*eax = c->status;
c->slatched = false;
c->status = 0;
} else if (in) {
/*
* The spec says that once the output latch is completely
* read it should revert to "following" the counter. Use
* the free running counter for this case (i.e. Linux
* TSC calibration). Assuming the access mode is 16-bit,
* toggle the MSB/LSB bit on each read.
*/
if (c->olbyte == 0) {
uint16_t tmp;
tmp = pit_update_counter(vatpit, c, false);
if (c->frbyte)
tmp >>= 8;
tmp &= 0xff;
*eax = tmp;
c->frbyte ^= 1;
} else
*eax = c->ol[--c->olbyte];
} else {
c->cr[c->crbyte++] = *eax;
if (c->crbyte == 2) {
c->status &= ~TIMER_STS_NULLCNT;
c->frbyte = 0;
c->crbyte = 0;
c->initial = c->cr[0] | (uint16_t)c->cr[1] << 8;
c->now_sbt = sbinuptime();
/* Start an interval timer for channel 0 */
if (port == TIMER_CNTR0) {
c->callout_sbt = c->now_sbt;
pit_timer_start_cntr0(vatpit);
}
if (c->initial == 0)
c->initial = 0xffff;
}
}
VATPIT_UNLOCK(vatpit);
return (0);
}
int
vatpit_nmisc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
uint32_t *eax)
{
struct vatpit *vatpit;
vatpit = vm_atpit(vm);
if (in) {
VATPIT_LOCK(vatpit);
if (vatpit_get_out(vatpit, 2))
*eax = TMR2_OUT_STS;
else
*eax = 0;
VATPIT_UNLOCK(vatpit);
}
return (0);
}
struct vatpit *
vatpit_init(struct vm *vm)
{
struct vatpit *vatpit;
struct bintime bt;
struct vatpit_callout_arg *arg;
int i;
vatpit = malloc(sizeof(struct vatpit), M_VATPIT, M_WAITOK | M_ZERO);
vatpit->vm = vm;
mtx_init(&vatpit->mtx, "vatpit lock", NULL, MTX_SPIN);
FREQ2BT(PIT_8254_FREQ, &bt);
vatpit->freq_sbt = bttosbt(bt);
for (i = 0; i < 3; i++) {
callout_init(&vatpit->channel[i].callout, true);
arg = &vatpit->channel[i].callout_arg;
arg->vatpit = vatpit;
arg->channel_num = i;
}
return (vatpit);
}
void
vatpit_cleanup(struct vatpit *vatpit)
{
int i;
for (i = 0; i < 3; i++)
callout_drain(&vatpit->channel[i].callout);
free(vatpit, M_VATPIT);
}

45
vmm/io/vatpit.h Normal file
View file

@ -0,0 +1,45 @@
/*-
* Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VATPIT_H_
#define _VATPIT_H_
#include <machine/timerreg.h>
#define NMISC_PORT 0x61
struct vatpit *vatpit_init(struct vm *vm);
void vatpit_cleanup(struct vatpit *vatpit);
int vatpit_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
uint32_t *eax);
int vatpit_nmisc_handler(struct vm *vm, int vcpuid, bool in, int port,
int bytes, uint32_t *eax);
#endif /* _VATPIT_H_ */

759
vmm/io/vhpet.c Normal file
View file

@ -0,0 +1,759 @@
/*-
* Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
* Copyright (c) 2013 Neel Natu <neel@freebsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/systm.h>
#include <dev/acpica/acpi_hpet.h>
#include <machine/vmm.h>
#include <machine/vmm_dev.h>
#include "vmm_lapic.h"
#include "vatpic.h"
#include "vioapic.h"
#include "vhpet.h"
#include "vmm_ktr.h"
static MALLOC_DEFINE(M_VHPET, "vhpet", "bhyve virtual hpet");
#define HPET_FREQ 10000000 /* 10.0 Mhz */
#define FS_PER_S 1000000000000000ul
/* Timer N Configuration and Capabilities Register */
#define HPET_TCAP_RO_MASK (HPET_TCAP_INT_ROUTE | \
HPET_TCAP_FSB_INT_DEL | \
HPET_TCAP_SIZE | \
HPET_TCAP_PER_INT)
/*
* HPET requires at least 3 timers and up to 32 timers per block.
*/
#define VHPET_NUM_TIMERS 8
CTASSERT(VHPET_NUM_TIMERS >= 3 && VHPET_NUM_TIMERS <= 32);
struct vhpet_callout_arg {
struct vhpet *vhpet;
int timer_num;
};
struct vhpet {
struct vm *vm;
struct mtx mtx;
sbintime_t freq_sbt;
uint64_t config; /* Configuration */
uint64_t isr; /* Interrupt Status */
uint32_t countbase; /* HPET counter base value */
sbintime_t countbase_sbt; /* uptime corresponding to base value */
struct {
uint64_t cap_config; /* Configuration */
uint64_t msireg; /* FSB interrupt routing */
uint32_t compval; /* Comparator */
uint32_t comprate;
struct callout callout;
sbintime_t callout_sbt; /* time when counter==compval */
struct vhpet_callout_arg arg;
} timer[VHPET_NUM_TIMERS];
};
#define VHPET_LOCK(vhp) mtx_lock(&((vhp)->mtx))
#define VHPET_UNLOCK(vhp) mtx_unlock(&((vhp)->mtx))
static void vhpet_start_timer(struct vhpet *vhpet, int n, uint32_t counter,
sbintime_t now);
static uint64_t
vhpet_capabilities(void)
{
uint64_t cap = 0;
cap |= 0x8086 << 16; /* vendor id */
cap |= (VHPET_NUM_TIMERS - 1) << 8; /* number of timers */
cap |= 1; /* revision */
cap &= ~HPET_CAP_COUNT_SIZE; /* 32-bit timer */
cap &= 0xffffffff;
cap |= (FS_PER_S / HPET_FREQ) << 32; /* tick period in fs */
return (cap);
}
static __inline bool
vhpet_counter_enabled(struct vhpet *vhpet)
{
return ((vhpet->config & HPET_CNF_ENABLE) ? true : false);
}
static __inline bool
vhpet_timer_msi_enabled(struct vhpet *vhpet, int n)
{
const uint64_t msi_enable = HPET_TCAP_FSB_INT_DEL | HPET_TCNF_FSB_EN;
if ((vhpet->timer[n].cap_config & msi_enable) == msi_enable)
return (true);
else
return (false);
}
static __inline int
vhpet_timer_ioapic_pin(struct vhpet *vhpet, int n)
{
/*
* If the timer is configured to use MSI then treat it as if the
* timer is not connected to the ioapic.
*/
if (vhpet_timer_msi_enabled(vhpet, n))
return (0);
return ((vhpet->timer[n].cap_config & HPET_TCNF_INT_ROUTE) >> 9);
}
static uint32_t
vhpet_counter(struct vhpet *vhpet, sbintime_t *nowptr)
{
uint32_t val;
sbintime_t now, delta;
val = vhpet->countbase;
if (vhpet_counter_enabled(vhpet)) {
now = sbinuptime();
delta = now - vhpet->countbase_sbt;
KASSERT(delta >= 0, ("vhpet_counter: uptime went backwards: "
"%#lx to %#lx", vhpet->countbase_sbt, now));
val += delta / vhpet->freq_sbt;
if (nowptr != NULL)
*nowptr = now;
} else {
/*
* The sbinuptime corresponding to the 'countbase' is
* meaningless when the counter is disabled. Make sure
* that the the caller doesn't want to use it.
*/
KASSERT(nowptr == NULL, ("vhpet_counter: nowptr must be NULL"));
}
return (val);
}
static void
vhpet_timer_clear_isr(struct vhpet *vhpet, int n)
{
int pin;
if (vhpet->isr & (1 << n)) {
pin = vhpet_timer_ioapic_pin(vhpet, n);
KASSERT(pin != 0, ("vhpet timer %d irq incorrectly routed", n));
vioapic_deassert_irq(vhpet->vm, pin);
vhpet->isr &= ~(1 << n);
}
}
static __inline bool
vhpet_periodic_timer(struct vhpet *vhpet, int n)
{
return ((vhpet->timer[n].cap_config & HPET_TCNF_TYPE) != 0);
}
static __inline bool
vhpet_timer_interrupt_enabled(struct vhpet *vhpet, int n)
{
return ((vhpet->timer[n].cap_config & HPET_TCNF_INT_ENB) != 0);
}
static __inline bool
vhpet_timer_edge_trig(struct vhpet *vhpet, int n)
{
KASSERT(!vhpet_timer_msi_enabled(vhpet, n), ("vhpet_timer_edge_trig: "
"timer %d is using MSI", n));
if ((vhpet->timer[n].cap_config & HPET_TCNF_INT_TYPE) == 0)
return (true);
else
return (false);
}
static void
vhpet_timer_interrupt(struct vhpet *vhpet, int n)
{
int pin;
/* If interrupts are not enabled for this timer then just return. */
if (!vhpet_timer_interrupt_enabled(vhpet, n))
return;
/*
* If a level triggered interrupt is already asserted then just return.
*/
if ((vhpet->isr & (1 << n)) != 0) {
VM_CTR1(vhpet->vm, "hpet t%d intr is already asserted", n);
return;
}
if (vhpet_timer_msi_enabled(vhpet, n)) {
lapic_intr_msi(vhpet->vm, vhpet->timer[n].msireg >> 32,
vhpet->timer[n].msireg & 0xffffffff);
return;
}
pin = vhpet_timer_ioapic_pin(vhpet, n);
if (pin == 0) {
VM_CTR1(vhpet->vm, "hpet t%d intr is not routed to ioapic", n);
return;
}
if (vhpet_timer_edge_trig(vhpet, n)) {
vioapic_pulse_irq(vhpet->vm, pin);
} else {
vhpet->isr |= 1 << n;
vioapic_assert_irq(vhpet->vm, pin);
}
}
static void
vhpet_adjust_compval(struct vhpet *vhpet, int n, uint32_t counter)
{
uint32_t compval, comprate, compnext;
KASSERT(vhpet->timer[n].comprate != 0, ("hpet t%d is not periodic", n));
compval = vhpet->timer[n].compval;
comprate = vhpet->timer[n].comprate;
/*
* Calculate the comparator value to be used for the next periodic
* interrupt.
*
* This function is commonly called from the callout handler.
* In this scenario the 'counter' is ahead of 'compval'. To find
* the next value to program into the accumulator we divide the
* number space between 'compval' and 'counter' into 'comprate'
* sized units. The 'compval' is rounded up such that is "ahead"
* of 'counter'.
*/
compnext = compval + ((counter - compval) / comprate + 1) * comprate;
vhpet->timer[n].compval = compnext;
}
static void
vhpet_handler(void *a)
{
int n;
uint32_t counter;
sbintime_t now;
struct vhpet *vhpet;
struct callout *callout;
struct vhpet_callout_arg *arg;
arg = a;
vhpet = arg->vhpet;
n = arg->timer_num;
callout = &vhpet->timer[n].callout;
VM_CTR1(vhpet->vm, "hpet t%d fired", n);
VHPET_LOCK(vhpet);
if (callout_pending(callout)) /* callout was reset */
goto done;
if (!callout_active(callout)) /* callout was stopped */
goto done;
callout_deactivate(callout);
if (!vhpet_counter_enabled(vhpet))
panic("vhpet(%p) callout with counter disabled", vhpet);
counter = vhpet_counter(vhpet, &now);
vhpet_start_timer(vhpet, n, counter, now);
vhpet_timer_interrupt(vhpet, n);
done:
VHPET_UNLOCK(vhpet);
return;
}
static void
vhpet_stop_timer(struct vhpet *vhpet, int n, sbintime_t now)
{
VM_CTR1(vhpet->vm, "hpet t%d stopped", n);
callout_stop(&vhpet->timer[n].callout);
/*
* If the callout was scheduled to expire in the past but hasn't
* had a chance to execute yet then trigger the timer interrupt
* here. Failing to do so will result in a missed timer interrupt
* in the guest. This is especially bad in one-shot mode because
* the next interrupt has to wait for the counter to wrap around.
*/
if (vhpet->timer[n].callout_sbt < now) {
VM_CTR1(vhpet->vm, "hpet t%d interrupt triggered after "
"stopping timer", n);
vhpet_timer_interrupt(vhpet, n);
}
}
static void
vhpet_start_timer(struct vhpet *vhpet, int n, uint32_t counter, sbintime_t now)
{
sbintime_t delta, precision;
if (vhpet->timer[n].comprate != 0)
vhpet_adjust_compval(vhpet, n, counter);
else {
/*
* In one-shot mode it is the guest's responsibility to make
* sure that the comparator value is not in the "past". The
* hardware doesn't have any belt-and-suspenders to deal with
* this so we don't either.
*/
}
delta = (vhpet->timer[n].compval - counter) * vhpet->freq_sbt;
precision = delta >> tc_precexp;
vhpet->timer[n].callout_sbt = now + delta;
callout_reset_sbt(&vhpet->timer[n].callout, vhpet->timer[n].callout_sbt,
precision, vhpet_handler, &vhpet->timer[n].arg, C_ABSOLUTE);
}
static void
vhpet_start_counting(struct vhpet *vhpet)
{
int i;
vhpet->countbase_sbt = sbinuptime();
for (i = 0; i < VHPET_NUM_TIMERS; i++) {
/*
* Restart the timers based on the value of the main counter
* when it stopped counting.
*/
vhpet_start_timer(vhpet, i, vhpet->countbase,
vhpet->countbase_sbt);
}
}
static void
vhpet_stop_counting(struct vhpet *vhpet, uint32_t counter, sbintime_t now)
{
int i;
vhpet->countbase = counter;
for (i = 0; i < VHPET_NUM_TIMERS; i++)
vhpet_stop_timer(vhpet, i, now);
}
static __inline void
update_register(uint64_t *regptr, uint64_t data, uint64_t mask)
{
*regptr &= ~mask;
*regptr |= (data & mask);
}
static void
vhpet_timer_update_config(struct vhpet *vhpet, int n, uint64_t data,
uint64_t mask)
{
bool clear_isr;
int old_pin, new_pin;
uint32_t allowed_irqs;
uint64_t oldval, newval;
if (vhpet_timer_msi_enabled(vhpet, n) ||
vhpet_timer_edge_trig(vhpet, n)) {
if (vhpet->isr & (1 << n))
panic("vhpet timer %d isr should not be asserted", n);
}
old_pin = vhpet_timer_ioapic_pin(vhpet, n);
oldval = vhpet->timer[n].cap_config;
newval = oldval;
update_register(&newval, data, mask);
newval &= ~(HPET_TCAP_RO_MASK | HPET_TCNF_32MODE);
newval |= oldval & HPET_TCAP_RO_MASK;
if (newval == oldval)
return;
vhpet->timer[n].cap_config = newval;
VM_CTR2(vhpet->vm, "hpet t%d cap_config set to 0x%016x", n, newval);
/*
* Validate the interrupt routing in the HPET_TCNF_INT_ROUTE field.
* If it does not match the bits set in HPET_TCAP_INT_ROUTE then set
* it to the default value of 0.
*/
allowed_irqs = vhpet->timer[n].cap_config >> 32;
new_pin = vhpet_timer_ioapic_pin(vhpet, n);
if (new_pin != 0 && (allowed_irqs & (1 << new_pin)) == 0) {
VM_CTR3(vhpet->vm, "hpet t%d configured invalid irq %d, "
"allowed_irqs 0x%08x", n, new_pin, allowed_irqs);
new_pin = 0;
vhpet->timer[n].cap_config &= ~HPET_TCNF_INT_ROUTE;
}
if (!vhpet_periodic_timer(vhpet, n))
vhpet->timer[n].comprate = 0;
/*
* If the timer's ISR bit is set then clear it in the following cases:
* - interrupt is disabled
* - interrupt type is changed from level to edge or fsb.
* - interrupt routing is changed
*
* This is to ensure that this timer's level triggered interrupt does
* not remain asserted forever.
*/
if (vhpet->isr & (1 << n)) {
KASSERT(old_pin != 0, ("timer %d isr asserted to ioapic pin %d",
n, old_pin));
if (!vhpet_timer_interrupt_enabled(vhpet, n))
clear_isr = true;
else if (vhpet_timer_msi_enabled(vhpet, n))
clear_isr = true;
else if (vhpet_timer_edge_trig(vhpet, n))
clear_isr = true;
else if (vhpet_timer_ioapic_pin(vhpet, n) != old_pin)
clear_isr = true;
else
clear_isr = false;
if (clear_isr) {
VM_CTR1(vhpet->vm, "hpet t%d isr cleared due to "
"configuration change", n);
vioapic_deassert_irq(vhpet->vm, old_pin);
vhpet->isr &= ~(1 << n);
}
}
}
int
vhpet_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t val, int size,
void *arg)
{
struct vhpet *vhpet;
uint64_t data, mask, oldval, val64;
uint32_t isr_clear_mask, old_compval, old_comprate, counter;
sbintime_t now, *nowptr;
int i, offset;
vhpet = vm_hpet(vm);
offset = gpa - VHPET_BASE;
VHPET_LOCK(vhpet);
/* Accesses to the HPET should be 4 or 8 bytes wide */
switch (size) {
case 8:
mask = 0xffffffffffffffff;
data = val;
break;
case 4:
mask = 0xffffffff;
data = val;
if ((offset & 0x4) != 0) {
mask <<= 32;
data <<= 32;
}
break;
default:
VM_CTR2(vhpet->vm, "hpet invalid mmio write: "
"offset 0x%08x, size %d", offset, size);
goto done;
}
/* Access to the HPET should be naturally aligned to its width */
if (offset & (size - 1)) {
VM_CTR2(vhpet->vm, "hpet invalid mmio write: "
"offset 0x%08x, size %d", offset, size);
goto done;
}
if (offset == HPET_CONFIG || offset == HPET_CONFIG + 4) {
/*
* Get the most recent value of the counter before updating
* the 'config' register. If the HPET is going to be disabled
* then we need to update 'countbase' with the value right
* before it is disabled.
*/
nowptr = vhpet_counter_enabled(vhpet) ? &now : NULL;
counter = vhpet_counter(vhpet, nowptr);
oldval = vhpet->config;
update_register(&vhpet->config, data, mask);
/*
* LegacyReplacement Routing is not supported so clear the
* bit explicitly.
*/
vhpet->config &= ~HPET_CNF_LEG_RT;
if ((oldval ^ vhpet->config) & HPET_CNF_ENABLE) {
if (vhpet_counter_enabled(vhpet)) {
vhpet_start_counting(vhpet);
VM_CTR0(vhpet->vm, "hpet enabled");
} else {
vhpet_stop_counting(vhpet, counter, now);
VM_CTR0(vhpet->vm, "hpet disabled");
}
}
goto done;
}
if (offset == HPET_ISR || offset == HPET_ISR + 4) {
isr_clear_mask = vhpet->isr & data;
for (i = 0; i < VHPET_NUM_TIMERS; i++) {
if ((isr_clear_mask & (1 << i)) != 0) {
VM_CTR1(vhpet->vm, "hpet t%d isr cleared", i);
vhpet_timer_clear_isr(vhpet, i);
}
}
goto done;
}
if (offset == HPET_MAIN_COUNTER || offset == HPET_MAIN_COUNTER + 4) {
/* Zero-extend the counter to 64-bits before updating it */
val64 = vhpet_counter(vhpet, NULL);
update_register(&val64, data, mask);
vhpet->countbase = val64;
if (vhpet_counter_enabled(vhpet))
vhpet_start_counting(vhpet);
goto done;
}
for (i = 0; i < VHPET_NUM_TIMERS; i++) {
if (offset == HPET_TIMER_CAP_CNF(i) ||
offset == HPET_TIMER_CAP_CNF(i) + 4) {
vhpet_timer_update_config(vhpet, i, data, mask);
break;
}
if (offset == HPET_TIMER_COMPARATOR(i) ||
offset == HPET_TIMER_COMPARATOR(i) + 4) {
old_compval = vhpet->timer[i].compval;
old_comprate = vhpet->timer[i].comprate;
if (vhpet_periodic_timer(vhpet, i)) {
/*
* In periodic mode writes to the comparator
* change the 'compval' register only if the
* HPET_TCNF_VAL_SET bit is set in the config
* register.
*/
val64 = vhpet->timer[i].comprate;
update_register(&val64, data, mask);
vhpet->timer[i].comprate = val64;
if ((vhpet->timer[i].cap_config &
HPET_TCNF_VAL_SET) != 0) {
vhpet->timer[i].compval = val64;
}
} else {
KASSERT(vhpet->timer[i].comprate == 0,
("vhpet one-shot timer %d has invalid "
"rate %u", i, vhpet->timer[i].comprate));
val64 = vhpet->timer[i].compval;
update_register(&val64, data, mask);
vhpet->timer[i].compval = val64;
}
vhpet->timer[i].cap_config &= ~HPET_TCNF_VAL_SET;
if (vhpet->timer[i].compval != old_compval ||
vhpet->timer[i].comprate != old_comprate) {
if (vhpet_counter_enabled(vhpet)) {
counter = vhpet_counter(vhpet, &now);
vhpet_start_timer(vhpet, i, counter,
now);
}
}
break;
}
if (offset == HPET_TIMER_FSB_VAL(i) ||
offset == HPET_TIMER_FSB_ADDR(i)) {
update_register(&vhpet->timer[i].msireg, data, mask);
break;
}
}
done:
VHPET_UNLOCK(vhpet);
return (0);
}
int
vhpet_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *rval, int size,
void *arg)
{
int i, offset;
struct vhpet *vhpet;
uint64_t data;
vhpet = vm_hpet(vm);
offset = gpa - VHPET_BASE;
VHPET_LOCK(vhpet);
/* Accesses to the HPET should be 4 or 8 bytes wide */
if (size != 4 && size != 8) {
VM_CTR2(vhpet->vm, "hpet invalid mmio read: "
"offset 0x%08x, size %d", offset, size);
data = 0;
goto done;
}
/* Access to the HPET should be naturally aligned to its width */
if (offset & (size - 1)) {
VM_CTR2(vhpet->vm, "hpet invalid mmio read: "
"offset 0x%08x, size %d", offset, size);
data = 0;
goto done;
}
if (offset == HPET_CAPABILITIES || offset == HPET_CAPABILITIES + 4) {
data = vhpet_capabilities();
goto done;
}
if (offset == HPET_CONFIG || offset == HPET_CONFIG + 4) {
data = vhpet->config;
goto done;
}
if (offset == HPET_ISR || offset == HPET_ISR + 4) {
data = vhpet->isr;
goto done;
}
if (offset == HPET_MAIN_COUNTER || offset == HPET_MAIN_COUNTER + 4) {
data = vhpet_counter(vhpet, NULL);
goto done;
}
for (i = 0; i < VHPET_NUM_TIMERS; i++) {
if (offset == HPET_TIMER_CAP_CNF(i) ||
offset == HPET_TIMER_CAP_CNF(i) + 4) {
data = vhpet->timer[i].cap_config;
break;
}
if (offset == HPET_TIMER_COMPARATOR(i) ||
offset == HPET_TIMER_COMPARATOR(i) + 4) {
data = vhpet->timer[i].compval;
break;
}
if (offset == HPET_TIMER_FSB_VAL(i) ||
offset == HPET_TIMER_FSB_ADDR(i)) {
data = vhpet->timer[i].msireg;
break;
}
}
if (i >= VHPET_NUM_TIMERS)
data = 0;
done:
VHPET_UNLOCK(vhpet);
if (size == 4) {
if (offset & 0x4)
data >>= 32;
}
*rval = data;
return (0);
}
struct vhpet *
vhpet_init(struct vm *vm)
{
int i, pincount;
struct vhpet *vhpet;
uint64_t allowed_irqs;
struct vhpet_callout_arg *arg;
struct bintime bt;
vhpet = malloc(sizeof(struct vhpet), M_VHPET, M_WAITOK | M_ZERO);
vhpet->vm = vm;
mtx_init(&vhpet->mtx, "vhpet lock", NULL, MTX_DEF);
FREQ2BT(HPET_FREQ, &bt);
vhpet->freq_sbt = bttosbt(bt);
pincount = vioapic_pincount(vm);
if (pincount >= 24)
allowed_irqs = 0x00f00000; /* irqs 20, 21, 22 and 23 */
else
allowed_irqs = 0;
/*
* Initialize HPET timer hardware state.
*/
for (i = 0; i < VHPET_NUM_TIMERS; i++) {
vhpet->timer[i].cap_config = allowed_irqs << 32;
vhpet->timer[i].cap_config |= HPET_TCAP_PER_INT;
vhpet->timer[i].cap_config |= HPET_TCAP_FSB_INT_DEL;
vhpet->timer[i].compval = 0xffffffff;
callout_init(&vhpet->timer[i].callout, 1);
arg = &vhpet->timer[i].arg;
arg->vhpet = vhpet;
arg->timer_num = i;
}
return (vhpet);
}
void
vhpet_cleanup(struct vhpet *vhpet)
{
int i;
for (i = 0; i < VHPET_NUM_TIMERS; i++)
callout_drain(&vhpet->timer[i].callout);
free(vhpet, M_VHPET);
}
int
vhpet_getcap(struct vm_hpet_cap *cap)
{
cap->capabilities = vhpet_capabilities();
return (0);
}

44
vmm/io/vhpet.h Normal file
View file

@ -0,0 +1,44 @@
/*-
* Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
* Copyright (c) 2013 Neel Natu <neel@freebsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VHPET_H_
#define _VHPET_H_
#define VHPET_BASE 0xfed00000
#define VHPET_SIZE 1024
struct vhpet *vhpet_init(struct vm *vm);
void vhpet_cleanup(struct vhpet *vhpet);
int vhpet_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t val,
int size, void *arg);
int vhpet_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *val,
int size, void *arg);
int vhpet_getcap(struct vm_hpet_cap *cap);
#endif /* _VHPET_H_ */

499
vmm/io/vioapic.c Normal file
View file

@ -0,0 +1,499 @@
/*-
* Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
* Copyright (c) 2013 Neel Natu <neel@freebsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/queue.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <x86/apicreg.h>
#include <machine/vmm.h>
#include "vmm_ktr.h"
#include "vmm_lapic.h"
#include "vlapic.h"
#include "vioapic.h"
#define IOREGSEL 0x00
#define IOWIN 0x10
#define REDIR_ENTRIES 24
#define RTBL_RO_BITS ((uint64_t)(IOART_REM_IRR | IOART_DELIVS))
struct vioapic {
struct vm *vm;
struct mtx mtx;
uint32_t id;
uint32_t ioregsel;
struct {
uint64_t reg;
int acnt; /* sum of pin asserts (+1) and deasserts (-1) */
} rtbl[REDIR_ENTRIES];
};
#define VIOAPIC_LOCK(vioapic) mtx_lock_spin(&((vioapic)->mtx))
#define VIOAPIC_UNLOCK(vioapic) mtx_unlock_spin(&((vioapic)->mtx))
#define VIOAPIC_LOCKED(vioapic) mtx_owned(&((vioapic)->mtx))
static MALLOC_DEFINE(M_VIOAPIC, "vioapic", "bhyve virtual ioapic");
#define VIOAPIC_CTR1(vioapic, fmt, a1) \
VM_CTR1((vioapic)->vm, fmt, a1)
#define VIOAPIC_CTR2(vioapic, fmt, a1, a2) \
VM_CTR2((vioapic)->vm, fmt, a1, a2)
#define VIOAPIC_CTR3(vioapic, fmt, a1, a2, a3) \
VM_CTR3((vioapic)->vm, fmt, a1, a2, a3)
#define VIOAPIC_CTR4(vioapic, fmt, a1, a2, a3, a4) \
VM_CTR4((vioapic)->vm, fmt, a1, a2, a3, a4)
#ifdef KTR
static const char *
pinstate_str(bool asserted)
{
if (asserted)
return ("asserted");
else
return ("deasserted");
}
#endif
static void
vioapic_send_intr(struct vioapic *vioapic, int pin)
{
int vector, delmode;
uint32_t low, high, dest;
bool level, phys;
KASSERT(pin >= 0 && pin < REDIR_ENTRIES,
("vioapic_set_pinstate: invalid pin number %d", pin));
KASSERT(VIOAPIC_LOCKED(vioapic),
("vioapic_set_pinstate: vioapic is not locked"));
low = vioapic->rtbl[pin].reg;
high = vioapic->rtbl[pin].reg >> 32;
if ((low & IOART_INTMASK) == IOART_INTMSET) {
VIOAPIC_CTR1(vioapic, "ioapic pin%d: masked", pin);
return;
}
phys = ((low & IOART_DESTMOD) == IOART_DESTPHY);
delmode = low & IOART_DELMOD;
level = low & IOART_TRGRLVL ? true : false;
if (level)
vioapic->rtbl[pin].reg |= IOART_REM_IRR;
vector = low & IOART_INTVEC;
dest = high >> APIC_ID_SHIFT;
vlapic_deliver_intr(vioapic->vm, level, dest, phys, delmode, vector);
}
static void
vioapic_set_pinstate(struct vioapic *vioapic, int pin, bool newstate)
{
int oldcnt, newcnt;
bool needintr;
KASSERT(pin >= 0 && pin < REDIR_ENTRIES,
("vioapic_set_pinstate: invalid pin number %d", pin));
KASSERT(VIOAPIC_LOCKED(vioapic),
("vioapic_set_pinstate: vioapic is not locked"));
oldcnt = vioapic->rtbl[pin].acnt;
if (newstate)
vioapic->rtbl[pin].acnt++;
else
vioapic->rtbl[pin].acnt--;
newcnt = vioapic->rtbl[pin].acnt;
if (newcnt < 0) {
VIOAPIC_CTR2(vioapic, "ioapic pin%d: bad acnt %d",
pin, newcnt);
}
needintr = false;
if (oldcnt == 0 && newcnt == 1) {
needintr = true;
VIOAPIC_CTR1(vioapic, "ioapic pin%d: asserted", pin);
} else if (oldcnt == 1 && newcnt == 0) {
VIOAPIC_CTR1(vioapic, "ioapic pin%d: deasserted", pin);
} else {
VIOAPIC_CTR3(vioapic, "ioapic pin%d: %s, ignored, acnt %d",
pin, pinstate_str(newstate), newcnt);
}
if (needintr)
vioapic_send_intr(vioapic, pin);
}
enum irqstate {
IRQSTATE_ASSERT,
IRQSTATE_DEASSERT,
IRQSTATE_PULSE
};
static int
vioapic_set_irqstate(struct vm *vm, int irq, enum irqstate irqstate)
{
struct vioapic *vioapic;
if (irq < 0 || irq >= REDIR_ENTRIES)
return (EINVAL);
vioapic = vm_ioapic(vm);
VIOAPIC_LOCK(vioapic);
switch (irqstate) {
case IRQSTATE_ASSERT:
vioapic_set_pinstate(vioapic, irq, true);
break;
case IRQSTATE_DEASSERT:
vioapic_set_pinstate(vioapic, irq, false);
break;
case IRQSTATE_PULSE:
vioapic_set_pinstate(vioapic, irq, true);
vioapic_set_pinstate(vioapic, irq, false);
break;
default:
panic("vioapic_set_irqstate: invalid irqstate %d", irqstate);
}
VIOAPIC_UNLOCK(vioapic);
return (0);
}
int
vioapic_assert_irq(struct vm *vm, int irq)
{
return (vioapic_set_irqstate(vm, irq, IRQSTATE_ASSERT));
}
int
vioapic_deassert_irq(struct vm *vm, int irq)
{
return (vioapic_set_irqstate(vm, irq, IRQSTATE_DEASSERT));
}
int
vioapic_pulse_irq(struct vm *vm, int irq)
{
return (vioapic_set_irqstate(vm, irq, IRQSTATE_PULSE));
}
/*
* Reset the vlapic's trigger-mode register to reflect the ioapic pin
* configuration.
*/
static void
vioapic_update_tmr(struct vm *vm, int vcpuid, void *arg)
{
struct vioapic *vioapic;
struct vlapic *vlapic;
uint32_t low, high, dest;
int delmode, pin, vector;
bool level, phys;
vlapic = vm_lapic(vm, vcpuid);
vioapic = vm_ioapic(vm);
VIOAPIC_LOCK(vioapic);
/*
* Reset all vectors to be edge-triggered.
*/
vlapic_reset_tmr(vlapic);
for (pin = 0; pin < REDIR_ENTRIES; pin++) {
low = vioapic->rtbl[pin].reg;
high = vioapic->rtbl[pin].reg >> 32;
level = low & IOART_TRGRLVL ? true : false;
if (!level)
continue;
/*
* For a level-triggered 'pin' let the vlapic figure out if
* an assertion on this 'pin' would result in an interrupt
* being delivered to it. If yes, then it will modify the
* TMR bit associated with this vector to level-triggered.
*/
phys = ((low & IOART_DESTMOD) == IOART_DESTPHY);
delmode = low & IOART_DELMOD;
vector = low & IOART_INTVEC;
dest = high >> APIC_ID_SHIFT;
vlapic_set_tmr_level(vlapic, dest, phys, delmode, vector);
}
VIOAPIC_UNLOCK(vioapic);
}
static uint32_t
vioapic_read(struct vioapic *vioapic, int vcpuid, uint32_t addr)
{
int regnum, pin, rshift;
regnum = addr & 0xff;
switch (regnum) {
case IOAPIC_ID:
return (vioapic->id);
break;
case IOAPIC_VER:
return (((REDIR_ENTRIES - 1) << MAXREDIRSHIFT) | 0x11);
break;
case IOAPIC_ARB:
return (vioapic->id);
break;
default:
break;
}
/* redirection table entries */
if (regnum >= IOAPIC_REDTBL &&
regnum < IOAPIC_REDTBL + REDIR_ENTRIES * 2) {
pin = (regnum - IOAPIC_REDTBL) / 2;
if ((regnum - IOAPIC_REDTBL) % 2)
rshift = 32;
else
rshift = 0;
return (vioapic->rtbl[pin].reg >> rshift);
}
return (0);
}
static void
vioapic_write(struct vioapic *vioapic, int vcpuid, uint32_t addr, uint32_t data)
{
uint64_t data64, mask64;
uint64_t last, changed;
int regnum, pin, lshift;
cpuset_t allvcpus;
regnum = addr & 0xff;
switch (regnum) {
case IOAPIC_ID:
vioapic->id = data & APIC_ID_MASK;
break;
case IOAPIC_VER:
case IOAPIC_ARB:
/* readonly */
break;
default:
break;
}
/* redirection table entries */
if (regnum >= IOAPIC_REDTBL &&
regnum < IOAPIC_REDTBL + REDIR_ENTRIES * 2) {
pin = (regnum - IOAPIC_REDTBL) / 2;
if ((regnum - IOAPIC_REDTBL) % 2)
lshift = 32;
else
lshift = 0;
last = vioapic->rtbl[pin].reg;
data64 = (uint64_t)data << lshift;
mask64 = (uint64_t)0xffffffff << lshift;
vioapic->rtbl[pin].reg &= ~mask64 | RTBL_RO_BITS;
vioapic->rtbl[pin].reg |= data64 & ~RTBL_RO_BITS;
VIOAPIC_CTR2(vioapic, "ioapic pin%d: redir table entry %#lx",
pin, vioapic->rtbl[pin].reg);
/*
* If any fields in the redirection table entry (except mask
* or polarity) have changed then rendezvous all the vcpus
* to update their vlapic trigger-mode registers.
*/
changed = last ^ vioapic->rtbl[pin].reg;
if (changed & ~(IOART_INTMASK | IOART_INTPOL)) {
VIOAPIC_CTR1(vioapic, "ioapic pin%d: recalculate "
"vlapic trigger-mode register", pin);
VIOAPIC_UNLOCK(vioapic);
allvcpus = vm_active_cpus(vioapic->vm);
vm_smp_rendezvous(vioapic->vm, vcpuid, allvcpus,
vioapic_update_tmr, NULL);
VIOAPIC_LOCK(vioapic);
}
/*
* Generate an interrupt if the following conditions are met:
* - pin is not masked
* - previous interrupt has been EOIed
* - pin level is asserted
*/
if ((vioapic->rtbl[pin].reg & IOART_INTMASK) == IOART_INTMCLR &&
(vioapic->rtbl[pin].reg & IOART_REM_IRR) == 0 &&
(vioapic->rtbl[pin].acnt > 0)) {
VIOAPIC_CTR2(vioapic, "ioapic pin%d: asserted at rtbl "
"write, acnt %d", pin, vioapic->rtbl[pin].acnt);
vioapic_send_intr(vioapic, pin);
}
}
}
static int
vioapic_mmio_rw(struct vioapic *vioapic, int vcpuid, uint64_t gpa,
uint64_t *data, int size, bool doread)
{
uint64_t offset;
offset = gpa - VIOAPIC_BASE;
/*
* The IOAPIC specification allows 32-bit wide accesses to the
* IOREGSEL (offset 0) and IOWIN (offset 16) registers.
*/
if (size != 4 || (offset != IOREGSEL && offset != IOWIN)) {
if (doread)
*data = 0;
return (0);
}
VIOAPIC_LOCK(vioapic);
if (offset == IOREGSEL) {
if (doread)
*data = vioapic->ioregsel;
else
vioapic->ioregsel = *data;
} else {
if (doread) {
*data = vioapic_read(vioapic, vcpuid,
vioapic->ioregsel);
} else {
vioapic_write(vioapic, vcpuid, vioapic->ioregsel,
*data);
}
}
VIOAPIC_UNLOCK(vioapic);
return (0);
}
int
vioapic_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *rval,
int size, void *arg)
{
int error;
struct vioapic *vioapic;
vioapic = vm_ioapic(vm);
error = vioapic_mmio_rw(vioapic, vcpuid, gpa, rval, size, true);
return (error);
}
int
vioapic_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t wval,
int size, void *arg)
{
int error;
struct vioapic *vioapic;
vioapic = vm_ioapic(vm);
error = vioapic_mmio_rw(vioapic, vcpuid, gpa, &wval, size, false);
return (error);
}
void
vioapic_process_eoi(struct vm *vm, int vcpuid, int vector)
{
struct vioapic *vioapic;
int pin;
KASSERT(vector >= 0 && vector < 256,
("vioapic_process_eoi: invalid vector %d", vector));
vioapic = vm_ioapic(vm);
VIOAPIC_CTR1(vioapic, "ioapic processing eoi for vector %d", vector);
/*
* XXX keep track of the pins associated with this vector instead
* of iterating on every single pin each time.
*/
VIOAPIC_LOCK(vioapic);
for (pin = 0; pin < REDIR_ENTRIES; pin++) {
if ((vioapic->rtbl[pin].reg & IOART_REM_IRR) == 0)
continue;
if ((vioapic->rtbl[pin].reg & IOART_INTVEC) != vector)
continue;
vioapic->rtbl[pin].reg &= ~IOART_REM_IRR;
if (vioapic->rtbl[pin].acnt > 0) {
VIOAPIC_CTR2(vioapic, "ioapic pin%d: asserted at eoi, "
"acnt %d", pin, vioapic->rtbl[pin].acnt);
vioapic_send_intr(vioapic, pin);
}
}
VIOAPIC_UNLOCK(vioapic);
}
struct vioapic *
vioapic_init(struct vm *vm)
{
int i;
struct vioapic *vioapic;
vioapic = malloc(sizeof(struct vioapic), M_VIOAPIC, M_WAITOK | M_ZERO);
vioapic->vm = vm;
mtx_init(&vioapic->mtx, "vioapic lock", NULL, MTX_SPIN);
/* Initialize all redirection entries to mask all interrupts */
for (i = 0; i < REDIR_ENTRIES; i++)
vioapic->rtbl[i].reg = 0x0001000000010000UL;
return (vioapic);
}
void
vioapic_cleanup(struct vioapic *vioapic)
{
free(vioapic, M_VIOAPIC);
}
int
vioapic_pincount(struct vm *vm)
{
return (REDIR_ENTRIES);
}

50
vmm/io/vioapic.h Normal file
View file

@ -0,0 +1,50 @@
/*-
* Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
* Copyright (c) 2013 Neel Natu <neel@freebsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VIOAPIC_H_
#define _VIOAPIC_H_
#define VIOAPIC_BASE 0xFEC00000
#define VIOAPIC_SIZE 4096
struct vioapic *vioapic_init(struct vm *vm);
void vioapic_cleanup(struct vioapic *vioapic);
int vioapic_assert_irq(struct vm *vm, int irq);
int vioapic_deassert_irq(struct vm *vm, int irq);
int vioapic_pulse_irq(struct vm *vm, int irq);
int vioapic_mmio_write(void *vm, int vcpuid, uint64_t gpa,
uint64_t wval, int size, void *arg);
int vioapic_mmio_read(void *vm, int vcpuid, uint64_t gpa,
uint64_t *rval, int size, void *arg);
int vioapic_pincount(struct vm *vm);
void vioapic_process_eoi(struct vm *vm, int vcpuid, int vector);
#endif

1654
vmm/io/vlapic.c Normal file

File diff suppressed because it is too large Load diff

Some files were not shown because too many files have changed in this diff Show more