/** Virtual Function IO wrapper around kernel API * * @author Steffen Vogel * @author Daniel Krebs * @copyright 2014-2020, Steffen Vogel * @copyright 2018, Daniel Krebs * @license GNU General Public License (version 3) * * VILLAScommon * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . *********************************************************************************/ #define _DEFAULT_SOURCE #if defined(__arm__) || defined(__aarch64__) #define _LARGEFILE64_SOURCE 1 #define _FILE_OFFSET_BITS 64 #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include using namespace villas::kernel::vfio; static const char *vfio_pci_region_names[] = { "PCI_BAR0", /* VFIO_PCI_BAR0_REGION_INDEX */ "PCI_BAR1", /* VFIO_PCI_BAR1_REGION_INDEX */ "PCI_BAR2", /* VFIO_PCI_BAR2_REGION_INDEX */ "PCI_BAR3", /* VFIO_PCI_BAR3_REGION_INDEX */ "PCI_BAR4", /* VFIO_PCI_BAR4_REGION_INDEX */ "PCI_BAR5", /* VFIO_PCI_BAR5_REGION_INDEX */ "PCI_ROM", /* VFIO_PCI_ROM_REGION_INDEX */ "PCI_CONFIG", /* VFIO_PCI_CONFIG_REGION_INDEX */ "PCI_VGA" /* VFIO_PCI_INTX_IRQ_INDEX */ }; static const char *vfio_pci_irq_names[] = { "PCI_INTX", /* VFIO_PCI_INTX_IRQ_INDEX */ "PCI_MSI", /* VFIO_PCI_MSI_IRQ_INDEX */ "PCI_MSIX", /* VFIO_PCI_MSIX_IRQ_INDEX */ "PCI_ERR", /* VFIO_PCI_ERR_IRQ_INDEX */ "PCI_REQ" /* VFIO_PCI_REQ_IRQ_INDEX */ }; namespace villas { Container::Container() : iova_next(0) { Logger logger = logging.get("kernel:vfio"); static constexpr const char* requiredKernelModules[] = { "vfio", "vfio_pci", "vfio_iommu_type1" }; for (const char* module : requiredKernelModules) { if (kernel::module_load(module) != 0) throw RuntimeError("Kernel module '{}' required but could not be loaded. " "Please load manually!", module); } /* Open VFIO API */ fd = open(VFIO_DEV, O_RDWR); if (fd < 0) throw RuntimeError("Failed to open VFIO container"); /* Check VFIO API version */ version = ioctl(fd, VFIO_GET_API_VERSION); if (version < 0 || version != VFIO_API_VERSION) throw RuntimeError("Failed to get VFIO version"); /* Check available VFIO extensions (IOMMU types) */ extensions = 0; for (unsigned int i = VFIO_TYPE1_IOMMU; i <= VFIO_NOIOMMU_IOMMU; i++) { int ret = ioctl(fd, VFIO_CHECK_EXTENSION, i); if (ret < 0) throw RuntimeError("Failed to get VFIO extensions"); else if (ret > 0) extensions |= (1 << i); } hasIommu = false; if (not (extensions & (1 << VFIO_NOIOMMU_IOMMU))) { if (not (extensions & (1 << VFIO_TYPE1_IOMMU))) throw RuntimeError("No supported IOMMU extension found"); else hasIommu = true; } logger->debug("Version: {:#x}", version); logger->debug("Extensions: {:#x}", extensions); logger->debug("IOMMU: {}", hasIommu ? "yes" : "no"); } Container::~Container() { Logger logger = logging.get("kernel:vfio"); logger->debug("Clean up container with fd {}", fd); /* Release memory and close fds */ groups.clear(); /* Close container */ int ret = close(fd); if (ret < 0) logger->error("Cannot close vfio container fd {}", fd); } std::shared_ptr Container::create() { std::shared_ptr container(new Container); return container; } void Container::dump() { Logger logger = logging.get("kernel:vfio"); logger->info("File descriptor: {}", fd); logger->info("Version: {}", version); logger->info("Extensions: 0x{:x}", extensions); for (auto &group : groups) { logger->info("VFIO Group {}, viable={}, container={}", group->index, (group->status.flags & VFIO_GROUP_FLAGS_VIABLE) > 0, (group->status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET) > 0 ); for (auto &device : group->devices) { logger->info("Device {}: regions={}, irqs={}, flags={}", device->name, device->info.num_regions, device->info.num_irqs, device->info.flags ); for (size_t i = 0; i < device->info.num_regions && i < 8; i++) { struct vfio_region_info *region = &device->regions[i]; if (region->size > 0) logger->info("Region {} {}: size={}, offset={}, flags={}", region->index, (device->info.flags & VFIO_DEVICE_FLAGS_PCI) ? vfio_pci_region_names[i] : "", region->size, region->offset, region->flags ); } for (size_t i = 0; i < device->info.num_irqs; i++) { struct vfio_irq_info *irq = &device->irqs[i]; if (irq->count > 0) logger->info("IRQ {} {}: count={}, flags={}", irq->index, (device->info.flags & VFIO_DEVICE_FLAGS_PCI ) ? vfio_pci_irq_names[i] : "", irq->count, irq->flags ); } } } } Device& Container::attachDevice(const char* name, int index) { Logger logger = logging.get("kernel:vfio"); Group &group = getOrAttachGroup(index); auto device = std::make_unique(name, group); /* Open device fd */ device->fd = ioctl(group.fd, VFIO_GROUP_GET_DEVICE_FD, name); if (device->fd < 0) throw RuntimeError("Failed to open VFIO device: {}", device->name); /* Get device info */ device->info.argsz = sizeof(device->info); int ret = ioctl(device->fd, VFIO_DEVICE_GET_INFO, &device->info); if (ret < 0) throw RuntimeError("Failed to get VFIO device info for: {}", device->name); logger->debug("Device has {} regions", device->info.num_regions); logger->debug("Device has {} IRQs", device->info.num_irqs); /* Reserve slots already so that we can use the []-operator for access */ device->irqs.resize(device->info.num_irqs); device->regions.resize(device->info.num_regions); device->mappings.resize(device->info.num_regions); /* Get device regions */ for (size_t i = 0; i < device->info.num_regions && i < 8; i++) { struct vfio_region_info region; memset(®ion, 0, sizeof (region)); region.argsz = sizeof(region); region.index = i; ret = ioctl(device->fd, VFIO_DEVICE_GET_REGION_INFO, ®ion); if (ret < 0) throw RuntimeError("Failed to get region of VFIO device: {}", device->name); device->regions[i] = region; } /* Get device irqs */ for (size_t i = 0; i < device->info.num_irqs; i++) { struct vfio_irq_info irq; memset(&irq, 0, sizeof (irq)); irq.argsz = sizeof(irq); irq.index = i; ret = ioctl(device->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq); if (ret < 0) throw RuntimeError("Failed to get IRQs of VFIO device: {}", device->name); device->irqs[i] = irq; } group.devices.push_back(std::move(device)); return *group.devices.back().get(); } Device& Container::attachDevice(const pci::Device &pdev) { int ret; char name[32], iommu_state[4]; static constexpr const char* kernelDriver = "vfio-pci"; Logger logger = logging.get("kernel:vfio"); /* Load PCI bus driver for VFIO */ if (kernel::module_load("vfio_pci")) throw RuntimeError("Failed to load kernel driver: vfio_pci"); /* Bind PCI card to vfio-pci driver if not already bound */ if (pdev.getDriver() != kernelDriver) { logger->debug("Bind PCI card to kernel driver '{}'", kernelDriver); pdev.attachDriver(kernelDriver); } /* Get IOMMU group of device */ int index = isIommuEnabled() ? pdev.getIOMMUGroup() : 0; if (index < 0) { ret = kernel::get_cmdline_param("intel_iommu", iommu_state, sizeof(iommu_state)); if (ret != 0 || strcmp("on", iommu_state) != 0) logger->warn("Kernel booted without command line parameter " "'intel_iommu' set to 'on'. Please check documentation " "(https://villas.fein-aachen.org/doc/fpga-setup.html) " "for help with troubleshooting."); throw RuntimeError("Failed to get IOMMU group of device"); } /* VFIO device name consists of PCI BDF */ snprintf(name, sizeof(name), "%04x:%02x:%02x.%x", pdev.slot.domain, pdev.slot.bus, pdev.slot.device, pdev.slot.function); logger->info("Attach to device {} with index {}", std::string(name), index); auto &device = attachDevice(name, index); device.pci_device = &pdev; /* Check if this is really a vfio-pci device */ if (not device.isVfioPciDevice()) throw RuntimeError("Device is not a vfio-pci device"); return device; } uintptr_t Container::memoryMap(uintptr_t virt, uintptr_t phys, size_t length) { int ret; Logger logger = logging.get("kernel:vfio"); if (not hasIommu) { logger->error("DMA mapping not supported without IOMMU"); return UINTPTR_MAX; } if (length & 0xFFF) { length += 0x1000; length &= ~0xFFF; } /* Super stupid allocator */ size_t iovaIncrement = 0; if (phys == UINTPTR_MAX) { phys = this->iova_next; iovaIncrement = length; } struct vfio_iommu_type1_dma_map dmaMap; memset(&dmaMap, 0, sizeof(dmaMap)); dmaMap.argsz = sizeof(dmaMap); dmaMap.vaddr = virt; dmaMap.iova = phys; dmaMap.size = length; dmaMap.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; ret = ioctl(this->fd, VFIO_IOMMU_MAP_DMA, &dmaMap); if (ret) { logger->error("Failed to create DMA mapping: {}", ret); return UINTPTR_MAX; } logger->debug("DMA map size={:#x}, iova={:#x}, vaddr={:#x}", dmaMap.size, dmaMap.iova, dmaMap.vaddr); /* Mapping successful, advance IOVA allocator */ this->iova_next += iovaIncrement; /* We intentionally don't return the actual mapped length, the users are * only guaranteed to have their demanded memory mapped correctly */ return dmaMap.iova; } bool Container::memoryUnmap(uintptr_t phys, size_t length) { int ret; Logger logger = logging.get("kernel:vfio"); if (not hasIommu) return true; struct vfio_iommu_type1_dma_unmap dmaUnmap; dmaUnmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap); dmaUnmap.flags = 0; dmaUnmap.iova = phys; dmaUnmap.size = length; ret = ioctl(this->fd, VFIO_IOMMU_UNMAP_DMA, &dmaUnmap); if (ret) { logger->error("Failed to unmap DMA mapping"); return false; } return true; } Group& Container::getOrAttachGroup(int index) { Logger logger = logging.get("kernel:vfio"); /* Search if group with index already exists */ for (auto &group : groups) { if (group->index == index) { return *group; } } /* Group not yet part of this container, so acquire ownership */ auto group = Group::attach(*this, index); if (not group) throw RuntimeError("Failed to attach to IOMMU group: {}", index); else logger->debug("Attached new group {} to VFIO container", index); /* Push to our list */ groups.push_back(std::move(group)); return *groups.back(); } Device::~Device() { Logger logger = logging.get("kernel:vfio"); logger->debug("Clean up device {} with fd {}", this->name, this->fd); for (auto ®ion : regions) { regionUnmap(region.index); } int ret = close(fd); if (ret != 0) { logger->error("Closing device fd {} failed", fd); } } bool Device::reset() { if (this->info.flags & VFIO_DEVICE_FLAGS_RESET) return ioctl(this->fd, VFIO_DEVICE_RESET) == 0; else return false; /* not supported by this device */ } void* Device::regionMap(size_t index) { struct vfio_region_info *r = ®ions[index]; if (!(r->flags & VFIO_REGION_INFO_FLAG_MMAP)) return MAP_FAILED; int flags = MAP_SHARED; #if !(defined(__arm__) || defined(__aarch64__)) flags |= MAP_SHARED | MAP_32BIT; #endif mappings[index] = mmap(nullptr, r->size, PROT_READ | PROT_WRITE, flags, fd, r->offset); return mappings[index]; } bool Device::regionUnmap(size_t index) { int ret; struct vfio_region_info *r = ®ions[index]; Logger logger = logging.get("kernel:vfio"); if (!mappings[index]) return false; /* was not mapped */ logger->debug("Unmap region {} from device {}", index, name); ret = munmap(mappings[index], r->size); if (ret) return false; mappings[index] = nullptr; return true; } size_t Device::regionGetSize(size_t index) { Logger logger = logging.get("kernel:vfio"); if (index >= regions.size()) { logger->error("Index out of range: {} >= {}", index, regions.size()); throw std::out_of_range("Index out of range"); } return regions[index].size; } bool Device::pciEnable() { int ret; uint32_t reg; const off_t offset = PCI_COMMAND + (static_cast(VFIO_PCI_CONFIG_REGION_INDEX) << 40); /* Check if this is really a vfio-pci device */ if (!(this->info.flags & VFIO_DEVICE_FLAGS_PCI)) return false; ret = pread(this->fd, ®, sizeof(reg), offset); if (ret != sizeof(reg)) return false; /* Enable memory access and PCI bus mastering which is required for DMA */ reg |= PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER; ret = pwrite(this->fd, ®, sizeof(reg), offset); if (ret != sizeof(reg)) return false; return true; } bool Device::pciHotReset() { Logger logger = logging.get("kernel:vfio"); /* Check if this is really a vfio-pci device */ if (not isVfioPciDevice()) return false; const size_t reset_info_len = sizeof(struct vfio_pci_hot_reset_info) + sizeof(struct vfio_pci_dependent_device) * 64; auto *reset_info_buf = new char[reset_info_len]; if (!reset_info_buf) throw MemoryAllocationError(); auto *reset_info = reinterpret_cast(reset_info_buf); reset_info->argsz = reset_info_len; if (ioctl(this->fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, reset_info) != 0) { delete[] reset_info_buf; return false; } logger->debug("Dependent devices for hot-reset:"); for (size_t i = 0; i < reset_info->count; i++) { struct vfio_pci_dependent_device *dd = &reset_info->devices[i]; logger->debug(" {:04x}:{:02x}:{:02x}.{:01x}: iommu_group={}", dd->segment, dd->bus, PCI_SLOT(dd->devfn), PCI_FUNC(dd->devfn), dd->group_id); if (static_cast(dd->group_id) != this->group.index) { delete[] reset_info_buf; return false; } } delete[] reset_info_buf; const size_t reset_len = sizeof(struct vfio_pci_hot_reset) + sizeof(int32_t) * 1; auto *reset_buf = new char[reset_len]; if (!reset_buf) throw MemoryAllocationError(); auto *reset = reinterpret_cast(reset_buf); reset->argsz = reset_len; reset->count = 1; reset->group_fds[0] = this->group.fd; int ret = ioctl(this->fd, VFIO_DEVICE_PCI_HOT_RESET, reset); const bool success = (ret == 0); delete[] reset_buf; if (not success and not group.container->isIommuEnabled()) { logger->info("PCI hot reset failed, but this is expected without IOMMU"); return true; } return success; } int Device::pciMsiInit(int efds[]) { /* Check if this is really a vfio-pci device */ if (not isVfioPciDevice()) return -1; const size_t irqCount = irqs[VFIO_PCI_MSI_IRQ_INDEX].count; const size_t irqSetSize = sizeof(struct vfio_irq_set) + sizeof(int) * irqCount; auto *irqSetBuf = new char[irqSetSize]; if (!irqSetBuf) throw MemoryAllocationError(); auto *irqSet = reinterpret_cast(irqSetBuf); irqSet->argsz = irqSetSize; irqSet->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; irqSet->index = VFIO_PCI_MSI_IRQ_INDEX; irqSet->start = 0; irqSet->count = irqCount; /* Now set the new eventfds */ for (size_t i = 0; i < irqCount; i++) { efds[i] = eventfd(0, 0); if (efds[i] < 0) { delete[] irqSetBuf; return -1; } } memcpy(irqSet->data, efds, sizeof(int) * irqCount); if (ioctl(fd, VFIO_DEVICE_SET_IRQS, irqSet) != 0) { delete[] irqSetBuf; return -1; } delete[] irqSetBuf; return irqCount; } int Device::pciMsiDeinit(int efds[]) { /* Check if this is really a vfio-pci device */ if (not isVfioPciDevice()) return -1; const size_t irqCount = irqs[VFIO_PCI_MSI_IRQ_INDEX].count; const size_t irqSetSize = sizeof(struct vfio_irq_set) + sizeof(int) * irqCount; auto *irqSetBuf = new char[irqSetSize]; if (!irqSetBuf) throw MemoryAllocationError(); auto *irqSet = reinterpret_cast(irqSetBuf); irqSet->argsz = irqSetSize; irqSet->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; irqSet->index = VFIO_PCI_MSI_IRQ_INDEX; irqSet->count = irqCount; irqSet->start = 0; for (size_t i = 0; i < irqCount; i++) { close(efds[i]); efds[i] = -1; } memcpy(irqSet->data, efds, sizeof(int) * irqCount); if (ioctl(fd, VFIO_DEVICE_SET_IRQS, irqSet) != 0) { delete[] irqSetBuf; return -1; } delete[] irqSetBuf; return irqCount; } bool Device::pciMsiFind(int nos[]) { int ret, idx, irq; char *end, *col, *last, line[1024], name[13]; FILE *f; f = fopen("/proc/interrupts", "r"); if (!f) return false; for (int i = 0; i < 32; i++) nos[i] = -1; /* For each line in /proc/interrupts */ while (fgets(line, sizeof(line), f)) { col = strtok(line, " "); /* IRQ number is in first column */ irq = strtol(col, &end, 10); if (col == end) continue; /* Find last column of line */ do { last = col; } while ((col = strtok(nullptr, " "))); ret = sscanf(last, "vfio-msi[%d](%12[0-9:])", &idx, name); if (ret == 2) { if (strstr(this->name.c_str(), name) == this->name.c_str()) nos[idx] = irq; } } fclose(f); return true; } bool Device::isVfioPciDevice() const { return info.flags & VFIO_DEVICE_FLAGS_PCI; } Group::~Group() { Logger logger = logging.get("kernel:vfio"); logger->debug("Clean up group {} with fd {}", this->index, this->fd); /* Release memory and close fds */ devices.clear(); if (fd < 0) logger->debug("Destructing group that has not been attached"); else { int ret = ioctl(fd, VFIO_GROUP_UNSET_CONTAINER); if (ret != 0) logger->error("Cannot unset container for group fd {}", fd); ret = close(fd); if (ret != 0) logger->error("Cannot close group fd {}", fd); } } std::unique_ptr Group::attach(Container &container, int groupIndex) { Logger logger = logging.get("kernel:vfio"); std::unique_ptr group { new Group(groupIndex) }; group->container = &container; /* Open group fd */ std::stringstream groupPath; groupPath << VFIO_PATH << (container.isIommuEnabled() ? "" : "noiommu-") << groupIndex; logger->debug("path: {}", groupPath.str().c_str()); group->fd = open(groupPath.str().c_str(), O_RDWR); if (group->fd < 0) { logger->error("Failed to open VFIO group {}", group->index); return nullptr; } logger->debug("VFIO group {} (fd {}) has path {}", groupIndex, group->fd, groupPath.str()); /* Claim group ownership */ int ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container.getFd()); if (ret < 0) { logger->error("Failed to attach VFIO group {} to container fd {} (error {})", group->index, container.getFd(), ret); return nullptr; } /* Set IOMMU type */ int iommu_type = container.isIommuEnabled() ? VFIO_TYPE1_IOMMU : VFIO_NOIOMMU_IOMMU; ret = ioctl(container.getFd(), VFIO_SET_IOMMU, iommu_type); if (ret < 0) { logger->error("Failed to set IOMMU type of container: {}", ret); return nullptr; } /* Check group viability and features */ group->status.argsz = sizeof(group->status); ret = ioctl(group->fd, VFIO_GROUP_GET_STATUS, &group->status); if (ret < 0) { logger->error("Failed to get VFIO group status"); return nullptr; } if (!(group->status.flags & VFIO_GROUP_FLAGS_VIABLE)) { logger->error("VFIO group is not available: bind all devices to the VFIO driver!"); return nullptr; } return group; } } /* namespace villas */