1
0
Fork 0
mirror of https://git.rwth-aachen.de/acs/public/villas/node/ synced 2025-03-09 00:00:00 +01:00

Merge branch 'feature/gpu' into 'develop'

Add basic GPU/CUDA integration as a shared library

See merge request acs/public/villas/VILLASfpga-code!9
This commit is contained in:
Steffen Vogel 2018-05-16 11:15:36 +02:00
commit 929ed2f393
33 changed files with 1577 additions and 145 deletions

6
fpga/.gitmodules vendored
View file

@ -4,3 +4,9 @@
[submodule "thirdparty/libxil"]
path = thirdparty/libxil
url = https://git.rwth-aachen.de/acs/public/villas/libxil.git
[submodule "lib/gpu/gdrcopy"]
path = lib/gpu/gdrcopy
url = https://github.com/daniel-k/gdrcopy.git
[submodule "thirdparty/udmabuf"]
path = thirdparty/udmabuf
url = https://github.com/ikwzm/udmabuf

View file

@ -120,30 +120,30 @@
"M_AXI_MM2S": {
"pcie_0_axi_pcie_0": {
"BAR0": {
"baseaddr": 2147483648,
"baseaddr": 0,
"highaddr": 4294967295,
"size": 2147483648
"size": 4294967296
}
}
},
"M_AXI_S2MM": {
"pcie_0_axi_pcie_0": {
"BAR0": {
"baseaddr": 2147483648,
"baseaddr": 0,
"highaddr": 4294967295,
"size": 2147483648
"size": 4294967296
}
}
}
},
"ports": [
{
"role": "initiator",
"role": "master",
"target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:1",
"name": "MM2S"
},
{
"role": "target",
"role": "slave",
"target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:1",
"name": "S2MM"
}
@ -159,30 +159,30 @@
"M_AXI_MM2S": {
"pcie_0_axi_pcie_0": {
"BAR0": {
"baseaddr": 2147483648,
"baseaddr": 0,
"highaddr": 4294967295,
"size": 2147483648
"size": 4294967296
}
}
},
"M_AXI_S2MM": {
"pcie_0_axi_pcie_0": {
"BAR0": {
"baseaddr": 2147483648,
"baseaddr": 0,
"highaddr": 4294967295,
"size": 2147483648
"size": 4294967296
}
}
}
},
"ports": [
{
"role": "initiator",
"role": "master",
"target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:6",
"name": "MM2S"
},
{
"role": "target",
"role": "slave",
"target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:6",
"name": "S2MM"
}
@ -214,22 +214,22 @@
"vlnv": "xilinx.com:ip:axis_switch:1.1",
"ports": [
{
"role": "initiator",
"role": "master",
"target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:3",
"name": "M03_AXIS"
},
{
"role": "target",
"role": "slave",
"target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:3",
"name": "S03_AXIS"
},
{
"role": "initiator",
"role": "master",
"target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:4",
"name": "M04_AXIS"
},
{
"role": "target",
"role": "slave",
"target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:4",
"name": "S04_AXIS"
}
@ -237,7 +237,7 @@
"num_ports": 7
},
"hier_0_hls_dft_0": {
"vlnv": "acs.eonerc.rwth-aachen.de:hls:hls_dft:1.0",
"vlnv": "acs.eonerc.rwth-aachen.de:hls:hls_dft:1.1",
"ports": [
{
"role": "master",
@ -249,7 +249,10 @@
"target": "hier_0_axis_interconnect_0_axis_interconnect_0_xbar:5",
"name": "input_r"
}
]
],
"irqs": {
"interrupt": "pcie_0_axi_pcie_intc_0:1"
}
},
"hier_0_rtds_axis_0": {
"vlnv": "acs.eonerc.rwth-aachen.de:user:rtds_axis:1.0",
@ -358,6 +361,19 @@
}
}
}
},
"axi_bars": {
"BAR0": {
"translation": 0,
"baseaddr": 0,
"highaddr": 4294967295,
"size": 4294967296
}
},
"pcie_bars": {
"BAR0": {
"translation": 0
}
}
},
"pcie_0_axi_pcie_intc_0": {

View file

@ -5,6 +5,7 @@
#include <memory>
#include <sstream>
#include <string>
#include <fstream>
#include <stdexcept>
#include <algorithm>
@ -58,6 +59,12 @@ public:
operator==(const Edge& other)
{ return this->id == other.id; }
Vertex::Identifier getVertexTo() const
{ return to; }
Vertex::Identifier getVertexFrom() const
{ return from; }
private:
Identifier id;
Vertex::Identifier from;
@ -81,7 +88,7 @@ public:
std::shared_ptr<VertexType> getVertex(VertexIdentifier vertexId) const
{
if(vertexId < 0 or vertexId >= lastVertexId)
if(vertexId >= lastVertexId)
throw std::invalid_argument("vertex doesn't exist");
// cannot use [] operator, because creates non-existing elements
@ -92,7 +99,10 @@ public:
template<class UnaryPredicate>
VertexIdentifier findVertex(UnaryPredicate p)
{
for(auto& [vertexId, vertex] : vertices) {
for(auto& v : vertices) {
auto& vertexId = v.first;
auto& vertex = v.second;
if(p(vertex)) {
return vertexId;
}
@ -103,7 +113,7 @@ public:
std::shared_ptr<EdgeType> getEdge(EdgeIdentifier edgeId) const
{
if(edgeId < 0 or edgeId >= lastEdgeId)
if(edgeId >= lastEdgeId)
throw std::invalid_argument("edge doesn't exist");
// cannot use [] operator, because creates non-existing elements
@ -177,7 +187,9 @@ public:
// delete every edge that start or ends at this vertex
auto it = edges.begin();
while(it != edges.end()) {
auto& [edgeId, edge] = *it;
auto& edgeId = it->first;
auto& edge = it->second;
bool removeEdge = false;
if(edge->to == vertexId) {
@ -207,9 +219,17 @@ public:
vertexGetEdges(VertexIdentifier vertexId) const
{ return getVertex(vertexId)->edges; }
using check_path_fn = std::function<bool(const Path&)>;
static bool
checkPath(const Path&)
{ return true; }
bool getPath(VertexIdentifier fromVertexId,
VertexIdentifier toVertexId,
Path& path)
Path& path,
check_path_fn pathCheckFunc = checkPath)
{
if(fromVertexId == toVertexId) {
// arrived at the destination
@ -239,7 +259,8 @@ public:
path.push_back(edgeId);
// recursive, depth-first search
if(getPath(edgeOfFromVertex->to, toVertexId, path)) {
if(getPath(edgeOfFromVertex->to, toVertexId, path, pathCheckFunc) and
pathCheckFunc(path)) {
// path found, we're done
return true;
} else {
@ -252,11 +273,11 @@ public:
return false;
}
void dump()
void dump(const std::string& fileName = "")
{
logger->info("Vertices:");
for(auto& [vertexId, vertex] : vertices) {
(void) vertexId;
for(auto& v : vertices) {
auto& vertex = v.second;
// format connected vertices into a list
std::stringstream ssEdges;
@ -267,11 +288,29 @@ public:
logger->info(" {} connected to: {}", *vertex, ssEdges.str());
}
std::fstream s(fileName, s.out | s.trunc);
if(s.is_open()) {
s << "digraph memgraph {" << std::endl;
}
logger->info("Edges:");
for(auto& [edgeId, edge] : edges) {
(void) edgeId;
for(auto& e : edges) {
auto& edge = e.second;
logger->info(" {}: {} -> {}", *edge, edge->from, edge->to);
if(s.is_open()) {
auto from = getVertex(edge->from);
auto to = getVertex(edge->to);
s << std::dec;
s << " \"" << *from << "\" -> \"" << *to << "\""
<< " [label=\"" << *edge << "\"];" << std::endl;
}
}
if(s.is_open()) {
s << "}" << std::endl;
s.close();
}
}

View file

@ -98,8 +98,9 @@ public: // TODO: make this private
std::string name; /**< The name of the FPGA card */
struct pci *pci;
struct pci* pci;
struct pci_device filter; /**< Filter for PCI device. */
struct pci_device* pdev; /**< PCI device handle */
/// The VFIO container that this card is part of
std::shared_ptr<VfioContainer> vfioContainer;

View file

@ -193,6 +193,10 @@ protected:
InterruptController*
getInterruptController(const std::string& interruptName) const;
MemoryManager::AddressSpaceId
getMasterAddrSpaceByInterface(const std::string& masterInterfaceName) const
{ return busMasterInterfaces.at(masterInterfaceName); }
protected:
struct IrqPort {
int num;

View file

@ -55,7 +55,9 @@ public:
bool readComplete()
{ return hasScatterGather() ? readCompleteSG() : readCompleteSimple(); }
bool pingPong(const MemoryBlock& src, const MemoryBlock& dst, size_t len);
bool memcpy(const MemoryBlock& src, const MemoryBlock& dst, size_t len);
bool makeAccesibleFromVA(const MemoryBlock& mem);
inline bool
hasScatterGather() const
@ -72,6 +74,8 @@ private:
bool writeCompleteSimple();
bool readCompleteSimple();
bool isMemoryBlockAccesible(const MemoryBlock& mem, const std::string& interface);
private:
static constexpr char registerMemory[] = "Reg";

View file

@ -52,6 +52,19 @@ public:
private:
static constexpr char axiInterface[] = "M_AXI";
static constexpr char pcieMemory[] = "BAR0";
struct AxiBar {
uintptr_t base;
size_t size;
uintptr_t translation;
};
struct PciBar {
uintptr_t translation;
};
std::map<std::string, AxiBar> axiToPcieTranslations;
std::map<std::string, PciBar> pcieToAxiTranslations;
};
@ -64,6 +77,8 @@ public:
getCompatibleVlnvString()
{ return "xilinx.com:ip:axi_pcie:"; }
bool configureJson(IpCore& ip, json_t *json_ip);
IpCore* create()
{ return new AxiPciExpressBridge; }

View file

@ -9,6 +9,7 @@
#pragma once
#include <stdint.h>
#include "list.h"
#define PCI_SLOT(devfn) (((devfn) >> 3) & 0x1f)
@ -33,6 +34,13 @@ struct pci_device {
} slot; /**< Bus, Device, Function (BDF) */
};
struct pci_region {
int num;
uintptr_t start;
uintptr_t end;
unsigned long long flags;
};
struct pci {
struct list devices; /**< List of available PCI devices in the system (struct pci_device) */
};
@ -66,6 +74,8 @@ int pci_attach_driver(const struct pci_device *d, const char *driver);
/** Return the IOMMU group of this PCI device or -1 if the device is not in a group. */
int pci_get_iommu_group(const struct pci_device *d);
size_t pci_get_regions(const struct pci_device *d, struct pci_region** regions);
#ifdef __cplusplus
}
#endif

View file

@ -18,7 +18,8 @@
#include <linux/vfio.h>
#include <sys/mman.h>
#define VFIO_DEV(x) "/dev/vfio/" x
#define VFIO_PATH "/dev/vfio/"
#define VFIO_DEV VFIO_PATH "vfio"
/* Forward declarations */
struct pci_device;
@ -90,7 +91,7 @@ public:
~VfioGroup();
static std::unique_ptr<VfioGroup>
attach(int containerFd, int groupIndex);
attach(VfioContainer& container, int groupIndex);
private:
/// VFIO group file descriptor
@ -135,6 +136,12 @@ public:
/** munmap() a region which has been mapped by vfio_map_region() */
bool memoryUnmap(uintptr_t phys, size_t length);
bool isIommuEnabled() const
{ return this->hasIommu; }
const int& getFd() const
{ return fd; }
private:
VfioGroup& getOrAttachGroup(int index);
@ -143,6 +150,7 @@ private:
int version;
int extensions;
uint64_t iova_next; /**< Next free IOVA address */
bool hasIommu;
/// All groups bound to this container
std::list<std::unique_ptr<VfioGroup>> groups;

View file

@ -116,6 +116,8 @@ public:
free = [&](MemoryBlock* mem) {
logger->warn("no free callback defined for addr space {}, not freeing",
mem->getAddrSpaceId());
removeMemoryBlock(*mem);
};
}
@ -128,6 +130,21 @@ public:
{
const size_t size = num * sizeof(T);
auto mem = allocateBlock(size);
// Check if the allocated memory is really accessible by writing to the
// allocated memory and reading back. Exponentially increase offset to
// speed up testing.
MemoryAccessor<volatile uint8_t> byteAccessor(*mem);
size_t idx = 0;
for(int i = 0; idx < mem->getSize(); i++, idx = (1 << i)) {
auto val = static_cast<uint8_t>(i);
byteAccessor[idx] = val;
if(byteAccessor[idx] != val) {
logger->error("Cannot access allocated memory");
throw std::bad_alloc();
}
}
return MemoryAccessor<T>(std::move(mem));
}
@ -179,6 +196,9 @@ public:
size_t getAvailableMemory() const
{ return memorySize - nextFreeAddress; }
size_t getSize() const
{ return memorySize; }
std::string getName() const;
std::unique_ptr<MemoryBlock, MemoryBlock::deallocator_fn>
@ -225,4 +245,41 @@ private:
static HostRamAllocator allocator;
};
class HostDmaRam {
private:
static std::string
getUdmaBufName(int num);
static std::string
getUdmaBufBasePath(int num);
static size_t
getUdmaBufBufSize(int num);
static uintptr_t
getUdmaBufPhysAddr(int num);
public:
class HostDmaRamAllocator : public LinearAllocator {
public:
HostDmaRamAllocator(int num);
virtual ~HostDmaRamAllocator();
std::string getName() const
{ return getUdmaBufName(num); }
private:
int num;
};
static HostDmaRamAllocator&
getAllocator(int num = 0);
private:
static std::map<int, std::unique_ptr<HostDmaRamAllocator>> allocators;
};
} // namespace villas

View file

@ -3,6 +3,7 @@
#include <cstdint>
#include <string>
#include <map>
#include <stdexcept>
#include <unistd.h>
#include "log.hpp"
@ -73,7 +74,12 @@ private:
// This is a singleton, so private constructor ...
MemoryManager() :
memoryGraph("MemoryGraph"),
logger(loggerGetOrCreate("MemoryManager")) {}
logger(loggerGetOrCreate("MemoryManager"))
{
pathCheckFunc = [&](const MemoryGraph::Path& path) {
return this->pathCheck(path);
};
}
// ... and no copying or assigning
MemoryManager(const MemoryManager&) = delete;
@ -109,7 +115,7 @@ private:
return stream << static_cast<const Edge&>(mapping) << " = "
<< mapping.name
<< std::hex
<< "(src=0x" << mapping.src
<< " (src=0x" << mapping.src
<< ", dest=0x" << mapping.dest
<< ", size=0x" << mapping.size
<< ")";
@ -144,6 +150,8 @@ public:
using AddressSpaceId = MemoryGraph::VertexIdentifier;
using MappingId = MemoryGraph::EdgeIdentifier;
struct InvalidTranslation : public std::exception {};
/// Get singleton instance
static MemoryManager&
get();
@ -152,6 +160,10 @@ public:
getProcessAddressSpace()
{ return getOrCreateAddressSpace("villas-fpga"); }
AddressSpaceId
getPciAddressSpace()
{ return getOrCreateAddressSpace("PCIe"); }
AddressSpaceId
getProcessAddressSpaceMemoryBlock(const std::string& memoryBlock)
{ return getOrCreateAddressSpace(getSlaveAddrSpaceName("villas-fpga", memoryBlock)); }
@ -184,6 +196,9 @@ public:
AddressSpaceId
findAddressSpace(const std::string& name);
std::list<AddressSpaceId>
findPath(AddressSpaceId fromAddrSpaceId, AddressSpaceId toAddrSpaceId);
MemoryTranslation
getTranslation(AddressSpaceId fromAddrSpaceId, AddressSpaceId toAddrSpaceId);
@ -203,6 +218,9 @@ public:
dump()
{ memoryGraph.dump(); }
void
dumpToFile(const std::string& fileName)
{ memoryGraph.dump(fileName); }
private:
/// Convert a Mapping to MemoryTranslation for calculations
@ -210,6 +228,8 @@ private:
getTranslationFromMapping(const Mapping& mapping)
{ return MemoryTranslation(mapping.src, mapping.dest, mapping.size); }
bool
pathCheck(const MemoryGraph::Path& path);
private:
/// Directed graph that stores address spaces and memory mappings
@ -221,6 +241,8 @@ private:
/// Logger for universal access in this class
SpdLogger logger;
MemoryGraph::check_path_fn pathCheckFunc;
/// Static pointer to global instance, because this is a singleton
static MemoryManager* instance;
};

View file

@ -40,6 +40,7 @@ public:
Unknown,
FpgaIp,
FpgaCard,
Gpu
};
Plugin(Type type, const std::string& name);

View file

@ -10,6 +10,15 @@ namespace utils {
std::vector<std::string>
tokenize(std::string s, std::string delimiter);
template<typename T>
void
assertExcept(bool condition, const T& exception)
{
if(not condition)
throw exception;
}
} // namespace utils
} // namespace villas

View file

@ -36,6 +36,16 @@ add_library(villas-fpga SHARED ${SOURCES})
target_link_libraries(villas-fpga PUBLIC villas-common)
# GPU library is optional, check for CUDA presence
include(CheckLanguage)
check_language(CUDA)
if(CMAKE_CUDA_COMPILER)
add_subdirectory(gpu)
target_link_libraries(villas-fpga PUBLIC villas-gpu)
else()
message("No CUDA support, not building GPU library")
endif()
target_compile_definitions(villas-fpga PRIVATE
BUILDID=\"abc\"
_GNU_SOURCE

View file

@ -165,6 +165,11 @@ PCIeCard::lookupIp(const Vlnv& vlnv) const
bool
PCIeCard::mapMemoryBlock(const MemoryBlock& block)
{
if(not vfioContainer->isIommuEnabled()) {
logger->warn("VFIO mapping not supported without IOMMU");
return false;
}
auto& mm = MemoryManager::get();
const auto& addrSpaceId = block.getAddrSpaceId();
@ -175,7 +180,6 @@ PCIeCard::mapMemoryBlock(const MemoryBlock& block)
logger->debug("Create VFIO mapping for {}", addrSpaceId);
}
auto translationFromProcess = mm.getTranslationFromProcess(addrSpaceId);
uintptr_t processBaseAddr = translationFromProcess.getLocalAddr(0);
uintptr_t iovaAddr = vfioContainer->memoryMap(processBaseAddr,
@ -188,10 +192,8 @@ PCIeCard::mapMemoryBlock(const MemoryBlock& block)
return false;
}
mm.createMapping(iovaAddr, 0, block.getSize(),
"vfio",
"VFIO-D2H",
this->addrSpaceIdDeviceToHost,
addrSpaceId);
@ -203,11 +205,8 @@ PCIeCard::mapMemoryBlock(const MemoryBlock& block)
bool
fpga::PCIeCard::init()
PCIeCard::init()
{
struct pci_device *pdev;
auto& mm = MemoryManager::get();
logger = getLogger();
logger->info("Initializing FPGA card {}", name);
@ -223,41 +222,12 @@ fpga::PCIeCard::init()
VfioDevice& device = vfioContainer->attachDevice(pdev);
this->vfioDevice = &device;
/* Enable memory access and PCI bus mastering for DMA */
if (not device.pciEnable()) {
logger->error("Failed to enable PCI device");
return false;
}
/* Map PCIe BAR */
const void* bar0_mapped = vfioDevice->regionMap(VFIO_PCI_BAR0_REGION_INDEX);
if (bar0_mapped == MAP_FAILED) {
logger->error("Failed to mmap() BAR0");
return false;
}
// determine size of BAR0 region
const size_t bar0_size = vfioDevice->regionGetSize(VFIO_PCI_BAR0_REGION_INDEX);
/* Link mapped BAR0 to global memory graph */
// get the address space of the current application
const auto villasAddrSpace = mm.getProcessAddressSpace();
// get the address space for the PCIe proxy we use with VFIO
const auto cardPCIeAddrSpaceName = mm.getMasterAddrSpaceName(name, "PCIe");
// create a new address space for this FPGA card
addrSpaceIdHostToDevice = mm.getOrCreateAddressSpace(cardPCIeAddrSpaceName);
// create a mapping from our address space to the FPGA card via vfio
mm.createMapping(reinterpret_cast<uintptr_t>(bar0_mapped),
0, bar0_size, "VFIO_map",
villasAddrSpace, addrSpaceIdHostToDevice);
/* Reset system? */
if (do_reset) {
/* Reset / detect PCI device */

View file

@ -1,6 +1,9 @@
#include <sys/mman.h>
#include <unistd.h>
#include <sstream>
#include <fstream>
#include "memory.hpp"
namespace villas {
@ -62,6 +65,8 @@ LinearAllocator::LinearAllocator(MemoryManager::AddressSpaceId memoryAddrSpaceId
mem->getSize(), mem->getOffset(), mem->getAddrSpaceId());
logger->warn("free() not implemented");
logger->debug("available memory: {:#x} bytes", getAvailableMemory());
removeMemoryBlock(*mem);
};
}
@ -133,7 +138,127 @@ HostRam::HostRamAllocator::HostRamAllocator() :
logger->warn("munmap() failed for {:#x} of size {:#x}",
mem->getOffset(), mem->getSize());
}
removeMemoryBlock(*mem);
};
}
std::map<int, std::unique_ptr<HostDmaRam::HostDmaRamAllocator>>
HostDmaRam::allocators;
HostDmaRam::HostDmaRamAllocator::HostDmaRamAllocator(int num) :
LinearAllocator(MemoryManager::get().getOrCreateAddressSpace(getUdmaBufName(num)), getUdmaBufBufSize(num)),
num(num)
{
auto& mm = MemoryManager::get();
logger = loggerGetOrCreate(getName());
if(getSize() == 0) {
logger->error("Zero-sized DMA buffer not supported, is the kernel module loaded?");
throw std::bad_alloc();
}
const uintptr_t base = getUdmaBufPhysAddr(num);
mm.createMapping(base, 0, getSize(), getName() + "-PCI",
mm.getPciAddressSpace(), getAddrSpaceId());
const auto bufPath = std::string("/dev/") + getUdmaBufName(num);
const int bufFd = open(bufPath.c_str(), O_RDWR | O_SYNC);
if(bufFd != -1) {
void* buf = mmap(nullptr, getSize(), PROT_READ|PROT_WRITE, MAP_SHARED, bufFd, 0);
close(bufFd);
if(buf != MAP_FAILED) {
mm.createMapping(reinterpret_cast<uintptr_t>(buf), 0, getSize(),
getName() + "-VA",
mm.getProcessAddressSpace(), getAddrSpaceId());
} else {
logger->warn("Cannot map {}", bufPath);
}
} else {
logger->warn("Cannot open {}", bufPath);
}
logger->info("Mapped {} of size {} bytes", bufPath, getSize());
}
HostDmaRam::HostDmaRamAllocator::~HostDmaRamAllocator()
{
auto& mm = MemoryManager::get();
void* baseVirt;
try {
auto translation = mm.getTranslationFromProcess(getAddrSpaceId());
baseVirt = reinterpret_cast<void*>(translation.getLocalAddr(0));
} catch(const std::out_of_range&) {
// not mapped, nothing to do
return;
}
logger->debug("Unmapping {}", getName());
// try to unmap it
if(::munmap(baseVirt, getSize()) != 0) {
logger->warn("munmap() failed for {:p} of size {:#x}",
baseVirt, getSize());
}
}
std::string
HostDmaRam::getUdmaBufName(int num)
{
std::stringstream name;
name << "udmabuf" << num;
return name.str();
}
std::string
HostDmaRam::getUdmaBufBasePath(int num)
{
std::stringstream path;
path << "/sys/class/udmabuf/udmabuf" << num << "/";
return path.str();
}
size_t
HostDmaRam::getUdmaBufBufSize(int num)
{
std::fstream s(getUdmaBufBasePath(num) + "size", s.in);
if(s.is_open()) {
std::string line;
if(std::getline(s, line)) {
return std::strtoul(line.c_str(), nullptr, 10);
}
}
return 0;
}
uintptr_t
HostDmaRam::getUdmaBufPhysAddr(int num)
{
std::fstream s(getUdmaBufBasePath(num) + "phys_addr", s.in);
if(s.is_open()) {
std::string line;
if(std::getline(s, line)) {
return std::strtoul(line.c_str(), nullptr, 16);
}
}
return UINTPTR_MAX;
}
HostDmaRam::HostDmaRamAllocator&HostDmaRam::getAllocator(int num)
{
auto& allocator = allocators[num];
if(not allocator) {
allocator = std::make_unique<HostDmaRamAllocator>(num);
}
return *allocator;
}
} // namespace villas

View file

@ -2,8 +2,11 @@
#include <limits>
#include <cstdint>
#include <villas/utils.hpp>
#include "memory_manager.hpp"
using namespace villas::utils;
namespace villas {
MemoryManager*
@ -70,17 +73,44 @@ MemoryManager::findAddressSpace(const std::string& name)
});
}
std::list<MemoryManager::AddressSpaceId>
MemoryManager::findPath(MemoryManager::AddressSpaceId fromAddrSpaceId,
MemoryManager::AddressSpaceId toAddrSpaceId)
{
std::list<AddressSpaceId> path;
auto fromAddrSpace = memoryGraph.getVertex(fromAddrSpaceId);
auto toAddrSpace = memoryGraph.getVertex(toAddrSpaceId);
// find a path through the memory graph
MemoryGraph::Path pathGraph;
if(not memoryGraph.getPath(fromAddrSpaceId, toAddrSpaceId, pathGraph, pathCheckFunc)) {
logger->debug("No translation found from ({}) to ({})",
*fromAddrSpace, *toAddrSpace);
throw std::out_of_range("no translation found");
}
for(auto& mappingId : pathGraph) {
auto mapping = memoryGraph.getEdge(mappingId);
path.push_back(mapping->getVertexTo());
}
return path;
}
MemoryTranslation
MemoryManager::getTranslation(MemoryManager::AddressSpaceId fromAddrSpaceId,
MemoryManager::AddressSpaceId toAddrSpaceId)
{
// find a path through the memory graph
MemoryGraph::Path path;
if(not memoryGraph.getPath(fromAddrSpaceId, toAddrSpaceId, path)) {
if(not memoryGraph.getPath(fromAddrSpaceId, toAddrSpaceId, path, pathCheckFunc)) {
auto fromAddrSpace = memoryGraph.getVertex(fromAddrSpaceId);
auto toAddrSpace = memoryGraph.getVertex(toAddrSpaceId);
logger->error("No translation found from ({}) to ({})",
logger->debug("No translation found from ({}) to ({})",
*fromAddrSpace, *toAddrSpace);
throw std::out_of_range("no translation found");
@ -98,6 +128,26 @@ MemoryManager::getTranslation(MemoryManager::AddressSpaceId fromAddrSpaceId,
return translation;
}
bool
MemoryManager::pathCheck(const MemoryGraph::Path& path)
{
// start with an identity mapping
MemoryTranslation translation(0, 0, SIZE_MAX);
// Try to add all mappings together to a common translation. If this fails
// there is a non-overlapping window
for(auto& mappingId : path) {
auto mapping = memoryGraph.getEdge(mappingId);
try {
translation += getTranslationFromMapping(*mapping);
} catch(const InvalidTranslation&) {
return false;
}
}
return true;
}
uintptr_t
MemoryTranslation::getLocalAddr(uintptr_t addrInForeignAddrSpace) const
{
@ -124,9 +174,18 @@ MemoryTranslation::operator+=(const MemoryTranslation& other)
const uintptr_t this_dst_high = this->dst + this->size;
const uintptr_t other_src_high = other.src + other.size;
logger->debug("this->src: {:#x}", this->src);
logger->debug("this->dst: {:#x}", this->dst);
logger->debug("this->size: {:#x}", this->size);
logger->debug("other.src: {:#x}", other.src);
logger->debug("other.dst: {:#x}", other.dst);
logger->debug("other.size: {:#x}", other.size);
logger->debug("this_dst_high: {:#x}", this_dst_high);
logger->debug("other_src_high: {:#x}", other_src_high);
// make sure there is a common memory area
assert(other.src < this_dst_high);
assert(this->dst < other_src_high);
assertExcept(other.src < this_dst_high, MemoryManager::InvalidTranslation());
assertExcept(this->dst < other_src_high, MemoryManager::InvalidTranslation());
const uintptr_t hi = std::max(this_dst_high, other_src_high);
const uintptr_t lo = std::min(this->dst, other.src);
@ -135,33 +194,39 @@ MemoryTranslation::operator+=(const MemoryTranslation& other)
? (this_dst_high - other_src_high)
: (other_src_high - this_dst_high);
const uintptr_t diff_lo = (this->dst > other.src)
const bool otherSrcIsSmaller = this->dst > other.src;
const uintptr_t diff_lo = (otherSrcIsSmaller)
? (this->dst - other.src)
: (other.src - this->dst);
const size_t size = (hi - lo) - diff_hi - diff_lo;
logger->debug("hi: {:#x}", hi);
logger->debug("lo: {:#x}", lo);
logger->debug("diff_hi: {:#x}", diff_hi);
logger->debug("diff_lo: {:#x}", diff_lo);
logger->debug("this->src: 0x{:x}", this->src);
logger->debug("this->dst: 0x{:x}", this->dst);
logger->debug("this->size: 0x{:x}", this->size);
logger->debug("other.src: 0x{:x}", other.src);
logger->debug("other.dst: 0x{:x}", other.dst);
logger->debug("other.size: 0x{:x}", other.size);
logger->debug("this_dst_high: 0x{:x}", this_dst_high);
logger->debug("other_src_high: 0x{:x}", other_src_high);
logger->debug("hi: 0x{:x}", hi);
logger->debug("lo: 0x{:x}", lo);
logger->debug("diff_hi: 0x{:x}", diff_hi);
logger->debug("diff_hi: 0x{:x}", diff_lo);
logger->debug("size: 0x{:x}", size);
// new size of aperture, can only stay or shrink
this->size = (hi - lo) - diff_hi - diff_lo;
this->src += other.src;
this->dst += other.dst;
this->size = size;
// new translation will come out other's destination (by default)
this->dst = other.dst;
logger->debug("result src: 0x{:x}", this->src);
logger->debug("result dst: 0x{:x}", this->dst);
logger->debug("result size: 0x{:x}", this->size);
// the source stays the same and can only increase with merged translations
this->src = this->src;
if(otherSrcIsSmaller) {
// other mapping starts at lower addresses, so we actually arrive at
// higher addresses
this->dst += diff_lo;
} else {
// other mapping starts at higher addresses than this, so we have to
// increase the start
// NOTE: for addresses equality, this just adds 0
this->src += diff_lo;
}
logger->debug("result src: {:#x}", this->src);
logger->debug("result dst: {:#x}", this->dst);
logger->debug("result size: {:#x}", this->size);
return *this;
}

View file

@ -0,0 +1,26 @@
cmake_minimum_required(VERSION 3.8)
project(villas-gpu
VERSION 1.0
DESCRIPTION "VILLASgpu"
LANGUAGES C CXX CUDA)
add_subdirectory(gdrcopy)
add_library(villas-gpu SHARED gpu.cpp kernels.cu)
target_compile_options(villas-gpu PRIVATE -g)
set_source_files_properties(gpu.cpp PROPERTIES
LANGUAGE CUDA)
target_include_directories(villas-gpu PRIVATE /opt/cuda/include)
target_link_libraries(villas-gpu
PRIVATE villas-common gdrapi cuda)
target_include_directories(villas-gpu
PUBLIC
${CMAKE_CURRENT_LIST_DIR}/include
PRIVATE
${CMAKE_CURRENT_LIST_DIR})

1
fpga/lib/gpu/gdrcopy Submodule

@ -0,0 +1 @@
Subproject commit 0441daa447b80260c4e11096f03e88f7be08bfa2

474
fpga/lib/gpu/gpu.cpp Normal file
View file

@ -0,0 +1,474 @@
#include <cstdio>
#include <cstdint>
#include <sys/mman.h>
#include <memory>
#include <algorithm>
#include <villas/gpu.hpp>
#include <villas/log.hpp>
#include <villas/kernel/pci.h>
#include <memory_manager.hpp>
#include <cuda.h>
#include <cuda_runtime.h>
#include <gdrapi.h>
#include "kernels.hpp"
namespace villas {
namespace gpu {
static GpuFactory gpuFactory;
GpuAllocator::GpuAllocator(Gpu& gpu) :
BaseAllocator(gpu.masterPciEAddrSpaceId),
gpu(gpu)
{
free = [&](MemoryBlock* mem) {
cudaSetDevice(gpu.gpuId);
if(cudaFree(reinterpret_cast<void*>(mem->getOffset())) != cudaSuccess) {
logger->warn("cudaFree() failed for {:#x} of size {:#x}",
mem->getOffset(), mem->getSize());
}
removeMemoryBlock(*mem);
};
}
std::string
villas::gpu::GpuAllocator::getName() const
{
std::stringstream name;
name << "GpuAlloc" << getAddrSpaceId();
return name.str();
}
GpuFactory::GpuFactory() :
Plugin(Plugin::Type::Gpu, "GPU")
{
logger = loggerGetOrCreate("GpuFactory");
}
// required to be defined here for PIMPL to compile
Gpu::~Gpu()
{
auto& mm = MemoryManager::get();
mm.removeAddressSpace(masterPciEAddrSpaceId);
}
// we use PIMPL in order to hide gdrcopy types from the public header
class Gpu::impl {
public:
gdr_t gdr;
struct pci_device pdev;
};
std::string Gpu::getName() const
{
cudaDeviceProp deviceProp;
if(cudaGetDeviceProperties(&deviceProp, gpuId) != cudaSuccess) {
// logger not yet availabe
loggerGetOrCreate("Gpu")->error("Cannot retrieve properties for GPU {}", gpuId);
throw std::exception();
}
std::stringstream name;
name << "gpu" << gpuId << "(" << deviceProp.name << ")";
return name.str();
}
bool Gpu::registerIoMemory(const MemoryBlock& mem)
{
auto& mm = MemoryManager::get();
const auto pciAddrSpaceId = mm.getPciAddressSpace();
// Check if we need to map anything at all, maybe it's already reachable
try {
// TODO: there might already be a path through the graph, but there's no
// overlapping window, so this will fail badly!
auto translation = mm.getTranslation(masterPciEAddrSpaceId,
mem.getAddrSpaceId());
if(translation.getSize() >= mem.getSize()) {
// there is already a sufficient path
logger->debug("Already mapped through another mapping");
return true;
} else {
logger->warn("There's already a mapping, but too small");
}
} catch(const std::out_of_range&) {
// not yet reachable, that's okay, proceed
}
// In order to register IO memory with CUDA, it has to be mapped to the VA
// space of the current process (requirement of CUDA API). Check this now.
MemoryManager::AddressSpaceId mappedBaseAddrSpaceId;
try {
auto path = mm.findPath(mm.getProcessAddressSpace(), mem.getAddrSpaceId());
// first node in path is the mapped memory space whose virtual address
// we need to hand to CUDA
mappedBaseAddrSpaceId = path.front();
} catch (const std::out_of_range&) {
logger->error("Memory not reachable from process, but required by CUDA");
return false;
}
// determine the base address of the mapped memory region needed by CUDA
const auto translationProcess = mm.getTranslationFromProcess(mappedBaseAddrSpaceId);
const uintptr_t baseAddrForProcess = translationProcess.getLocalAddr(0);
// Now check that the memory is also reachable via PCIe bus, otherwise GPU
// has no means to access it.
uintptr_t baseAddrOnPci;
size_t sizeOnPci;
try {
auto translationPci = mm.getTranslation(pciAddrSpaceId,
mappedBaseAddrSpaceId);
baseAddrOnPci = translationPci.getLocalAddr(0);
sizeOnPci = translationPci.getSize();
} catch(const std::out_of_range&) {
logger->error("Memory is not reachable via PCIe bus");
return false;
}
if(sizeOnPci < mem.getSize()) {
logger->warn("VA mapping of IO memory is too small: {:#x} instead of {:#x} bytes",
sizeOnPci, mem.getSize());
logger->warn("If something later on fails or behaves strangely, this might be the cause!");
}
cudaSetDevice(gpuId);
auto baseAddrVA = reinterpret_cast<void*>(baseAddrForProcess);
if(cudaHostRegister(baseAddrVA, sizeOnPci, cudaHostRegisterIoMemory) != cudaSuccess) {
logger->error("Cannot register IO memory for block {}", mem.getAddrSpaceId());
return false;
}
void* devicePointer = nullptr;
if(cudaHostGetDevicePointer(&devicePointer, baseAddrVA, 0) != cudaSuccess) {
logger->error("Cannot retrieve device pointer for IO memory");
return false;
}
mm.createMapping(reinterpret_cast<uintptr_t>(devicePointer), baseAddrOnPci,
sizeOnPci, "CudaIoMem", masterPciEAddrSpaceId, pciAddrSpaceId);
return true;
}
bool
Gpu::registerHostMemory(const MemoryBlock& mem)
{
auto& mm = MemoryManager::get();
auto translation = mm.getTranslationFromProcess(mem.getAddrSpaceId());
auto localBase = reinterpret_cast<void*>(translation.getLocalAddr(0));
int ret = cudaHostRegister(localBase, mem.getSize(), 0);
if(ret != cudaSuccess) {
logger->error("Cannot register memory block {} addr={:p} size={:#x} to CUDA: ret={}",
mem.getAddrSpaceId(), localBase, mem.getSize(), ret);
return false;
}
void* devicePointer = nullptr;
ret = cudaHostGetDevicePointer(&devicePointer, localBase, 0);
if(ret != cudaSuccess) {
logger->error("Cannot retrieve device pointer for IO memory: ret={}", ret);
return false;
}
mm.createMapping(reinterpret_cast<uintptr_t>(devicePointer), 0, mem.getSize(),
"CudaHostMem", masterPciEAddrSpaceId, mem.getAddrSpaceId());
return true;
}
bool Gpu::makeAccessibleToPCIeAndVA(const MemoryBlock& mem)
{
if(pImpl->gdr == nullptr) {
logger->warn("GDRcopy not available");
return false;
}
auto& mm = MemoryManager::get();
try {
auto path = mm.findPath(masterPciEAddrSpaceId, mem.getAddrSpaceId());
// if first hop is the PCIe bus, we know that memory is off-GPU
if(path.front() == mm.getPciAddressSpace()) {
throw std::out_of_range("Memory block is outside of this GPU");
}
} catch (const std::out_of_range&) {
logger->error("Trying to map non-GPU memory block");
return false;
}
logger->debug("retrieve complete device pointer from point of view of GPU");
// retrieve complete device pointer from point of view of GPU
auto translation = mm.getTranslation(masterPciEAddrSpaceId,
mem.getAddrSpaceId());
CUdeviceptr devptr = translation.getLocalAddr(0);
int ret;
// required to set this flag before mapping
unsigned int enable = 1;
ret = cuPointerSetAttribute(&enable, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, devptr);
if(ret != CUDA_SUCCESS) {
logger->error("Cannot set pointer attributes on memory block {}: {}",
mem.getAddrSpaceId(), ret);
return false;
}
gdr_mh_t mh;
ret = gdr_pin_buffer(pImpl->gdr, devptr, mem.getSize(), 0, 0, &mh);
if(ret != 0) {
logger->error("Cannot pin memory block {} via gdrcopy: {}",
mem.getAddrSpaceId(), ret);
return false;
}
void* bar = nullptr;
ret = gdr_map(pImpl->gdr, mh, &bar, mem.getSize());
if(ret != 0) {
logger->error("Cannot map memory block {} via gdrcopy: {}",
mem.getAddrSpaceId(), ret);
return false;
}
gdr_info_t info;
ret = gdr_get_info(pImpl->gdr, mh, &info);
if(ret != 0) {
logger->error("Cannot get info for mapping of memory block {}: {}",
mem.getAddrSpaceId(), ret);
return false;
}
const uintptr_t offset = info.va - devptr;
const uintptr_t userPtr = reinterpret_cast<uintptr_t>(bar) + offset;
logger->debug("BAR ptr: {:p}", bar);
logger->debug("info.va: {:#x}", info.va);
logger->debug("info.mapped_size: {:#x}", info.mapped_size);
logger->debug("info.page_size: {:#x}", info.page_size);
logger->debug("offset: {:#x}", offset);
logger->debug("user pointer: {:#x}", userPtr);
// mapping to acceses memory block from process
mm.createMapping(userPtr, 0, info.mapped_size, "GDRcopy",
mm.getProcessAddressSpace(), mem.getAddrSpaceId());
// retrieve bus address
uint64_t addr[8];
ret = gdr_map_dma(pImpl->gdr, mh, 3, 0, 0, addr, 8);
for(int i = 0; i < ret; i++) {
logger->debug("DMA addr[{}]: {:#x}", i, addr[i]);
}
if(ret != 1) {
logger->error("Only one DMA address per block supported at the moment");
return false;
}
// mapping to access memory block from peer devices via PCIe
mm.createMapping(addr[0], 0, mem.getSize(), "GDRcopyDMA",
mm.getPciAddressSpace(), mem.getAddrSpaceId());
return true;
}
bool
Gpu::makeAccessibleFromPCIeOrHostRam(const MemoryBlock& mem)
{
// Check which kind of memory this is and where it resides
// There are two possibilities:
// - Host memory not managed by CUDA
// - IO memory somewhere on the PCIe bus
auto& mm = MemoryManager::get();
bool isIoMemory = false;
try {
auto path = mm.findPath(mm.getPciAddressSpace(), mem.getAddrSpaceId());
isIoMemory = true;
} catch(const std::out_of_range&) {
// not reachable via PCI -> not IO memory
}
if(isIoMemory) {
logger->debug("Memory block {} is assumed to be IO memory",
mem.getAddrSpaceId());
return registerIoMemory(mem);
} else {
logger->debug("Memory block {} is assumed to be non-CUDA host memory",
mem.getAddrSpaceId());
return registerHostMemory(mem);
}
}
void Gpu::memcpySync(const MemoryBlock& src, const MemoryBlock& dst, size_t size)
{
auto& mm = MemoryManager::get();
auto src_translation = mm.getTranslation(masterPciEAddrSpaceId,
src.getAddrSpaceId());
const void* src_buf = reinterpret_cast<void*>(src_translation.getLocalAddr(0));
auto dst_translation = mm.getTranslation(masterPciEAddrSpaceId,
dst.getAddrSpaceId());
void* dst_buf = reinterpret_cast<void*>(dst_translation.getLocalAddr(0));
cudaSetDevice(gpuId);
cudaMemcpy(dst_buf, src_buf, size, cudaMemcpyDefault);
}
void Gpu::memcpyKernel(const MemoryBlock& src, const MemoryBlock& dst, size_t size)
{
auto& mm = MemoryManager::get();
auto src_translation = mm.getTranslation(masterPciEAddrSpaceId,
src.getAddrSpaceId());
auto src_buf = reinterpret_cast<uint8_t*>(src_translation.getLocalAddr(0));
auto dst_translation = mm.getTranslation(masterPciEAddrSpaceId,
dst.getAddrSpaceId());
auto dst_buf = reinterpret_cast<uint8_t*>(dst_translation.getLocalAddr(0));
cudaSetDevice(gpuId);
kernel_memcpy<<<1, 1>>>(dst_buf, src_buf, size);
cudaDeviceSynchronize();
}
std::unique_ptr<villas::MemoryBlock, villas::MemoryBlock::deallocator_fn>
GpuAllocator::allocateBlock(size_t size)
{
cudaSetDevice(gpu.gpuId);
void* addr;
if(cudaSuccess != cudaMalloc(&addr, size)) {
logger->error("cudaMalloc(..., size={}) failed", size);
throw std::bad_alloc();
}
auto& mm = MemoryManager::get();
// assemble name for this block
std::stringstream name;
name << std::showbase << std::hex << reinterpret_cast<uintptr_t>(addr);
auto blockName = mm.getSlaveAddrSpaceName(getName(), name.str());
auto blockAddrSpaceId = mm.getOrCreateAddressSpace(blockName);
const auto localAddr = reinterpret_cast<uintptr_t>(addr);
std::unique_ptr<MemoryBlock, MemoryBlock::deallocator_fn>
mem(new MemoryBlock(localAddr, size, blockAddrSpaceId), this->free);
insertMemoryBlock(*mem);
gpu.makeAccessibleToPCIeAndVA(*mem);
return mem;
}
Gpu::Gpu(int gpuId) :
pImpl{std::make_unique<impl>()},
gpuId(gpuId)
{
logger = loggerGetOrCreate(getName());
pImpl->gdr = gdr_open();
if(pImpl->gdr == nullptr) {
logger->warn("No GDRcopy support enabled, cannot open /dev/gdrdrv");
}
}
bool Gpu::init()
{
auto& mm = MemoryManager::get();
const auto gpuPciEAddrSpaceName = mm.getMasterAddrSpaceName(getName(), "PCIe");
masterPciEAddrSpaceId = mm.getOrCreateAddressSpace(gpuPciEAddrSpaceName);
allocator = std::make_unique<GpuAllocator>(*this);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, gpuId);
pImpl->pdev.slot = {
deviceProp.pciDomainID,
deviceProp.pciBusID,
deviceProp.pciDeviceID,
0};
struct pci_region* pci_regions = nullptr;
const size_t pci_num_regions = pci_get_regions(&pImpl->pdev, &pci_regions);
for(size_t i = 0; i < pci_num_regions; i++) {
const size_t region_size = pci_regions[i].end - pci_regions[i].start + 1;
logger->info("BAR{}: bus addr={:#x} size={:#x}",
pci_regions[i].num, pci_regions[i].start, region_size);
char name[] = "BARx";
name[3] = '0' + pci_regions[i].num;
auto gpuBarXAddrSpaceName = mm.getSlaveAddrSpaceName(getName(), name);
auto gpuBarXAddrSpaceId = mm.getOrCreateAddressSpace(gpuBarXAddrSpaceName);
mm.createMapping(pci_regions[i].start, 0, region_size,
std::string("PCI-") + name,
mm.getPciAddressSpace(), gpuBarXAddrSpaceId);
}
free(pci_regions);
return true;
}
std::list<std::unique_ptr<Gpu>>
GpuFactory::make()
{
int deviceCount = 0;
cudaGetDeviceCount(&deviceCount);
std::list<std::unique_ptr<Gpu>> gpuList;
for(int gpuId = 0; gpuId < deviceCount; gpuId++) {
if(cudaSetDevice(gpuId) != cudaSuccess) {
logger->warn("Cannot activate GPU {}", gpuId);
continue;
}
auto gpu = std::make_unique<Gpu>(gpuId);
if(not gpu->init()) {
logger->warn("Cannot initialize GPU {}", gpuId);
continue;
}
gpuList.emplace_back(std::move(gpu));
}
logger->info("Initialized {} GPUs", gpuList.size());
for(auto& gpu : gpuList) {
logger->debug(" - {}", gpu->getName());
}
return gpuList;
}
} // namespace villas
} // namespace gpu

View file

@ -0,0 +1,87 @@
#pragma once
#include <sstream>
#include <plugin.hpp>
#include <memory_manager.hpp>
#include <memory.hpp>
#include <villas/log.hpp>
namespace villas {
namespace gpu {
class GpuAllocator;
class Gpu {
friend GpuAllocator;
public:
Gpu(int gpuId);
~Gpu();
bool init();
std::string getName() const;
GpuAllocator& getAllocator() const
{ return *allocator; }
bool makeAccessibleToPCIeAndVA(const MemoryBlock& mem);
/// Make some memory block accssible for this GPU
bool makeAccessibleFromPCIeOrHostRam(const MemoryBlock& mem);
void memcpySync(const MemoryBlock& src, const MemoryBlock& dst, size_t size);
void memcpyKernel(const MemoryBlock& src, const MemoryBlock& dst, size_t size);
private:
bool registerIoMemory(const MemoryBlock& mem);
bool registerHostMemory(const MemoryBlock& mem);
private:
class impl;
std::unique_ptr<impl> pImpl;
// master, will be used to derived slave addr spaces for allocation
MemoryManager::AddressSpaceId masterPciEAddrSpaceId;
MemoryManager::AddressSpaceId slaveMemoryAddrSpaceId;
SpdLogger logger;
int gpuId;
std::unique_ptr<GpuAllocator> allocator;
};
class GpuAllocator : public BaseAllocator<GpuAllocator> {
public:
GpuAllocator(Gpu& gpu);
std::string getName() const;
std::unique_ptr<MemoryBlock, MemoryBlock::deallocator_fn>
allocateBlock(size_t size);
private:
Gpu& gpu;
};
class GpuFactory : public Plugin {
public:
GpuFactory();
std::list<std::unique_ptr<Gpu>>
make();
void run(void*);
private:
SpdLogger logger;
};
} // namespace villas
} // namespace gpu

42
fpga/lib/gpu/kernels.cu Normal file
View file

@ -0,0 +1,42 @@
#include <stdio.h>
#include <villas/gpu.hpp>
#include "kernels.hpp"
#include "cuda_runtime.h"
#include <cuda.h>
namespace villas {
namespace gpu {
__global__ void
kernel_mailbox(volatile uint32_t *mailbox, volatile uint32_t* counter)
{
printf("[gpu] hello!\n");
printf("[gpu] mailbox: %p\n", mailbox);
printf("[kernel] started\n");
while(1) {
if (*mailbox == 1) {
*mailbox = 0;
printf("[gpu] counter = %d\n", *counter);
break;
}
}
printf("[gpu] quit\n");
}
__global__ void
kernel_memcpy(volatile uint8_t* dst, volatile uint8_t* src, size_t length)
{
while(length > 0) {
*dst++ = *src++;
length--;
}
}
} // namespace villas
} // namespace gpu

16
fpga/lib/gpu/kernels.hpp Normal file
View file

@ -0,0 +1,16 @@
#pragma once
#include <cstdint>
#include <cuda_runtime.h>
namespace villas {
namespace gpu {
__global__ void
kernel_mailbox(volatile uint32_t *mailbox, volatile uint32_t* counter);
__global__ void
kernel_memcpy(volatile uint8_t* dst, volatile uint8_t* src, size_t length);
} // namespace villas
} // namespace gpu

View file

@ -223,8 +223,8 @@ IpCoreFactory::make(PCIeCard* card, json_t *json_ips)
json_t* json_block;
json_object_foreach(json_instance, block_name, json_block) {
unsigned int base, high, size;
int ret = json_unpack(json_block, "{ s: i, s: i, s: i }",
json_int_t base, high, size;
int ret = json_unpack(json_block, "{ s: I, s: I, s: I }",
"baseaddr", &base,
"highaddr", &high,
"size", &size);

View file

@ -132,8 +132,14 @@ Dma::reset()
bool
Dma::pingPong(const MemoryBlock& src, const MemoryBlock& dst, size_t len)
Dma::memcpy(const MemoryBlock& src, const MemoryBlock& dst, size_t len)
{
if(len == 0)
return true;
if(not connectLoopback())
return false;
if(this->read(dst, len) == 0)
return false;
@ -153,17 +159,14 @@ Dma::pingPong(const MemoryBlock& src, const MemoryBlock& dst, size_t len)
size_t
Dma::write(const MemoryBlock& mem, size_t len)
{
// make sure memory is reachable
if(not card->mapMemoryBlock(mem)) {
logger->error("Memory not accessible by DMA");
return 0;
}
auto& mm = MemoryManager::get();
// user has to make sure that memory is accessible, otherwise this will throw
auto translation = mm.getTranslation(busMasterInterfaces[mm2sInterface],
mem.getAddrSpaceId());
const void* buf = reinterpret_cast<void*>(translation.getLocalAddr(0));
logger->debug("Write to address: {:p}", buf);
return hasScatterGather() ? writeSG(buf, len) : writeSimple(buf, len);
}
@ -171,17 +174,14 @@ Dma::write(const MemoryBlock& mem, size_t len)
size_t
Dma::read(const MemoryBlock& mem, size_t len)
{
// make sure memory is reachable
if(not card->mapMemoryBlock(mem)) {
logger->error("Memory not accessible by DMA");
return 0;
}
auto& mm = MemoryManager::get();
// user has to make sure that memory is accessible, otherwise this will throw
auto translation = mm.getTranslation(busMasterInterfaces[s2mmInterface],
mem.getAddrSpaceId());
void* buf = reinterpret_cast<void*>(translation.getLocalAddr(0));
logger->debug("Read from address: {:p}", buf);
return hasScatterGather() ? readSG(buf, len) : readSimple(buf, len);
}
@ -350,6 +350,48 @@ Dma::readCompleteSimple()
}
bool
Dma::makeAccesibleFromVA(const MemoryBlock& mem)
{
// only symmetric mapping supported currently
if(isMemoryBlockAccesible(mem, s2mmInterface) and
isMemoryBlockAccesible(mem, mm2sInterface)) {
return true;
}
// try mapping via FPGA-card (VFIO)
if(not card->mapMemoryBlock(mem)) {
logger->error("Memory not accessible by DMA");
return false;
}
// sanity-check if mapping worked, this shouldn't be neccessary
if(not isMemoryBlockAccesible(mem, s2mmInterface) or
not isMemoryBlockAccesible(mem, mm2sInterface)) {
logger->error("Mapping memory via card didn't work, but reported success?!");
return false;
}
return true;
}
bool
Dma::isMemoryBlockAccesible(const MemoryBlock& mem, const std::string& interface)
{
auto& mm = MemoryManager::get();
try {
mm.findPath(getMasterAddrSpaceByInterface(interface), mem.getAddrSpaceId());
} catch(const std::out_of_range&) {
// not (yet) accessible
return false;
}
return true;
}
} // namespace ip
} // namespace fpga
} // namespace villas

View file

@ -42,14 +42,23 @@ AxiPciExpressBridge::init()
// Throw an exception if the is no bus master interface and thus no
// address space we can use for translation -> error
const MemoryManager::AddressSpaceId myAddrSpaceid =
busMasterInterfaces.at(axiInterface);
card->addrSpaceIdHostToDevice = busMasterInterfaces.at(axiInterface);
// Create an identity mapping from the FPGA card to this IP as an entry
// point to all other IPs in the FPGA, because Vivado will generate a
// memory view for this bridge that can see all others.
MemoryManager::get().createMapping(0x00, 0x00, SIZE_MAX, "PCIeBridge",
card->addrSpaceIdHostToDevice, myAddrSpaceid);
/* Map PCIe BAR0 via VFIO */
const void* bar0_mapped = card->vfioDevice->regionMap(VFIO_PCI_BAR0_REGION_INDEX);
if (bar0_mapped == MAP_FAILED) {
logger->error("Failed to mmap() BAR0");
return false;
}
// determine size of BAR0 region
const size_t bar0_size = card->vfioDevice->regionGetSize(VFIO_PCI_BAR0_REGION_INDEX);
// create a mapping from process address space to the FPGA card via vfio
mm.createMapping(reinterpret_cast<uintptr_t>(bar0_mapped),
0, bar0_size, "VFIO-H2D",
mm.getProcessAddressSpace(),
card->addrSpaceIdHostToDevice);
/* Make PCIe (IOVA) address space available to FPGA via BAR0 */
@ -63,6 +72,102 @@ AxiPciExpressBridge::init()
card->addrSpaceIdDeviceToHost =
mm.getOrCreateAddressSpace(addrSpaceNameDeviceToHost);
auto pciAddrSpaceId = mm.getPciAddressSpace();
struct pci_region* pci_regions = nullptr;
size_t num_regions = pci_get_regions(card->pdev, &pci_regions);
for(size_t i = 0; i < num_regions; i++) {
const size_t region_size = pci_regions[i].end - pci_regions[i].start + 1;
char barName[] = "BARx";
barName[3] = '0' + pci_regions[i].num;
auto pciBar = pcieToAxiTranslations.at(barName);
logger->info("PCI-BAR{}: bus addr={:#x} size={:#x}",
pci_regions[i].num, pci_regions[i].start, region_size);
logger->info("PCI-BAR{}: AXI translation offset {:#x}",
i, pciBar.translation);
mm.createMapping(pci_regions[i].start, pciBar.translation, region_size,
std::string("PCI-") + barName,
pciAddrSpaceId, card->addrSpaceIdHostToDevice);
}
if(pci_regions != nullptr) {
logger->debug("freeing pci regions");
free(pci_regions);
}
for(auto& [barName, axiBar] : axiToPcieTranslations) {
logger->info("AXI-{}: bus addr={:#x} size={:#x}",
barName, axiBar.base, axiBar.size);
logger->info("AXI-{}: PCI translation offset: {:#x}",
barName, axiBar.translation);
auto barXAddrSpaceName = mm.getSlaveAddrSpaceName(getInstanceName(), barName);
auto barXAddrSpaceId = mm.getOrCreateAddressSpace(barXAddrSpaceName);
// base is already incorporated into mapping of each IP by Vivado, so
// the mapping src has to be 0
mm.createMapping(0, axiBar.translation, axiBar.size,
std::string("AXI-") + barName,
barXAddrSpaceId, pciAddrSpaceId);
}
return true;
}
bool
AxiPciExpressBridgeFactory::configureJson(IpCore& ip, json_t* json_ip)
{
auto logger = getLogger();
auto& pcie = reinterpret_cast<AxiPciExpressBridge&>(ip);
for(auto barType : std::list<std::string>{"axi_bars", "pcie_bars"}) {
json_t* json_bars = json_object_get(json_ip, barType.c_str());
if(not json_is_object(json_bars)) {
return false;
}
json_t* json_bar;
const char* bar_name;
json_object_foreach(json_bars, bar_name, json_bar) {
unsigned int translation;
int ret = json_unpack(json_bar, "{ s: i }", "translation", &translation);
if(ret != 0) {
logger->error("Cannot parse {}/{}", barType, bar_name);
return false;
}
if(barType == "axi_bars") {
json_int_t base, high, size;
int ret = json_unpack(json_bar, "{ s: I, s: I, s: I }",
"baseaddr", &base,
"highaddr", &high,
"size", &size);
if(ret != 0) {
logger->error("Cannot parse {}/{}", barType, bar_name);
return false;
}
pcie.axiToPcieTranslations[bar_name] = {
.base = static_cast<uintptr_t>(base),
.size = static_cast<size_t>(size),
.translation = translation
};
} else {
pcie.pcieToAxiTranslations[bar_name] = {
.translation = translation
};
}
}
}
return true;
}

View file

@ -254,10 +254,78 @@ struct pci_device * pci_lookup_device(struct pci *p, struct pci_device *f)
return list_search(&p->devices, (cmp_cb_t) pci_device_compare, (void *) f);
}
size_t pci_get_regions(const struct pci_device *d, struct pci_region** regions)
{
FILE* f;
char sysfs[1024];
assert(regions != NULL);
snprintf(sysfs, sizeof(sysfs), "%s/bus/pci/devices/%04x:%02x:%02x.%x/resource",
SYSFS_PATH, d->slot.domain, d->slot.bus, d->slot.device, d->slot.function);
f = fopen(sysfs, "r");
if (!f)
serror("Failed to open resource mapping %s", sysfs);
struct pci_region _regions[8];
struct pci_region* cur_region = _regions;
size_t valid_regions = 0;
ssize_t bytesRead;
char* line = NULL;
size_t len = 0;
int region = 0;
// cap to 8 regions, just because we don't know how many may exist
while(region < 8 && (bytesRead = getline(&line, &len, f)) != -1) {
unsigned long long tokens[3];
char* s = line;
for(int i = 0; i < 3; i++) {
char* end;
tokens[i] = strtoull(s, &end, 16);
if(s == end) {
printf("Error parsing line %d of %s\n", region + 1, sysfs);
tokens[0] = tokens[1] = 0; // mark invalid
break;
}
s = end;
}
free(line);
// required for getline() to allocate a new buffer on the next iteration
line = NULL;
len = 0;
if(tokens[0] != tokens[1]) {
// this is a valid region
cur_region->num = region;
cur_region->start = tokens[0];
cur_region->end = tokens[1];
cur_region->flags = tokens[2];
cur_region++;
valid_regions++;
}
region++;
}
if(valid_regions > 0) {
const size_t len = valid_regions * sizeof (struct pci_region);
*regions = malloc(len);
memcpy(*regions, _regions, len);
}
return valid_regions;
}
int pci_get_driver(const struct pci_device *d, char *buf, size_t buflen)
{
int ret;
char sysfs[1024], syml[1024];
memset(syml, 0, sizeof(syml));
snprintf(sysfs, sizeof(sysfs), "%s/bus/pci/devices/%04x:%02x:%02x.%x/driver", SYSFS_PATH,
d->slot.domain, d->slot.bus, d->slot.device, d->slot.function);
@ -305,6 +373,7 @@ int pci_get_iommu_group(const struct pci_device *d)
{
int ret;
char *group, link[1024], sysfs[1024];
memset(link, 0, sizeof(link));
snprintf(sysfs, sizeof(sysfs), "%s/bus/pci/devices/%04x:%02x:%02x.%x/iommu_group", SYSFS_PATH,
d->slot.domain, d->slot.bus, d->slot.device, d->slot.function);

View file

@ -87,7 +87,7 @@ VfioContainer::VfioContainer()
}
/* Open VFIO API */
fd = open(VFIO_DEV("vfio"), O_RDWR);
fd = open(VFIO_DEV, O_RDWR);
if (fd < 0) {
logger->error("Failed to open VFIO container");
throw std::exception();
@ -108,12 +108,25 @@ VfioContainer::VfioContainer()
logger->error("Failed to get VFIO extensions");
throw std::exception();
}
else if (ret > 0)
else if (ret > 0) {
extensions |= (1 << i);
}
}
logger->debug("Version: {:#x}", version);
hasIommu = false;
if(not (extensions & (1 << VFIO_NOIOMMU_IOMMU))) {
if(not (extensions & (1 << VFIO_TYPE1_IOMMU))) {
logger->error("No supported IOMMU extension found");
throw std::exception();
} else {
hasIommu = true;
}
}
logger->debug("Version: {:#x}", version);
logger->debug("Extensions: {:#x}", extensions);
logger->debug("IOMMU: {}", hasIommu ? "yes" : "no");
}
@ -289,7 +302,7 @@ VfioContainer::attachDevice(const pci_device* pdev)
}
/* Get IOMMU group of device */
int index = pci_get_iommu_group(pdev);
int index = isIommuEnabled() ? pci_get_iommu_group(pdev) : 0;
if (index < 0) {
logger->error("Failed to get IOMMU group of device");
throw std::exception();
@ -319,6 +332,11 @@ VfioContainer::memoryMap(uintptr_t virt, uintptr_t phys, size_t length)
{
int ret;
if(not hasIommu) {
logger->error("DMA mapping not supported without IOMMU");
return UINTPTR_MAX;
}
if (length & 0xFFF) {
length += 0x1000;
length &= ~0xFFF;
@ -363,6 +381,10 @@ VfioContainer::memoryUnmap(uintptr_t phys, size_t length)
{
int ret;
if(not hasIommu) {
return true;
}
struct vfio_iommu_type1_dma_unmap dmaUnmap;
dmaUnmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
dmaUnmap.flags = 0;
@ -390,7 +412,7 @@ VfioContainer::getOrAttachGroup(int index)
}
// group not yet part of this container, so acquire ownership
auto group = VfioGroup::attach(fd, index);
auto group = VfioGroup::attach(*this, index);
if(not group) {
logger->error("Failed to attach to IOMMU group: {}", index);
throw std::exception();
@ -516,6 +538,7 @@ VfioDevice::pciHotReset()
const size_t reset_infolen = sizeof(struct vfio_pci_hot_reset_info) +
sizeof(struct vfio_pci_dependent_device) * 64;
auto reset_info = reinterpret_cast<struct vfio_pci_hot_reset_info*>
(calloc(1, reset_infolen));
@ -540,6 +563,8 @@ VfioDevice::pciHotReset()
}
}
free(reset_info);
const size_t resetlen = sizeof(struct vfio_pci_hot_reset) +
sizeof(int32_t) * 1;
auto reset = reinterpret_cast<struct vfio_pci_hot_reset*>
@ -549,10 +574,15 @@ VfioDevice::pciHotReset()
reset->count = 1;
reset->group_fds[0] = this->group.fd;
const bool success = ioctl(this->fd, VFIO_DEVICE_PCI_HOT_RESET, reset) == 0;
int ret = ioctl(this->fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
const bool success = (ret == 0);
free(reset);
free(reset_info);
if(not success and not group.container->isIommuEnabled()) {
logger->info("PCI hot reset failed, but this is expected without IOMMU");
return true;
}
return success;
}
@ -713,13 +743,18 @@ VfioGroup::~VfioGroup()
std::unique_ptr<VfioGroup>
VfioGroup::attach(int containerFd, int groupIndex)
VfioGroup::attach(VfioContainer& container, int groupIndex)
{
std::unique_ptr<VfioGroup> group { new VfioGroup(groupIndex) };
group->container = &container;
/* Open group fd */
std::stringstream groupPath;
groupPath << VFIO_DEV("") << groupIndex;
groupPath << VFIO_PATH
<< (container.isIommuEnabled() ? "" : "noiommu-")
<< groupIndex;
group->fd = open(groupPath.str().c_str(), O_RDWR);
if (group->fd < 0) {
logger->error("Failed to open VFIO group {}", group->index);
@ -729,18 +764,20 @@ VfioGroup::attach(int containerFd, int groupIndex)
logger->debug("VFIO group {} (fd {}) has path {}",
groupIndex, group->fd, groupPath.str());
int ret;
/* Claim group ownership */
ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &containerFd);
int ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container.getFd());
if (ret < 0) {
logger->error("Failed to attach VFIO group {} to container fd {} (error {})",
group->index, containerFd, ret);
group->index, container.getFd(), ret);
return nullptr;
}
/* Set IOMMU type */
ret = ioctl(containerFd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
int iommu_type = container.isIommuEnabled() ?
VFIO_TYPE1_IOMMU :
VFIO_NOIOMMU_IOMMU;
ret = ioctl(container.getFd(), VFIO_SET_IOMMU, iommu_type);
if (ret < 0) {
logger->error("Failed to set IOMMU type of container: {}", ret);
return nullptr;

View file

@ -255,4 +255,27 @@ for bram in brams:
if instance in ips:
ips[instance]['size'] = int(size)
pcies = root.xpath('.//MODULE[@MODTYPE="axi_pcie"]')
for pcie in pcies:
instance = pcie.get('INSTANCE')
axi_bars = ips[instance].setdefault('axi_bars', {})
pcie_bars = ips[instance].setdefault('pcie_bars', {})
for from_bar, to_bar, from_bars in (('AXIBAR', 'PCIEBAR', axi_bars), ('PCIEBAR', 'AXIBAR', pcie_bars)):
from_bar_num = int(pcie.find('.//PARAMETER[@NAME="C_{}_NUM"]'.format(from_bar)).get('VALUE'))
for i in range(0, from_bar_num):
from_bar_to_bar_offset = int(pcie.find('.//PARAMETER[@NAME="C_{}2{}_{}"]'.format(from_bar, to_bar, i)).get('VALUE'), 16)
from_bars['BAR{}'.format(i)] = { 'translation': from_bar_to_bar_offset }
if from_bar == 'AXIBAR':
axi_bar_lo = int(pcie.find('.//PARAMETER[@NAME="C_{}_{}"]'.format(from_bar, i)).get('VALUE'), 16)
axi_bar_hi = int(pcie.find('.//PARAMETER[@NAME="C_{}_HIGHADDR_{}"]'.format(from_bar, i)).get('VALUE'), 16)
axi_bar_size = axi_bar_hi - axi_bar_lo + 1
axi_bar = from_bars['BAR{}'.format(i)]
axi_bar['baseaddr'] = axi_bar_lo
axi_bar['highaddr'] = axi_bar_hi
axi_bar['size'] = axi_bar_size
print(json.dumps(ips, indent=2))

View file

@ -12,6 +12,10 @@ set(SOURCES
graph.cpp
)
if(CMAKE_CUDA_COMPILER)
list(APPEND SOURCES gpu.cpp)
endif()
add_executable(unit-tests ${SOURCES})
find_package(Criterion REQUIRED)

View file

@ -26,38 +26,52 @@ Test(fpga, dma, .description = "DMA")
auto dma = reinterpret_cast<villas::fpga::ip::Dma&>(*ip);
if(not dma.connectLoopback()) {
if(not dma.loopbackPossible()) {
logger->info("Loopback test not possible for {}", *ip);
continue;
}
count++;
if(not dma.loopbackPossible()) {
logger->info("Loopback test not possible for {}", *ip);
continue;
}
// Simple DMA can only transfer up to 4 kb due to PCIe page size burst
// limitation
size_t len = 4 * (1 << 10);
// find a block RAM IP to write to
auto bramIp = state.cards.front()->lookupIp(villas::fpga::Vlnv("xilinx.com:ip:axi_bram_ctrl:"));
auto bram = reinterpret_cast<villas::fpga::ip::Bram*>(bramIp);
cr_assert_not_null(bram, "Couldn't find BRAM");
// Simple DMA can only transfer up to 4 kb due to PCIe page size burst
// limitation
size_t len = 4 * (1 << 10);
/* Allocate memory to use with DMA */
auto src = villas::HostRam::getAllocator().allocate<char>(len);
auto dst = bram->getAllocator().allocate<char>(len);
auto src = villas::HostDmaRam::getAllocator().allocate<char>(len);
auto dst = villas::HostDmaRam::getAllocator().allocate<char>(len);
/* ... only works with IOMMU enabled currently */
// auto src = bram->getAllocator().allocate<char>(len);
// auto dst = bram->getAllocator().allocate<char>(len);
/* ... only works with IOMMU enabled currently */
// auto src = villas::HostRam::getAllocator().allocate<char>(len);
// auto dst = villas::HostRam::getAllocator().allocate<char>(len);
/* Make sure memory is accessible for DMA */
cr_assert(dma.makeAccesibleFromVA(src.getMemoryBlock()),
"Source memory not accessible for DMA");
cr_assert(dma.makeAccesibleFromVA(dst.getMemoryBlock()),
"Destination memory not accessible for DMA");
/* Get new random data */
const size_t lenRandom = read_random(&src, len);
cr_assert(len == lenRandom, "Failed to get random data");
/* Start transfer */
cr_assert(dma.pingPong(src.getMemoryBlock(), dst.getMemoryBlock(), len),
cr_assert(dma.memcpy(src.getMemoryBlock(), dst.getMemoryBlock(), len),
"DMA ping pong failed");
/* Compare data */
cr_assert(memcmp(&src, &dst, len) == 0, "Data not equal");

129
fpga/tests/gpu.cpp Normal file
View file

@ -0,0 +1,129 @@
#include <criterion/criterion.h>
#include <map>
#include <string>
#include <villas/log.hpp>
#include <villas/fpga/card.hpp>
#include <villas/fpga/ips/dma.hpp>
#include <villas/fpga/ips/bram.hpp>
#include <villas/utils.h>
#include "global.hpp"
#include <villas/memory.hpp>
#include <villas/gpu.hpp>
Test(fpga, gpu_dma, .description = "GPU DMA tests")
{
auto logger = loggerGetOrCreate("unittest:dma");
auto& card = state.cards.front();
villas::Plugin* plugin = villas::Plugin::lookup(villas::Plugin::Type::Gpu, "");
auto gpuPlugin = dynamic_cast<villas::gpu::GpuFactory*>(plugin);
cr_assert_not_null(gpuPlugin, "No GPU plugin found");
auto gpus = gpuPlugin->make();
cr_assert(gpus.size() > 0, "No GPUs found");
// just get first cpu
auto& gpu = gpus.front();
size_t count = 0;
for(auto& ip : card->ips) {
// skip non-dma IPs
if(*ip != villas::fpga::Vlnv("xilinx.com:ip:axi_bram_ctrl:"))
continue;
logger->info("Testing {}", *ip);
auto bram = reinterpret_cast<villas::fpga::ip::Bram*>(ip.get());
cr_assert_not_null(bram, "Couldn't find BRAM");
count++;
size_t len = 4 * (1 << 10);
/* Allocate memory to use with DMA */
auto bram0 = bram->getAllocator().allocate<char>(len);
auto bram1 = bram->getAllocator().allocate<char>(len);
gpu->makeAccessibleFromPCIeOrHostRam(bram0.getMemoryBlock());
gpu->makeAccessibleFromPCIeOrHostRam(bram1.getMemoryBlock());
auto hostRam0 = villas::HostRam::getAllocator().allocate<char>(len);
auto hostRam1 = villas::HostRam::getAllocator().allocate<char>(len);
gpu->makeAccessibleFromPCIeOrHostRam(hostRam0.getMemoryBlock());
gpu->makeAccessibleFromPCIeOrHostRam(hostRam1.getMemoryBlock());
auto dmaRam0 = villas::HostDmaRam::getAllocator().allocate<char>(len);
auto dmaRam1 = villas::HostDmaRam::getAllocator().allocate<char>(len);
gpu->makeAccessibleFromPCIeOrHostRam(dmaRam0.getMemoryBlock());
gpu->makeAccessibleFromPCIeOrHostRam(dmaRam1.getMemoryBlock());
auto gpuMem0 = gpu->getAllocator().allocate<char>(64 << 10);
auto gpuMem1 = gpu->getAllocator().allocate<char>(64 << 10);
gpu->makeAccessibleToPCIeAndVA(gpuMem0.getMemoryBlock());
gpu->makeAccessibleToPCIeAndVA(gpuMem1.getMemoryBlock());
// auto& src = bram0;
// auto& dst = bram1;
// auto& src = hostRam0;
// auto& dst = hostRam1;
auto& src = dmaRam0;
// auto& dst = dmaRam1;
// auto& src = gpuMem0;
auto& dst = gpuMem1;
std::list<std::pair<std::string, std::function<void()>>> memcpyFuncs = {
{"cudaMemcpy", [&]() {gpu->memcpySync(src.getMemoryBlock(), dst.getMemoryBlock(), len);}},
{"CUDA kernel", [&]() {gpu->memcpyKernel(src.getMemoryBlock(), dst.getMemoryBlock(), len);}},
};
auto dmaIp = card->lookupIp(villas::fpga::Vlnv("xilinx.com:ip:axi_dma:"));
auto dma = dynamic_cast<villas::fpga::ip::Dma*>(dmaIp);
if(dma != nullptr and dma->connectLoopback()) {
memcpyFuncs.push_back({
"DMA memcpy", [&]() {
if(not dma->makeAccesibleFromVA(src.getMemoryBlock()) or
not dma->makeAccesibleFromVA(dst.getMemoryBlock())) {
return;
}
dma->memcpy(src.getMemoryBlock(), dst.getMemoryBlock(), len);
}});
}
for(auto& [name, memcpyFunc] : memcpyFuncs) {
logger->info("Testing {}", name);
/* Get new random data */
const size_t lenRandom = read_random(&src, len);
cr_assert(len == lenRandom, "Failed to get random data");
memcpyFunc();
const bool success = memcmp(&src, &dst, len) == 0;
logger->info(" {}", success ?
TXT_GREEN("Passed") :
TXT_RED("Failed"));
}
villas::MemoryManager::get().dump();
}
cr_assert(count > 0, "No BRAM found");
}

1
fpga/thirdparty/udmabuf vendored Submodule

@ -0,0 +1 @@
Subproject commit 65762ca3333cb230e3b80ecf4cb5e605390474f2