diff --git a/Makefile b/Makefile index 4c561a678..fa75918bc 100644 --- a/Makefile +++ b/Makefile @@ -85,6 +85,12 @@ endif # PKGS += libwebsockets jansson #endif +## Add support for LAPACK / BLAS benchmarks / solvers +ifeq ($(shell pkg-config blas lapack; echo $$?),0) + PKGS += blas lapack + BENCH_OBJS += fpga-bench-overruns.o +endif + # Enable OPAL-RT Asynchronous Process support (will result in 32bit binary!!!) ifdef WITH_OPAL ifneq (,$(wildcard thirdparty/opal/include/AsyncApi.h)) diff --git a/config.h b/config.h index 9d6424367..abde0bc3e 100644 --- a/config.h +++ b/config.h @@ -43,6 +43,17 @@ /* Required kernel version */ #define KERNEL_VERSION_MAJ 3 #define KERNEL_VERSION_MIN 6 + +/* Some hard-coded configuration for the FPGA benchmarks */ +#define BENCH_DM 3 +// 1 FIFO +// 2 DMA SG +// 3 DMA Simple + +#define BENCH_RUNS 3000000 +#define BENCH_WARMUP 100 +#define BENCH_DM_EXP_MIN 0 +#define BENCH_DM_EXP_MAX 20 /** Coefficients for simple FIR-LowPass: * F_s = 1kHz, F_pass = 100 Hz, F_block = 300 diff --git a/src/fpga-bench-overruns.c b/src/fpga-bench-overruns.c new file mode 100644 index 000000000..f161c7696 --- /dev/null +++ b/src/fpga-bench-overruns.c @@ -0,0 +1,147 @@ +/** Benchmarks for VILLASfpga: LAPACK & BLAS + * + * @author Steffen Vogel + * @copyright 2015-2016, Steffen Vogel + * This file is part of VILLASnode. All Rights Reserved. Proprietary and confidential. + * Unauthorized copying of this file, via any medium is strictly prohibited. + **********************************************************************************/ + +#include +#include + +#include +#include + +#include "config.h" + +/* Declared in fpga-bench.c */ +extern int intc_flags; +extern struct utsname uts; + +/* LAPACK & BLAS Fortran prototypes */ +extern int dgemm_(char *transa, char *transb, int *m, int *n, int *k, double *alpha, double *a, int *lda, double *b, int *ldb, double *beta, double *c, int *ldc); +extern int dgetrf_(int *m, int *n, double *a, int *lda, int *ipiv, int *info); +extern int dgetri_(int *n, double *a, int *lda, int *ipiv, double *work, int *lwork, int *info); + +static int lapack_generate_workload(int N, double *C) +{ + double *A = alloc(N * N * sizeof(double)); + + srand(time(NULL)); + + for (int i = 0; i < N * N; i++) + A[i] = 100 * (double) rand() / RAND_MAX + 1; + + char transA = 'T'; + char transB = 'N'; + double alpha = 1; + double beta = 1; + + /* C = A' * A, to get an invertible matrix */ + dgemm_(&transA, &transB, &N, &N, &N, &alpha, A, &N, A, &N, &beta, C, &N); + + free(A); + + return 0; +} + +static int lapack_workload(int N, double *A) +{ + int info = 0; + int lworkspace = N; + int ipiv[N]; + double workspace[N]; + + dgetrf_(&N, &N, A, &N, ipiv, &info); + if (info > 0) + error("Failed to pivot matrix"); + + dgetri_(&N, A, &N, ipiv, workspace, &lworkspace, &info); + if (info > 0) + error("Failed to LU factorized matrix"); + + return 0; +} + +int fpga_benchmark_overruns(struct fpga *f) +{ + struct ip *rtds, *dm; + + dm = list_lookup(&f->ips, "dma_1"); + rtds = list_lookup(&f->ips, "rtds_axis_0"); + if (!rtds || !f->intc) + return -1; + + int ret; + float period = 50e-6; + int runs = 1.0 / period; + int overruns; + + info("runs = %u", runs); + + switch_connect(f->sw, dm, rtds); + switch_connect(f->sw, rtds, dm); + + intc_enable(f->intc, (1 << (dm->irq + 1 )), intc_flags); + + /* Dump results */ + char fn[256]; + snprintf(fn, sizeof(fn), "results/overruns_lu_rtds_axis_%s_%s.dat", intc_flags & INTC_POLLING ? "polling" : "irq", uts.release); + FILE *g = fopen(fn, "w"); + fprintf(g, "# period = %f\n", period); + fprintf(g, "# runs = %u\n", runs); + + struct dma_mem mem; + ret = dma_alloc(dm, &mem, 0x1000, 0); + if (ret) + error("Failed to allocate DMA memory"); + + uint32_t *data_rx = (uint32_t *) mem.base_virt; + uint32_t *data_tx = (uint32_t *) mem.base_virt + 0x200; + uint64_t total, start, stop; + for (int p = 3; p < 45; p++) { + double *A = alloc(p*p*sizeof(double)); + + lapack_generate_workload(p, A); + + overruns = 0; + total = 0; + + for (int i = 0; i < 2000; i++) { + dma_read(dm, mem.base_phys, 0x200); + dma_read_complete(dm, NULL, NULL); + } + + for (int i = 0; i < runs + BENCH_WARMUP; i++) { + dma_read(dm, mem.base_phys, 0x200); + + start = rdtscp(); + lapack_workload(p, A); + stop = rdtscp(); + + dma_read_complete(dm, NULL, NULL); + + /* Send data to rtds */ + data_tx[0] = i; + dma_write(dm, mem.base_phys + 0x200, 64 * sizeof(data_tx[0])); + + if (i < BENCH_WARMUP) + continue; + + if (i - data_rx[0] > 2) + overruns++; + total += stop - start; + } + + free(A); + + info("iter = %u clks = %ju overruns = %u", p, total / runs, overruns); + fprintf(g, "%u %ju %u\n", p, total / runs, overruns); + + if (overruns >= runs) + break; + } + + fclose(g); + return 0; +} diff --git a/src/fpga-bench.c b/src/fpga-bench.c index 314d1570d..9da551742 100644 --- a/src/fpga-bench.c +++ b/src/fpga-bench.c @@ -24,26 +24,17 @@ #include "config.h" #include "config-fpga.h" -/* Some hard-coded configuration for the benchmarks */ - -#define BENCH_DM 3 -// 1 FIFO -// 2 DMA SG -// 3 DMA Simple - -#define BENCH_RUNS 3000000 -#define BENCH_WARMUP 100 -#define BENCH_DM_EXP_MIN 0 -#define BENCH_DM_EXP_MAX 20 - int fpga_benchmark_datamover(struct fpga *f); int fpga_benchmark_jitter(struct fpga *f); int fpga_benchmark_memcpy(struct fpga *f); -int fpga_benchmark_overruns(struct fpga *f); int fpga_benchmark_latency(struct fpga *f); -static int intc_flags = 0; -static struct utsname uts; +#if defined(WITH_BLAS) && defined(WITH_LAPACK) +int fpga_benchmark_overruns(struct fpga *f); +#endif + +int intc_flags = 0; +struct utsname uts; int fpga_benchmarks(int argc, char *argv[], struct fpga *f) { @@ -55,7 +46,9 @@ int fpga_benchmarks(int argc, char *argv[], struct fpga *f) { "datamover", fpga_benchmark_datamover }, { "jitter", fpga_benchmark_jitter }, { "memcpy", fpga_benchmark_memcpy }, +#if defined(WITH_BLAS) && defined(WITH_LAPACK) { "overruns", fpga_benchmark_overruns }, +#endif { "latency", fpga_benchmark_latency } }; @@ -91,133 +84,6 @@ again: ret = bench->func(f); return -1; } -extern int dgemm_(char *transa, char *transb, int *m, int *n, int *k, double *alpha, double *a, int *lda, double *b, int *ldb, double *beta, double *c, int *ldc); -extern int dgetrf_(int *m, int *n, double *a, int *lda, int *ipiv, int *info); -extern int dgetri_(int *n, double *a, int *lda, int *ipiv, double *work, int *lwork, int *info); - -static int lapack_generate_workload(int N, double *C) -{ - double *A = alloc(N * N * sizeof(double)); - - srand(time(NULL)); - - for (int i = 0; i < N * N; i++) - A[i] = 100 * (double) rand() / RAND_MAX + 1; - - char transA = 'T'; - char transB = 'N'; - double alpha = 1; - double beta = 1; - - /* C = A' * A, to get an invertible matrix */ - dgemm_(&transA, &transB, &N, &N, &N, &alpha, A, &N, A, &N, &beta, C, &N); - - free(A); - - return 0; -} - -static int lapack_workload(int N, double *A) -{ - int info = 0; - int lworkspace = N; - int ipiv[N]; - double workspace[N]; - - dgetrf_(&N, &N, A, &N, ipiv, &info); - if (info > 0) - error("Failed to pivot matrix"); - - dgetri_(&N, A, &N, ipiv, workspace, &lworkspace, &info); - if (info > 0) - error("Failed to LU factorized matrix"); - - return 0; -} - -int fpga_benchmark_overruns(struct fpga *f) -{ - struct ip *rtds, *dm; - - dm = list_lookup(&f->ips, "dma_1"); - rtds = list_lookup(&f->ips, "rtds_axis_0"); - if (!rtds || !f->intc) - return -1; - - int ret; - float period = 50e-6; - int runs = 1.0 / period; - int overruns; - - info("runs = %u", runs); - - switch_connect(f->sw, dm, rtds); - switch_connect(f->sw, rtds, dm); - - intc_enable(f->intc, (1 << (dm->irq + 1 )), intc_flags); - - /* Dump results */ - char fn[256]; - snprintf(fn, sizeof(fn), "results/overruns_lu_rtds_axis_%s_%s.dat", intc_flags & INTC_POLLING ? "polling" : "irq", uts.release); - FILE *g = fopen(fn, "w"); - fprintf(g, "# period = %f\n", period); - fprintf(g, "# runs = %u\n", runs); - - struct dma_mem mem; - ret = dma_alloc(dm, &mem, 0x1000, 0); - if (ret) - error("Failed to allocate DMA memory"); - - uint32_t *data_rx = (uint32_t *) mem.base_virt; - uint32_t *data_tx = (uint32_t *) mem.base_virt + 0x200; - uint64_t total, start, stop; - for (int p = 3; p < 45; p++) { - double *A = alloc(p*p*sizeof(double)); - - lapack_generate_workload(p, A); - - overruns = 0; - total = 0; - - for (int i = 0; i < 2000; i++) { - dma_read(dm, mem.base_phys, 0x200); - dma_read_complete(dm, NULL, NULL); - } - - for (int i = 0; i < runs + BENCH_WARMUP; i++) { - dma_read(dm, mem.base_phys, 0x200); - - start = rdtscp(); - lapack_workload(p, A); - stop = rdtscp(); - - dma_read_complete(dm, NULL, NULL); - - /* Send data to rtds */ - data_tx[0] = i; - dma_write(dm, mem.base_phys + 0x200, 64 * sizeof(data_tx[0])); - - if (i < BENCH_WARMUP) - continue; - - if (i - data_rx[0] > 2) - overruns++; - total += stop - start; - } - - free(A); - - info("iter = %u clks = %ju overruns = %u", p, total / runs, overruns); - fprintf(g, "%u %ju %u\n", p, total / runs, overruns); - - if (overruns >= runs) - break; - } - - fclose(g); - return 0; -} - int fpga_benchmark_jitter(struct fpga *f) { int ret;