metalsvm/arch/x86/mm/svm.c

379 lines
10 KiB
C

/*
* Copyright 2011 Stefan Lankes, Chair for Operating Systems,
* RWTH Aachen University
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* This file is part of MetalSVM.
*/
#include <metalsvm/stddef.h>
#include <metalsvm/stdio.h>
#include <metalsvm/stdlib.h>
#include <metalsvm/mmu.h>
#include <metalsvm/tasks.h>
#include <metalsvm/page.h>
#include <metalsvm/errno.h>
#include <asm/irqflags.h>
#include <asm/processor.h>
#ifdef CONFIG_ROCKCREEK
#include <asm/RCCE.h>
#include <asm/RCCE_lib.h>
#include <asm/iRCCE.h>
#include <asm/SCC_API.h>
#include <asm/icc.h>
#include <asm/svm.h>
#define SHARED_PAGES ((4*RCCE_SHM_SIZE_MAX) >> PAGE_SHIFT)
#define OWNER_SIZE ((SHARED_PAGES * sizeof(uint8_t) + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1))
t_vcharp RC_SHM_BUFFER_START();
/*
* This array describes the owner of a specific page.
* Only the owner of a page is able to change the possession.
* => No lock is needded.
*/
static volatile uint8_t* page_owner = NULL;
// helper array to convert a physical to a virtual address
static size_t phys2virt[SHARED_PAGES] = {[0 ... SHARED_PAGES-1] = 0};
static size_t shmbegin = 0;
static uint32_t emit[RCCE_MAXNP] = {[0 ... RCCE_MAXNP-1] = 0};
static uint32_t request[RCCE_MAXNP] = {[0 ... RCCE_MAXNP-1] = 0};
static uint32_t forward[RCCE_MAXNP] = {[0 ... RCCE_MAXNP-1] = 0};
static uint64_t request_ticks = 0;
static uint64_t emit_ticks = 0;
static uint64_t wait_ticks = 0;
static uint64_t max_wait = 0;
static uint64_t min_wait = (uint64_t) -1;
int svm_init(void)
{
size_t phyaddr;
uint32_t flags;
// iRCCE is not thread save => disable interrupts
flags = irq_nested_disable();
shmbegin = (size_t)RC_SHM_BUFFER_START();
phyaddr = (size_t) RCCE_shmalloc(OWNER_SIZE);
irq_nested_enable(flags);
if (BUILTIN_EXPECT(!phyaddr, 0))
return -ENOMEM;
if (BUILTIN_EXPECT(phyaddr & 0xFFF, 0)) {
kprintf("RCCE_shmalloc returns not a page aligned physiacl address: 0x%x\n", phyaddr);
return -ENOMEM;
}
kprintf("Shared memory starts at the physical address 0x%x\n", shmbegin);
page_owner = (uint8_t*) map_region(0, phyaddr, OWNER_SIZE >> PAGE_SHIFT, MAP_KERNEL_SPACE|MAP_NO_CACHE);
if (BUILTIN_EXPECT(!page_owner, 0)) {
flags = irq_nested_disable();
RCCE_shfree((t_vcharp) phyaddr);
irq_nested_enable(flags);
return -ENOMEM;
}
// per default is core 0 owner
if (!RCCE_IAM)
memset((void*)page_owner, 0x00, OWNER_SIZE);
// iRCCE is not thread save => disable interrupts
flags = irq_nested_disable();
RCCE_barrier(&RCCE_COMM_WORLD);
irq_nested_enable(flags);
return 0;
}
/*
* This function is called by the pagefault handler
* => the interrupt flags is already cleared
*/
int svm_access_request(size_t addr)
{
uint64_t start = rdtsc();
size_t phyaddr = virt_to_phys(addr);
uint32_t pageid;
int remote_rank;
uint8_t payload[iRCCE_MAIL_HEADER_PAYLOAD];
int ret;
if (phyaddr < shmbegin)
return -EINVAL;
if (phyaddr >= shmbegin + RCCE_SHM_SIZE_MAX)
return -EINVAL;
pageid = (phyaddr-shmbegin) >> PAGE_SHIFT;
remote_rank = page_owner[pageid];
if (remote_rank == RCCE_IAM)
return 0;
((size_t*) payload)[0] = RCCE_IAM;
((size_t*) payload)[1] = phyaddr;
/* send ping request */
iRCCE_mail_send(2*sizeof(size_t), SVM_REQUEST, 0, payload, remote_rank);
icc_send_gic_irq(remote_rank);
request[remote_rank]++;
uint64_t wait_start = rdtsc();
// wait for response
icc_wait(SVM_RESP);
uint64_t res = rdtsc() - wait_start;
wait_ticks += res;
if (min_wait > res)
min_wait = res;
if (max_wait < res)
max_wait = res;
ret = change_page_permissions(addr, addr+PAGE_SIZE, VMA_READ|VMA_WRITE|VMA_CACHEABLE);
request_ticks += rdtsc() - start;
return ret;
}
static atomic_int32_t size_counter = ATOMIC_INIT(0);
void* svm_malloc(size_t size, uint32_t consistency)
{
size_t phyaddr, viraddr, i;
uint32_t flags;
uint32_t map_flags = MAP_KERNEL_SPACE|MAP_MPE;
if (consistency & SVM_STRONG)
map_flags |= MAP_SVM_STRONG;
else if (consistency & SVM_LAZYRELEASE)
map_flags |= MAP_SVM_LAZYRELEASE;
else return 0;
// currently, we allocate memory in page size granulation
size = (size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
#if 1 // Workaround for our MARC paper
// iRCCE is not thread save => disable interrupts
flags = irq_nested_disable();
kprintf("Entering shmmalloc: size 0x%x, owner_size 0x%x\n", size, OWNER_SIZE);
if (RCCE_IAM && (consistency & SVM_STRONG))
map_flags |= MAP_NO_ACCESS;
viraddr = vm_alloc(size >> PAGE_SHIFT, map_flags);
kprintf("vm_alloc returns 0x%x\n", viraddr);
static uint32_t last = 0;
// get memory on MC0
if (last)
phyaddr = last + size/4;
else
last = phyaddr = (size_t) RCCE_shmalloc(size/4);
map_region(viraddr, phyaddr, (size/4) >> PAGE_SHIFT, map_flags|MAP_REMAP);
for(i=0; i<size/4; i+=PAGE_SIZE)
phys2virt[(phyaddr + i - shmbegin) >> PAGE_SHIFT] = viraddr + i;
kprintf("svmmalloc on MC0: phyaddr 0x%x, viraddr 0x%x, size 0x%x\n", phyaddr, viraddr, size);
// get memory on MC1
phyaddr = shmbegin + 0x1000000 + atomic_int32_read(&size_counter);
map_region(viraddr + size/4, phyaddr, (size/4) >> PAGE_SHIFT, map_flags|MAP_REMAP);
for(i=0; i<size/4; i+=PAGE_SIZE)
phys2virt[(phyaddr + i - shmbegin) >> PAGE_SHIFT] = viraddr + size/4 + i;
kprintf("svmmalloc on MC1: phyaddr 0x%x, viraddr 0x%x, size 0x%x\n", phyaddr, viraddr+size/4, size);
// get memory on MC2
phyaddr = shmbegin + 0x2000000 + atomic_int32_read(&size_counter);
map_region(viraddr + 2 * size/4, phyaddr, (size/4) >> PAGE_SHIFT, map_flags|MAP_REMAP);
for(i=0; i<size/4; i+=PAGE_SIZE)
phys2virt[(phyaddr + i - shmbegin) >> PAGE_SHIFT] = viraddr + 2 * size/4 + i;
kprintf("svmmalloc on MC2: phyaddr 0x%x, viraddr 0x%x, size 0x%x\n", phyaddr, viraddr+2*size/4, size);
// get memory on MC3
phyaddr = shmbegin + 0x3000000 + atomic_int32_read(&size_counter);
map_region(viraddr + 3 * size/4, phyaddr, (size/4) >> PAGE_SHIFT, map_flags|MAP_REMAP);
for(i=0; i<size/4; i+=PAGE_SIZE)
phys2virt[(phyaddr + i - shmbegin) >> PAGE_SHIFT] = viraddr + 3 * size/4 + i;
kprintf("svmmalloc on MC3: phyaddr 0x%x, viraddr 0x%x, size 0x%x\n", phyaddr, viraddr+3*size/4, size);
atomic_int32_add(&size_counter, size/4);
irq_nested_enable(flags);
kprintf("shmmalloc returns 0x%x\n", viraddr);
return (void*) viraddr;
#else
// iRCCE is not thread save => disable interrupts
flags = irq_nested_disable();
phyaddr = (size_t) RCCE_shmalloc(size);
if (RCCE_IAM && (consistency & SVM_STRONG))
map_flags |= MAP_NO_ACCESS;
irq_nested_enable(flags);
if (BUILTIN_EXPECT(!phyaddr, 0))
return NULL;
if (BUILTIN_EXPECT(phyaddr & 0xFFF, 0)) {
kprintf("RCCE_shmalloc returns not a page aligned physiacl address: 0x%x\n", phyaddr);
return NULL;
}
viraddr = map_region(0, phyaddr, size >> PAGE_SHIFT, map_flags);
for(i=0; i<size; i+=PAGE_SIZE)
phys2virt[(phyaddr + i - shmbegin) >> PAGE_SHIFT] = viraddr + i;
kprintf("svmmalloc: phyaddr 0x%x, viraddr 0x%x, size 0x%x\n", phyaddr, viraddr, size);
return (void*) viraddr;
#endif
}
void svm_free(void* addr, size_t size)
{
size_t phyaddr, i;
uint32_t flags;
if (BUILTIN_EXPECT(!addr || !size, 0))
return;
phyaddr = virt_to_phys((size_t) addr);
// currently, we allocate memory in page size granulation
size = (size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
kprintf("svmfree: phyaddr 0x%x, viraddr 0x%x, size 0x%x\n", phyaddr, addr, size);
unmap_region((size_t) addr, size >> PAGE_SHIFT);
for(i=0; i<size; i+=PAGE_SIZE)
phys2virt[(phyaddr + i - shmbegin) >> PAGE_SHIFT] = 0;
// iRCCE is not thread save => disable interrupts
flags = irq_nested_disable();
RCCE_shfree((t_vcharp) phyaddr);
irq_nested_enable(flags);
}
/*
* This function is called by icc_mail_check.
* => Interrupt flag is alread cleared.
*/
int svm_emit_page(size_t phyaddr, int ue)
{
uint64_t start = rdtsc();
uint32_t pageid;
int remote_rank;
//kprintf("Try to emit page 0x%x to %d\n", phyaddr, ue);
if (phyaddr < shmbegin)
return -EINVAL;
if (phyaddr >= shmbegin + RCCE_SHM_SIZE_MAX)
return -EINVAL;
pageid = (phyaddr-shmbegin) >> PAGE_SHIFT;
remote_rank = page_owner[pageid];
if (remote_rank != RCCE_IAM) {
// Core is nor owner => forward request to new owner
uint8_t payload[iRCCE_MAIL_HEADER_PAYLOAD];
kprintf("Ups, core %d is not owner of page 0x%x\n", RCCE_IAM, phyaddr);
((size_t*) payload)[0] = ue;
((size_t*) payload)[1] = phyaddr;
/* send ping request */
iRCCE_mail_send(2*sizeof(size_t), SVM_REQUEST, 0, payload, remote_rank);
/* send interrupt */
icc_send_gic_irq(remote_rank);
forward[remote_rank]++;
} else {
size_t viraddr;
svm_flush();
// send response back to ue
// ue is polling for the response => no irq is needed
iRCCE_mail_send(0, SVM_RESP, 0, NULL, ue);
emit[ue]++;
viraddr = phys2virt[pageid];
page_owner[pageid] = ue;
change_page_permissions(viraddr, viraddr+PAGE_SIZE, VMA_NOACCESS|VMA_READ|VMA_CACHEABLE);
}
emit_ticks += rdtsc() - start;
return 0;
}
#ifdef SVM_WB
void svm_flush(void)
{
int z, tmp;
// need to write to another line to make sure the write combine buffer gets flushed
*(int *)RCCE_fool_write_combine_buffer = 1;
flush_cache();
#error Currently not supported
#if 0
// try to flush L2 cache
z = Z_PID(RC_COREID[my_ue]);
tmp=ReadConfigReg(CRB_OWN + (z==0 ? GLCFG0 : GLCFG1));
tmp &= ~(1 << GLCFG_XFLSHNN_BIT);
SetConfigReg(CRB_OWN + (z==0 ? GLCFG0 : GLCFG1), tmp);
while(!(ReadConfigReg(CRB_OWN + (z==0 ? GLCFG0 : GLCFG1)) & (1 << GLCFG_XFLSHNN_BIT))) {
NOP8;
}
#endif
}
#endif
int svm_barrier(uint32_t type)
{
if (type == SVM_LAZYRELEASE) {
svm_flush();
svm_invalidate();
}
RCCE_barrier(&RCCE_COMM_WORLD);
return 0;
}
int svm_statistics(void)
{
uint32_t i;
kprintf("emit\t:");
for(i=0; i<RCCE_MAXNP; i++)
kprintf("\t%u", emit[i]);
kprintf("\nrequest\t:");
for(i=0; i<RCCE_MAXNP; i++)
kprintf("\t%u", request[i]);
kprintf("\nforward\t:");
for(i=0; i<RCCE_MAXNP; i++)
kprintf("\t%u", forward[i]);
kputs("\n");
kprintf("request ticks: %llu\n", request_ticks);
kprintf("wait ticks: %llu\n", wait_ticks);
kprintf("emit ticks: %llu\n", emit_ticks);
kprintf("max wait: %llu\n", max_wait);
kprintf("min wait: %llu\n", min_wait);
return 0;
}
#endif