metalsvm/arch/x86/mm/page64.c

606 lines
15 KiB
C
Raw Permalink Normal View History

/*
* Copyright 2012 Stefan Lankes, Chair for Operating Systems,
* RWTH Aachen University
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* This file is part of MetalSVM.
*/
#include <metalsvm/stddef.h>
#include <metalsvm/stdio.h>
#include <metalsvm/stdlib.h>
#include <metalsvm/mmu.h>
#include <metalsvm/vma.h>
#include <metalsvm/string.h>
#include <metalsvm/page.h>
#include <metalsvm/spinlock.h>
#include <metalsvm/processor.h>
#include <metalsvm/tasks.h>
#include <metalsvm/errno.h>
#include <asm/irq.h>
#include <asm/multiboot.h>
#include <asm/apic.h>
/*
* Virtual Memory Layout of the standard configuration
* (1 GB kernel space)
*
* 0x000000000000 - 0x0000000FFFFF: reserved for IO devices (16MB)
* 0x000000100000 - 0x00000DEADFFF: Kernel (size depends on the configuration) (221MB)
* 0x00000DEAE000 - 0x00003FFFFFFF: Kernel heap
* 0xFF8000000000 - 0xFFFFFFFFFFFF: Paging structures are mapped in this region (max 512GB)
*/
/*
* Note that linker symbols are not variables, they have no memory allocated for
* maintaining a value, rather their address is their value.
*/
extern const void kernel_start;
extern const void kernel_end;
// boot task's page map and page map lock
extern page_map_t boot_pml4;
static spinlock_t kslock = SPINLOCK_INIT;
page_map_t* get_boot_page_map(void)
{
return &boot_pml4;
}
/** @brief Copy a single page frame
*
* @param src virtual address of source page frame
* @return physical addr to copied page frame
*/
static size_t copy_page_frame(size_t *src)
{
kprintf("copy_page_frame(%p)\n", src);
#if 1 // TODO: untested
size_t phyaddr, viraddr;
// allocate and map an empty page
phyaddr = get_page();
if (BUILTIN_EXPECT(!phyaddr, 0))
return 0;
viraddr = vma_alloc(PAGE_SIZE, VMA_HEAP);
if (BUILTIN_EXPECT(!viraddr, 0))
return 0;
2011-02-18 21:20:15 +01:00
viraddr = map_region(viraddr, phyaddr, 1, MAP_KERNEL_SPACE);
if (BUILTIN_EXPECT(!viraddr, 0))
return 0;
// copy the whole page
strncpy((void*) viraddr, (void*) src, PAGE_SIZE);
// unmap and free page
unmap_region(viraddr, 1);
vma_free(viraddr, viraddr+PAGE_SIZE);
return phyaddr;
#else
kprintf("TODO: copy_page_frame(%lx)\n", source);
return 0;
#endif
}
static inline size_t canonicalize(size_t addr)
{
if (addr & (1UL<<47))
return addr;
else
return addr & ((1UL<<48) - 1);
}
static inline int map_to_level(size_t addr)
{
if (addr >= PAGE_PML4)
return 4;
else if (addr >= PAGE_PDPT)
return 3;
else if (addr >= PAGE_PGD)
return 2;
else if (addr >= PAGE_PGT)
return 1;
else
return -EINVAL;
}
static inline const char * map_to_lvlname(size_t addr)
{
const char* names[] = {"(none)", "PGT", "PGD", "PDPT", "PML4"};
return names[map_to_level(addr)];
}
static inline size_t map_to_virt(size_t addr)
{
return canonicalize(addr << (map_to_level(addr) * PAGE_MAP_SHIFT));
}
/*
* Copy page maps using recursion
*
* @param from pointer to virtual address of source page tables
* @param to pointer to virtual address of destination page tables
* @param copy flags what should be copied (see #define COPY_*)
* @return number of new allocated page frames (for tables only)
*/
static int copy_page_map(page_map_t *src, page_map_t *dest, int copy)
{
page_map_t* next_src, * next_dest;
int ret = 0;
uint32_t i;
for(i=0; i<PAGE_MAP_ENTRIES; i++) {
if (!(src->entries[i] & PG_PRESENT))
// skip empty entries
dest->entries[i] = 0;
else if (src->entries[i] & PG_USER) {
size_t phys;
kprintf("d:%p (%s: 0x%012lx) -> %p\n", &src->entries[i], map_to_lvlname((size_t) &src->entries[i]), map_to_virt((size_t) &src->entries[i]), &dest->entries[i]);
// deep copy user tables
if ((size_t) src >= PAGE_PGT) {
phys = get_page();
if (BUILTIN_EXPECT(!phys, 0))
return -ENOMEM;
dest->entries[i] = phys|(src->entries[i] & ~PAGE_MASK);
// reuse pointers to next lower page map tables
next_src = (page_map_t*) ((size_t) &src->entries[i] << 9);
next_dest = (page_map_t*) ((size_t) &dest->entries[i] << 9);
ret += 1 + copy_page_map(next_src, next_dest, copy);
}
// deep copy page frame
else {
if (copy) {
phys = copy_page_frame((size_t*) src->entries[i]);
dest->entries[i] = phys|(src->entries[i] & ~PAGE_MASK);
}
kprintf("c: %p (%lx)\n", &src->entries[i], src->entries[i]);
}
}
// shallow copy kernel only tables
else {
kprintf("s:%p (%s: 0x%012lx) -> %p\n", &src->entries[i], map_to_lvlname((size_t) &src->entries[i]), map_to_virt((size_t) &src->entries[i]), &dest->entries[i]);
dest->entries[i] = src->entries[i];
}
}
kputs("r\n");
return ret;
}
2013-11-14 12:25:52 +01:00
int create_page_map(task_t* task, int copy)
{
size_t phys;
uint32_t ret;
// fixed mapping for paging structures
page_map_t *current = (page_map_t*) PAGE_PML4;
page_map_t *new = (page_map_t*) (PAGE_PML4 - 0x1000);
// get new pml4 table
phys = get_page();
if (!phys) return -ENOMEM;
current->entries[PAGE_MAP_ENTRIES-2] = phys|KERN_TABLE;
new->entries[PAGE_MAP_ENTRIES-1] = phys|KERN_TABLE;
tlb_flush(); // ouch :(
spinlock_lock(&kslock);
ret = copy_page_map(current, new, copy);
spinlock_unlock(&kslock);
new->entries[PAGE_MAP_ENTRIES-1] = phys|KERN_TABLE;
current->entries[PAGE_MAP_ENTRIES-2] = 0;
task->page_map = (page_map_t*) phys;
kprintf("create_page_map: allocated %u page tables\n", ret);
return ret;
}
int drop_page_map(void)
{
#if 1
kprintf("TODO: test drop_page_map()\n");
return -EINVAL; // TODO
#else
task_t* task = per_core(current_task);
page_map_t* pml4, * pdpt, * pgd, * pgt;
size_t phys;
uint32_t i, j, k, l;
pml4 = task->page_map;
if (BUILTIN_EXPECT(pml4 == &boot_pml4, 0))
return -EINVAL;
spinlock_lock(&task->page_lock);
// delete all user pages and tables
for(i=0; i<PAGE_MAP_ENTRIES; i++) { // pml4
if (pml4->entries[i] & PG_USER) {
for(j=0; j<PAGE_MAP_ENTRIES; j++) { // pdpt
if (pdpt->entries[j] & PG_USER) {
for(k=0; k<PAGE_MAP_ENTRIES; k++) { // pgd
if (pgd->entries[k] & PG_USER) {
for(l=0; l<PAGE_MAP_ENTRIES; l++) { // pgt
if (pgt->entries[l] & PG_USER)
put_page(pgt->entries[l] & PAGE_MASK);
}
// TODO: put pgt
}
}
// TODO: put pgd
}
}
// TODO: put pdpt
}
}
put_page(virt_to_phys((size_t) pml4));
task->page_map = NULL;
spinlock_unlock(&task->page_lock);
2011-02-18 21:20:15 +01:00
return 0;
#endif
}
2011-02-24 10:15:58 +01:00
size_t virt_to_phys(size_t viraddr)
{
2011-02-24 10:15:58 +01:00
task_t* task = per_core(current_task);
2013-11-14 12:23:42 +01:00
size_t phyaddr;
size_t* pte;
spinlock_irqsave_lock(&task->page_lock);
2013-11-14 12:23:42 +01:00
pte = (size_t *) (PAGE_PGT | (viraddr >> 9));
phyaddr = (*pte & PAGE_MASK) | (viraddr & ~PAGE_MASK);
spinlock_irqsave_unlock(&task->page_lock);
2013-11-14 12:23:42 +01:00
return phyaddr;
}
2011-02-24 10:15:58 +01:00
size_t map_region(size_t viraddr, size_t phyaddr, uint32_t npages, uint32_t flags)
{
2011-02-24 10:15:58 +01:00
task_t* task = per_core(current_task);
size_t i, ret;
if (BUILTIN_EXPECT(!task || !task->page_map, 0))
return 0;
if (!viraddr) {
kputs("map_region: deprecated vma_alloc() call from within map_region\n");
viraddr = vma_alloc(npages*PAGE_SIZE, VMA_HEAP);
if (BUILTIN_EXPECT(!viraddr, 0)) {
kputs("map_region: found no valid virtual address\n");
2012-09-10 15:37:45 +02:00
ret = 0;
goto out;
}
}
// correct alignment
phyaddr &= PAGE_MASK;
viraddr &= PAGE_MASK;
ret = viraddr;
if (flags & MAP_KERNEL_SPACE)
spinlock_lock(&kslock);
else
spinlock_irqsave_lock(&task->page_lock);
kprintf("map_region: map %u pages from 0x%lx to 0x%lx with flags: 0x%x\n", npages, viraddr, phyaddr, flags);
for(i=0; i<npages; i++, viraddr+=PAGE_SIZE, phyaddr+=PAGE_SIZE) {
// page table entry
size_t* pte = (size_t *) (PAGE_PGT|(viraddr >> 9));
if (*pte && !(flags & MAP_REMAP)) {
kprintf("map_region: 0x%lx is already mapped\n", viraddr);
2012-09-10 15:37:45 +02:00
ret = 0;
goto out;
}
if (flags & MAP_USER_SPACE)
*pte = phyaddr|USER_PAGE;
else
*pte = phyaddr|KERN_PAGE;
if (flags & MAP_NO_CACHE)
*pte |= PG_PCD;
if (flags & MAP_NO_ACCESS)
*pte &= ~PG_PRESENT;
if (flags & MAP_WT)
*pte |= PG_PWT;
if (flags & MAP_USER_SPACE)
atomic_int32_inc(&task->user_usage);
tlb_flush_one_page(viraddr);
}
2012-09-10 15:37:45 +02:00
out:
if (flags & MAP_KERNEL_SPACE)
spinlock_unlock(&kslock);
else
spinlock_irqsave_unlock(&task->page_lock);
return ret;
}
int change_page_permissions(size_t start, size_t end, uint32_t flags)
{
#if 0
uint32_t index1, index2, newflags;
size_t viraddr = start & PAGE_MASK;
size_t phyaddr;
page_map_t* pgt;
page_map_t* pgd;
task_t* task = per_core(current_task);
pgd = per_core(current_task)->page_map;
if (BUILTIN_EXPECT(!pgd, 0))
return -EINVAL;
spinlock_lock(&task->page_lock);
while (viraddr < end)
{
index1 = viraddr >> 22;
index2 = (viraddr >> 12) & 0x3FF;
while ((viraddr < end) && (index2 < 1024)) {
pgt = (page_map_t*) (page_map_t*) ((KERNEL_SPACE - 1024*PAGE_SIZE + index1*PAGE_SIZE) & PAGE_MASK);
if (pgt && pgt->entries[index2]) {
phyaddr = pgt->entries[index2] & PAGE_MASK;
newflags = pgt->entries[index2] & 0xFFF; // get old flags
if (!(newflags & PG_SVM_INIT)) {
if ((newflags & PG_SVM_STRONG) && !(newflags & PG_PRESENT) && (flags & (VMA_READ|VMA_WRITE) && !(flags & VMA_NOACCESS)))
newflags |= PG_PRESENT;
else if ((newflags & PG_SVM_STRONG) && (newflags & PG_PRESENT) && (flags & VMA_NOACCESS))
newflags &= ~PG_PRESENT;
}
// update flags
if (!(flags & VMA_WRITE)) {
newflags &= ~PG_RW;
} else {
newflags |= PG_RW;
}
pgt->entries[index2] = (newflags & 0xFFF) | (phyaddr & PAGE_MASK);
tlb_flush_one_page(viraddr);
}
index2++;
viraddr += PAGE_SIZE;
}
}
spinlock_unlock(&task->page_lock);
#endif
return -EINVAL;
}
int unmap_region(size_t viraddr, uint32_t npages)
{
task_t* task = per_core(current_task);
page_map_t* pdpt, * pgd, * pgt;
size_t i;
uint16_t index_pml4, index_pdpt;
uint16_t index_pgd, index_pgt;
if (BUILTIN_EXPECT(!task || !task->page_map, 0))
return -EINVAL;
if (viraddr <= KERNEL_SPACE)
2012-09-10 15:37:45 +02:00
spinlock_lock(&kslock);
else
spinlock_irqsave_lock(&task->page_lock);
i = 0;
while(i<npages)
{
index_pml4 = (viraddr >> 39) & 0x1FF;
index_pdpt = (viraddr >> 30) & 0x1FF;
index_pgd = (viraddr >> 21) & 0x1FF;
index_pgt = (viraddr >> 12) & 0x1FF;
// currently, we allocate pages only in kernel space.
// => physical address of the page table is identical of the virtual address
pdpt = (page_map_t*) (task->page_map->entries[index_pml4] & PAGE_MASK);
if (!pdpt) {
viraddr += (size_t) PAGE_MAP_ENTRIES*PAGE_MAP_ENTRIES*PAGE_MAP_ENTRIES*PAGE_SIZE;
i += PAGE_MAP_ENTRIES*PAGE_MAP_ENTRIES*PAGE_MAP_ENTRIES;
continue;
}
pgd = (page_map_t*) (pdpt->entries[index_pdpt] & PAGE_MASK);
if (!pgd) {
viraddr += PAGE_MAP_ENTRIES*PAGE_MAP_ENTRIES*PAGE_SIZE;
i += PAGE_MAP_ENTRIES*PAGE_MAP_ENTRIES;
continue;
}
pgt = (page_map_t*) (pgd->entries[index_pgd] & PAGE_MASK);
if (!pgt) {
viraddr += PAGE_MAP_ENTRIES*PAGE_SIZE;
i += PAGE_MAP_ENTRIES;
continue;
}
if (pgt->entries[index_pgt])
pgt->entries[index_pgt] &= ~PG_PRESENT;
viraddr +=PAGE_SIZE;
i++;
if (viraddr > KERNEL_SPACE)
atomic_int32_dec(&task->user_usage);
tlb_flush_one_page(viraddr);
}
2012-09-10 15:37:45 +02:00
if (viraddr <= KERNEL_SPACE)
spinlock_unlock(&kslock);
else
spinlock_irqsave_unlock(&task->page_lock);
return 0;
}
static void pagefault_handler(struct state *s)
{
task_t* task = per_core(current_task);
size_t viraddr = read_cr2();
size_t phyaddr;
#if 0
if ((viraddr >= task->start_heap) && (viraddr <= task->end_heap) && (viraddr > KERNEL_SPACE)) {
viraddr = viraddr & PAGE_MASK;
phyaddr = get_page();
if (BUILTIN_EXPECT(!phyaddr, 0))
goto oom;
2011-04-22 09:31:33 +02:00
if (map_region(viraddr, phyaddr, 1, MAP_USER_SPACE) == viraddr) {
memset((void*) viraddr, 0x00, PAGE_SIZE);
return;
}
kprintf("Could not map 0x%x at 0x%x\n", phyaddr, viraddr);
put_page(phyaddr);
}
/*
* handle missing paging structures for userspace
* all kernel space paging structures have been initialized in entry64.asm
*/
else if (viraddr >= PAGE_PGT) {
kprintf("map_region: missing paging structure at: 0x%lx (%s)\n", viraddr, map_to_lvlname(viraddr));
phyaddr = get_page();
if (BUILTIN_EXPECT(!phyaddr, 0))
goto oom;
// TODO: initialize with zeros
// TODO: check that we are in userspace
// get pointer to parent page level entry
size_t *entry = (size_t *) ((int64_t) viraddr >> 9 & ~0x07);
// update entry
*entry = phyaddr|USER_TABLE;
return;
}
2011-08-24 09:39:17 +02:00
#endif
kprintf("PAGE FAULT: Task %u got page fault at %p (irq %llu, cs:rip 0x%llx:0x%llx)\n", task->id, viraddr, s->int_no, s->cs, s->rip);
kprintf("Register state: rax = 0x%llx, rbx = 0x%llx, rcx = 0x%llx, rdx = 0x%llx, rdi = 0x%llx, rsi = 0x%llx, rbp = 0x%llx, rsp = 0x%llx\n",
s->rax, s->rbx, s->rcx, s->rdx, s->rdi, s->rsi, s->rbp, s->rsp);
irq_enable();
abort();
oom:
kputs("map_region: out of memory\n");
2011-08-02 06:19:26 -07:00
irq_enable();
abort();
}
int arch_paging_init(void)
{
uint32_t i, npages;
// replace default pagefault handler
irq_uninstall_handler(14);
irq_install_handler(14, pagefault_handler);
/*
* In longmode the kernel is already maped into the kernel space (see entry64.asm)
* this includes .data, .bss, .text, VGA, the multiboot & multiprocessing (APIC) structures
*/
#if MAX_CORES > 1
// reserve page for smp boot code
if (!map_region(SMP_SETUP_ADDR, SMP_SETUP_ADDR, 1, MAP_KERNEL_SPACE|MAP_NO_CACHE)) {
kputs("could not reserve page for smp boot code\n");
return -ENOMEM;
}
#endif
#ifdef CONFIG_MULTIBOOT
2011-04-07 20:36:43 +02:00
#if 0
// map reserved memory regions into the kernel space
2011-02-16 22:35:46 +01:00
if (mb_info && (mb_info->flags & MULTIBOOT_INFO_MEM_MAP)) {
multiboot_memory_map_t* mmap = (multiboot_memory_map_t*) mb_info->mmap_addr;
multiboot_memory_map_t* mmap_end = (void*) ((size_t) mb_info->mmap_addr + mb_info->mmap_length);
while (mmap < mmap_end) {
if (mmap->type != MULTIBOOT_MEMORY_AVAILABLE) {
npages = mmap->len / PAGE_SIZE;
if ((mmap->addr+mmap->len) % PAGE_SIZE)
npages++;
2011-02-24 10:15:58 +01:00
map_region(mmap->addr, mmap->addr, npages, MAP_KERNEL_SPACE|MAP_NO_CACHE);
}
mmap++;
}
}
2011-04-07 20:36:43 +02:00
#endif
/*
* Modules like the init ram disk are already loaded.
* Therefore, we map these modules into the kernel space.
*/
2011-02-16 22:35:46 +01:00
if (mb_info && (mb_info->flags & MULTIBOOT_INFO_MODS)) {
multiboot_module_t* mmodule = (multiboot_module_t*) ((size_t) mb_info->mods_addr);
npages = mb_info->mods_count * sizeof(multiboot_module_t) >> PAGE_SHIFT;
if (mb_info->mods_count * sizeof(multiboot_module_t) & (PAGE_SIZE-1))
npages++;
2012-06-12 09:24:38 +02:00
map_region((size_t) (mb_info->mods_addr), (size_t) (mb_info->mods_addr), npages, MAP_REMAP|MAP_KERNEL_SPACE);
for(i=0; i<mb_info->mods_count; i++, mmodule++) {
// map physical address to the same virtual address
npages = (mmodule->mod_end - mmodule->mod_start) >> PAGE_SHIFT;
if (mmodule->mod_end & (PAGE_SIZE-1))
npages++;
2012-06-10 12:10:54 +02:00
kprintf("Map module %s at 0x%x (%u pages)\n", (char*) mmodule->cmdline, mmodule->mod_start, npages);
2012-06-12 09:24:38 +02:00
map_region((size_t) (mmodule->mod_start), (size_t) (mmodule->mod_start), npages, MAP_REMAP|MAP_KERNEL_SPACE);
}
}
#endif
// we turned on paging => now, we are able to register our task
2012-06-10 23:40:22 +02:00
register_task();
// APIC registers into the kernel address space
map_apic();
return 0;
}