metalsvm/arch/x86/mm/page64.c

701 lines
18 KiB
C
Raw Permalink Normal View History

/*
* Copyright 2012 Stefan Lankes, Chair for Operating Systems,
* RWTH Aachen University
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* This file is part of MetalSVM.
*/
#include <metalsvm/stddef.h>
#include <metalsvm/stdio.h>
#include <metalsvm/stdlib.h>
#include <metalsvm/memory.h>
#include <metalsvm/vma.h>
#include <metalsvm/string.h>
#include <metalsvm/page.h>
#include <metalsvm/spinlock.h>
#include <metalsvm/processor.h>
#include <metalsvm/tasks.h>
#include <metalsvm/errno.h>
#include <asm/irq.h>
#include <asm/multiboot.h>
#include <asm/apic.h>
/*
* Virtual Memory Layout of the standard configuration
* (1 GB kernel space)
*
* 0x000000000000 - 0x0000000FFFFF: reserved for IO devices (16MB)
* 0x000000100000 - 0x00000DEADFFF: Kernel (size depends on the configuration) (221MB)
* 0x00000DEAE000 - 0x00003FFFFFFF: Kernel heap
* 0xFF0000000000 - 0xFF7FFFFFFFFF: Paging structures for copying a page map (max 512GB)
* 0xFF8000000000 - 0xFFFFFFFFFFFF: Paging structures are mapped in this region (max 512GB)
*/
/// Boot task's page map
extern page_map_t boot_pml4;
/// Kernel space page map lock
static spinlock_t kslock = SPINLOCK_INIT;
/** @brief Get the corresponding page map entry to a given virtual address */
static inline page_entry_t* virt_to_entry(size_t addr, int level)
{
return (page_entry_t*) ((((ssize_t) addr | (-1L << VIRT_BITS)) >> ((level+1) * PAGE_MAP_BITS)) & ~0x7);
}
/** @brief Get the corresponding virtual address to a page map entry */
static inline size_t entry_to_virt(page_entry_t* entry, int level)
{
return VIRT_SEXT((size_t) entry << ((level+1) * PAGE_MAP_BITS));
}
/** @brief Converts a virtual address to a physical
*
* A non mapped virtual address causes a pagefault!
*
* @param viraddr Virtual address to convert
* @return physical address
*/
inline size_t virt_to_phys(size_t viraddr)
{
page_entry_t* entry = (page_entry_t*) (PAGE_MAP_PGT | (viraddr >> PAGE_MAP_BITS));
return (*entry & ~PAGE_FLAGS_MASK) | (viraddr & ~PAGE_MASK);
}
/** @brief Update page table bits (PG_*) by using arch independent flags (MAP_*) */
static inline size_t page_bits(int flags)
{
size_t bits = PG_PRESENT|PG_RW|PG_GLOBAL|PG_XD;
if (flags & MAP_NO_ACCESS) bits &= ~PG_PRESENT;
if (flags & MAP_READ_ONLY) bits &= ~PG_RW;
if (flags & MAP_CODE) bits &= ~PG_XD;
if (flags & MAP_USER_SPACE) bits &= ~PG_GLOBAL;
if (flags & MAP_USER_SPACE) bits |= PG_USER;
if (flags & MAP_WT) bits |= PG_PWT;
if (flags & MAP_NO_CACHE) bits |= PG_PCD;
if (flags & MAP_MPE) bits |= PG_MPE;
if (flags & MAP_SVM_INIT) bits |= PG_SVM_INIT;
if (flags & MAP_SVM_LAZYRELEASE) bits |= PG_SVM_LAZYRELEASE;
if (flags & MAP_SVM_STRONG) bits |= PG_SVM_STRONG;
return bits;
}
page_map_t* get_boot_page_map(void)
{
return &boot_pml4;
}
/** @brief Recursive traversal through the page map tree
*
* @param start The first address whose page map entry we will call on
* @param end The exclusive end address whose page map entry we will call on
* @param pre Callback which is called for every page map entry (pre-order traversal)
* @param post Callback which is called for every page map entry (post-order traversal)
*/
int page_iterate(size_t start, size_t end, page_cb_t pre, page_cb_t post)
{
page_entry_t* entry[PAGE_MAP_LEVELS];
page_entry_t* last[PAGE_MAP_LEVELS];
if (BUILTIN_EXPECT(start >= end, 0))
return -EINVAL;
// setup subtree boundaries
int i;
for (i=0; i<PAGE_MAP_LEVELS; i++) {
entry[i] = virt_to_entry(start, i);
last[i] = virt_to_entry(end - 1, i);
}
// nested iterator function (sees the scope of parent)
int iterate(int level) {
int ret;
while (entry[level] <= last[level]) {
if (pre) { // call pre-order callback if available
ret = pre(entry[level], level);
if (BUILTIN_EXPECT(ret < 0, 0))
return ret;
}
// recurse if
// - we are not in the PGT
// - and the inferior page table is present
// - and the current entry represents no huge page
if (level && (*entry[level] & PG_PRESENT) && !(*entry[level] & PG_PSE)) {
ret = iterate(level-1);
if (BUILTIN_EXPECT(ret < 0, 0))
return ret;
}
// or skip the entries we've omit...
else {
size_t next = (size_t) (entry[level]+1);
for (i=0; i<level; i++)
entry[i] = (page_entry_t*) (next << (PAGE_MAP_BITS*(level-i)));
}
if (post) { // call post-order callback if available
ret = post(entry[level], level);
if (BUILTIN_EXPECT(ret < 0, 0))
return ret;
}
// return if we've reached the end of table
entry[level]++;
if (((size_t) entry[level] & ~PAGE_MASK) == 0x000) // TODO
return 0;
}
return 0;
}
// we start at the highest order table (PML4 or PGD)
return iterate(PAGE_MAP_LEVELS-1);
}
void page_dump(size_t from, size_t to)
{
task_t* task = per_core(current_task);
size_t flags = 0;
size_t start = 0;
void print(size_t start, size_t end, size_t flags) {
size_t size = end - start;
kprintf("%#018lx-%#018lx %#14x %c%c%c%c%c%c\n", start, end, size,
(flags & PG_XD) ? '-' : 'x',
(flags & PG_GLOBAL) ? 'g' : '-',
(flags & PG_DIRTY) ? 'd' : '-',
(flags & PG_ACCESSED) ? 'a' : '-',
(flags & PG_USER) ? 'u' : '-',
(flags & PG_RW) ? 'w' : '-'
);
}
int cb(page_entry_t* entry, int level) {
size_t end;
if (*entry & PG_PRESENT) {
if (!level || (*entry & PG_PSE)) {
if (!flags) {
flags = *entry & PAGE_FLAGS_MASK;
start = entry_to_virt(entry, level);
}
else if (flags != (*entry & PAGE_FLAGS_MASK)) {
end = entry_to_virt(entry, level);
print(start, end, flags);
start = end;
flags = *entry & PAGE_FLAGS_MASK;
}
}
}
else if (flags) {
end = entry_to_virt(entry, level);
print(start, end, flags);
flags = 0;
}
return 0;
}
// lock tables
spinlock_lock(&kslock);
spinlock_irqsave_lock(&task->page_lock);
kprintf("%-18s-%18s %14s %-6s\n", "start", "end", "size", "flags"); // header
page_iterate(from, to, cb, NULL);
// unlock tables
spinlock_unlock(&kslock);
spinlock_irqsave_unlock(&task->page_lock);
// workaround to print last mapping
if (flags)
print(start, PAGE_FLOOR(to), flags);
}
void page_stats(size_t from, size_t to, int reset)
{
task_t* task = per_core(current_task);
int i, stats[13] = { 0 };
const char* labels[] = { [0] = "present", "writable", "user accessable", "write through", "cache disabled", // IA-32 "legacy" bits
"accessed", "dirty", "huge pages", "global", "svm", "svm lazy", "svm init",
[12] = "exec disabled" // IA-32e / PAE bits
};
int cb(page_entry_t* entry, int level) {
if (*entry & PG_PRESENT) {
if (!level || (*entry & PG_PSE)) {
// increment stat counters
int i;
for (i=0; i<12; i++) { // IA-32 "legacy" bits
if (*entry & (1 << i))
stats[i]++;
}
for (i=0; i<1; i++) { // IA-32e / PAE bits
if (*entry & (1 << (63-i)))
stats[i+PAGE_BITS]++;
}
}
// reset accessed and dirty bits
if (reset) {
*entry &= ~(PG_ACCESSED|PG_DIRTY);
tlb_flush_one_page(entry_to_virt(entry, level)); // see IA32 Vol3 4.8
}
}
return 0;
}
// lock tables
spinlock_lock(&kslock);
spinlock_irqsave_lock(&task->page_lock);
page_iterate(from, to, cb, NULL);
// unlock tables
spinlock_unlock(&kslock);
spinlock_irqsave_unlock(&task->page_lock);
kprintf("total pages:\n");
for (i=0; i<13; i++)
kprintf(" - %s:%*lu\n", labels[i], 25-strlen(labels[i]), stats[i]);
}
int copy_page_map(task_t* new_task, int copy)
{
task_t* cur_task = per_core(current_task);
size_t phyaddr;
size_t ret;
int cb(page_entry_t* src, int level) {
page_entry_t* dest = src - (1L<<36); // TODO
2011-02-18 21:20:15 +01:00
if (*src & PG_PRESENT) {
if (*src & PG_USER) {
if (copy) { // deep copy page frame
size_t phyaddr = get_page();
if (BUILTIN_EXPECT(!phyaddr, 0))
return -ENOMEM;
atomic_int32_inc(&cur_task->user_usage);
copy_page(phyaddr, *src & ~PAGE_FLAGS_MASK);
*dest = phyaddr | (*src & PAGE_FLAGS_MASK);
}
}
else // shallow copy kernel table
*dest = *src;
}
return 0;
}
2013-11-14 12:25:52 +01:00
// fixed mapping for paging structures
page_map_t *current = (page_map_t*) PAGE_MAP_PML4;
page_map_t *new = palloc(PAGE_SIZE, 0);
if (BUILTIN_EXPECT(!new, 0))
return -ENOMEM;
2013-11-14 12:25:52 +01:00
phyaddr = virt_to_phys((size_t) new);
2013-11-14 12:25:52 +01:00
// lock tables
spinlock_lock(&kslock);
spinlock_irqsave_lock(&cur_task->page_lock);
2013-11-14 12:25:52 +01:00
// map new table
current->entries[PAGE_MAP_ENTRIES-2] = phyaddr | PG_TABLE;
2013-11-14 12:25:52 +01:00
tlb_flush(); // ouch :(
// setup self reference for new table
new->entries[PAGE_MAP_ENTRIES-1] = phyaddr | PG_TABLE;
ret = page_iterate(0, PAGE_MAP_PGT - (1L<<39), cb, NULL); // TODO: check boundaries
// unlock tables
spinlock_irqsave_unlock(&cur_task->page_lock);
2013-11-14 12:25:52 +01:00
spinlock_unlock(&kslock);
// unmap new tables
2013-11-14 12:25:52 +01:00
current->entries[PAGE_MAP_ENTRIES-2] = 0;
tlb_flush(); // ouch :(
new_task->page_map = new;
2013-11-14 12:25:52 +01:00
kprintf("copy_page_map: allocated %i page tables\n", ret); // TODO: remove
2013-11-14 12:25:52 +01:00
return ret;
}
int drop_page_map(void)
{
task_t* task = per_core(current_task);
int cb(page_entry_t* entry, int level) {
if (*entry & PG_USER) {
kprintf("drop_page_map:cb: entry = %p, level = %u\n", entry, level); // TODO: remove
if (put_page(*entry & ~PAGE_FLAGS_MASK))
atomic_int32_dec(&task->user_usage);
}
return 0;
}
kprintf("drop_page_map: task = %u\n", task->id); // TODO: remove
// check assertions
if (BUILTIN_EXPECT(task->page_map == get_boot_page_map(), 0))
return -EINVAL;
if (BUILTIN_EXPECT(!task || !task->page_map, 0))
return -EINVAL;
// lock tables
spinlock_irqsave_lock(&task->page_lock);
page_iterate(0, PAGE_MAP_PGT, NULL, cb);
pfree(task->page_map, PAGE_SIZE);
// unlock tables
spinlock_irqsave_unlock(&task->page_lock);
return 0;
}
static int set_page_flags(size_t viraddr, uint32_t npages, int flags)
{
task_t* task = per_core(current_task);
size_t bits = page_bits(flags);
size_t start = viraddr;
size_t end = start + npages * PAGE_SIZE;
int cb(page_entry_t* entry, int level) {
if (level) {
if (flags & MAP_USER_SPACE)
*entry |= PG_USER;
}
else
*entry = (*entry & ~PAGE_FLAGS_MASK) | bits;
tlb_flush_one_page(entry_to_virt(entry, level));
return 0;
}
// check assertions
if (BUILTIN_EXPECT(start < KERNEL_SPACE && end >= KERNEL_SPACE, 0))
return 0;
if (BUILTIN_EXPECT(!task || !task->page_map, 0))
return 0;
// lock tables
if (viraddr < KERNEL_SPACE)
spinlock_lock(&kslock);
else
spinlock_irqsave_lock(&task->page_lock);
int ret = page_iterate(start, end, cb, NULL);
// unlock tables
if (viraddr < KERNEL_SPACE)
spinlock_lock(&kslock);
2012-09-10 15:37:45 +02:00
else
spinlock_irqsave_lock(&task->page_lock);
return ret;
}
size_t map_region(size_t viraddr, size_t phyaddr, uint32_t npages, uint32_t flags)
{
task_t* task = per_core(current_task);
if (!viraddr) {
int vma_flags = VMA_HEAP;
if (flags & MAP_USER_SPACE)
vma_flags |= VMA_USER;
viraddr = vma_alloc(npages * PAGE_SIZE, vma_flags);
}
size_t bits = page_bits(flags);
size_t start = viraddr;
size_t end = start + npages * PAGE_SIZE;
int cb(page_entry_t* entry, int level) {
if (level) { // PGD, PDPT, PML4..
if (*entry & PG_PRESENT) {
if (flags & MAP_USER_SPACE) {
/*
* We are changing page map entries which cover
* the kernel. So before altering them we need to
* make a private copy for the task
*/
if (!(*entry & PG_USER)) {
size_t phyaddr = get_page();
if (BUILTIN_EXPECT(!phyaddr, 0))
return -ENOMEM;
atomic_int32_inc(&task->user_usage);
copy_page(phyaddr, *entry & ~PAGE_FLAGS_MASK);
*entry = phyaddr | (*entry & PAGE_FLAGS_MASK) | PG_USER;
/*
* We just need to flush the table itself.
* TLB entries for the kernel remain valid
* because we've not changed them.
*/
tlb_flush_one_page(entry_to_virt(entry, 0));
}
}
}
else {
size_t phyaddr = get_page();
if (BUILTIN_EXPECT(!phyaddr, 0))
return -ENOMEM;
atomic_int32_inc(&task->user_usage);
*entry = phyaddr | bits;
}
}
else { // PGT
if ((*entry & PG_PRESENT) && !(flags & MAP_REMAP))
return -EINVAL;
*entry = phyaddr | bits;
if (flags & MAP_USER_SPACE)
atomic_int32_inc(&task->user_usage);
if (flags & MAP_REMAP)
tlb_flush_one_page(entry_to_virt(entry, level));
phyaddr += PAGE_SIZE;
}
return 0;
}
kprintf("map_region: map %u pages from %#lx to %#lx with flags: %#x\n", npages, viraddr, phyaddr, flags); // TODO: remove
// check assertions
if (BUILTIN_EXPECT(start < KERNEL_SPACE && end >= KERNEL_SPACE, 0))
return 0;
if (BUILTIN_EXPECT(!task || !task->page_map, 0))
return 0;
if (BUILTIN_EXPECT(!viraddr, 0))
return 0;
// lock tables
if (viraddr < KERNEL_SPACE)
spinlock_lock(&kslock);
else
spinlock_irqsave_lock(&task->page_lock);
int ret = page_iterate(start, end, cb, NULL);
// unlock tables
if (viraddr < KERNEL_SPACE)
spinlock_unlock(&kslock);
else
spinlock_irqsave_unlock(&task->page_lock);
return (ret == 0) ? viraddr : 0;
}
int unmap_region(size_t viraddr, uint32_t npages)
{
task_t* task = per_core(current_task);
size_t start = viraddr;
size_t end = start + npages * PAGE_SIZE;
kprintf("unmap_region: unmap %u pages from %#lx\n", npages, viraddr); // TODO: remove
int cb(page_entry_t* entry, int level) {
if (level) { // PGD, PDPT, PML4
page_map_t* map = (page_map_t*) entry_to_virt(entry, 0);
int used = 0;
int i;
for (i=0; i<PAGE_MAP_ENTRIES; i++) {
if (map->entries[i] & PG_PRESENT)
used++;
}
if (!used) {
*entry &= ~PG_PRESENT;
tlb_flush_one_page(entry_to_virt(entry, 0));
if (put_page(*entry & ~PAGE_FLAGS_MASK))
atomic_int32_dec(&task->user_usage);
}
}
else { // PGT
*entry = 0;
tlb_flush_one_page(entry_to_virt(entry, level));
if (viraddr >= KERNEL_SPACE)
atomic_int32_dec(&task->user_usage);
}
return 0;
}
// check assertions
if (BUILTIN_EXPECT(start < KERNEL_SPACE && end >= KERNEL_SPACE, 0))
return 0;
if (BUILTIN_EXPECT(!task || !task->page_map, 0))
return 0;
// lock tables
if (viraddr < KERNEL_SPACE)
spinlock_lock(&kslock);
else
spinlock_irqsave_lock(&task->page_lock);
int ret = page_iterate(start, end, NULL, cb);
// unlock tables
if (viraddr < KERNEL_SPACE)
2012-09-10 15:37:45 +02:00
spinlock_unlock(&kslock);
else
spinlock_irqsave_unlock(&task->page_lock);
return ret;
}
static void pagefault_handler(struct state *s)
{
task_t* task = per_core(current_task);
size_t viraddr = read_cr2();
// on demand userspace heap mapping
if ((task->heap) && (viraddr >= task->heap->start) && (viraddr < task->heap->end)) {
viraddr &= PAGE_MASK;
size_t phyaddr = get_page();
if (BUILTIN_EXPECT(!phyaddr, 0)) {
kprintf("out of memory: task = %u\n", task->id);
goto default_handler;
}
viraddr = map_region(viraddr, phyaddr, 1, MAP_USER_SPACE);
if (BUILTIN_EXPECT(!viraddr, 0)) {
kprintf("map_region: could not map %#lx to %#lx, task = %u\n", viraddr, phyaddr, task->id);
put_page(phyaddr);
goto default_handler;
}
memset((void*) viraddr, 0x00, PAGE_SIZE); // fill with zeros
return;
}
default_handler:
kprintf("Page Fault Exception (%d) at cs:rip = %#x:%#lx, core = %u, task = %u, addr = %#lx, error = %#x [ %s %s %s %s %s ]\n"
"Register state: rflags = %#lx, rax = %#lx, rbx = %#lx, rcx = %#lx, rdx = %#lx, rdi = %#lx, rsi = %#lx, rbp = %#llx, rsp = %#lx\n",
s->int_no, s->cs, s->rip, CORE_ID, task->id, viraddr, s->error,
(s->error & 0x4) ? "user" : "supervisor",
(s->error & 0x10) ? "instruction" : "data",
(s->error & 0x2) ? "write" : ((s->error & 0x10) ? "fetch" : "read"),
(s->error & 0x1) ? "protection" : "not present",
(s->error & 0x8) ? "reserved bit" : "\b",
s->rflags, s->rax, s->rbx, s->rcx, s->rdx, s->rdi, s->rsi, s->rbp, s->rsp);
2011-08-02 06:19:26 -07:00
irq_enable();
abort();
}
int arch_paging_init(void)
{
uint32_t i, npages;
// replace default pagefault handler
irq_uninstall_handler(14);
irq_install_handler(14, pagefault_handler);
// setup recursive paging
boot_pml4.entries[PAGE_MAP_ENTRIES-1] = (size_t) &boot_pml4 | PG_TABLE;
/*
* In longmode the kernel is already maped into the kernel space (see entry64.asm)
* this includes .data, .bss, .text, VGA, the multiboot & multiprocessing (APIC) structures
*/
#if MAX_CORES > 1
// reserve page for smp boot code
if (!map_region(SMP_SETUP_ADDR, SMP_SETUP_ADDR, 1, MAP_NO_CACHE | MAP_REMAP)) {
kputs("could not reserve page for smp boot code\n");
return -ENOMEM;
}
#endif
#ifdef CONFIG_MULTIBOOT
2011-04-07 20:36:43 +02:00
#if 0
// map reserved memory regions into the kernel space
2011-02-16 22:35:46 +01:00
if (mb_info && (mb_info->flags & MULTIBOOT_INFO_MEM_MAP)) {
multiboot_memory_map_t* mmap = (multiboot_memory_map_t*) mb_info->mmap_addr;
multiboot_memory_map_t* mmap_end = (void*) ((size_t) mb_info->mmap_addr + mb_info->mmap_length);
while (mmap < mmap_end) {
if (mmap->type != MULTIBOOT_MEMORY_AVAILABLE) {
npages = mmap->len / PAGE_SIZE;
if ((mmap->addr+mmap->len) % PAGE_SIZE)
npages++;
map_region(mmap->addr, mmap->addr, npages, MAP_NO_CACHE | MAP_REMAP);
}
mmap++;
}
}
2011-04-07 20:36:43 +02:00
#endif
/*
* Modules like the init ram disk are already loaded.
* Therefore, we map these modules into the kernel space.
*/
2011-02-16 22:35:46 +01:00
if (mb_info && (mb_info->flags & MULTIBOOT_INFO_MODS)) {
multiboot_module_t* mmodule = (multiboot_module_t*) ((size_t) mb_info->mods_addr);
npages = PAGE_FLOOR(mb_info->mods_count*sizeof(multiboot_module_t)) >> PAGE_BITS;
map_region((size_t) mmodule, (size_t) mmodule, npages, MAP_REMAP);
for(i=0; i<mb_info->mods_count; i++, mmodule++) {
// map physical address to the same virtual address
npages = PAGE_FLOOR(mmodule->mod_end - mmodule->mod_start) >> PAGE_BITS;
kprintf("Map module %s at %#x (%u pages)\n", (char*)(size_t) mmodule->cmdline, mmodule->mod_start, npages);
map_region((size_t) (mmodule->mod_start), (size_t) (mmodule->mod_start), npages, MAP_REMAP);
}
}
#endif
// we turned on paging => now, we are able to register our task
2012-06-10 23:40:22 +02:00
register_task();
// APIC registers into the kernel address space
map_apic();
return 0;
}