/* * Copyright 2014 Steffen Vogel, Chair for Operating Systems, * RWTH Aachen University * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * This file is part of MetalSVM. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /** * @author Steffen Vogel */ /* * Virtual Memory Layout of the standard configuration * (1 GB kernel space) * * 0x0000000000000000 - 0x00000000000FFFFF: reserved for IO devices (16MB) * 0x0000000000100000 - 0x00000000008C2000: Kernel (~8MB) * 0x00000000008c3000 - 0x0000000000973000: Init Ramdisk (~2MB) * * 0x0001000000000000 - 0xffff000000000000: Memory hole (48 bit VAS limitation) * * 0xFFFFFE8000000000 - 0xFFFFFEFFFFFFFFFF: Page map dest for copy_page_map() (512GB) * 0xFFFFFF0000000000 - 0xFFFFFF7FFFFFFFFF: Page map source for copy_page_map() (512GB) * 0xFFFFFF8000000000 - 0xFFFFFFFFFFFFFFFF: Self-referenced page maps of the current task (512GB) */ /// Boot task's page map (setup by entryXX.asm) extern page_entry_t boot_pml4[PAGE_MAP_ENTRIES]; /// Kernel space page map lock static spinlock_t kslock = SPINLOCK_INIT; /// Mapping of self referenced page map (at the end of the VAS) // TODO: find a more generic initialization #ifdef CONFIG_X86_32 static page_entry_t* const current_map = (page_entry_t*) (1 * PAGE_MAP_PGD); static page_entry_t* const src_map = (page_entry_t*) (2 * PAGE_MAP_PGD); static page_entry_t* const dest_map = (page_entry_t*) (3 * PAGE_MAP_PGD); #elif defined(CONFIG_X86_64) static page_entry_t* const current_map = (page_entry_t*) (1 * PAGE_MAP_PML4); static page_entry_t* const src_map = (page_entry_t*) (2 * PAGE_MAP_PML4); static page_entry_t* const dest_map = (page_entry_t*) (3 * PAGE_MAP_PML4); #endif #ifdef CONFIG_X86_32 static page_entry_t boot_pgd[PAGE_MAP_ENTRIES]; #endif page_entry_t* get_boot_page_map(void) { #ifdef CONFIG_X86_32 return boot_pgd; #elif defined(CONFIG_X86_64) return boot_pml4; #endif } void page_dump(size_t mask) { task_t* task = per_core(current_task); mask |= PG_PRESENT; size_t flags = 0; size_t start = 0; size_t end; void print(size_t start, size_t end, size_t flags) { size_t size = end - start; kprintf("%#018lx-%#018lx %#14x %c%c%c%c%c%c\n", start, end, size, (mask & flags & PG_XD) ? '-' : 'x', (mask & flags & PG_GLOBAL) ? 'g' : '-', (mask & flags & PG_DIRTY) ? 'd' : '-', (mask & flags & PG_ACCESSED) ? 'a' : '-', (mask & flags & PG_USER) ? 'u' : '-', (mask & flags & PG_RW) ? 'w' : '-' ); } void traverse(int level, page_entry_t* entry) { page_entry_t* stop = entry + PAGE_MAP_ENTRIES; for (; entry != stop; entry++) { if (*entry & PG_PRESENT) { if (level && !(*entry & PG_PSE)) // do "pre-order" traversal // TODO: handle "inheritance" of page table flags (see get_page_flags()) traverse(level-1, get_child_entry(entry)); else { if (!flags) { flags = *entry & ~PAGE_MASK & mask; start = entry_to_virt(entry, level); } else if (flags != (*entry & ~PAGE_MASK & mask)) { end = entry_to_virt(entry, level); print(start, end, flags); flags = *entry & ~PAGE_MASK & mask; start = end; } } } else if (flags) { end = entry_to_virt(entry, level); print(start, end, flags); flags = 0; } } } // lock tables spinlock_lock(&kslock); spinlock_irqsave_lock(&task->page_lock); kprintf("%-18s-%18s %14s %-6s\n", "start", "end", "size", "flags"); // header traverse(PAGE_MAP_LEVELS-1, current_map); if (flags) // workaround to print last mapping print(start, 0L, flags); // unlock tables spinlock_irqsave_unlock(&task->page_lock); spinlock_unlock(&kslock); } void page_stats(int reset) { task_t* task = per_core(current_task); int i, stats[13] = { 0 }; const char* labels[] = { [0] = "present", "writable", "user accessable", "write through", "cache disabled", // IA-32 "legacy" bits "accessed", "dirty", "huge pages", "global", "svm", "svm lazy", "svm init", [12] = "exec disabled" // IA-32e / PAE bits }; void traverse(int level, page_entry_t* entry) { page_entry_t* stop = entry + PAGE_MAP_ENTRIES; for (; entry != stop; entry++) { if (*entry & PG_PRESENT) { if (level && !(*entry & PG_PSE)) traverse(level-1, get_child_entry(entry)); else { // increment stat counters int i; for (i=0; i<12; i++) { // IA-32 "legacy" bits if (*entry & (1 << i)) stats[i]++; } #ifdef CONFIG_X86_64 for (i=0; i<1; i++) { // IA-32e / PAE bits if (*entry & (1 << (63-i))) stats[i+PAGE_BITS]++; } #endif if (reset) { // reset accessed and dirty bits *entry &= ~(PG_ACCESSED|PG_DIRTY); tlb_flush_one_page(entry_to_virt(entry, level)); // see IA32 Vol3 4.8 } } } } } // lock tables spinlock_lock(&kslock); spinlock_irqsave_lock(&task->page_lock); traverse(PAGE_MAP_LEVELS-1, current_map); // unlock tables spinlock_irqsave_unlock(&task->page_lock); spinlock_unlock(&kslock); kprintf("total pages:\n"); for (i=0; i<13; i++) kprintf(" - %s:%*lu\n", labels[i], 25-strlen(labels[i]), stats[i]); } int copy_page_map(task_t* new_task, int copy) { task_t* cur_task = per_core(current_task); int traverse(int level, page_entry_t* src, page_entry_t* dest) { page_entry_t* stop = src + PAGE_MAP_ENTRIES; for (; src != stop; src++, dest++) { if (*src & PG_PRESENT) { if (*src & PG_USER) { // deep copy page frame size_t phyaddr = get_page(); if (BUILTIN_EXPECT(!phyaddr, 0)) return -ENOMEM; atomic_int32_inc(&cur_task->user_usage); copy_page(phyaddr, *src & PAGE_MASK); *dest = phyaddr | (*src & ~PAGE_MASK); // do "pre-order" traversal if (level && !(*src & PG_PSE)) { int ret = traverse(level-1, get_child_entry(src), get_child_entry(dest)); if (ret < 0) return ret; } } else // shallow copy kernel table *dest = *src; } else // table does not exists *dest = 0; } return 0; } page_entry_t* src_virt = (copy) ? cur_task->page_map : get_boot_page_map(); page_entry_t* dest_virt = (page_entry_t*) palloc(PAGE_SIZE, MAP_KERNEL_SPACE); if (BUILTIN_EXPECT(!dest_virt, 0)) return -ENOMEM; size_t src_phys = virt_to_phys((size_t) src_virt); size_t dest_phys = virt_to_phys((size_t) dest_virt); // lock tables spinlock_lock(&kslock); spinlock_irqsave_lock(&cur_task->page_lock); kprintf("copy_page_map: copy = %u, src = %p (%p, %p), dest = %p (%p, %p)\n", copy, src_virt, src_phys, src_map, dest_virt, dest_phys, dest_map); // TODO: remove // temporary map src and dest tables current_map[PAGE_MAP_ENTRIES-2] = (src_phys & PAGE_MASK) | (PG_TABLE & ~PG_RW); // source is read-only! current_map[PAGE_MAP_ENTRIES-3] = (dest_phys & PAGE_MASK) | PG_TABLE; //tlb_flush(); // ouch :( int ret = traverse(PAGE_MAP_LEVELS-1, src_map, dest_map); // setup self reference for new table dest_map[PAGE_MAP_ENTRIES-1] = dest_phys | PG_TABLE; // unmap temporary tables current_map[PAGE_MAP_ENTRIES-2] = 0; current_map[PAGE_MAP_ENTRIES-3] = 0; dest_map[PAGE_MAP_ENTRIES-2] = 0; dest_map[PAGE_MAP_ENTRIES-3] = 0; tlb_flush(); // ouch :( // unlock tables spinlock_irqsave_unlock(&cur_task->page_lock); spinlock_unlock(&kslock); new_task->page_map = dest_virt; return ret; } int drop_page_map(void) { task_t* task = per_core(current_task); void traverse(int level, page_entry_t* entry) { page_entry_t* stop = entry + PAGE_MAP_ENTRIES; for (; entry != stop; entry++) { if (*entry & PG_PRESENT) { // do "post-order" traversal if (level && !(*entry & PG_PSE)) traverse(level-1, get_child_entry(entry)); if (*entry & PG_USER) { kprintf("drop_page_map: entry = %p. level = %u\n", entry, level); if (put_page(*entry & PAGE_MASK)) atomic_int32_dec(&task->user_usage); } } } } kprintf("drop_page_map: task = %u\n", task->id); // TODO: remove // check assertions if (BUILTIN_EXPECT(task->page_map == get_boot_page_map(), 0)) return -EINVAL; if (BUILTIN_EXPECT(!task || !task->page_map, 0)) return -EINVAL; // lock tables spinlock_irqsave_lock(&task->page_lock); kprintf("user_usage: %u (task = %u)\n", atomic_int32_read(&task->user_usage), task->id); traverse(PAGE_MAP_LEVELS-1, current_map); put_page((size_t) task->page_map); // we replace the page table task->page_map = get_boot_page_map(); tlb_flush(); // unlock tables spinlock_irqsave_unlock(&task->page_lock); return 0; } int set_page_flags(size_t viraddr, uint32_t npages, int flags) { task_t* task = per_core(current_task); page_entry_t* first[PAGE_MAP_LEVELS]; page_entry_t* last[PAGE_MAP_LEVELS]; size_t bits = page_bits(flags); size_t start = viraddr; size_t end = start + npages * PAGE_SIZE; void traverse(int level, page_entry_t* entry) { page_entry_t* stop = entry + PAGE_MAP_ENTRIES; for (; entry != stop; entry++) { if (entry < last[level] && entry >= first[level]) { if ((*entry & PG_PRESENT) && !(*entry & PG_PSE)) { if (level) { if (flags & MAP_USER_SPACE) *entry |= PG_USER; #ifdef CONFIG_X86_64 if (flags & MAP_CODE) *entry &= ~PG_XD; #endif // do "pre-order" traversal traverse(level-1, get_child_entry(entry)); } else *entry = (*entry & PAGE_MASK) | bits; tlb_flush_one_page(entry_to_virt(entry, level)); } } } } // check assertions if (BUILTIN_EXPECT(!task || !task->page_map, 0)) return 0; // calc page tree boundaries int i; for (i=0; i= KERNEL_SPACE) spinlock_irqsave_lock(&task->page_lock); traverse(PAGE_MAP_LEVELS-1, current_map); // unlock tables if (start < KERNEL_SPACE) spinlock_unlock(&kslock); if (end >= KERNEL_SPACE) spinlock_irqsave_unlock(&task->page_lock); return 0; } size_t map_region(size_t viraddr, size_t phyaddr, uint32_t npages, uint32_t flags) { task_t* task = per_core(current_task); page_entry_t* first[PAGE_MAP_LEVELS]; page_entry_t* last[PAGE_MAP_LEVELS]; // TODO: this behaviour should be deprecated if (!viraddr) { int vma_flags = VMA_HEAP; if (flags & MAP_USER_SPACE) vma_flags |= VMA_USER; viraddr = vma_alloc(npages * PAGE_SIZE, vma_flags); } size_t bits = page_bits(flags); size_t start = viraddr; size_t end = start + npages * PAGE_SIZE; int traverse(int level, page_entry_t* entry) { page_entry_t* stop = entry + PAGE_MAP_ENTRIES; for (; entry != stop; entry++) { if (entry < last[level] && entry >= first[level]) { if (level) { // PGD, PDPT, PML4.. if (*entry & PG_PRESENT) { if ((flags & MAP_USER_SPACE) && !(*entry & PG_USER)) { /* We are altering entries which cover * the kernel. So before changing them we need to * make a private copy for the task */ size_t phyaddr = get_page(); if (BUILTIN_EXPECT(!phyaddr, 0)) return -ENOMEM; atomic_int32_inc(&task->user_usage); copy_page(phyaddr, *entry & PAGE_MASK); *entry = phyaddr | (*entry & ~PAGE_MASK); *entry &= ~PG_GLOBAL; *entry |= PG_USER; /* We just need to flush the table itself. * TLB entries for the kernel remain valid * because we've not changed them. */ tlb_flush_one_page(entry_to_virt(entry, 0)); } } else { /* Theres no page map table available * which covers the region. Therefore we will create a * new table. */ size_t phyaddr = get_page(); if (BUILTIN_EXPECT(!phyaddr, 0)) return -ENOMEM; if (flags & MAP_USER_SPACE) atomic_int32_inc(&task->user_usage); *entry = phyaddr | bits; memset(get_child_entry(entry), 0x00, PAGE_SIZE); // fill with zeros } // do "pre-order" traversal if no hugepage if (!(*entry & PG_PSE)) { int ret = traverse(level-1, get_child_entry(entry)); if (ret < 0) return ret; } } else { // PGT if ((*entry & PG_PRESENT) && !(flags & MAP_REMAP)) return -EINVAL; *entry = phyaddr | bits; if (flags & MAP_USER_SPACE) atomic_int32_inc(&task->user_usage); if (flags & MAP_REMAP) tlb_flush_one_page(entry_to_virt(entry, level)); phyaddr += PAGE_SIZE; } } } return 0; } kprintf("map_region: map %u pages from %#lx to %#lx with flags: %#x\n", npages, viraddr, phyaddr, flags); // TODO: remove if (BUILTIN_EXPECT(!task || !task->page_map, 0)) return 0; // calc page tree boundaries int i; for (i=0; i= KERNEL_SPACE) spinlock_irqsave_lock(&task->page_lock); int ret = traverse(PAGE_MAP_LEVELS-1, current_map); // unlock tables if (start < KERNEL_SPACE) spinlock_unlock(&kslock); if (end >= KERNEL_SPACE) spinlock_irqsave_unlock(&task->page_lock); return (ret) ? 0 : viraddr; } int unmap_region(size_t viraddr, uint32_t npages) { task_t* task = per_core(current_task); page_entry_t* first[PAGE_MAP_LEVELS]; page_entry_t* last[PAGE_MAP_LEVELS]; size_t start = viraddr; size_t end = start + npages * PAGE_SIZE; kprintf("unmap_region: unmap %u pages from %#lx\n", npages, viraddr); // TODO: remove /** @return number of page table entries which a present */ int traverse(int level, page_entry_t* entry) { int used = 0; page_entry_t* stop = entry + PAGE_MAP_ENTRIES; for (; entry != stop; entry++) { if (entry < last[level] && entry >= first[level]) { if (level) { // PGD, PDPT, PML4 if ((*entry & PG_PRESENT) && !(*entry & PG_PSE)) { // do "post-order" traversal if table is present and no hugepage if (traverse(level-1, get_child_entry(entry))) used++; else { // child table is empty => delete it *entry &= ~PG_PRESENT; tlb_flush_one_page(entry_to_virt(entry, 0)); if (*entry & PG_USER) { if (put_page(*entry & PAGE_MASK)) atomic_int32_dec(&task->user_usage); } } } } else { // PGT *entry &= ~PG_PRESENT; tlb_flush_one_page(entry_to_virt(entry, level)); if (*entry & PG_USER) atomic_int32_dec(&task->user_usage); } } else { if (*entry & PG_PRESENT) used++; } } return used; } if (BUILTIN_EXPECT(!task || !task->page_map, 0)) return 0; // calc page tree boundaries int i; for (i=0; i= KERNEL_SPACE) spinlock_irqsave_lock(&task->page_lock); traverse(PAGE_MAP_LEVELS-1, current_map); // unlock tables if (start < KERNEL_SPACE) spinlock_unlock(&kslock); if (end > KERNEL_SPACE) spinlock_irqsave_unlock(&task->page_lock); return 0; } static void pagefault_handler(struct state *s) { task_t* task = per_core(current_task); size_t viraddr = read_cr2(); // on demand userspace heap mapping if ((task->heap) && (viraddr >= task->heap->start) && (viraddr < task->heap->end)) { viraddr &= PAGE_MASK; size_t phyaddr = get_page(); if (BUILTIN_EXPECT(!phyaddr, 0)) { kprintf("out of memory: task = %u\n", task->id); goto default_handler; } viraddr = map_region(viraddr, phyaddr, 1, MAP_USER_SPACE); if (BUILTIN_EXPECT(!viraddr, 0)) { kprintf("map_region: could not map %#lx to %#lx, task = %u\n", viraddr, phyaddr, task->id); put_page(phyaddr); goto default_handler; } memset((void*) viraddr, 0x00, PAGE_SIZE); // fill with zeros return; } default_handler: kprintf("Page Fault Exception (%d) at cs:ip = %#x:%#lx, core = %u, task = %u, addr = %#lx, error = %#x [ %s %s %s %s %s ]\n", s->int_no, s->cs, #ifdef CONFIG_X86_32 s->eip, #elif defined(CONFIG_X86_64) s->rip, #endif CORE_ID, task->id, viraddr, s->error, (s->error & 0x4) ? "user" : "supervisor", (s->error & 0x10) ? "instruction" : "data", (s->error & 0x2) ? "write" : ((s->error & 0x10) ? "fetch" : "read"), (s->error & 0x1) ? "protection" : "not present", (s->error & 0x8) ? "reserved bit" : "\b"); // TODO: move this to something like print_registers() #ifdef CONFIG_X86_32 kprintf("Register state: eflags = %#lx, eax = %#lx, ebx = %#lx, ecx = %#lx, edx = %#lx, edi = %#lx, esi = %#lx, ebp = %#llx, esp = %#lx\n", s->eflags, s->eax, s->ebx, s->ecx, s->edx, s->edi, s->esi, s->ebp, s->esp); #elif defined(CONFIG_X86_64) kprintf("Register state: rflags = %#lx, rax = %#lx, rbx = %#lx, rcx = %#lx, rdx = %#lx, rdi = %#lx, rsi = %#lx, rbp = %#llx, rsp = %#lx\n", s->rflags, s->rax, s->rbx, s->rcx, s->rdx, s->rdi, s->rsi, s->rbp, s->rsp); #endif irq_enable(); abort(); } int arch_paging_init(void) { uint32_t i, npages; // replace default pagefault handler irq_uninstall_handler(14); irq_install_handler(14, pagefault_handler); // setup recursive paging page_entry_t* boot_map = get_boot_page_map(); boot_map[PAGE_MAP_ENTRIES-1] = (size_t) boot_map | PG_TABLE; /* * In longmode the kernel is already maped into the kernel space (see entry64.asm) * this includes .data, .bss, .text, VGA, the multiboot & multiprocessing (APIC) structures */ #if MAX_CORES > 1 // reserve page for smp boot code if (!map_region(SMP_SETUP_ADDR, SMP_SETUP_ADDR, 1, MAP_NO_CACHE | MAP_REMAP)) { kputs("could not reserve page for smp boot code\n"); return -ENOMEM; } #endif #ifdef CONFIG_MULTIBOOT #if 0 // map reserved memory regions into the kernel space if (mb_info && (mb_info->flags & MULTIBOOT_INFO_MEM_MAP)) { multiboot_memory_map_t* mmap = (multiboot_memory_map_t*) mb_info->mmap_addr; multiboot_memory_map_t* mmap_end = (void*) ((size_t) mb_info->mmap_addr + mb_info->mmap_length); while (mmap < mmap_end) { if (mmap->type != MULTIBOOT_MEMORY_AVAILABLE) { npages = mmap->len / PAGE_SIZE; if ((mmap->addr+mmap->len) % PAGE_SIZE) npages++; map_region(mmap->addr, mmap->addr, npages, MAP_NO_CACHE | MAP_REMAP); } mmap++; } } #endif /* * Modules like the init ram disk are already loaded. * Therefore, we map these modules into the kernel space. */ if (mb_info && (mb_info->flags & MULTIBOOT_INFO_MODS)) { multiboot_module_t* mmodule = (multiboot_module_t*) ((size_t) mb_info->mods_addr); npages = PAGE_FLOOR(mb_info->mods_count*sizeof(multiboot_module_t)) >> PAGE_BITS; map_region((size_t) mmodule, (size_t) mmodule, npages, MAP_REMAP); for(i=0; imods_count; i++, mmodule++) { // map physical address to the same virtual address npages = PAGE_FLOOR(mmodule->mod_end - mmodule->mod_start) >> PAGE_BITS; kprintf("Map module %s at %#x (%u pages)\n", (char*)(size_t) mmodule->cmdline, mmodule->mod_start, npages); map_region((size_t) (mmodule->mod_start), (size_t) (mmodule->mod_start), npages, MAP_REMAP); } } #endif // we turned on paging => now, we are able to register our task register_task(); // APIC registers into the kernel address space map_apic(); return 0; }