another rewrite of the paging code without nested functions

This commit is contained in:
Steffen Vogel 2014-02-18 13:08:22 +01:00
parent 6e8ecad91f
commit 3203d53a83
4 changed files with 337 additions and 342 deletions

View file

@ -34,59 +34,44 @@
/// Page offset bits /// Page offset bits
#define PAGE_BITS 12 #define PAGE_BITS 12
/// The size of a single page in bytes
#define PAGE_SIZE ( 1L << PAGE_BITS)
#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_32
/// Number of page map indirections
#define PAGE_MAP_LEVELS 2
/// Page map bits
#define PAGE_MAP_BITS 10
/// Total operand width in bits /// Total operand width in bits
#define BITS 32 #define BITS 32
/// Linear/virtual address width /// Linear/virtual address width
#define VIRT_BITS BITS #define VIRT_BITS BITS
/// Physical address width (we dont support PAE) /// Physical address width (we dont support PAE)
#define PHYS_BITS BITS #define PHYS_BITS BITS
#elif defined(CONFIG_X86_64)
/// Number of page map indirections
#define PAGE_MAP_LEVELS 4
/// Page map bits /// Page map bits
#define PAGE_MAP_BITS 9 #define PAGE_MAP_BITS 10
/// Number of page map indirections
#define PAGE_MAP_LEVELS 2
/// Mask the page address without page map flags
#define PAGE_MASK 0xFFFFF000
#elif defined(CONFIG_X86_64)
/// Total operand width in bits /// Total operand width in bits
#define BITS 64 #define BITS 64
/// Linear/virtual address width /// Linear/virtual address width
#define VIRT_BITS 48 #define VIRT_BITS 48
/// Physical address width (maximum value) /// Physical address width (maximum value)
#define PHYS_BITS 52 #define PHYS_BITS 52
/// Page map bits
#define PAGE_MAP_BITS 9
/// Number of page map indirections
#define PAGE_MAP_LEVELS 4
/// Mask the page address without page map flags
#define PAGE_MASK 0x000FFFFFFFFFF000
#endif #endif
/// The size of a single page in bytes
#define PAGE_SIZE ( 1L << PAGE_BITS)
/// The number of entries in a page map table /// The number of entries in a page map table
#define PAGE_MAP_ENTRIES ( 1L << PAGE_MAP_BITS) #define PAGE_MAP_ENTRIES (1L << PAGE_MAP_BITS)
/// Mask the page address
#define PAGE_MASK (-1L << PAGE_BITS)
/// Mask the entry in a page table
#define PAGE_ENTRY_MASK (-1L << (PAGE_BITS-PAGE_MAP_BITS))
/// Mask for all flag bits in a page map entry (including ignored bits)
#define PAGE_FLAGS_MASK (~(-1L << PAGE_BITS) | (-1L << VIRT_BITS))
/// Align to next page /// Align to next page
#define PAGE_FLOOR(addr) (((addr) + PAGE_SIZE - 1) & PAGE_MASK) #define PAGE_FLOOR(addr) (((addr) + PAGE_SIZE - 1) & PAGE_MASK)
/// Align to page /// Align to page
#define PAGE_CEIL(addr) ( (addr) & PAGE_MASK) #define PAGE_CEIL(addr) ( (addr) & PAGE_MASK)
/// Sign extension to get a valid canonical address (hack: by using aritmethic shifts)
#define VIRT_SEXT(addr) ((ssize_t) addr << (BITS-VIRT_BITS) >> (BITS-VIRT_BITS))
// base addresses of page map tables
#ifdef CONFIG_X86_32
#define PAGE_MAP_PGD 0xFFFFF000
#define PAGE_MAP_PGT 0xFFC00000
#elif defined(CONFIG_X86_64)
#define PAGE_MAP_PML4 0xFFFFFFFFFFFFF000
#define PAGE_MAP_PDPT 0xFFFFFFFFFFE00000
#define PAGE_MAP_PGD 0xFFFFFFFFC0000000
#define PAGE_MAP_PGT 0xFFFFFF8000000000
#endif
/// Page is present /// Page is present
#define PG_PRESENT (1 << 0) #define PG_PRESENT (1 << 0)
@ -125,28 +110,12 @@
/// This is a whole set of flags (PRESENT,RW,GLOBAL) for kernelspace pages /// This is a whole set of flags (PRESENT,RW,GLOBAL) for kernelspace pages
#define PG_PAGE (PG_PRESENT|PG_RW|PG_GLOBAL|PG_XD) #define PG_PAGE (PG_PRESENT|PG_RW|PG_GLOBAL|PG_XD)
/** @brief A single entry in a page map */ /** @brief A single entry in a page map
*
* Usually used as a pointer to a mapped page map entry.
*/
typedef size_t page_entry_t; typedef size_t page_entry_t;
/** @brief General page map structure
*
* This page map structure is a general type for all indirecton levels.
* As all page map levels containing the same amount of entries.
* All page maps must be page aligned!
*/
typedef struct page_map {
page_entry_t entries[PAGE_MAP_ENTRIES];
} __attribute__ ((aligned (PAGE_SIZE))) page_map_t;
/** @brief A callback type for the page map iterator
*
* @param entry A pointer to the current page map entry
* @return
* - 0 if we want to skip underlying page tables
* - >0 if want to recurse into underlying page tables
*/
typedef int (*page_cb_t)(page_entry_t* entry, int level);
/** @brief Converts a virtual address to a physical /** @brief Converts a virtual address to a physical
* *
* @param viraddr Virtual address to convert * @param viraddr Virtual address to convert
@ -227,7 +196,7 @@ int arch_paging_init(void);
* *
* @return Returns the address of the boot task's page dir array. * @return Returns the address of the boot task's page dir array.
*/ */
page_map_t* get_boot_page_map(void); page_entry_t* get_boot_page_map(void);
/** @brief Setup a new page directory for a new user-level task /** @brief Setup a new page directory for a new user-level task
* *
@ -240,10 +209,10 @@ page_map_t* get_boot_page_map(void);
*/ */
int copy_page_map(struct task* task, int copy); int copy_page_map(struct task* task, int copy);
/** @brief Delete all page map structures of the current task /** @brief Deletes all user page map structures of the current task
* *
* Puts PML4, PDPT, PGD, PGT tables back to buffer and * All allocated physical page frames are released in the bitmap
* sets the task's page map pointer to NULL * The task->page_map is replaces by the boot_page_map()
* *
* @return * @return
* - 0 on success * - 0 on success
@ -264,15 +233,18 @@ int drop_page_map(void);
* - 0 on success * - 0 on success
* - -EINVAL (-22) on failure. * - -EINVAL (-22) on failure.
*/ */
int change_page_permissions(size_t start, size_t end, uint32_t flags); int set_page_flags(size_t viraddr, uint32_t npages, int flags);
/** @brief Dump mapped memory */ /** @brief Dump mapped memory
void page_dump(size_t start, size_t end); *
* @param mask Only watch for changes in these page flags (PG_PRESENT is set by default)
*/
void page_dump(size_t mask);
/** @brief Print stats about page flags /** @brief Print stats about page flags
* *
* @param reset Reset accessed and dirty bits in page tables * @param reset Reset accessed and dirty bits in page tables
*/ */
void page_stats(size_t start, size_t end, int reset); void page_stats(int reset);
#endif #endif

View file

@ -50,7 +50,7 @@ size_t* get_current_stack(void)
#endif #endif
// use new page table // use new page table
write_cr3(virt_to_phys((size_t)curr_task->page_map)); write_cr3(virt_to_phys((size_t) curr_task->page_map));
return curr_task->last_stack_pointer; return curr_task->last_stack_pointer;
} }

View file

@ -37,138 +37,83 @@
* Virtual Memory Layout of the standard configuration * Virtual Memory Layout of the standard configuration
* (1 GB kernel space) * (1 GB kernel space)
* *
* 0x000000000000 - 0x0000000FFFFF: reserved for IO devices (16MB) * 0x0000000000000000 - 0x00000000000FFFFF: reserved for IO devices (16MB)
* 0x000000100000 - 0x00000DEADFFF: Kernel (size depends on the configuration) (221MB) * 0x0000000000100000 - 0x00000000008C2000: Kernel (~8MB)
* 0x00000DEAE000 - 0x00003FFFFFFF: Kernel heap * 0x00000000008c3000 - 0x0000000000973000: Init Ramdisk (~2MB)
* 0xFF0000000000 - 0xFF7FFFFFFFFF: Paging structures for copying a page map (max 512GB) *
* 0xFF8000000000 - 0xFFFFFFFFFFFF: Paging structures are mapped in this region (max 512GB) * 0x0001000000000000 - 0xffff000000000000: Memory hole (48 bit VAS limitation)
*
* 0xFFFFFE8000000000 - 0xFFFFFEFFFFFFFFFF: Page map dest for copy_page_map() (512GB)
* 0xFFFFFF0000000000 - 0xFFFFFF7FFFFFFFFF: Page map source for copy_page_map() (512GB)
* 0xFFFFFF8000000000 - 0xFFFFFFFFFFFFFFFF: Self-referenced page maps of the current task (512GB)
*/ */
/// Boot task's page map /// Boot task's page map (setup by entryXX.asm)
extern page_map_t boot_pml4; extern page_entry_t boot_pml4[PAGE_MAP_ENTRIES];
/// Kernel space page map lock /// Kernel space page map lock
static spinlock_t kslock = SPINLOCK_INIT; static spinlock_t kslock = SPINLOCK_INIT;
/// Mapping of self referenced page map (at the end of the VAS)
static page_entry_t* const current_map = (page_entry_t*) (-1*PAGE_SIZE);
static page_entry_t* const src_map = (page_entry_t*) (-2*PAGE_SIZE);
static page_entry_t* const dest_map = (page_entry_t*) (-3*PAGE_SIZE);
* page_entry_t* get_boot_page_map(void)
page_map_t* get_boot_page_map(void)
{ {
return &boot_pml4; return boot_pml4;
} }
/** @brief Recursive traversal through the page map tree void page_dump(size_t mask)
*
* @param start The first address whose page map entry we will call on
* @param end The exclusive end address whose page map entry we will call on
* @param pre Callback which is called for every page map entry (pre-order traversal)
* @param post Callback which is called for every page map entry (post-order traversal)
*/
int page_iterate(size_t start, size_t end, page_cb_t pre, page_cb_t post)
{
page_entry_t* entry[PAGE_MAP_LEVELS];
page_entry_t* last[PAGE_MAP_LEVELS];
if (BUILTIN_EXPECT(start >= end, 0))
return -EINVAL;
// setup subtree boundaries
int i;
for (i=0; i<PAGE_MAP_LEVELS; i++) {
entry[i] = virt_to_entry(start, i);
last[i] = virt_to_entry(end - 1, i);
}
// nested iterator function (sees the scope of parent)
int iterate(int level) {
int ret;
while (entry[level] <= last[level]) {
if (pre) { // call pre-order callback if available
ret = pre(entry[level], level);
if (BUILTIN_EXPECT(ret < 0, 0))
return ret;
}
// recurse if
// - we are not in the PGT
// - and the inferior page table is present
// - and the current entry represents no huge page
if (level && (*entry[level] & PG_PRESENT) && !(*entry[level] & PG_PSE)) {
ret = iterate(level-1);
if (BUILTIN_EXPECT(ret < 0, 0))
return ret;
}
// or skip the entries we've omit...
else {
size_t next = (size_t) (entry[level]+1);
for (i=0; i<level; i++)
entry[i] = (page_entry_t*) (next << (PAGE_MAP_BITS*(level-i)));
}
if (post) { // call post-order callback if available
ret = post(entry[level], level);
if (BUILTIN_EXPECT(ret < 0, 0))
return ret;
}
// return if we've reached the end of table
entry[level]++;
if (((size_t) entry[level] & ~PAGE_MASK) == 0x000) // TODO
return 0;
}
return 0;
}
// we start at the highest order table (PML4 or PGD)
return iterate(PAGE_MAP_LEVELS-1);
}
void page_dump(size_t from, size_t to)
{ {
task_t* task = per_core(current_task); task_t* task = per_core(current_task);
mask |= PG_PRESENT;
size_t flags = 0; size_t flags = 0;
size_t start = 0; size_t start = 0;
size_t end;
void print(size_t start, size_t end, size_t flags) { void print(size_t start, size_t end, size_t flags) {
size_t size = end - start; size_t size = end - start;
kprintf("%#018lx-%#018lx %#14x %c%c%c%c%c%c\n", start, end, size, kprintf("%#018lx-%#018lx %#14x %c%c%c%c%c%c\n", start, end, size,
(flags & PG_XD) ? '-' : 'x', (mask & flags & PG_XD) ? '-' : 'x',
(flags & PG_GLOBAL) ? 'g' : '-', (mask & flags & PG_GLOBAL) ? 'g' : '-',
(flags & PG_DIRTY) ? 'd' : '-', (mask & flags & PG_DIRTY) ? 'd' : '-',
(flags & PG_ACCESSED) ? 'a' : '-', (mask & flags & PG_ACCESSED) ? 'a' : '-',
(flags & PG_USER) ? 'u' : '-', (mask & flags & PG_USER) ? 'u' : '-',
(flags & PG_RW) ? 'w' : '-' (mask & flags & PG_RW) ? 'w' : '-'
); );
} }
int cb(page_entry_t* entry, int level) { void traverse(int level, page_entry_t* entry) {
size_t end; page_entry_t* stop = entry + PAGE_MAP_ENTRIES;
for (; entry != stop; entry++) {
if (*entry & PG_PRESENT) {
if (level && !(*entry & PG_PSE)) // do "pre-order" traversal
// TODO: handle "inheritance" of page table flags (see get_page_flags())
traverse(level-1, get_child_entry(entry));
else {
if (!flags) {
flags = *entry & ~PAGE_MASK & mask;
start = entry_to_virt(entry, level);
}
else if (flags != (*entry & ~PAGE_MASK & mask)) {
end = entry_to_virt(entry, level);
print(start, end, flags);
if (*entry & PG_PRESENT) { flags = *entry & ~PAGE_MASK & mask;
if (!level || (*entry & PG_PSE)) { start = end;
if (!flags) { }
flags = *entry & PAGE_FLAGS_MASK;
start = entry_to_virt(entry, level);
}
else if (flags != (*entry & PAGE_FLAGS_MASK)) {
end = entry_to_virt(entry, level);
print(start, end, flags);
start = end;
flags = *entry & PAGE_FLAGS_MASK;
} }
} }
else if (flags) {
end = entry_to_virt(entry, level);
print(start, end, flags);
flags = 0;
}
} }
else if (flags) {
end = entry_to_virt(entry, level);
print(start, end, flags);
flags = 0;
}
return 0;
} }
// lock tables // lock tables
@ -176,18 +121,18 @@ void page_dump(size_t from, size_t to)
spinlock_irqsave_lock(&task->page_lock); spinlock_irqsave_lock(&task->page_lock);
kprintf("%-18s-%18s %14s %-6s\n", "start", "end", "size", "flags"); // header kprintf("%-18s-%18s %14s %-6s\n", "start", "end", "size", "flags"); // header
page_iterate(from, to, cb, NULL);
traverse(PAGE_MAP_LEVELS-1, current_map);
if (flags) // workaround to print last mapping
print(start, 0L, flags);
// unlock tables // unlock tables
spinlock_unlock(&kslock);
spinlock_irqsave_unlock(&task->page_lock); spinlock_irqsave_unlock(&task->page_lock);
spinlock_unlock(&kslock);
// workaround to print last mapping
if (flags)
print(start, PAGE_FLOOR(to), flags);
} }
void page_stats(size_t from, size_t to, int reset) void page_stats(int reset)
{ {
task_t* task = per_core(current_task); task_t* task = per_core(current_task);
@ -197,40 +142,42 @@ void page_stats(size_t from, size_t to, int reset)
[12] = "exec disabled" // IA-32e / PAE bits [12] = "exec disabled" // IA-32e / PAE bits
}; };
int cb(page_entry_t* entry, int level) { void traverse(int level, page_entry_t* entry) {
if (*entry & PG_PRESENT) { page_entry_t* stop = entry + PAGE_MAP_ENTRIES;
if (!level || (*entry & PG_PSE)) { for (; entry != stop; entry++) {
// increment stat counters if (*entry & PG_PRESENT) {
int i; if (level && !(*entry & PG_PSE))
for (i=0; i<12; i++) { // IA-32 "legacy" bits traverse(level-1, get_child_entry(entry));
if (*entry & (1 << i)) else {
stats[i]++; // increment stat counters
} int i;
for (i=0; i<1; i++) { // IA-32e / PAE bits for (i=0; i<12; i++) { // IA-32 "legacy" bits
if (*entry & (1 << (63-i))) if (*entry & (1 << i))
stats[i+PAGE_BITS]++; stats[i]++;
} }
} for (i=0; i<1; i++) { // IA-32e / PAE bits
if (*entry & (1 << (63-i)))
stats[i+PAGE_BITS]++;
}
// reset accessed and dirty bits if (reset) { // reset accessed and dirty bits
if (reset) { *entry &= ~(PG_ACCESSED|PG_DIRTY);
*entry &= ~(PG_ACCESSED|PG_DIRTY); tlb_flush_one_page(entry_to_virt(entry, level)); // see IA32 Vol3 4.8
tlb_flush_one_page(entry_to_virt(entry, level)); // see IA32 Vol3 4.8 }
}
} }
} }
return 0;
} }
// lock tables // lock tables
spinlock_lock(&kslock); spinlock_lock(&kslock);
spinlock_irqsave_lock(&task->page_lock); spinlock_irqsave_lock(&task->page_lock);
page_iterate(from, to, cb, NULL); traverse(PAGE_MAP_LEVELS-1, current_map);
// unlock tables // unlock tables
spinlock_unlock(&kslock);
spinlock_irqsave_unlock(&task->page_lock); spinlock_irqsave_unlock(&task->page_lock);
spinlock_unlock(&kslock);
kprintf("total pages:\n"); kprintf("total pages:\n");
for (i=0; i<13; i++) for (i=0; i<13; i++)
@ -241,64 +188,77 @@ int copy_page_map(task_t* new_task, int copy)
{ {
task_t* cur_task = per_core(current_task); task_t* cur_task = per_core(current_task);
size_t phyaddr; int traverse(int level, page_entry_t* src, page_entry_t* dest) {
size_t ret; page_entry_t* stop = src + PAGE_MAP_ENTRIES;
for (; src != stop; src++, dest++) {
if (*src & PG_PRESENT) {
if (*src & PG_USER) { // deep copy page frame
kprintf("copy_page_map: deep src = %p, dest = %p, level = %u\n", src, dest, level); // TODO: remove
int cb(page_entry_t* src, int level) {
page_entry_t* dest = src - (1L<<36); // TODO
if (*src & PG_PRESENT) {
if (*src & PG_USER) {
if (copy) { // deep copy page frame
size_t phyaddr = get_page(); size_t phyaddr = get_page();
if (BUILTIN_EXPECT(!phyaddr, 0)) if (BUILTIN_EXPECT(!phyaddr, 0))
return -ENOMEM; return -ENOMEM;
atomic_int32_inc(&cur_task->user_usage); atomic_int32_inc(&cur_task->user_usage);
copy_page(phyaddr, *src & ~PAGE_FLAGS_MASK); copy_page(phyaddr, *src & PAGE_MASK);
*dest = phyaddr | (*src & PAGE_FLAGS_MASK); *dest = phyaddr | (*src & ~PAGE_MASK);
// do "pre-order" traversal
if (level && !(*src & PG_PSE)) {
int ret = traverse(level-1, get_child_entry(src),
get_child_entry(dest));
if (ret < 0)
return ret;
}
} }
else // shallow copy kernel table
*dest = *src;
} }
else // shallow copy kernel table else // table does not exists
*dest = *src; *dest = 0;
} }
return 0; return 0;
} }
// fixed mapping for paging structures
page_map_t *current = (page_map_t*) PAGE_MAP_PML4; page_entry_t* src_virt = (copy) ? cur_task->page_map : get_boot_page_map();
page_map_t *new = palloc(PAGE_SIZE, 0); page_entry_t* dest_virt = (page_entry_t*) palloc(PAGE_SIZE, MAP_KERNEL_SPACE);
if (BUILTIN_EXPECT(!new, 0)) if (BUILTIN_EXPECT(!dest_virt, 0))
return -ENOMEM; return -ENOMEM;
phyaddr = virt_to_phys((size_t) new); size_t src_phys = virt_to_phys((size_t) src_virt);
size_t dest_phys = virt_to_phys((size_t) dest_virt);
// lock tables // lock tables
spinlock_lock(&kslock); spinlock_lock(&kslock);
spinlock_irqsave_lock(&cur_task->page_lock); spinlock_irqsave_lock(&cur_task->page_lock);
// map new table kprintf("copy_page_map: copy = %u, src = %p (%p, %p), dest = %p (%p, %p)\n",
current->entries[PAGE_MAP_ENTRIES-2] = phyaddr | PG_TABLE; copy, src_virt, src_phys, src_map, dest_virt, dest_phys, dest_map); // TODO: remove
// temporary map src and dest tables
current_map[PAGE_MAP_ENTRIES-2] = (src_phys & PAGE_MASK) | (PG_TABLE & ~PG_RW); // source is read-only!
current_map[PAGE_MAP_ENTRIES-3] = (dest_phys & PAGE_MASK) | PG_TABLE;
tlb_flush(); // ouch :( tlb_flush(); // ouch :(
// setup self reference for new table int ret = traverse(PAGE_MAP_LEVELS-1, src_map, dest_map);
new->entries[PAGE_MAP_ENTRIES-1] = phyaddr | PG_TABLE;
ret = page_iterate(0, PAGE_MAP_PGT - (1L<<39), cb, NULL); // TODO: check boundaries // setup self reference for new table
dest_map[PAGE_MAP_ENTRIES-1] = dest_phys | PG_TABLE;
// unmap temporary tables
current_map[PAGE_MAP_ENTRIES-2] = 0;
current_map[PAGE_MAP_ENTRIES-3] = 0;
tlb_flush(); // ouch :(
// unlock tables // unlock tables
spinlock_irqsave_unlock(&cur_task->page_lock); spinlock_irqsave_unlock(&cur_task->page_lock);
spinlock_unlock(&kslock); spinlock_unlock(&kslock);
// unmap new tables new_task->page_map = dest_virt;
current->entries[PAGE_MAP_ENTRIES-2] = 0;
tlb_flush(); // ouch :(
new_task->page_map = new;
kprintf("copy_page_map: allocated %i page tables\n", ret); // TODO: remove
return ret; return ret;
} }
@ -307,15 +267,21 @@ int drop_page_map(void)
{ {
task_t* task = per_core(current_task); task_t* task = per_core(current_task);
int cb(page_entry_t* entry, int level) { void traverse(int level, page_entry_t* entry) {
if (*entry & PG_USER) { page_entry_t* stop = entry + PAGE_MAP_ENTRIES;
kprintf("drop_page_map:cb: entry = %p, level = %u\n", entry, level); // TODO: remove for (; entry != stop; entry++) {
if (*entry & PG_PRESENT) {
// do "post-order" traversal
if (level && !(*entry & PG_PSE))
traverse(level-1, get_child_entry(entry));
if (put_page(*entry & ~PAGE_FLAGS_MASK)) if (*entry & PG_USER) {
atomic_int32_dec(&task->user_usage); kprintf("drop_page_map: entry = %p. level = %u\n", entry, level);
if (put_page(*entry & PAGE_MASK))
atomic_int32_dec(&task->user_usage);
}
}
} }
return 0;
} }
kprintf("drop_page_map: task = %u\n", task->id); // TODO: remove kprintf("drop_page_map: task = %u\n", task->id); // TODO: remove
@ -329,9 +295,15 @@ int drop_page_map(void)
// lock tables // lock tables
spinlock_irqsave_lock(&task->page_lock); spinlock_irqsave_lock(&task->page_lock);
page_iterate(0, PAGE_MAP_PGT, NULL, cb); kprintf("user_usage: %u (task = %u)\n", atomic_int32_read(&task->user_usage), task->id);
pfree(task->page_map, PAGE_SIZE); traverse(PAGE_MAP_LEVELS-1, current_map);
put_page((size_t) task->page_map);
// we replace the page table
task->page_map = get_boot_page_map();
tlb_flush();
// unlock tables // unlock tables
spinlock_irqsave_unlock(&task->page_lock); spinlock_irqsave_unlock(&task->page_lock);
@ -339,54 +311,77 @@ int drop_page_map(void)
return 0; return 0;
} }
static int set_page_flags(size_t viraddr, uint32_t npages, int flags) int set_page_flags(size_t viraddr, uint32_t npages, int flags)
{ {
task_t* task = per_core(current_task); task_t* task = per_core(current_task);
page_entry_t* first[PAGE_MAP_LEVELS];
page_entry_t* last[PAGE_MAP_LEVELS];
size_t bits = page_bits(flags); size_t bits = page_bits(flags);
size_t start = viraddr; size_t start = viraddr;
size_t end = start + npages * PAGE_SIZE; size_t end = start + npages * PAGE_SIZE;
int cb(page_entry_t* entry, int level) { void traverse(int level, page_entry_t* entry) {
if (level) { page_entry_t* stop = entry + PAGE_MAP_ENTRIES;
if (flags & MAP_USER_SPACE) for (; entry != stop; entry++) {
*entry |= PG_USER; if (entry < last[level] && entry >= first[level]) {
if ((*entry & PG_PRESENT) && !(*entry & PG_PSE)) {
if (level) {
if (flags & MAP_USER_SPACE)
*entry |= PG_USER;
#ifdef CONFIG_X86_64
if (flags & MAP_CODE)
*entry &= ~PG_XD;
#endif
// do "pre-order" traversal
traverse(level-1, get_child_entry(entry));
}
else
*entry = (*entry & PAGE_MASK) | bits;
tlb_flush_one_page(entry_to_virt(entry, level));
}
}
} }
else
*entry = (*entry & ~PAGE_FLAGS_MASK) | bits;
tlb_flush_one_page(entry_to_virt(entry, level));
return 0;
} }
// check assertions // check assertions
if (BUILTIN_EXPECT(start < KERNEL_SPACE && end >= KERNEL_SPACE, 0))
return 0;
if (BUILTIN_EXPECT(!task || !task->page_map, 0)) if (BUILTIN_EXPECT(!task || !task->page_map, 0))
return 0; return 0;
// calc page tree boundaries
int i;
for (i=0; i<PAGE_MAP_LEVELS; i++) {
first[i] = virt_to_entry(start, i);
last[i] = virt_to_entry(end - 1, i) + 1; // exclusive
}
// lock tables // lock tables
if (viraddr < KERNEL_SPACE) if (start < KERNEL_SPACE)
spinlock_lock(&kslock); spinlock_lock(&kslock);
else if (end >= KERNEL_SPACE)
spinlock_irqsave_lock(&task->page_lock); spinlock_irqsave_lock(&task->page_lock);
int ret = page_iterate(start, end, cb, NULL); traverse(PAGE_MAP_LEVELS-1, current_map);
// unlock tables // unlock tables
if (viraddr < KERNEL_SPACE) if (start < KERNEL_SPACE)
spinlock_lock(&kslock); spinlock_unlock(&kslock);
else if (end >= KERNEL_SPACE)
spinlock_irqsave_lock(&task->page_lock); spinlock_irqsave_unlock(&task->page_lock);
return ret; return 0;
} }
size_t map_region(size_t viraddr, size_t phyaddr, uint32_t npages, uint32_t flags) size_t map_region(size_t viraddr, size_t phyaddr, uint32_t npages, uint32_t flags)
{ {
task_t* task = per_core(current_task); task_t* task = per_core(current_task);
page_entry_t* first[PAGE_MAP_LEVELS];
page_entry_t* last[PAGE_MAP_LEVELS];
// TODO: this behaviour should be deprecated
if (!viraddr) { if (!viraddr) {
int vma_flags = VMA_HEAP; int vma_flags = VMA_HEAP;
if (flags & MAP_USER_SPACE) if (flags & MAP_USER_SPACE)
@ -399,57 +394,66 @@ size_t map_region(size_t viraddr, size_t phyaddr, uint32_t npages, uint32_t flag
size_t start = viraddr; size_t start = viraddr;
size_t end = start + npages * PAGE_SIZE; size_t end = start + npages * PAGE_SIZE;
int cb(page_entry_t* entry, int level) { int traverse(int level, page_entry_t* entry) {
if (level) { // PGD, PDPT, PML4.. page_entry_t* stop = entry + PAGE_MAP_ENTRIES;
if (*entry & PG_PRESENT) { for (; entry != stop; entry++) {
if (flags & MAP_USER_SPACE) { if (entry < last[level] && entry >= first[level]) {
/* if (level) { // PGD, PDPT, PML4..
* We are changing page map entries which cover if (*entry & PG_PRESENT) {
* the kernel. So before altering them we need to if ((flags & MAP_USER_SPACE) && !(*entry & PG_USER)) {
* make a private copy for the task /* We are changing page map entries which cover
*/ * the kernel. So before altering them we need to
if (!(*entry & PG_USER)) { * make a private copy for the task */
size_t phyaddr = get_page();
if (BUILTIN_EXPECT(!phyaddr, 0))
return -ENOMEM;
atomic_int32_inc(&task->user_usage);
copy_page(phyaddr, *entry & PAGE_MASK);
*entry = phyaddr | (*entry & ~PAGE_MASK) | PG_USER;
/* We just need to flush the table itself.
* TLB entries for the kernel remain valid
* because we've not changed them. */
tlb_flush_one_page(entry_to_virt(entry, 0));
}
}
else {
size_t phyaddr = get_page(); size_t phyaddr = get_page();
if (BUILTIN_EXPECT(!phyaddr, 0)) if (BUILTIN_EXPECT(!phyaddr, 0))
return -ENOMEM; return -ENOMEM;
atomic_int32_inc(&task->user_usage); if (flags & MAP_USER_SPACE)
atomic_int32_inc(&task->user_usage);
copy_page(phyaddr, *entry & ~PAGE_FLAGS_MASK); *entry = phyaddr | bits;
*entry = phyaddr | (*entry & PAGE_FLAGS_MASK) | PG_USER;
/* memset(get_child_entry(entry), 0x00, PAGE_SIZE); // fill with zeros
* We just need to flush the table itself. }
* TLB entries for the kernel remain valid
* because we've not changed them. // do "pre-order" traversal if no hugepage
*/ if (!(*entry & PG_PSE)) {
tlb_flush_one_page(entry_to_virt(entry, 0)); int ret = traverse(level-1, get_child_entry(entry));
if (ret < 0)
return ret;
} }
} }
else { // PGT
if ((*entry & PG_PRESENT) && !(flags & MAP_REMAP))
return -EINVAL;
*entry = phyaddr | bits;
if (flags & MAP_USER_SPACE)
atomic_int32_inc(&task->user_usage);
if (flags & MAP_REMAP)
tlb_flush_one_page(entry_to_virt(entry, level));
phyaddr += PAGE_SIZE;
}
} }
else {
size_t phyaddr = get_page();
if (BUILTIN_EXPECT(!phyaddr, 0))
return -ENOMEM;
atomic_int32_inc(&task->user_usage);
*entry = phyaddr | bits;
}
}
else { // PGT
if ((*entry & PG_PRESENT) && !(flags & MAP_REMAP))
return -EINVAL;
*entry = phyaddr | bits;
if (flags & MAP_USER_SPACE)
atomic_int32_inc(&task->user_usage);
if (flags & MAP_REMAP)
tlb_flush_one_page(entry_to_virt(entry, level));
phyaddr += PAGE_SIZE;
} }
return 0; return 0;
@ -457,92 +461,109 @@ size_t map_region(size_t viraddr, size_t phyaddr, uint32_t npages, uint32_t flag
kprintf("map_region: map %u pages from %#lx to %#lx with flags: %#x\n", npages, viraddr, phyaddr, flags); // TODO: remove kprintf("map_region: map %u pages from %#lx to %#lx with flags: %#x\n", npages, viraddr, phyaddr, flags); // TODO: remove
// check assertions
if (BUILTIN_EXPECT(start < KERNEL_SPACE && end >= KERNEL_SPACE, 0))
return 0;
if (BUILTIN_EXPECT(!task || !task->page_map, 0)) if (BUILTIN_EXPECT(!task || !task->page_map, 0))
return 0; return 0;
if (BUILTIN_EXPECT(!viraddr, 0))
return 0; // calc page tree boundaries
int i;
for (i=0; i<PAGE_MAP_LEVELS; i++) {
first[i] = virt_to_entry(start, i);
last[i] = virt_to_entry(end - 1, i) + 1; // exclusive
}
// lock tables // lock tables
if (viraddr < KERNEL_SPACE) if (start < KERNEL_SPACE)
spinlock_lock(&kslock); spinlock_lock(&kslock);
else if (end >= KERNEL_SPACE)
spinlock_irqsave_lock(&task->page_lock); spinlock_irqsave_lock(&task->page_lock);
int ret = page_iterate(start, end, cb, NULL); int ret = traverse(PAGE_MAP_LEVELS-1, current_map);
// unlock tables // unlock tables
if (viraddr < KERNEL_SPACE) if (start < KERNEL_SPACE)
spinlock_unlock(&kslock); spinlock_unlock(&kslock);
else if (end >= KERNEL_SPACE)
spinlock_irqsave_unlock(&task->page_lock); spinlock_irqsave_unlock(&task->page_lock);
return (ret == 0) ? viraddr : 0; return (ret) ? 0 : viraddr;
} }
int unmap_region(size_t viraddr, uint32_t npages) int unmap_region(size_t viraddr, uint32_t npages)
{ {
task_t* task = per_core(current_task); task_t* task = per_core(current_task);
page_entry_t* first[PAGE_MAP_LEVELS];
page_entry_t* last[PAGE_MAP_LEVELS];
size_t start = viraddr; size_t start = viraddr;
size_t end = start + npages * PAGE_SIZE; size_t end = start + npages * PAGE_SIZE;
kprintf("unmap_region: unmap %u pages from %#lx\n", npages, viraddr); // TODO: remove kprintf("unmap_region: unmap %u pages from %#lx\n", npages, viraddr); // TODO: remove
int cb(page_entry_t* entry, int level) { /** @return number of page table entries which a present */
if (level) { // PGD, PDPT, PML4 int traverse(int level, page_entry_t* entry) {
page_map_t* map = (page_map_t*) entry_to_virt(entry, 0); int used = 0;
int used = 0; page_entry_t* stop = entry + PAGE_MAP_ENTRIES;
for (; entry != stop; entry++) {
if (entry < last[level] && entry >= first[level]) {
if (level) { // PGD, PDPT, PML4
if ((*entry & PG_PRESENT) && !(*entry & PG_PSE)) {
// do "post-order" traversal if table is present and no hugepage
if (traverse(level-1, get_child_entry(entry)))
used++;
else { // child table is empty => delete it
*entry &= ~PG_PRESENT;
tlb_flush_one_page(entry_to_virt(entry, 0));
int i; if (*entry & PG_USER) {
for (i=0; i<PAGE_MAP_ENTRIES; i++) { if (put_page(*entry & PAGE_MASK))
if (map->entries[i] & PG_PRESENT) atomic_int32_dec(&task->user_usage);
}
}
}
}
else { // PGT
*entry &= ~PG_PRESENT;
tlb_flush_one_page(entry_to_virt(entry, level));
if (*entry & PG_USER)
atomic_int32_dec(&task->user_usage);
}
}
else {
if (*entry & PG_PRESENT)
used++; used++;
} }
if (!used) {
*entry &= ~PG_PRESENT;
tlb_flush_one_page(entry_to_virt(entry, 0));
if (put_page(*entry & ~PAGE_FLAGS_MASK))
atomic_int32_dec(&task->user_usage);
}
}
else { // PGT
*entry = 0;
tlb_flush_one_page(entry_to_virt(entry, level));
if (viraddr >= KERNEL_SPACE)
atomic_int32_dec(&task->user_usage);
} }
return 0; return used;
} }
// check assertions
if (BUILTIN_EXPECT(start < KERNEL_SPACE && end >= KERNEL_SPACE, 0))
return 0;
if (BUILTIN_EXPECT(!task || !task->page_map, 0)) if (BUILTIN_EXPECT(!task || !task->page_map, 0))
return 0; return 0;
// calc page tree boundaries
int i;
for (i=0; i<PAGE_MAP_LEVELS; i++) {
first[i] = virt_to_entry(start, i);
last[i] = virt_to_entry(end - 1, i) + 1; // exclusive
}
// lock tables // lock tables
if (viraddr < KERNEL_SPACE) if (start < KERNEL_SPACE)
spinlock_lock(&kslock); spinlock_lock(&kslock);
else if (end >= KERNEL_SPACE)
spinlock_irqsave_lock(&task->page_lock); spinlock_irqsave_lock(&task->page_lock);
int ret = page_iterate(start, end, NULL, cb); traverse(PAGE_MAP_LEVELS-1, current_map);
// unlock tables // unlock tables
if (viraddr < KERNEL_SPACE) if (start < KERNEL_SPACE)
spinlock_unlock(&kslock); spinlock_unlock(&kslock);
else if (end > KERNEL_SPACE)
spinlock_irqsave_unlock(&task->page_lock); spinlock_irqsave_unlock(&task->page_lock);
return ret; return 0;
} }
static void pagefault_handler(struct state *s) static void pagefault_handler(struct state *s)
@ -597,7 +618,8 @@ int arch_paging_init(void)
irq_install_handler(14, pagefault_handler); irq_install_handler(14, pagefault_handler);
// setup recursive paging // setup recursive paging
boot_pml4.entries[PAGE_MAP_ENTRIES-1] = (size_t) &boot_pml4 | PG_TABLE; page_entry_t* boot_map = get_boot_page_map();
boot_map[PAGE_MAP_ENTRIES-1] = (size_t) boot_map | PG_TABLE;
/* /*
* In longmode the kernel is already maped into the kernel space (see entry64.asm) * In longmode the kernel is already maped into the kernel space (see entry64.asm)
@ -658,3 +680,4 @@ int arch_paging_init(void)
return 0; return 0;
} }

View file

@ -90,8 +90,8 @@ typedef struct task {
atomic_int32_t user_usage; atomic_int32_t user_usage;
/// locks access to all page maps with PG_USER flag set /// locks access to all page maps with PG_USER flag set
spinlock_irqsave_t page_lock; spinlock_irqsave_t page_lock;
/// pointer to page directory (32bit) or page map level 4 (64bit) table respectively /// virtual address of page map for CR3
page_map_t* page_map; page_entry_t* page_map;
/// lock for the VMA_list /// lock for the VMA_list
spinlock_t vma_lock; spinlock_t vma_lock;
/// list of VMAs /// list of VMAs