metalsvm/kernel/tasks.c

/*
 * Copyright 2010 Stefan Lankes, Chair for Operating Systems,
 *                               RWTH Aachen University
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * This file is part of MetalSVM.
 */

#include <metalsvm/stdio.h>
#include <metalsvm/stdlib.h>
#include <metalsvm/string.h>
#include <metalsvm/errno.h>
#include <metalsvm/mmu.h>
#include <metalsvm/page.h>
#include <metalsvm/tasks.h>
#include <metalsvm/processor.h>
#include <metalsvm/spinlock.h>
#include <metalsvm/mailbox.h>
#include <metalsvm/syscall.h>
#include <metalsvm/fs.h>
#include <asm/elf.h>

DEFINE_PER_CORE(task_t*, current_task, NULL);
static task_t task_table[MAX_TASKS] = {[0 ... MAX_TASKS-1] = {0, TASK_INVALID, ATOMIC_INIT(0), \
			 SPINLOCK_INIT, NULL, SPINLOCK_INIT, NULL}};
static spinlock_t table_lock = SPINLOCK_INIT;

/*
 * helper function for the assembly code to determine the current task
 */
task_t* get_current_task(void) {
	return per_core(current_task);
}

int multitasking_init(void) {
	if (task_table[0].status == TASK_INVALID) {
		task_table[0].id = 0;
		task_table[0].status = TASK_RUNNING;
		atomic_int32_set(&task_table[0].user_usage, 0);
		mailbox_wait_msg_init(&task_table[0].inbox);
		memset(task_table[0].outbox, 0x00, sizeof(mailbox_wait_msg_t*)*MAX_TASKS);
		per_core(current_task) = task_table+0;
		per_core(current_task)->pgd = get_boot_pgd();
		return 0;
	}

	return -ENOMEM;
}

static void wakeup_blocked_tasks(int result)
{
	wait_msg_t tmp = { per_core(current_task)->id, result };
	unsigned int i;

	spinlock_lock_irqsave(&table_lock);

	/* wake up blocked tasks */
	for(i=0; i<MAX_TASKS; i++) {
		if (per_core(current_task)->outbox[i]) {
			mailbox_wait_msg_post(per_core(current_task)->outbox[i], tmp);
			per_core(current_task)->outbox[i] = NULL;
		}
	}

	spinlock_unlock_irqsave(&table_lock);
}

static void NORETURN do_exit(int arg) {
	vma_t* tmp;

	kprintf("Terminate task: %u, return value %d\n", per_core(current_task)->id, arg);

	wakeup_blocked_tasks(arg);

	//vma_dump(per_core(current_task));
	spinlock_lock(&(per_core(current_task)->vma_lock));

	// remove memory regions
	while((tmp = per_core(current_task)->vma_list) != NULL) {
		kfree((void*) tmp->start, tmp->end - tmp->start + 1);
		per_core(current_task)->vma_list = tmp->next;
		kfree((void*) tmp, sizeof(vma_t));
	}

	spinlock_unlock(&(per_core(current_task)->vma_lock));

	drop_pgd(); // delete page directory and its page tables

	if (atomic_int32_read(&per_core(current_task)->user_usage))
		kprintf("Memory leak! Task %d did not release %d pages\n", per_core(current_task)->id, atomic_int32_read(&per_core(current_task)->user_usage));
	per_core(current_task)->status = TASK_FINISHED;
	reschedule();

	kputs("Kernel panic: scheduler found no valid task\n");
	while(1) {
		NOP8;
	}
}

void NORETURN leave_kernel_task(void) {
        int result;

        result = get_return_value();
        do_exit(result);
}

void NORETURN sys_exit(int arg) {
	do_exit(arg);
}

void NORETURN abort(void) {
	do_exit(-1);
}

static int create_task(tid_t* id, entry_point_t ep, void* arg)
{
	int ret = -ENOMEM;
	unsigned int i;

	if (BUILTIN_EXPECT(!ep, 0))
		return -EINVAL;

	spinlock_lock_irqsave(&table_lock);

	for(i=0; i<MAX_TASKS; i++) {
		if (task_table[i].status == TASK_INVALID) {
			atomic_int32_set(&task_table[i].user_usage, 0);

			ret = create_pgd(task_table+i, 0);
			if (ret < 0) {
				ret = -ENOMEM;
				goto create_task_out;
			}

			task_table[i].id = i;
			spinlock_init(&task_table[i].vma_lock);
			task_table[i].vma_list = NULL;
			mailbox_wait_msg_init(&task_table[i].inbox);
			memset(task_table[i].outbox, 0x00, sizeof(mailbox_wait_msg_t*)*MAX_TASKS);
			task_table[i].outbox[per_core(current_task)->id] = &per_core(current_task)->inbox;

			if (id)
				*id = i;

			ret = create_default_frame(task_table+i, ep, arg);

			task_table[i].status = TASK_READY;
			break;
		}
	}

create_task_out:
	spinlock_unlock_irqsave(&table_lock);

	return ret;
}

int sys_fork(void)
{
	int ret = -ENOMEM;
	unsigned int i;
	task_t* parent_task = per_core(current_task);
	vma_t** child;
	vma_t* parent;
	vma_t* tmp;

	spinlock_lock_irqsave(&table_lock);

	for(i=0; i<MAX_TASKS; i++) {
		if (task_table[i].status == TASK_INVALID) {
			atomic_int32_set(&task_table[i].user_usage, 0);

			ret = create_pgd(task_table+i, 1);
			if (ret < 0) {
				ret = -ENOMEM;
				goto create_task_out;
			}

			task_table[i].id = i;
			spinlock_init(&task_table[i].vma_lock);

			// copy VMA list
			child = &task_table[i].vma_list;
			parent = per_core(current_task)->vma_list;
			tmp = NULL;

			while(parent) {
				*child = (vma_t*) kmalloc(sizeof(vma_t));
				if (BUILTIN_EXPECT(!child, 0))
					break;

				(*child)->start = parent->start;
				(*child)->end = parent->end;
				(*child)->type = parent->type;
				(*child)->prev = tmp;
				(*child)->next = NULL;

				parent = parent->next;
				tmp = *child;
				child = &((*child)->next);
			}

			mailbox_wait_msg_init(&task_table[i].inbox);
			memset(task_table[i].outbox, 0x00, sizeof(mailbox_wait_msg_t*)*MAX_TASKS);
			task_table[i].outbox[per_core(current_task)->id] = &per_core(current_task)->inbox;

			ret = arch_fork(task_table+i);

			if (parent_task != per_core(current_task))
				return 0;  // Oh, the new child! => leave function

			if (!ret) {
				task_table[i].status = TASK_READY;
				ret = i;
			}
			break;
		}
	}

create_task_out:
	spinlock_unlock_irqsave(&table_lock);

	return ret;
}

int create_kernel_task(tid_t* id, entry_point_t ep, void* arg)
{
	return create_task(id, ep, arg);
}

static int STDCALL user_entry(void* arg)
{
	uint32_t i, addr, npages, flags, stack = 0;
	vfs_node_t* node = (vfs_node_t*) arg;
	elf_header_t header;
	elf_program_header_t prog_header;
	//elf_section_header_t sec_header;

	if (!node)
		return -EINVAL;

	read_fs(node, (uint8_t*)&header, sizeof(elf_header_t), 0);
	if (BUILTIN_EXPECT(header.ident.magic != ELF_MAGIC, 0))
		goto invalid;

	if (BUILTIN_EXPECT(header.type != ELF_ET_EXEC, 0))
		goto invalid;

	if (BUILTIN_EXPECT(header.machine != ELF_EM_386, 0))
		goto invalid;

	if (BUILTIN_EXPECT(header.ident._class != ELF_CLASS_32, 0))
		goto invalid;

	if (BUILTIN_EXPECT(header.ident.data != ELF_DATA_2LSB, 0))
		goto invalid;

	if (header.entry <= KERNEL_SPACE)
		goto invalid;

        // interpret program header table
	for (i=0; i<header.ph_entry_count; i++) {
		if (read_fs(node, (uint8_t*)&prog_header, sizeof(elf_program_header_t), header.ph_offset+i*header.ph_entry_size) == 0) {
			kprintf("Could not read programm header!\n");
			continue;
		}

		switch(prog_header.type)
		{
		case  ELF_PT_LOAD:  // load program segment
			if (!prog_header.virt_addr)
				continue;

			npages = (prog_header.mem_size / PAGE_SIZE);
			if (prog_header.mem_size % PAGE_SIZE)
				npages++;

			addr = get_pages(npages);

			flags = MAP_USER_SPACE;
			if (prog_header.flags & PF_X)
				flags |= MAP_CODE;

			// map page frames in the address space of the current task
			if (!map_region(prog_header.virt_addr, addr, npages, flags))
				kprintf("Could not map 0x%x at 0x%x\n", addr, prog_header.virt_addr);

			// clear pages
			memset((void*) prog_header.virt_addr, 0, npages*PAGE_SIZE);

			// load program
			read_fs(node, (uint8_t*)prog_header.virt_addr, prog_header.file_size, prog_header.offset);

			flags = VMA_CACHEABLE;
			if (prog_header.flags & PF_R)
				flags |= VMA_READ;
			if (prog_header.flags & PF_W)
				flags |= VMA_WRITE;
			if (prog_header.flags & PF_X)
				flags |= VMA_EXECUTE;
			vma_add(per_core(current_task), prog_header.virt_addr, prog_header.virt_addr+npages*PAGE_SIZE-1, flags);

			if (!(prog_header.flags & PF_W))
				change_page_permissions(prog_header.virt_addr, prog_header.virt_addr+npages*PAGE_SIZE-1, flags);
			break;

		case ELF_PT_GNU_STACK: // Indicates stack executability
			// create user-level stack
			npages = DEFAULT_STACK_SIZE / PAGE_SIZE;
			if (DEFAULT_STACK_SIZE % PAGE_SIZE)
				npages++;

			addr = get_pages(npages);
			stack = header.entry*2; // virtual address of the stack

			if (!map_region(stack, addr, npages, MAP_USER_SPACE)) {
				kprintf("Could not map stack at 0x%x\n", stack);
				return -ENOMEM;
			}
			memset((void*) stack, 0, npages*PAGE_SIZE);

			// create vma regions for the user-level stack
			flags = VMA_CACHEABLE;
			if (prog_header.flags & PF_R)
				flags |= VMA_READ;
			if (prog_header.flags & PF_W)
				flags |= VMA_WRITE;
			if (prog_header.flags & PF_X)
				flags |= VMA_EXECUTE;
			vma_add(per_core(current_task), stack, stack+npages*PAGE_SIZE-1, flags);
			break;
		 }
	}

#if 0
	// interpret section header table
	for (i=0; i<header.sh_entry_count; i++) {
		if (read_fs(node, (uint8_t*)&sec_header, sizeof(elf_section_header_t), header.sh_offset+i*header.sh_entry_size) == 0) {
			kprintf("Could not read section header!\n");
			continue;
		}

		// TODO: interpret section header
	}
#endif

	if (BUILTIN_EXPECT(!stack, 0)) {
		kprintf("Stack is missing!\n");
		return -ENOMEM;
	}

	jump_to_user_code(header.entry, stack+DEFAULT_STACK_SIZE-64);

	return 0;

invalid:
	kprintf("Invalid executable!\n");

	return -EINVAL;
}

int create_user_task(tid_t* id, size_t sz, const char* fname, int argc, char** argv)
{
	vfs_node_t* node;

	node = findnode_fs((char*) fname);
	if (!node || !(node->type == FS_FILE))
		return -EINVAL;

	return create_task(id, user_entry, node);
}

tid_t wait(int32_t* result)
{
	wait_msg_t tmp = { -1, -1};

	/*
	 * idle tasks are not allowed to wait for another task
	 * they should always run...
	 */
	if (BUILTIN_EXPECT(per_core(current_task)->status == TASK_IDLE, 0))
		return -EINVAL;

	mailbox_wait_msg_fetch(&per_core(current_task)->inbox, &tmp);

	if (result)
		*result = tmp.result;

	return tmp.id;
}

int wakeup_task(tid_t id)
{
	int ret = -EINVAL;

	/* avoid nested locking */
	spinlock_lock_irqsave(&table_lock);

	if (task_table[id].status != TASK_BLOCKED) {
		kprintf("Task %d is not blocked!\n", id);
	} else {
		task_table[id].status = TASK_READY;
		ret = 0;
	}

	spinlock_unlock_irqsave(&table_lock);

	return ret;
}

int block_task(tid_t id)
{
	int ret = -EINVAL;

	spinlock_lock_irqsave(&table_lock);

        if ((task_table[id].status == TASK_RUNNING) || (task_table[id].status == TASK_READY)) {
		task_table[id].status = TASK_BLOCKED;
		ret = 0;
	} else kprintf("Unable to block task %d!\n", id);

        spinlock_unlock_irqsave(&table_lock);

        return ret;
}

void scheduler(void)
{
	unsigned int i;
	unsigned int new_id;

	spinlock_lock(&table_lock);

	/* signalize that this task could be reused */
	if (per_core(current_task)->status == TASK_FINISHED)
		per_core(current_task)->status = TASK_INVALID;

	for(i=1, new_id=(per_core(current_task)->id + 1) % MAX_TASKS;
		i<MAX_TASKS; i++, new_id=(new_id+1) % MAX_TASKS)
	{
		if (task_table[new_id].status == TASK_READY) {
			if (per_core(current_task)->status == TASK_RUNNING)
				per_core(current_task)->status = TASK_READY;
			task_table[new_id].status = TASK_RUNNING;

			per_core(current_task) = task_table+new_id;
			goto get_task_out;
		}
	}

	if ((per_core(current_task)->status == TASK_RUNNING) || (per_core(current_task)->status == TASK_IDLE))
		goto get_task_out;

	/*
	 * we switch to the idle task, if the current task terminates
	 * and no other is ready
	 */
	for(i=0; i<MAX_TASKS; i++) {
		if (task_table[i].status == TASK_IDLE) {
			per_core(current_task) = task_table+i;
			goto get_task_out;
		}
	}

get_task_out:
	spinlock_unlock(&table_lock);
}