QEMU与KVM之间的共享数据

QEMU与KVM之间的共享数据

Posted by lwk on May 26, 2022

在阐述QEMU和KVM之间共享数据之前,先回忆并梳理一下虚拟机的创建流程,主要是qemu用户态的创建流程,这里我梳理了一下给出流程图如下 image

今天阐述的内容主要逻辑在kvm_init_vcpu函数里。对应的也是ioctl系统,参数是KVM_GET_VCPU_MMAP_SIZE,通过系统调用ioctl进入KVM内核态申请内存页,最后通过mmap实现共享内存,从而达到KVM与QEMU之间共享数据的任务。


int kvm_init_vcpu(CPUState *cpu, Error **errp)
{
    KVMState *s = kvm_state;
    long mmap_size;
    int ret;
.......

    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
    if (mmap_size < 0) { 
        ret = mmap_size;
        error_setg_errno(errp, -mmap_size,
                         "kvm_init_vcpu: KVM_GET_VCPU_MMAP_SIZE failed");
        goto err; 
    }    

    cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
                        cpu->kvm_fd, 0);
......
}

static long kvm_dev_ioctl(struct file *filp,
              unsigned int ioctl, unsigned long arg) 
{
    long r = -EINVAL;

    switch (ioctl) {
    case KVM_GET_API_VERSION:
        if (arg)
            goto out; 
        r = KVM_API_VERSION;
        break;
    case KVM_CREATE_VM:
        r = kvm_dev_ioctl_create_vm(arg);
        break;
    case KVM_CHECK_EXTENSION:
        r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
        break;
    case KVM_GET_VCPU_MMAP_SIZE:
        if (arg)
            goto out; 
        r = PAGE_SIZE;     /* struct kvm_run */
#ifdef CONFIG_X86
        r += PAGE_SIZE;    /* pio data page */
#endif
#ifdef CONFIG_KVM_MMIO
        r += PAGE_SIZE;    /* coalesced mmio ring page */
#endif
        break;
    case KVM_TRACE_ENABLE:
    case KVM_TRACE_PAUSE:
    case KVM_TRACE_DISABLE:
        r = -EOPNOTSUPP;
        break;
    default:
        return kvm_arch_dev_ioctl(filp, ioctl, arg);
    }
out:
    return r;
}
           

从kvm_dev_ioctl代码可以看出,ioctl(KVM_GET_VCPU_MMAP_SIZE)可能返回1个、2个或3个页。第一个页用于kvm_run,该结构体QEMU和KVM进行基本的数据交互,第二个页用户虚拟机访问IO端口时存储相应的数据,最后一页用户聚合的MMIO。

kvm_init_vcpu在得到了共享内存的大小之后,在VCPU的fd上调用mmap,对应的内核处理函数是kvm_vcpu_mmap,代码如下:

static struct file_operations kvm_vcpu_fops = {
    .release        = kvm_vcpu_release,
    .unlocked_ioctl = kvm_vcpu_ioctl,
    .mmap           = kvm_vcpu_mmap,
    .llseek     = noop_llseek,
    KVM_COMPAT(kvm_vcpu_compat_ioctl),
};
static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
{
    vma->vm_ops = &kvm_vcpu_vm_ops;
    return 0;
}
static const struct vm_operations_struct kvm_vcpu_vm_ops = {
    .fault = kvm_vcpu_fault,
};

static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
{   
    struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
    struct page *page;

    if (vmf->pgoff == 0)
        page = virt_to_page(vcpu->run);
#ifdef CONFIG_X86
    else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
        page = virt_to_page(vcpu->arch.pio_data);
#endif
#ifdef CONFIG_KVM_MMIO
    else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
        page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
#endif
    else
        return kvm_arch_vcpu_fault(vcpu, vmf);
    get_page(page);
    vmf->page = page;
    return 0;
}

vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
{
    return VM_FAULT_SIGBUS;
}

QEMU调用mmap映射VCPU的fd这个匿名文件的时候,实际上只分配了虚拟地址空间,并且设置了这段虚拟地址空间的操作为kvm_vcpu_vm_ops,该操作只有一个fault回调函数kvm_vcpu_fault。kvm_vcpu_fault函数会在QEMU访问共享内存产生缺页异常的时候被调用,从代码可以看出,内核会把对应的数据与虚拟地址空间联系起来。访问第一页时,实际访问到kvm_vcpu结构体中的kvm_run的run成员;访问第二页会访问kvm_vcpu中类型为kvm_vcpu_arch的arch成员;访问第三页会访问整个虚拟机结构KVM中的coalesced_mmio_ring成员,这一页是在ioctl(KVM_CREATE_VM)代码路径中初始化的,如果访问出错,则会调用kvm_arch_vcpu_fault返回VM_FAULT_SIGBUS