在阐述QEMU和KVM之间共享数据之前,先回忆并梳理一下虚拟机的创建流程,主要是qemu用户态的创建流程,这里我梳理了一下给出流程图如下
今天阐述的内容主要逻辑在kvm_init_vcpu函数里。对应的也是ioctl系统,参数是KVM_GET_VCPU_MMAP_SIZE,通过系统调用ioctl进入KVM内核态申请内存页,最后通过mmap实现共享内存,从而达到KVM与QEMU之间共享数据的任务。
int kvm_init_vcpu(CPUState *cpu, Error **errp)
{
KVMState *s = kvm_state;
long mmap_size;
int ret;
.......
mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
if (mmap_size < 0) {
ret = mmap_size;
error_setg_errno(errp, -mmap_size,
"kvm_init_vcpu: KVM_GET_VCPU_MMAP_SIZE failed");
goto err;
}
cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
cpu->kvm_fd, 0);
......
}
static long kvm_dev_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
long r = -EINVAL;
switch (ioctl) {
case KVM_GET_API_VERSION:
if (arg)
goto out;
r = KVM_API_VERSION;
break;
case KVM_CREATE_VM:
r = kvm_dev_ioctl_create_vm(arg);
break;
case KVM_CHECK_EXTENSION:
r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
break;
case KVM_GET_VCPU_MMAP_SIZE:
if (arg)
goto out;
r = PAGE_SIZE; /* struct kvm_run */
#ifdef CONFIG_X86
r += PAGE_SIZE; /* pio data page */
#endif
#ifdef CONFIG_KVM_MMIO
r += PAGE_SIZE; /* coalesced mmio ring page */
#endif
break;
case KVM_TRACE_ENABLE:
case KVM_TRACE_PAUSE:
case KVM_TRACE_DISABLE:
r = -EOPNOTSUPP;
break;
default:
return kvm_arch_dev_ioctl(filp, ioctl, arg);
}
out:
return r;
}
从kvm_dev_ioctl代码可以看出,ioctl(KVM_GET_VCPU_MMAP_SIZE)可能返回1个、2个或3个页。第一个页用于kvm_run,该结构体QEMU和KVM进行基本的数据交互,第二个页用户虚拟机访问IO端口时存储相应的数据,最后一页用户聚合的MMIO。
kvm_init_vcpu在得到了共享内存的大小之后,在VCPU的fd上调用mmap,对应的内核处理函数是kvm_vcpu_mmap,代码如下:
static struct file_operations kvm_vcpu_fops = {
.release = kvm_vcpu_release,
.unlocked_ioctl = kvm_vcpu_ioctl,
.mmap = kvm_vcpu_mmap,
.llseek = noop_llseek,
KVM_COMPAT(kvm_vcpu_compat_ioctl),
};
static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
{
vma->vm_ops = &kvm_vcpu_vm_ops;
return 0;
}
static const struct vm_operations_struct kvm_vcpu_vm_ops = {
.fault = kvm_vcpu_fault,
};
static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
{
struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
struct page *page;
if (vmf->pgoff == 0)
page = virt_to_page(vcpu->run);
#ifdef CONFIG_X86
else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
page = virt_to_page(vcpu->arch.pio_data);
#endif
#ifdef CONFIG_KVM_MMIO
else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
#endif
else
return kvm_arch_vcpu_fault(vcpu, vmf);
get_page(page);
vmf->page = page;
return 0;
}
vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
{
return VM_FAULT_SIGBUS;
}
QEMU调用mmap映射VCPU的fd这个匿名文件的时候,实际上只分配了虚拟地址空间,并且设置了这段虚拟地址空间的操作为kvm_vcpu_vm_ops,该操作只有一个fault回调函数kvm_vcpu_fault。kvm_vcpu_fault函数会在QEMU访问共享内存产生缺页异常的时候被调用,从代码可以看出,内核会把对应的数据与虚拟地址空间联系起来。访问第一页时,实际访问到kvm_vcpu结构体中的kvm_run的run成员;访问第二页会访问kvm_vcpu中类型为kvm_vcpu_arch的arch成员;访问第三页会访问整个虚拟机结构KVM中的coalesced_mmio_ring成员,这一页是在ioctl(KVM_CREATE_VM)代码路径中初始化的,如果访问出错,则会调用kvm_arch_vcpu_fault返回VM_FAULT_SIGBUS