在虚拟化领域,KVM已经是独领风骚,尤其是在硬件虚拟化方面,大大提升了虚拟机的性能。今天主要介绍KVM模块的初始化。KVM模块即通常所说的内核模块,它是以kernel module的形式呈现出来的,在虚拟化宿主机上会有两个内核模块,一个是kvm,另一个是kvm_intel(kvm_amd),本文主要介绍的是kvm_intel。
kvm的相关源码在linux内核的virt/目录以及arch/x86/kvm里,这里有两个目录,其中virt/目录是CPU架构的公共代码,也是kvm.ko对应的代码;而arch/x86/kvm是cpu架构代码,如x86架构代码,其中x86架构又有不同实现,例如Intel和AMD就是两个典型代表,例如arch/x86/kvm/vmx.c和arch/x86/kvm/svm.c分别代表Intel(VT-X)和AMD(AMD-V),这也是kvm_intel.ko和kvm_amd.ko的来源。
接下来从arch/x86/kvm/vmx.c的源码开始看起
static int __init vmx_init(void)
{
int r;
#if IS_ENABLED(CONFIG_HYPERV)
/*
* Enlightened VMCS usage should be recommended and the host needs
* to support eVMCS v1 or above. We can also disable eVMCS support
* with module parameter.
*/
if (enlightened_vmcs &&
ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
(ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
KVM_EVMCS_VERSION) {
int cpu;
/* Check that we have assist pages on all online CPUs */
for_each_online_cpu(cpu) {
if (!hv_get_vp_assist_page(cpu)) {
enlightened_vmcs = false;
break;
}
}
if (enlightened_vmcs) {
pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
static_branch_enable(&enable_evmcs);
}
if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
vmx_x86_ops.enable_direct_tlbflush
= hv_enable_direct_tlbflush;
} else {
enlightened_vmcs = false;
}
#endif
r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
__alignof__(struct vcpu_vmx), THIS_MODULE);
if (r)
return r;
/*
* Must be called after kvm_init() so enable_ept is properly set
* up. Hand the parameter mitigation value in which was stored in
* the pre module init parser. If no parameter was given, it will
* contain 'auto' which will be turned into the default 'cond'
* mitigation mode.
*/
r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
if (r) {
vmx_exit();
return r;
}
#ifdef CONFIG_KEXEC_CORE
rcu_assign_pointer(crash_vmclear_loaded_vmcss,
crash_vmclear_local_loaded_vmcss);
#endif
vmx_check_vmcs12_offsets();
return 0;
}
module_init(vmx_init);
vmx_init函数会调用kvm_init函数,kvm_init函数在virt/kvm/kvm_main.c文件中
int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
struct module *module)
{
int r;
int cpu;
r = kvm_arch_init(opaque);
if (r)
goto out_fail;
/*
* kvm_arch_init makes sure there's at most one caller
* for architectures that support multiple implementations,
* like intel and amd on x86.
* kvm_arch_init must be called before kvm_irqfd_init to avoid creating
* conflicts in case kvm is already setup for another implementation.
*/
r = kvm_irqfd_init();
if (r)
goto out_irqfd;
if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
r = -ENOMEM;
goto out_free_0;
}
r = kvm_arch_hardware_setup();
if (r < 0)
goto out_free_1;
for_each_online_cpu(cpu) {
smp_call_function_single(cpu, check_processor_compat, &r, 1);
if (r < 0)
goto out_free_2;
}
r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
kvm_starting_cpu, kvm_dying_cpu);
if (r)
goto out_free_2;
register_reboot_notifier(&kvm_reboot_notifier);
/* A kmem cache lets us meet the alignment requirements of fx_save. */
if (!vcpu_align)
vcpu_align = __alignof__(struct kvm_vcpu);
kvm_vcpu_cache =
kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
SLAB_ACCOUNT,
offsetof(struct kvm_vcpu, arch),
sizeof_field(struct kvm_vcpu, arch),
NULL);
if (!kvm_vcpu_cache) {
r = -ENOMEM;
goto out_free_3;
}
r = kvm_async_pf_init();
if (r)
goto out_free;
kvm_chardev_ops.owner = module;
kvm_vm_fops.owner = module;
kvm_vcpu_fops.owner = module;
r = misc_register(&kvm_dev);
if (r) {
pr_err("kvm: misc device register failed\n");
goto out_unreg;
}
register_syscore_ops(&kvm_syscore_ops);
kvm_preempt_ops.sched_in = kvm_sched_in;
kvm_preempt_ops.sched_out = kvm_sched_out;
kvm_init_debug();
r = kvm_vfio_ops_init();
WARN_ON(r);
return 0;
out_unreg:
kvm_async_pf_deinit();
out_free:
kmem_cache_destroy(kvm_vcpu_cache);
out_free_3:
unregister_reboot_notifier(&kvm_reboot_notifier);
cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
out_free_2:
kvm_arch_hardware_unsetup();
out_free_1:
free_cpumask_var(cpus_hardware_enabled);
out_free_0:
kvm_irqfd_exit();
out_irqfd:
kvm_arch_exit();
out_fail:
return r;
}
EXPORT_SYMBOL_GPL(kvm_init);
其中vmx_x86_ops表示的是intel VT-X具体实现的各种回调函数,包括了硬件检测、虚拟机创建VCPU、一些寄存器的设置等。第二个参数标识VMX实现的VCPU结构的大小。
static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
.hardware_setup = hardware_setup,
.hardware_unsetup = hardware_unsetup,
.check_processor_compatibility = vmx_check_processor_compat,
.hardware_enable = hardware_enable,
.hardware_disable = hardware_disable,
.cpu_has_accelerated_tpr = report_flexpriority,
.has_emulated_msr = vmx_has_emulated_msr,
.vm_init = vmx_vm_init,
.vm_alloc = vmx_vm_alloc,
.vm_free = vmx_vm_free,
//这里只贴了部分结构
}
kvm_init函数里有几个关键函数,其中kvm_arch_init用来初始化架构相关的代码;kvm_irqfd_init初始化irqfd相关的数据,主要是创建一个线程。kvm_arch_hardware_setup用来创建一些跟启动KVM密切相关的数据结构以及初始化一些硬件特性等。对每个CPU会调用check_processor_compat,该函数用来检测所有CPU的特性是否一致;此时注册kvm_reboot_notifier,当系统重启时会得到通知。kmem_cache_create_usercopy 主要是创建VCPU结构体的cache赋值给kvm_vcpu_cache,之后就能比较快的分配VCPU空间,并设置file_operations的Owner为当前模块,其中kvm_chardev_ops表示的是/dev/kvm这设备对应的fd的file_operations ,kvm_vm_fops表示的是创建虚拟机对应的fd的file_operations,kvm_vcpu_fops表示的是创建的VCPU对应的fd的file_operations。调用misc_register创建kvm_dev这个misc设备,将kvm_dev这个设备的file_operations设置为kvm_chardev_ops。设置kvm_preempt_ops的schd_in和sched_out,当虚拟机VCPU所在线程被强占或者被调度时就会调用这两个函数。
下面对kvm_init的这几个核心函数进行分析,首先是kvm_arch_init,该函数进行架构相关的初始化
int kvm_arch_init(void *opaque)
{
int r;
struct kvm_x86_ops *ops = opaque;
if (kvm_x86_ops) {
printk(KERN_ERR "kvm: already loaded the other module\n");
r = -EEXIST;
goto out;
}
if (!ops->cpu_has_kvm_support()) {
printk(KERN_ERR "kvm: no hardware support\n");
r = -EOPNOTSUPP;
goto out;
}
if (ops->disabled_by_bios()) {
printk(KERN_ERR "kvm: disabled by bios\n");
r = -EOPNOTSUPP;
goto out;
}
/*
* KVM explicitly assumes that the guest has an FPU and
* FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the
* vCPU's FPU state as a fxregs_state struct.
*/
if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) {
printk(KERN_ERR "kvm: inadequate fpu\n");
r = -EOPNOTSUPP;
goto out;
}
r = -ENOMEM;
x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu),
__alignof__(struct fpu), SLAB_ACCOUNT,
NULL);
if (!x86_fpu_cache) {
printk(KERN_ERR "kvm: failed to allocate cache for x86 fpu\n");
goto out;
}
shared_msrs = alloc_percpu(struct kvm_shared_msrs);
if (!shared_msrs) {
printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
goto out_free_x86_fpu_cache;
}
r = kvm_mmu_module_init();
if (r)
goto out_free_percpu;
kvm_x86_ops = ops;
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
PT_DIRTY_MASK, PT64_NX_MASK, 0,
PT_PRESENT_MASK, 0, sme_me_mask);
kvm_timer_init();
perf_register_guest_info_callbacks(&kvm_guest_cbs);
if (boot_cpu_has(X86_FEATURE_XSAVE))
host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
kvm_lapic_init();
if (pi_inject_timer == -1)
pi_inject_timer = housekeeping_enabled(HK_FLAG_TIMER);
#ifdef CONFIG_X86_64
pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
set_hv_tscchange_cb(kvm_hyperv_tsc_notifier);
#endif
return 0;
out_free_percpu:
free_percpu(shared_msrs);
out_free_x86_fpu_cache:
kmem_cache_destroy(x86_fpu_cache);
out:
return r;
}
kvm_arch_init在初始化过程中,确保只有一个KVM模块能够加载到内核,kvm现在的结构会被赋值到全局变量kvm_x86_ops中,这里传递到kvm_arch_init的参数opaque就是vmx_x86_ops,其中存放了Intel CPU下KVM实现的各类回调函数。kvm_arch_init调用实现相关的回调函数cpu_has_kvm_support和disabled_by_bios,用来检测CPU是否支持VMX的模式以及是否被BIOS关闭,前者通过CPUID指令的返回值判断,后者通过读取MSR寄存器判断。
kvm_init第二个重要函数就是kvm_arch_hardware_setup,主要实现 vmx_x86_ops的hardware_setup成功
int kvm_arch_hardware_setup(void)
{
int r;
r = kvm_x86_ops->hardware_setup();
if (r != 0)
return r;
if (kvm_has_tsc_control) {
/*
* Make sure the user can only configure tsc_khz values that
* fit into a signed integer.
* A min value is not calculated because it will always
* be 1 on all machines.
*/
u64 max = min(0x7fffffffULL,
__scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz));
kvm_max_guest_tsc_khz = max;
kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
}
if (boot_cpu_has(X86_FEATURE_XSAVES))
rdmsrl(MSR_IA32_XSS, host_xss)
kvm_init_msr_list();
return 0;
}
static __init int hardware_setup(void)
{
unsigned long host_bndcfgs;
struct desc_ptr dt;
int r, i;
rdmsrl_safe(MSR_EFER, &host_efer);
store_idt(&dt);
host_idt_base = dt.address;
if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
return -EIO;
if (!cpu_has_vmx_ept() ||
!cpu_has_vmx_ept_4levels() ||
!cpu_has_vmx_ept_mt_wb() ||
!cpu_has_vmx_invept_global())
enable_ept = 0;
r = alloc_kvm_area();
if (r)
nested_vmx_hardware_unsetup();
return r;
}
static __init int alloc_kvm_area(void)
{
int cpu;
for_each_possible_cpu(cpu) {
struct vmcs *vmcs;
vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
if (!vmcs) {
free_kvm_area();
return -ENOMEM;
}
if (static_branch_unlikely(&enable_evmcs))
vmcs->hdr.revision_id = vmcs_config.revision_id;
per_cpu(vmxarea, cpu) = vmcs;
}
return 0;
}
kvm_arch_hardware_setup函数主要调用了intel的hardware_setup函数,在文件arch/x86/kvm/vmx/vmx.c中。hardware_setup会设置vmcs配置,根据CPU特性设置一些全局变量,例如是否支持EPT的变量,调用alloc_kvm_area为每一个物理CPU分配一个VMCS结构并放到vmxarea这个percpu变量中。
kvm_init另外一个重要函数就是check_processor_compat
static void check_processor_compat(void *rtn)
{
*(int *)rtn = kvm_arch_check_processor_compat();
}
int kvm_arch_check_processor_compat(void)
{
return kvm_x86_ops->check_processor_compatibility();
}
//由于是intel,因此调用的是vmx.c里的vmx_check_processor_compat
static int __init vmx_check_processor_compat(void)
{
struct vmcs_config vmcs_conf;
struct vmx_capability vmx_cap;
if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0)
return -EIO;
if (nested)
nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept,
enable_apicv);
if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
smp_processor_id());
return -EIO;
}
return 0;
}
这里对所有物理CPU构造粗vmcs_conf,然后与去全局的vmcs_config比较,确保所有的物理CPU的vmcs_conf一样。这样才能保证VCPU在物理CPU上调度时不会出错。
kvm_init的最后一个工作创建一个misc设备/dev/kvm,该函数以及对应的操作如下:
static struct file_operations kvm_chardev_ops = {
.unlocked_ioctl = kvm_dev_ioctl,
.llseek = noop_llseek,
KVM_COMPAT(kvm_dev_ioctl),
};
static struct miscdevice kvm_dev = {
KVM_MINOR,
"kvm",
&kvm_chardev_ops,
};
该设备只支持ioctl系统调用。
static long kvm_dev_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
long r = -EINVAL;
switch (ioctl) {
case KVM_GET_API_VERSION:
if (arg)
goto out;
r = KVM_API_VERSION;
break;
case KVM_CREATE_VM:
r = kvm_dev_ioctl_create_vm(arg);
break;
case KVM_CHECK_EXTENSION:
r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
break;
case KVM_GET_VCPU_MMAP_SIZE:
if (arg)
goto out;
r = PAGE_SIZE; /* struct kvm_run */
#ifdef CONFIG_X86
r += PAGE_SIZE; /* pio data page */
#endif
#ifdef CONFIG_KVM_MMIO
r += PAGE_SIZE; /* coalesced mmio ring page */
#endif
break;
case KVM_TRACE_ENABLE:
case KVM_TRACE_PAUSE:
case KVM_TRACE_DISABLE:
r = -EOPNOTSUPP;
break;
default:
return kvm_arch_dev_ioctl(filp, ioctl, arg);
}
out:
return r;
}
/dev/kvm设备的ioctl接口分为两类:一类为通用接口如KVM_API_VERSION何KVM_CREATE_VM,另一类是架构相关接口,ioctl由kvm_arch_dev_ioctl函数处理。kvm的ioctl接口处理整个KVM层面的请求,如KVM_GET_API_VERSION返回KVM的版本号,KVM_CREATE_VM创建一台虚机。
以上就是kvm_init的工作,主要进行硬件检查,如CPU架构,分配一些缓存,如vmcs等,创建设备,如/dev/kvm。