qemu加载seaBIOS源码分析

qemu加载seaBIOS源码分析

Posted by lwk on May 15, 2022

虚拟化里对于虚拟机有一个重要的模拟就是BIOS的模拟,目前QEMU针对BIOS的模拟主要是开源的seaBIOS。今天重要介绍qemu是如何加载seaBIOS的。

pc_i440fx_machine_options会设置qemu的firmware为bios-256k.bin,代码如下:


static void pc_i440fx_machine_options(MachineClass *m)
{
    PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
    pcmc->default_nic_model = "e1000";

    m->family = "pc_piix";
    m->desc = "Standard PC (i440FX + PIIX, 1996)";
    m->default_machine_opts = "firmware=bios-256k.bin";
    m->default_display = "std";
    machine_class_allow_dynamic_sysbus_dev(m, TYPE_RAMFB_DEVICE);
    machine_class_allow_dynamic_sysbus_dev(m, TYPE_VMBUS_BRIDGE);
}

pc_i440fx_machine_options会在虚拟机注册qemu虚拟机机器类型的时候调用,这是由宏DEFINE_PC_MACHINE指定的。这个在之前的文章qemu模拟machine类型是从哪来的一文中有提到过。

main函数首先会将这个default_machine_opts挂到machine的option lists上,之后从这个option lists上把firmware的值赋值到bios_name,由上可知bios_name为bios-256k.bin

 /*   
     * Get the default machine options from the machine if it is not already
     * specified either by the configuration file or by the command line.
     */
    if (machine_class->default_machine_opts) {
        qemu_opts_set_defaults(qemu_find_opts("machine"),
                               machine_class->default_machine_opts, 0);
    }

BIOS固件的加载是在函数old_pc_system_room_init完成的,调用链为pc_init1->pc_memory_init->pc_system_firmware_init->x86_bios_rom_init。该函数主要完成三项工作:

1)打开文件,得到文件信息,创建一个BIOS MemoryRegon。首先通过qemu_find_file和get_image_size得到文件的路径和大小,大小需要是64KB的整数倍,然后调用memory_region_init_ram初始化一个名为bios的MemoryRegion。memory_region_init_ram会在qemu的地址空间分配bios_size,即256KB大小的地址空间

void x86_bios_rom_init(MemoryRegion *rom_memory, bool isapc_ram_fw)
{
    char *filename;
    MemoryRegion *bios, *isa_bios;
    int bios_size, isa_bios_size;
    int ret;

    /* BIOS load */
    if (bios_name == NULL) {
        bios_name = BIOS_FILENAME;
    }    
    filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name);
    if (filename) {
        bios_size = get_image_size(filename);
    } else {
        bios_size = -1;
    }    
    if (bios_size <= 0 || 
        (bios_size % 65536) != 0) { 
        goto bios_error;
    }    
    bios = g_malloc(sizeof(*bios));
    memory_region_init_ram(bios, NULL, "pc.bios", bios_size, &error_fatal);
    if (!isapc_ram_fw) {
        memory_region_set_readonly(bios, true);
    }    
    ret = rom_add_file_fixed(bios_name, (uint32_t)(-bios_size), -1);
    if (ret != 0) {
    bios_error:
        fprintf(stderr, "qemu: could not load PC BIOS '%s'\n", bios_name);
        exit(1);
    }
    g_free(filename);

    /* map the last 128KB of the BIOS in ISA space */
    isa_bios_size = MIN(bios_size, 128 * KiB);
    isa_bios = g_malloc(sizeof(*isa_bios));
    memory_region_init_alias(isa_bios, NULL, "isa-bios", bios,
                             bios_size - isa_bios_size, isa_bios_size);
    memory_region_add_subregion_overlap(rom_memory,
                                        0x100000 - isa_bios_size,
                                        isa_bios,
                                        1);
    if (!isapc_ram_fw) {
        memory_region_set_readonly(isa_bios, true);
    }

    /* map all the bios at the top of memory */
    memory_region_add_subregion(rom_memory,
                                (uint32_t)(-bios_size),
                                bios);
}

2)通过宏rom_add_file_fixed调用rom_add_file打开BIOS固件文件


#define rom_add_file_fixed(_f, _a, _i)          \
    rom_add_file(_f, NULL, _a, _i, false, NULL, NULL)
int rom_add_file(const char *file, const char *fw_dir,
                 hwaddr addr, int32_t bootindex,
                 bool option_rom, MemoryRegion *mr,
                 AddressSpace *as
)
{
    MachineClass *mc = MACHINE_GET_CLASS(qdev_get_machine());
    Rom *rom;
    int rc, fd = -1;
    char devpath[100];

    if (as && mr) {
        fprintf(stderr, "Specifying an Address Space and Memory Region is " \
                "not valid when loading a rom\n");
        /* We haven't allocated anything so we don't need any cleanup */
        return -1;
    }

    rom = g_malloc0(sizeof(*rom));
    rom->name = g_strdup(file);
    rom->path = qemu_find_file(QEMU_FILE_TYPE_BIOS, rom->name);
    rom->as = as;
    if (rom->path == NULL) {
        rom->path = g_strdup(file);
    }
    fd = open(rom->path, O_RDONLY | O_BINARY);
    if (fd == -1) {
        fprintf(stderr, "Could not open option rom '%s': %s\n",
                rom->path, strerror(errno));
        goto err;
    }

    if (fw_dir) {
        rom->fw_dir  = g_strdup(fw_dir);
        rom->fw_file = g_strdup(file);
    }
    rom->addr     = addr;
    rom->romsize  = lseek(fd, 0, SEEK_END);
    if (rom->romsize == -1) {
        fprintf(stderr, "rom: file %-20s: get size error: %s\n",
                rom->name, strerror(errno));
        goto err;
    }

    rom->datasize = rom->romsize;
    rom->data     = g_malloc0(rom->datasize);
    lseek(fd, 0, SEEK_SET);
    rc = read(fd, rom->data, rom->datasize);
    if (rc != rom->datasize) {
        fprintf(stderr, "rom: file %-20s: read error: rc=%d (expected %zd)\n",
                rom->name, rc, rom->datasize);
        goto err;
         }
    close(fd);
    rom_insert(rom);
    if (rom->fw_file && fw_cfg) {
        const char *basename;
        char fw_file_name[FW_CFG_MAX_FILE_PATH];
        void *data;

        basename = strrchr(rom->fw_file, '/');
        if (basename) {
            basename++;
        } else {
            basename = rom->fw_file;
        }
        snprintf(fw_file_name, sizeof(fw_file_name), "%s/%s", rom->fw_dir,
                 basename);
        snprintf(devpath, sizeof(devpath), "/rom@%s", fw_file_name);

        if ((!option_rom || mc->option_rom_has_mr) && mc->rom_file_has_mr) {
            data = rom_set_mr(rom, OBJECT(fw_cfg), devpath, true);
        } else {
            data = rom->data;
        }

        fw_cfg_add_file(fw_cfg, fw_file_name, data, rom->romsize);
    } else {
        if (mr) {
            rom->mr = mr;
            snprintf(devpath, sizeof(devpath), "/rom@%s", file);
        } else {
            snprintf(devpath, sizeof(devpath), "/rom@" TARGET_FMT_plx, addr);
        }
    }

    add_boot_device_path(bootindex, NULL, devpath);
    return 0;

err:
    if (fd != -1)
        close(fd);

    rom_free(rom);
    return -1;
 }

rom_add_file的主要作用之一就是分配一个Rom结构体,记录BIOS的一些基本信息调用,rom_insert函数将新分配的Rom挂到一个链表上。Rom的定义如下

 struct Rom {
    char *name;
    char *path;

    /* datasize is the amount of memory allocated in "data". If datasize is less
     * than romsize, it means that the area from datasize to romsize is filled
     * with zeros.
     */
    size_t romsize;
    size_t datasize;

    uint8_t *data;
    MemoryRegion *mr;
    AddressSpace *as;
    int isrom;
    char *fw_dir;
    char *fw_file;
    GMappedFile *mapped_file;

    bool committed;

    hwaddr addr;
    QTAILQ_ENTRY(Rom) next;
};

3)最终一步就是将创建的bios MemoryRegion设置为rom memory的子Region,其offset设置为BIOS的加载地址。QEMU使用的SeaBIOS是256KB,bios_size为0x40000, (uint32_t)-bios_size为0xfffc0000,所以这里是把bios的地址加在到了0xfffc0000处,也就是最靠近4GB地址空间的256KB中。这里还会把创建的isa_bios作为BIOS MemoryRegion的别名,并且放在最靠近1MB的128KB位置。

现在分配了BIOS MemoryRegion,并且设置了其基地址和大小,也将BIOS的数据加载到了内存中,但是只是在rom->data中。需要将BIOS的数据复制到MemoryRegion中,并将虚拟机对应的物理内存映射到QEMU的虚拟内存中。

QEMU会在main函数中调用rom_check_and_register_reset,后者的主要工作是将rom_reset挂到reset_handlers链表上,当虚拟机重置时会调用链表上的每一个函数。

rom_reset最重要的任务就是把存放在rom->data中的BIOS数据复制到BIOS MemoryRegion对应的QEMU进程中分配的虚拟内存中。


    if (rom_check_and_register_reset() != 0) { 
        error_report("rom check and register reset failed");
        exit(1);
    }    


int rom_check_and_register_reset(void)
{
    hwaddr addr = 0;
    MemoryRegionSection section;
    Rom *rom;
    AddressSpace *as = NULL;

    QTAILQ_FOREACH(rom, &roms, next) {
        if (rom->fw_file) {
            continue;
        }
        if (!rom->mr) {
            if ((addr > rom->addr) && (as == rom->as)) {
                fprintf(stderr, "rom: requested regions overlap "
                        "(rom %s. free=0x" TARGET_FMT_plx
                        ", addr=0x" TARGET_FMT_plx ")\n",
                        rom->name, addr, rom->addr);
                return -1;
            }
            addr  = rom->addr;
            addr += rom->romsize;
            as = rom->as;
        }
        section = memory_region_find(rom->mr ? rom->mr : get_system_memory(),
                                     rom->addr, 1);
        rom->isrom = int128_nz(section.size) && memory_region_is_rom(section.mr);
        memory_region_unref(section.mr);
    }
    qemu_register_reset(rom_reset, NULL);
    roms_loaded = 1;
    return 0;
}

现在BIOS MemoryRegion对应的宿主机QEMU进程虚拟内存已经有了BIOS数据,并且这个MR的地址敌营的虚拟机的物理地址是0xfffc000。

CPU在启动后会初始化各个寄存器的值,其中CS被初始化为0xf000,EIP被初始化为0xfff0,CS的基地址被初始化为0xffff0000。


static void x86_cpu_reset(DeviceState *dev)
{
    CPUState *s = CPU(dev);
    X86CPU *cpu = X86_CPU(s);
    X86CPUClass *xcc = X86_CPU_GET_CLASS(cpu);
    CPUX86State *env = &cpu->env;
    target_ulong cr4; 
    uint64_t xcr0;
    int i;

    xcc->parent_reset(dev);

    memset(env, 0, offsetof(CPUX86State, end_reset_fields));

    env->old_exception = -1;

    /* init to reset state */

    env->hflags2 |= HF2_GIF_MASK;
    env->hflags &= ~HF_GUEST_MASK;

    cpu_x86_update_cr0(env, 0x60000010);
    env->a20_mask = ~0x0;
    env->smbase = 0x30000;
    env->msr_smi_count = 0; 

    env->idt.limit = 0xffff;
    env->gdt.limit = 0xffff;
    env->ldt.limit = 0xffff;
    env->ldt.flags = DESC_P_MASK | (2 << DESC_TYPE_SHIFT);
    env->tr.limit = 0xffff;
    env->tr.flags = DESC_P_MASK | (11 << DESC_TYPE_SHIFT);

    cpu_x86_load_seg_cache(env, R_CS, 0xf000, 0xffff0000, 0xffff,
                           DESC_P_MASK | DESC_S_MASK | DESC_CS_MASK |
                           DESC_R_MASK | DESC_A_MASK);
    cpu_x86_load_seg_cache(env, R_DS, 0, 0, 0xffff,
                           DESC_P_MASK | DESC_S_MASK | DESC_W_MASK |
                           DESC_A_MASK);
    cpu_x86_load_seg_cache(env, R_ES, 0, 0, 0xffff,
                           DESC_P_MASK | DESC_S_MASK | DESC_W_MASK |
                           DESC_A_MASK);
    cpu_x86_load_seg_cache(env, R_SS, 0, 0, 0xffff,
                           DESC_P_MASK | DESC_S_MASK | DESC_W_MASK |
                           DESC_A_MASK);
    cpu_x86_load_seg_cache(env, R_FS, 0, 0, 0xffff,
                           DESC_P_MASK | DESC_S_MASK | DESC_W_MASK |
                           DESC_A_MASK);
    cpu_x86_load_seg_cache(env, R_GS, 0, 0, 0xffff,
                           DESC_P_MASK | DESC_S_MASK | DESC_W_MASK |
                           DESC_A_MASK);

    env->eip = 0xfff0;
    env->regs[R_EDX] = env->cpuid_version;
    env->eflags = 0x2;

    /* FPU init */
    for (i = 0; i < 8; i++) {
        env->fptags[i] = 1;
    }
    cpu_set_fpuc(env, 0x37f);

    env->mxcsr = 0x1f80;
    /* All units are in INIT state.  */
    env->xstate_bv = 0;

    env->pat = 0x0007040600070406ULL;
    env->msr_ia32_misc_enable = MSR_IA32_MISC_ENABLE_DEFAULT;
    if (env->features[FEAT_1_ECX] & CPUID_EXT_MONITOR) {
        env->msr_ia32_misc_enable |= MSR_IA32_MISC_ENABLE_MWAIT;
    }

    memset(env->dr, 0, sizeof(env->dr));
    env->dr[6] = DR6_FIXED_1;
    env->dr[7] = DR7_FIXED_1;
    cpu_breakpoint_remove_all(s, BP_CPU);
    cpu_watchpoint_remove_all(s, BP_CPU);

    cr4 = 0;
    xcr0 = XSTATE_FP_MASK;
#ifdef CONFIG_USER_ONLY
    /* Enable all the features for user-mode.  */
    if (env->features[FEAT_1_EDX] & CPUID_SSE) {
        xcr0 |= XSTATE_SSE_MASK;
    }
    for (i = 2; i < ARRAY_SIZE(x86_ext_save_areas); i++) {
        const ExtSaveArea *esa = &x86_ext_save_areas[i];
        if (env->features[esa->feature] & esa->bits) {
            xcr0 |= 1ull << i;
        }
    }

    if (env->features[FEAT_1_ECX] & CPUID_EXT_XSAVE) {
        cr4 |= CR4_OSFXSR_MASK | CR4_OSXSAVE_MASK;
    }
    if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_FSGSBASE) {
        cr4 |= CR4_FSGSBASE_MASK;
    }
#endif
    env->xcr0 = xcr0;
    cpu_x86_update_cr4(env, cr4);

    /*
     * SDM 11.11.5 requires:
     *  - IA32_MTRR_DEF_TYPE MSR.E = 0
     *  - IA32_MTRR_PHYSMASKn.V = 0
     * All other bits are undefined.  For simplification, zero it all.
     */
    env->mtrr_deftype = 0;
    memset(env->mtrr_var, 0, sizeof(env->mtrr_var));
    memset(env->mtrr_fixed, 0, sizeof(env->mtrr_fixed));

    env->interrupt_injected = -1;
    env->exception_nr = -1;
    env->exception_pending = 0;
    env->exception_injected = 0;
    env->exception_has_payload = false;
    env->exception_payload = 0;
    env->nmi_injected = false;
#if !defined(CONFIG_USER_ONLY)
    /* We hard-wire the BSP to the first CPU. */
    apic_designate_bsp(cpu->apic_state, s->cpu_index == 0);

    s->halted = !cpu_is_bsp(cpu);
    if (kvm_enabled()) {
        kvm_arch_reset_vcpu(cpu);
    }
#endif
}

从前面的分析可知,0xfffffff0处就是BIOS的最后16个字节的开始处,这里都是一个跳转指令,跳转到BIOS的前面执行。