GSoC 2012 week3:疑似BIOSの雛形を実行してみる
これをゲスト環境で実行する為に、疑似BIOSをロードして、CPUをリアルモードで初期化し、unrestricted guest modeを有効にしてゲストを開始する一連の実装を行ってみる。
bhyvebiosloadコマンドを作る
bhyveloadコマンドをforkしてbhyvebiosloadコマンドを作る。
これは、userbootを実行する代わりに、pseudobios.binを0x0へロードしてRIPを0x0へ設定する。
--- bhyveload/bhyveload.c 2012-05-30 05:59:02.000000000 +0900 +++ bhyvebiosload/bhyvebiosload.c 2012-06-14 16:38:11.000000000 +0900 @@ -99,6 +99,7 @@ * Console i/o callbacks */ +#if 0 static void cb_putc(void *arg, int ch) { @@ -126,6 +127,7 @@ return (n > 0); return (0); } +#endif /* * Host filesystem i/o callbacks @@ -198,6 +200,7 @@ return (0); } +#if 0 static int cb_isdir(void *arg, void *h) { @@ -205,6 +208,7 @@ return (cf->cf_isdir); } +#endif static int cb_read(void *arg, void *h, void *buf, size_t size, size_t *resid) @@ -221,6 +225,7 @@ return (0); } +#if 0 static int cb_readdir(void *arg, void *h, uint32_t *fileno_return, uint8_t *type_return, size_t *namelen_return, char *name) @@ -290,6 +295,7 @@ *resid = size - n; return (0); } +#endif /* * Guest virtual machine i/o callbacks @@ -309,6 +315,7 @@ return (0); } +#if 0 static int cb_copyout(void *arg, uint64_t from, void *to, size_t size) { @@ -415,6 +422,7 @@ cb_exit(NULL, USERBOOT_EXIT_QUIT); } } +#endif static void cb_setgdt(void *arg, uint64_t base, size_t size) @@ -435,7 +443,7 @@ { int error; - error = vm_setup_freebsd_registers(ctx, BSP, rip, cr3, gdtbase, rsp); + error = vm_setup_bios_registers(ctx, BSP, rip, cr3, gdtbase, rsp); if (error) { perror("vm_setup_freebsd_registers"); cb_exit(NULL, USERBOOT_EXIT_QUIT); @@ -444,6 +452,7 @@ cb_exit(NULL, 0); } +#if 0 /* * Misc */ @@ -454,6 +463,7 @@ usleep(usec); } +#endif static void cb_exit(void *arg, int v) @@ -463,6 +473,7 @@ exit(v); } +#if 0 static void cb_getmem(void *arg, uint64_t *ret_lowmem, uint64_t *ret_highmem) { @@ -498,6 +509,7 @@ .exit = cb_exit, .getmem = cb_getmem, }; +#endif static void usage(void) @@ -513,9 +525,12 @@ main(int argc, char** argv) { void *h; - void (*func)(struct loader_callbacks_v1 *, void *, int, int); int opt, error; char *disk_image; + struct cb_file *cf; + char *buf; + size_t res; + uint64_t gdtr[3]; progname = argv[0]; @@ -586,19 +601,26 @@ term.c_lflag &= ~(ICANON|ECHO); term.c_iflag &= ~ICRNL; tcsetattr(0, TCSAFLUSH, &term); - h = dlopen("/boot/userboot.so", RTLD_LOCAL); - if (!h) { - printf("%s\n", dlerror()); - return (1); - } - func = dlsym(h, "loader_main"); - if (!func) { - printf("%s\n", dlerror()); - return (1); - } if (disk_image) { disk_fd = open(disk_image, O_RDONLY); } - func(&cb, NULL, USERBOOT_VERSION_1, disk_fd >= 0); + + if (cb_open(NULL, "/pseudobios.bin", &h)) { + perror("cb_open "); + return (1); + } + cf = h; + buf = malloc(6); + if (cb_read(NULL, cf, buf, 3, &res) != 0 || res != 0) { + fprintf(stderr, "cb_read\n"); + return (1); + } + cb_copyin(NULL, buf, 0x0, 3); + cb_close(NULL, cf); + vm_setup_bios_gdt(gdtr); + cb_copyin(NULL, gdtr, 0x5000, sizeof(gdtr)); + cb_setgdt(NULL, 0x5000, sizeof(gdtr)); + cb_exec(NULL, 0x0); + return (0); }
実際のdiffはこのあたり:
svn diff -r r237671:r237672 https://socsvn.freebsd.org/socsvn/soc2012/syuu/bhyve-bios
vm_setup_bios_registers(), vm_setup_bios_gdt()を作る
リアルモードで動かすために、余計なフラグやセグメントレジスタの値をクリアした。
そのあたりが決め打ちになってるAPIなので、取りあえず別の関数名・ファイル名で用意。
--- vmmapi_freebsd.c 2012-06-14 15:46:41.000000000 +0900 +++ vmmapi_bios.c 2012-06-14 16:38:29.000000000 +0900 @@ -49,7 +49,7 @@ #define GUEST_GDTR_LIMIT (3 * 8 - 1) void -vm_setup_freebsd_gdt(uint64_t *gdtr) +vm_setup_bios_gdt(uint64_t *gdtr) { gdtr[GUEST_NULL_SEL] = 0; gdtr[GUEST_CODE_SEL] = 0x0020980000000000; @@ -61,24 +61,23 @@ * 'rip' in long mode. */ int -vm_setup_freebsd_registers(struct vmctx *vmctx, int vcpu, - uint64_t rip, uint64_t cr3, uint64_t gdtbase, - uint64_t rsp) +vm_setup_bios_registers(struct vmctx *vmctx, int vcpu, + uint64_t rip, uint64_t cr3, uint64_t gdtbase, + uint64_t rsp) { int error; uint64_t cr0, cr4, efer, rflags, desc_base; uint32_t desc_access, desc_limit; - uint16_t gsel; - cr0 = CR0_PE | CR0_PG | CR0_NE; + cr0 = CR0_NE; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0) goto done; - cr4 = CR4_PAE | CR4_VMXE; + cr4 = 0; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0) goto done; - efer = EFER_LME | EFER_LMA; + efer = 0; if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_EFER, efer))) goto done; @@ -135,24 +134,22 @@ if (error) goto done; - gsel = GSEL(GUEST_CODE_SEL, SEL_KPL); - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, gsel)) != 0) + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, 0)) != 0) goto done; - gsel = GSEL(GUEST_DATA_SEL, SEL_KPL); - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, gsel)) != 0) + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, 0)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, gsel)) != 0) + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, 0)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, gsel)) != 0) + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, 0)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, gsel)) != 0) + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, 0)) != 0) goto done; - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, gsel)) != 0) + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, 0)) != 0) goto done; /* XXX TR is pointing to the null selector */ @@ -168,7 +165,7 @@ goto done; /* page table base */ - if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, cr3)) != 0) + if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, 0)) != 0) goto done; desc_base = gdtbase;
実際のdiffはこのあたり:
svn diff -r r237670:r237671 https://socsvn.freebsd.org/socsvn/soc2012/syuu/bhyve-bios
bhyveコマンドにBIOSモードを追加(-bオプション)
Intel VT-xは元々プロテクトモードのハードウェア仮想化しかサポートしていない。
EPTを利用してリアルモードのハードウェア仮想化が出来るようになったのは、Nehalem-Cから。
この機能をunrestricted guest modeと呼ぶ。
BHyVeのカーネルモジュールはこれをオン・オフする機能を既に持っているので、オンにしてやる。
Index: usr.sbin/bhyve/fbsdrun.c =================================================================== --- usr.sbin/bhyve/fbsdrun.c (revision 237668) +++ usr.sbin/bhyve/fbsdrun.c (revision 237669) @@ -99,6 +99,8 @@ static void *oem_tbl_start; static int oem_tbl_size; +static int bios_mode; + static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip); struct vm_exit vmexit[VM_MAXCPU]; @@ -126,7 +128,7 @@ { fprintf(stderr, - "Usage: %s [-ehBHP][-g <gdb port>][-z <hz>][-s <pci>][-p pincpu]" + "Usage: %s [-ehBHPb][-g <gdb port>][-z <hz>][-s <pci>][-p pincpu]" "[-n <pci>][-m lowmem][-M highmem] <vm>\n" " -g: gdb port (default is %d and 0 means don't open)\n" " -c: # cpus (default 1)\n" @@ -143,7 +145,8 @@ " -m: lowmem in MB\n" " -M: highmem in MB\n" " -x: mux vcpus to 1 hcpu\n" - " -t: mux vcpu timeslice hz (default %d)\n", + " -t: mux vcpu timeslice hz (default %d)\n" + " -b: BIOS compatible mode\n", progname, DEFAULT_GDB_PORT, DEFAULT_GUEST_HZ, DEFAULT_GUEST_TSLICE); exit(code); @@ -531,7 +544,7 @@ gdb_port = DEFAULT_GDB_PORT; guest_ncpus = 1; - while ((c = getopt(argc, argv, "ehBHPxp:g:c:z:s:S:n:m:M:")) != -1) { + while ((c = getopt(argc, argv, "ehBHPxbp:g:c:z:s:S:n:m:M:")) != -1) { switch (c) { case 'B': inject_bkpt = 1; @@ -578,6 +591,9 @@ case 'e': strictio = 1; break; + case 'b': + bios_mode = 1; + break; case 'h': usage(0); default: @@ -644,6 +660,10 @@ } } + if (bios_mode != 0) { + vm_set_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, 1); + } + init_inout(); init_pci(ctx);
実際のdiffはこのあたり:
svn diff -r r237668:r237669
https://socsvn.freebsd.org/socsvn/soc2012/syuu/bhyve-bios
実行してみる
$ sudo ./bhyvebiosload -m 128 -M 256 -h /usr/bhyve-guest/ vm0 $ cd ../bhyve $ sudo ./bhyve -m 128 -M 256 -b vm0 vm exit[0] reason VMX rip 0x0000000000000000 inst_length 3 error 0 exit_reason 2 qualification 0x0000000000000000
てな訳で、実行すべき命令がVMCALLだという事は分かっているらしいが(3byteのinst_length)、実行する前にtriple faultを起こして死んでいる。
即時faultを起こすようなコントロールレジスタの値とか、セグメントレジスタの値になっているのだと思うのだけれど、どれがそうなのか判別がついてない。
レジスタダンプ(bhyvebiosload直後)
$ sudo vmmctl --get-all --vm=vm0 lowmem 0x0000000100000000/134217728 highmem 0x0000000108000000/268435456 efer[0] 0x0000000000000000 cr0[0] 0x0000000000000020 cr3[0] 0x0000000000000000 cr4[0] 0x0000000000002000 dr7[0] 0x0000000000000000 rsp[0] 0x0000000000000000 rip[0] 0x0000000000000000 rax[0] 0x0000000000000000 rbx[0] 0x0000000000000000 rcx[0] 0x0000000000000000 rdx[0] 0x0000000000000000 rsi[0] 0x0000000000000000 rdi[0] 0x0000000000000000 rbp[0] 0x0000000000000000 r8[0] 0x0000000000000000 r9[0] 0x0000000000000000 r10[0] 0x0000000000000000 r11[0] 0x0000000000000000 r12[0] 0x0000000000000000 r13[0] 0x0000000000000000 r14[0] 0x0000000000000000 r15[0] 0x0000000000000000 rflags[0] 0x0000000000000002 vcpu0 vcpu migration across host cpus 0 vcpu total runtime 0 vm exits due to external interrupt 0 ds desc[0] 0x0000000000000000/0x00000000/0x00000093 es desc[0] 0x0000000000000000/0x00000000/0x00000093 fs desc[0] 0x0000000000000000/0x00000000/0x00000093 gs desc[0] 0x0000000000000000/0x00000000/0x00000093 ss desc[0] 0x0000000000000000/0x00000000/0x00000093 cs desc[0] 0x0000000000000000/0x00000000/0x0000209b tr desc[0] 0x0000000000000000/0x00000000/0x0000008b ldtr desc[0] 0x0000000000000000/0x00000000/0x00010000 gdtr[0] 0x0000000000005000/0x00000017 idtr[0] 0x0000000000000000/0x00000000 cs[0] 0x0000 ds[0] 0x0000 es[0] 0x0000 fs[0] 0x0000 gs[0] 0x0000 ss[0] 0x0000 tr[0] 0x0000 ldtr[0] 0x0000 pincpu[0] unpinned pinbased_ctls[0] 0x0000003f procbased_ctls[0] 0x95006172 procbased_ctls2[0] 0x00000022 gla[0] 0x0000000000000000 gpa[0] 0x0000000000000000 entry_interruption_info[0] 0x00000000 eptp[0] 0x000000007b08801e exception_bitmap[0] 0x00040000 io_bitmap_a[0] 0x00000000 io_bitmap_b[0] 0x00000000 tsc_offset[0] 0x0000000000000000 cr0_mask[0] 0xffffffff60000020 cr0_shadow[0] 0x0000000000000020 cr4_mask[0] 0x0000000000000000 cr4_shadow[0] 0x0000000000000000 cr3_target_count[0] 0x00000000 cr3_target0[0] 0x0000000000000000 cr3_target1[0] 0x0000000000000000 cr3_target2[0] 0x0000000000000000 cr3_target3[0] 0x0000000000000000 apic_access_addr[0] 0x0000000000000000 virtual_apic_addr[0] 0x0000000000000000 tpr_threshold[0] 0x00000000 msr_bitmap[0] 0x000000006ef31000 msr 0xc0000080[0] RW msr 0xc0000100[0] RW msr 0xc0000101[0] RW msr 0xc0000102[0] RW msr 0x00000277[0] RW vpid[0] 0x00c9 ple_window[0] 0x00000000 ple_gap[0] 0x00000000 instruction_error[0] 0x00000000 exit_ctls[0] 0x003f6ffb entry_ctls[0] 0x0000d1fb host_pat[0] 0x0001050600070406 guest_pat[0] 0x0007040600070406 host_cr0[0] 0x0000000080050033 host_cr3[0] 0x0000000000000000 host_cr4[0] 0x00000000000026f0 host_rip[0] 0xffffffff81812090 host_rip[0] 0xffffff8002aaa000 guest_sysenter_cs[0] 0x00000000 guest_sysenter_sp[0] 0x0000000000000000 guest_sysenter_ip[0] 0x0000000000000000 vmcs_pointer[0] 0xffffffffffffffff vmcs_exit_reason[0] 0x0000000000000000 vmcs_exit_qualification[0] 0x0000000000000000 vmcs_exit_interruption_info[0] 0x00000000 vmcs_exit_interruption_error[0] 0x00000000 vmcs_guest_interruptibility[0] 0x00000000 errno = 22
レジスタダンプ(bhyve実行後)
$ sudo vmmctl --get-all --vm=vm0 lowmem 0x0000000100000000/134217728 highmem 0x0000000108000000/268435456 efer[0] 0x0000000000000000 cr0[0] 0x0000000000000030 cr3[0] 0x0000000000000000 cr4[0] 0x0000000000002000 dr7[0] 0x0000000000000000 rsp[0] 0x0000000000000000 rip[0] 0x0000000000000000 rax[0] 0x0000000000000000 rbx[0] 0x0000000000000000 rcx[0] 0x0000000000000000 rdx[0] 0x0000000000000000 rsi[0] 0x0000000000000000 rdi[0] 0x0000000000000000 rbp[0] 0x0000000000000000 r8[0] 0x0000000000000000 r9[0] 0x0000000000000000 r10[0] 0x0000000000000000 r11[0] 0x0000000000000000 r12[0] 0x0000000000000000 r13[0] 0x0000000000000000 r14[0] 0x0000000000000000 r15[0] 0x0000000000000000 rflags[0] 0x0000000000010002 vcpu0 vcpu migration across host cpus 1 vcpu total runtime 23841 vm exits due to external interrupt 0 ds desc[0] 0x0000000000000000/0x00000000/0x00000093 es desc[0] 0x0000000000000000/0x00000000/0x00000093 fs desc[0] 0x0000000000000000/0x00000000/0x00000093 gs desc[0] 0x0000000000000000/0x00000000/0x00000093 ss desc[0] 0x0000000000000000/0x00000000/0x00000093 cs desc[0] 0x0000000000000000/0x00000000/0x0000209b tr desc[0] 0x0000000000000000/0x00000000/0x0000008b ldtr desc[0] 0x0000000000000000/0x00000000/0x00010000 gdtr[0] 0x0000000000005000/0x00000017 idtr[0] 0x0000000000000000/0x00000000 cs[0] 0x0000 ds[0] 0x0000 es[0] 0x0000 fs[0] 0x0000 gs[0] 0x0000 ss[0] 0x0000 tr[0] 0x0000 ldtr[0] 0x0000 pincpu[0] unpinned pinbased_ctls[0] 0x0000003f procbased_ctls[0] 0x95006172 procbased_ctls2[0] 0x000000a2 gla[0] 0x0000000000000000 gpa[0] 0x0000000000000000 entry_interruption_info[0] 0x00000000 eptp[0] 0x000000007b08801e exception_bitmap[0] 0x00040000 io_bitmap_a[0] 0x00000000 io_bitmap_b[0] 0x00000000 tsc_offset[0] 0x0000000000000000 cr0_mask[0] 0xffffffff60000020 cr0_shadow[0] 0x0000000000000020 cr4_mask[0] 0x0000000000000000 cr4_shadow[0] 0x0000000000000000 cr3_target_count[0] 0x00000000 cr3_target0[0] 0x0000000000000000 cr3_target1[0] 0x0000000000000000 cr3_target2[0] 0x0000000000000000 cr3_target3[0] 0x0000000000000000 apic_access_addr[0] 0x0000000000000000 virtual_apic_addr[0] 0x0000000000000000 tpr_threshold[0] 0x00000000 msr_bitmap[0] 0x000000006ef31000 msr 0xc0000080[0] RW msr 0xc0000100[0] RW msr 0xc0000101[0] RW msr 0xc0000102[0] RW msr 0x00000277[0] RW vpid[0] 0x00c9 ple_window[0] 0x00000000 ple_gap[0] 0x00000000 instruction_error[0] 0x00000000 exit_ctls[0] 0x003f6ffb entry_ctls[0] 0x0000d1fb host_pat[0] 0x0001050600070406 guest_pat[0] 0x0007040600070406 host_cr0[0] 0x0000000080050033 host_cr3[0] 0x000000002c71c000 host_cr4[0] 0x00000000000026f0 host_rip[0] 0xffffffff81812090 host_rip[0] 0xffffff8002aaa000 guest_sysenter_cs[0] 0x00000000 guest_sysenter_sp[0] 0x0000000000000000 guest_sysenter_ip[0] 0x0000000000000000 vmcs_pointer[0] 0xffffffffffffffff vmcs_exit_reason[0] 0x0000000000000002 vmcs_exit_qualification[0] 0x0000000000000000 vmcs_exit_interruption_info[0] 0x00000000 vmcs_exit_interruption_error[0] 0x00000000 vmcs_guest_interruptibility[0] 0x00000000 errno = 22