GSoC 2012 week3:疑似BIOSの雛形を実行してみる

これをゲスト環境で実行する為に、疑似BIOSをロードして、CPUをリアルモードで初期化し、unrestricted guest modeを有効にしてゲストを開始する一連の実装を行ってみる。

bhyvebiosloadコマンドを作る

bhyveloadコマンドをforkしてbhyvebiosloadコマンドを作る。
これは、userbootを実行する代わりに、pseudobios.binを0x0へロードしてRIPを0x0へ設定する。

--- bhyveload/bhyveload.c       2012-05-30 05:59:02.000000000 +0900
+++ bhyvebiosload/bhyvebiosload.c       2012-06-14 16:38:11.000000000 +0900
@@ -99,6 +99,7 @@
  * Console i/o callbacks
  */

+#if 0
 static void
 cb_putc(void *arg, int ch)
 {
@@ -126,6 +127,7 @@
                return (n > 0);
        return (0);
 }
+#endif

 /*
  * Host filesystem i/o callbacks
@@ -198,6 +200,7 @@
        return (0);
 }

+#if 0
 static int
 cb_isdir(void *arg, void *h)
 {
@@ -205,6 +208,7 @@

        return (cf->cf_isdir);
 }
+#endif

 static int
 cb_read(void *arg, void *h, void *buf, size_t size, size_t *resid)
@@ -221,6 +225,7 @@
        return (0);
 }

+#if 0
 static int
 cb_readdir(void *arg, void *h, uint32_t *fileno_return, uint8_t *type_return,
           size_t *namelen_return, char *name)
@@ -290,6 +295,7 @@
        *resid = size - n;
        return (0);
 }
+#endif

 /*
  * Guest virtual machine i/o callbacks
@@ -309,6 +315,7 @@
        return (0);
 }

+#if 0
 static int
 cb_copyout(void *arg, uint64_t from, void *to, size_t size)
 {
@@ -415,6 +422,7 @@
                cb_exit(NULL, USERBOOT_EXIT_QUIT);
        }
 }
+#endif

 static void
 cb_setgdt(void *arg, uint64_t base, size_t size)
@@ -435,7 +443,7 @@
 {
        int error;

-       error = vm_setup_freebsd_registers(ctx, BSP, rip, cr3, gdtbase, rsp);
+       error = vm_setup_bios_registers(ctx, BSP, rip, cr3, gdtbase, rsp);
        if (error) {
                perror("vm_setup_freebsd_registers");
                cb_exit(NULL, USERBOOT_EXIT_QUIT);
@@ -444,6 +452,7 @@
        cb_exit(NULL, 0);
 }

+#if 0
 /*
  * Misc
  */
@@ -454,6 +463,7 @@

        usleep(usec);
 }
+#endif

 static void
 cb_exit(void *arg, int v)
@@ -463,6 +473,7 @@
        exit(v);
 }

+#if 0
 static void
 cb_getmem(void *arg, uint64_t *ret_lowmem, uint64_t *ret_highmem)
 {
@@ -498,6 +509,7 @@
        .exit = cb_exit,
        .getmem = cb_getmem,
 };
+#endif

 static void
 usage(void)
@@ -513,9 +525,12 @@
 main(int argc, char** argv)
 {
        void *h;
-       void (*func)(struct loader_callbacks_v1 *, void *, int, int);
        int opt, error;
        char *disk_image;
+       struct cb_file *cf;
+       char *buf;
+       size_t res;
+       uint64_t                gdtr[3];

        progname = argv[0];

@@ -586,19 +601,26 @@
        term.c_lflag &= ~(ICANON|ECHO);
        term.c_iflag &= ~ICRNL;
        tcsetattr(0, TCSAFLUSH, &term);
-       h = dlopen("/boot/userboot.so", RTLD_LOCAL);
-       if (!h) {
-               printf("%s\n", dlerror());
-               return (1);
-       }
-       func = dlsym(h, "loader_main");
-       if (!func) {
-               printf("%s\n", dlerror());
-               return (1);
-       }

        if (disk_image) {
                disk_fd = open(disk_image, O_RDONLY);
        }
-       func(&cb, NULL, USERBOOT_VERSION_1, disk_fd >= 0);
+
+       if (cb_open(NULL, "/pseudobios.bin", &h)) {
+               perror("cb_open ");
+               return (1);
+       }
+       cf = h;
+       buf = malloc(6);
+       if (cb_read(NULL, cf, buf, 3, &res) != 0 || res != 0) {
+               fprintf(stderr, "cb_read\n");
+               return (1);
+       }
+       cb_copyin(NULL, buf, 0x0, 3);
+       cb_close(NULL, cf);
+       vm_setup_bios_gdt(gdtr);
+       cb_copyin(NULL, gdtr, 0x5000, sizeof(gdtr));
+       cb_setgdt(NULL, 0x5000, sizeof(gdtr));
+       cb_exec(NULL, 0x0);
+       return (0);
 }

実際のdiffはこのあたり:
svn diff -r r237671:r237672 https://socsvn.freebsd.org/socsvn/soc2012/syuu/bhyve-bios

vm_setup_bios_registers(), vm_setup_bios_gdt()を作る

リアルモードで動かすために、余計なフラグやセグメントレジスタの値をクリアした。
そのあたりが決め打ちになってるAPIなので、取りあえず別の関数名・ファイル名で用意。

--- vmmapi_freebsd.c    2012-06-14 15:46:41.000000000 +0900
+++ vmmapi_bios.c       2012-06-14 16:38:29.000000000 +0900
@@ -49,7 +49,7 @@
 #define        GUEST_GDTR_LIMIT        (3 * 8 - 1)

 void
-vm_setup_freebsd_gdt(uint64_t *gdtr)
+vm_setup_bios_gdt(uint64_t *gdtr)
 {
        gdtr[GUEST_NULL_SEL] = 0;
        gdtr[GUEST_CODE_SEL] = 0x0020980000000000;
@@ -61,24 +61,23 @@
  * 'rip' in long mode.
  */
 int
-vm_setup_freebsd_registers(struct vmctx *vmctx, int vcpu,
-                          uint64_t rip, uint64_t cr3, uint64_t gdtbase,
-                          uint64_t rsp)
+vm_setup_bios_registers(struct vmctx *vmctx, int vcpu,
+                       uint64_t rip, uint64_t cr3, uint64_t gdtbase,
+                       uint64_t rsp)
 {
        int error;
        uint64_t cr0, cr4, efer, rflags, desc_base;
        uint32_t desc_access, desc_limit;
-       uint16_t gsel;

-       cr0 = CR0_PE | CR0_PG | CR0_NE;
+       cr0 = CR0_NE;
        if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0)
                goto done;

-       cr4 = CR4_PAE | CR4_VMXE;
+       cr4 = 0;
        if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0)
                goto done;

-       efer = EFER_LME | EFER_LMA;
+       efer = 0;
        if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_EFER, efer)))
                goto done;

@@ -135,24 +134,22 @@
        if (error)
                goto done;

-       gsel = GSEL(GUEST_CODE_SEL, SEL_KPL);
-       if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, gsel)) != 0)
+       if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, 0)) != 0)
                goto done;

-       gsel = GSEL(GUEST_DATA_SEL, SEL_KPL);
-       if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, gsel)) != 0)
+       if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, 0)) != 0)
                goto done;

-       if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, gsel)) != 0)
+       if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, 0)) != 0)
                goto done;

-       if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, gsel)) != 0)
+       if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, 0)) != 0)
                goto done;

-       if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, gsel)) != 0)
+       if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, 0)) != 0)
                goto done;

-       if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, gsel)) != 0)
+       if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, 0)) != 0)
                goto done;

        /* XXX TR is pointing to the null selector */
@@ -168,7 +165,7 @@
                goto done;

        /* page table base */
-       if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, cr3)) != 0)
+       if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, 0)) != 0)
                goto done;

        desc_base = gdtbase;

実際のdiffはこのあたり:
svn diff -r r237670:r237671 https://socsvn.freebsd.org/socsvn/soc2012/syuu/bhyve-bios

bhyveコマンドにBIOSモードを追加(-bオプション)

Intel VT-xは元々プロテクトモードのハードウェア仮想化しかサポートしていない。
EPTを利用してリアルモードのハードウェア仮想化が出来るようになったのは、Nehalem-Cから。
この機能をunrestricted guest modeと呼ぶ。
BHyVeのカーネルモジュールはこれをオン・オフする機能を既に持っているので、オンにしてやる。

Index: usr.sbin/bhyve/fbsdrun.c
===================================================================
--- usr.sbin/bhyve/fbsdrun.c    (revision 237668)
+++ usr.sbin/bhyve/fbsdrun.c    (revision 237669)
@@ -99,6 +99,8 @@
 static void *oem_tbl_start;
 static int oem_tbl_size;

+static int bios_mode;
+
 static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);

 struct vm_exit vmexit[VM_MAXCPU];
@@ -126,7 +128,7 @@
 {

         fprintf(stderr,
-                "Usage: %s [-ehBHP][-g <gdb port>][-z <hz>][-s <pci>][-p pincpu]"
+                "Usage: %s [-ehBHPb][-g <gdb port>][-z <hz>][-s <pci>][-p pincpu]"
                "[-n <pci>][-m lowmem][-M highmem] <vm>\n"
                "       -g: gdb port (default is %d and 0 means don't open)\n"
                "       -c: # cpus (default 1)\n"
@@ -143,7 +145,8 @@
                "       -m: lowmem in MB\n"
                "       -M: highmem in MB\n"
                "       -x: mux vcpus to 1 hcpu\n"
-               "       -t: mux vcpu timeslice hz (default %d)\n",
+               "       -t: mux vcpu timeslice hz (default %d)\n"
+               "       -b: BIOS compatible mode\n",
                progname, DEFAULT_GDB_PORT, DEFAULT_GUEST_HZ,
                DEFAULT_GUEST_TSLICE);
        exit(code);
@@ -531,7 +544,7 @@
        gdb_port = DEFAULT_GDB_PORT;
        guest_ncpus = 1;

-       while ((c = getopt(argc, argv, "ehBHPxp:g:c:z:s:S:n:m:M:")) != -1) {
+       while ((c = getopt(argc, argv, "ehBHPxbp:g:c:z:s:S:n:m:M:")) != -1) {
                switch (c) {
                case 'B':
                        inject_bkpt = 1;
@@ -578,6 +591,9 @@
                case 'e':
                        strictio = 1;
                        break;
+               case 'b':
+                       bios_mode = 1;
+                       break;
                case 'h':
                        usage(0);
                default:
@@ -644,6 +660,10 @@
                }
        }

+       if (bios_mode != 0) {
+               vm_set_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, 1);
+       }
+
        init_inout();
        init_pci(ctx);

実際のdiffはこのあたり:
svn diff -r r237668:r237669
https://socsvn.freebsd.org/socsvn/soc2012/syuu/bhyve-bios

実行してみる

$ sudo ./bhyvebiosload -m 128 -M 256 -h /usr/bhyve-guest/ vm0
$ cd ../bhyve
$ sudo ./bhyve -m 128 -M 256 -b vm0
vm exit[0]
       reason          VMX
       rip             0x0000000000000000
       inst_length     3
       error           0
       exit_reason     2
       qualification   0x0000000000000000

てな訳で、実行すべき命令がVMCALLだという事は分かっているらしいが(3byteのinst_length)、実行する前にtriple faultを起こして死んでいる。
即時faultを起こすようなコントロールレジスタの値とか、セグメントレジスタの値になっているのだと思うのだけれど、どれがそうなのか判別がついてない。

レジスタダンプ(bhyvebiosload直後)

$ sudo vmmctl --get-all --vm=vm0
lowmem          0x0000000100000000/134217728
highmem         0x0000000108000000/268435456
efer[0]         0x0000000000000000
cr0[0]          0x0000000000000020
cr3[0]          0x0000000000000000
cr4[0]          0x0000000000002000
dr7[0]          0x0000000000000000
rsp[0]          0x0000000000000000
rip[0]          0x0000000000000000
rax[0]          0x0000000000000000
rbx[0]          0x0000000000000000
rcx[0]          0x0000000000000000
rdx[0]          0x0000000000000000
rsi[0]          0x0000000000000000
rdi[0]          0x0000000000000000
rbp[0]          0x0000000000000000
r8[0]           0x0000000000000000
r9[0]           0x0000000000000000
r10[0]          0x0000000000000000
r11[0]          0x0000000000000000
r12[0]          0x0000000000000000
r13[0]          0x0000000000000000
r14[0]          0x0000000000000000
r15[0]          0x0000000000000000
rflags[0]       0x0000000000000002
vcpu0
vcpu migration across host cpus         0
vcpu total runtime                      0
vm exits due to external interrupt      0
ds desc[0]      0x0000000000000000/0x00000000/0x00000093
es desc[0]      0x0000000000000000/0x00000000/0x00000093
fs desc[0]      0x0000000000000000/0x00000000/0x00000093
gs desc[0]      0x0000000000000000/0x00000000/0x00000093
ss desc[0]      0x0000000000000000/0x00000000/0x00000093
cs desc[0]      0x0000000000000000/0x00000000/0x0000209b
tr desc[0]      0x0000000000000000/0x00000000/0x0000008b
ldtr desc[0]    0x0000000000000000/0x00000000/0x00010000
gdtr[0]         0x0000000000005000/0x00000017
idtr[0]         0x0000000000000000/0x00000000
cs[0]           0x0000
ds[0]           0x0000
es[0]           0x0000
fs[0]           0x0000
gs[0]           0x0000
ss[0]           0x0000
tr[0]           0x0000
ldtr[0]         0x0000
pincpu[0]       unpinned
pinbased_ctls[0]        0x0000003f
procbased_ctls[0]       0x95006172
procbased_ctls2[0]      0x00000022
gla[0]          0x0000000000000000
gpa[0]          0x0000000000000000
entry_interruption_info[0]      0x00000000
eptp[0]         0x000000007b08801e
exception_bitmap[0]     0x00040000
io_bitmap_a[0]  0x00000000
io_bitmap_b[0]  0x00000000
tsc_offset[0]   0x0000000000000000
cr0_mask[0]             0xffffffff60000020
cr0_shadow[0]           0x0000000000000020
cr4_mask[0]             0x0000000000000000
cr4_shadow[0]           0x0000000000000000
cr3_target_count[0]     0x00000000
cr3_target0[0]          0x0000000000000000
cr3_target1[0]          0x0000000000000000
cr3_target2[0]          0x0000000000000000
cr3_target3[0]          0x0000000000000000
apic_access_addr[0]     0x0000000000000000
virtual_apic_addr[0]    0x0000000000000000
tpr_threshold[0]        0x00000000
msr_bitmap[0]           0x000000006ef31000
msr 0xc0000080[0]               RW
msr 0xc0000100[0]               RW
msr 0xc0000101[0]               RW
msr 0xc0000102[0]               RW
msr 0x00000277[0]               RW
vpid[0]         0x00c9
ple_window[0]           0x00000000
ple_gap[0]              0x00000000
instruction_error[0]    0x00000000
exit_ctls[0]            0x003f6ffb
entry_ctls[0]           0x0000d1fb
host_pat[0]             0x0001050600070406
guest_pat[0]            0x0007040600070406
host_cr0[0]             0x0000000080050033
host_cr3[0]             0x0000000000000000
host_cr4[0]             0x00000000000026f0
host_rip[0]             0xffffffff81812090
host_rip[0]             0xffffff8002aaa000
guest_sysenter_cs[0]    0x00000000
guest_sysenter_sp[0]    0x0000000000000000
guest_sysenter_ip[0]    0x0000000000000000
vmcs_pointer[0] 0xffffffffffffffff
vmcs_exit_reason[0]     0x0000000000000000
vmcs_exit_qualification[0]      0x0000000000000000
vmcs_exit_interruption_info[0]  0x00000000
vmcs_exit_interruption_error[0] 0x00000000
vmcs_guest_interruptibility[0]  0x00000000
errno = 22

レジスタダンプ(bhyve実行後)

$ sudo vmmctl --get-all --vm=vm0
lowmem          0x0000000100000000/134217728
highmem         0x0000000108000000/268435456
efer[0]         0x0000000000000000
cr0[0]          0x0000000000000030
cr3[0]          0x0000000000000000
cr4[0]          0x0000000000002000
dr7[0]          0x0000000000000000
rsp[0]          0x0000000000000000
rip[0]          0x0000000000000000
rax[0]          0x0000000000000000
rbx[0]          0x0000000000000000
rcx[0]          0x0000000000000000
rdx[0]          0x0000000000000000
rsi[0]          0x0000000000000000
rdi[0]          0x0000000000000000
rbp[0]          0x0000000000000000
r8[0]           0x0000000000000000
r9[0]           0x0000000000000000
r10[0]          0x0000000000000000
r11[0]          0x0000000000000000
r12[0]          0x0000000000000000
r13[0]          0x0000000000000000
r14[0]          0x0000000000000000
r15[0]          0x0000000000000000
rflags[0]       0x0000000000010002
vcpu0
vcpu migration across host cpus         1
vcpu total runtime                      23841
vm exits due to external interrupt      0
ds desc[0]      0x0000000000000000/0x00000000/0x00000093
es desc[0]      0x0000000000000000/0x00000000/0x00000093
fs desc[0]      0x0000000000000000/0x00000000/0x00000093
gs desc[0]      0x0000000000000000/0x00000000/0x00000093
ss desc[0]      0x0000000000000000/0x00000000/0x00000093
cs desc[0]      0x0000000000000000/0x00000000/0x0000209b
tr desc[0]      0x0000000000000000/0x00000000/0x0000008b
ldtr desc[0]    0x0000000000000000/0x00000000/0x00010000
gdtr[0]         0x0000000000005000/0x00000017
idtr[0]         0x0000000000000000/0x00000000
cs[0]           0x0000
ds[0]           0x0000
es[0]           0x0000
fs[0]           0x0000
gs[0]           0x0000
ss[0]           0x0000
tr[0]           0x0000
ldtr[0]         0x0000
pincpu[0]       unpinned
pinbased_ctls[0]        0x0000003f
procbased_ctls[0]       0x95006172
procbased_ctls2[0]      0x000000a2
gla[0]          0x0000000000000000
gpa[0]          0x0000000000000000
entry_interruption_info[0]      0x00000000
eptp[0]         0x000000007b08801e
exception_bitmap[0]     0x00040000
io_bitmap_a[0]  0x00000000
io_bitmap_b[0]  0x00000000
tsc_offset[0]   0x0000000000000000
cr0_mask[0]             0xffffffff60000020
cr0_shadow[0]           0x0000000000000020
cr4_mask[0]             0x0000000000000000
cr4_shadow[0]           0x0000000000000000
cr3_target_count[0]     0x00000000
cr3_target0[0]          0x0000000000000000
cr3_target1[0]          0x0000000000000000
cr3_target2[0]          0x0000000000000000
cr3_target3[0]          0x0000000000000000
apic_access_addr[0]     0x0000000000000000
virtual_apic_addr[0]    0x0000000000000000
tpr_threshold[0]        0x00000000
msr_bitmap[0]           0x000000006ef31000
msr 0xc0000080[0]               RW
msr 0xc0000100[0]               RW
msr 0xc0000101[0]               RW
msr 0xc0000102[0]               RW
msr 0x00000277[0]               RW
vpid[0]         0x00c9
ple_window[0]           0x00000000
ple_gap[0]              0x00000000
instruction_error[0]    0x00000000
exit_ctls[0]            0x003f6ffb
entry_ctls[0]           0x0000d1fb
host_pat[0]             0x0001050600070406
guest_pat[0]            0x0007040600070406
host_cr0[0]             0x0000000080050033
host_cr3[0]             0x000000002c71c000
host_cr4[0]             0x00000000000026f0
host_rip[0]             0xffffffff81812090
host_rip[0]             0xffffff8002aaa000
guest_sysenter_cs[0]    0x00000000
guest_sysenter_sp[0]    0x0000000000000000
guest_sysenter_ip[0]    0x0000000000000000
vmcs_pointer[0] 0xffffffffffffffff
vmcs_exit_reason[0]     0x0000000000000002
vmcs_exit_qualification[0]      0x0000000000000000
vmcs_exit_interruption_info[0]  0x00000000
vmcs_exit_interruption_error[0] 0x00000000
vmcs_guest_interruptibility[0]  0x00000000
errno = 22