GSoC 2012 week4&5:week3のバグを直して擬似BIOS動くようにした

これが動いていなかったので、直した。

IntelのマニュアルのVol3・section 9.1.1 "Processor State After Reset"とsection 26.3 ”CHECKING AND LOADING GUEST STATE”を読みながら、VMCSの初期値のつじつま合わせを。

int
vm_setup_bios_registers(struct vmctx *vmctx, int vcpu)
{
	int error;
	uint64_t rip, cr0, cr3, cr4, efer, rflags, rax, rbx, rcx, rdx;
	uint64_t rsi, rdi, rbp, rsp, desc_base;
	uint32_t desc_access, desc_limit;
	uint16_t gsel;

#if 0
	rip = 0xfff0;
#endif
	rip = 0x0;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0)
		goto done;

	rflags = 0x2;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags))	!= 0)
		goto done;

	cr0 = 0x60000010;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0)
		goto done;

	cr3 = 0;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, cr3)) != 0)
		goto done;

	cr4 = 0;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0)
		goto done;

#if 0
	desc_base = 0xffff0000;
#endif
	desc_base = 0x0;
	desc_limit = 0xffff;
	/* PRESENT | DESC_TYPE_CODEDATA | SEG_TYPE_DATA_RW_ACCESSED */
	desc_access = 0x00000093;
	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS,
			    desc_base, desc_limit, desc_access);
	if (error)
		goto done;

#if 0
	gsel = 0xf000;
#endif
	gsel = 0x0;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, gsel)) != 0)
		goto done;

	desc_base = 0x0;
	desc_limit = 0xffff;
	/* PRESENT | DESC_TYPE_CODEDATA | SEG_TYPE_DATA_RW_ACCESSED */
	desc_access = 0x00000093;
	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS,
			    desc_base, desc_limit, desc_access);
	if (error)
		goto done;

	gsel = 0x0;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, gsel)) != 0)
		goto done;

	/* same as SS */
	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS,
			    desc_base, desc_limit, desc_access);
	if (error)
		goto done;

	/* same as SS */
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, gsel)) != 0)
		goto done;

	/* same as SS */
	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES,
			    desc_base, desc_limit, desc_access);
	if (error)
		goto done;

	/* same as SS */
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, gsel)) != 0)
		goto done;

	/* same as SS */
	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS,
			    desc_base, desc_limit, desc_access);
	if (error)
		goto done;

	/* same as SS */
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, gsel)) != 0)
		goto done;

	/* same as SS */
	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS,
			    desc_base, desc_limit, desc_access);
	if (error)
		goto done;

	/* same as SS */
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, gsel)) != 0)
		goto done;

	rdx = 0xf00;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDX, rdx)) != 0)
		goto done;

	rax = 0x0;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RAX, rax)) != 0)
		goto done;

	rbx = 0x0;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBX, rbx)) != 0)
		goto done;

	rcx = 0x0;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RCX, rcx)) != 0)
		goto done;

	rsi = 0;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSI, rsi)) != 0)
		goto done;

	rdi = 0;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDI, rdi)) != 0)
		goto done;

	rbp = 0;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBP, rbp)) != 0)
		goto done;

	rsp = 0;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, rsp)) != 0)
		goto done;

	desc_base = 0x0;
	desc_limit = 0xffff;
	/* PRESENT | DESC_TYPE_CODEDATA | SEG_TYPE_DATA_RW */
	desc_access = 0x00000092;
	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR,
			    desc_base, desc_limit, desc_access);
	if (error != 0)
		goto done;

	/* same as GDTR */
	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_IDTR,
			    desc_base, desc_limit, desc_access);
	if (error != 0)
		goto done;

	desc_base = 0x0;
	desc_limit = 0xffff;
	/* PRESENT | SEG_TYPE_16BIT_BUSY_TSS */
	desc_access = 0x00000083;
	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR,
			    desc_base, desc_limit, desc_access);
	if (error)
		goto done;

	gsel = 0x0;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, gsel)) != 0)
		goto done;

	desc_base = 0x0;
	desc_limit = 0xffff;
	/* PRESENT | SEG_TYPE_LDT */
	desc_access = 0x00000082;
	error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR,
			    desc_base, desc_limit, desc_access);
	if (error)
		goto done;

	/* same as TR */
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, gsel)) != 0)
		goto done;

	efer = 0x9;
	if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_EFER, efer)) != 0)
		goto done;

	error = 0;
done:
	return (error);
}

ここまではユーザランドから出来たのだが、どうもカーネル側でも手を加えないとならないと気づいたので、こんな感じに。
(real modeでVMLAUNCHするには、VM_ENTRY_LOAD_EFERとVM_ENTRY_GUEST_LMAをクリアしないとならんという話。)

Index: sys/amd64/vmm/intel/vmx.c
===================================================================
--- sys/amd64/vmm/intel/vmx.c	(revision 238294)
+++ sys/amd64/vmm/intel/vmx.c	(revision 238295)
@@ -1652,10 +1655,22 @@
 		break;
 	case VM_CAP_UNRESTRICTED_GUEST:
 		if (cap_unrestricted_guest) {
+			uint64_t ctls;
+
 			retval = 0;
 			baseval = procbased_ctls2;
 			flag = PROCBASED2_UNRESTRICTED_GUEST;
 			reg = VMCS_SEC_PROC_BASED_CTLS;
+			error = vmcs_getreg(vmcs,
+				    VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
+			if (error == 0) {
+				ctls &= ~(VM_ENTRY_LOAD_EFER | VM_ENTRY_GUEST_LMA);
+				vmcs_setreg(vmcs,
+				    	VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
+			}else{
+				printf("%s vmcs_getreg returns %d\n",
+					__func__, error);
+			}
 		}
 		break;
 	default:

これでBHyVeを実行すると、アドレス0000:0000からリアルモードでプログラムが実行され、0000:0000に置いたVMCALL命令が無事実行されて/usr/sbin/bhyveで命令がハンドル出来た。

$ sudo bhyvebiosload -m 128 -M 256 -h /usr/bhyve-guest/ vm0
$ sudo bhyve -m 128 -M 256 -b vm0
VMCALL handled

とてもおもしろいのだが、正直セグメント周りが今でもよく分からなくて、それが原因で本来考えていたFFFF:0000からのエントリは今でもできていない(´・ω・`)

ちなみに、マトモに全diffを読みたい人は、

svn diff -r238292:238305 https://socsvn.freebsd.org/socsvn/soc2012/syuu/bhyve-bios

してください。