kexecの実装
ユーザランドのkexecコマンドへ再起動後のカーネルを渡します。
ELFパーサとかbzImageカーネルイメージパーサとか種類あるみたいだけど、それらがカーネルのレイアウト読み取って解釈を行なって、最終的にイメージとメモリレイアウトをシステムコール経由でカーネルへ渡します。
kexec.c\kexec - kernel/kexec/kexec-tools.git - kexec-tools development tree
result = kexec_load( info.entry, info.nr_segments, info.segment, info.kexec_flags);
kexec-syscall.h\kexec - kernel/kexec/kexec-tools.git - kexec-tools development tree
static inline long kexec_load(void *entry, unsigned long nr_segments, struct kexec_segment *segments, unsigned long flags) { return (long) syscall(__NR_kexec_load, entry, nr_segments, segments, flags); }
カーネル側では渡されたデータの解釈を行なって最終的に変数「kexec_image」に保存しときます。まだこのカーネルでリブートするわけではありません。
LXR linux/kernel/kexec.c
for (i = 0; i < nr_segments; i++) { result = kimage_load_segment(image, &image->segment[i]); if (result) goto out; }
新しいカーネルで再起動するのはrebootを呼んだ時です。
特別なモードのrebootを呼ぶ必要があるようです。
kexec.c\kexec - kernel/kexec/kexec-tools.git - kexec-tools development tree
/* * Exec the new kernel (reboot) */ static int my_exec(void) { reboot(LINUX_REBOOT_CMD_KEXEC); /* I have failed if I make it here */ fprintf(stderr, "kexec failed: %s\n", strerror(errno)); return -1; }
カーネル側でLINUX_REBOOT_CMD_KEXECがハンドルされkexecのルーチンが呼ばれます。
LXR linux/kernel/sys.c
#ifdef CONFIG_KEXEC case LINUX_REBOOT_CMD_KEXEC: ret = kernel_kexec(); break; #endif
いろいろあってアーキテクチャごとのルーチンが呼ばれます。
LXR linux/kernel/kexec.c
machine_kexec(kexec_image);
そこでセグメントレジスタやGDTR・IDTRをリセットし、最終的にアセンブリのルーチンをコールします。
LXR linux/arch/x86/kernel/machine_kexec_64.c
void machine_kexec(struct kimage *image) { unsigned long page_list[PAGES_NR]; void *control_page; int save_ftrace_enabled; #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) save_processor_state(); #endif save_ftrace_enabled = __ftrace_enabled_save(); /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); hw_breakpoint_disable(); if (image->preserve_context) { #ifdef CONFIG_X86_IO_APIC /* * We need to put APICs in legacy mode so that we can * get timer interrupts in second kernel. kexec/kdump * paths already have calls to disable_IO_APIC() in * one form or other. kexec jump path also need * one. */ disable_IO_APIC(); #endif } control_page = page_address(image->control_code_page) + PAGE_SIZE; memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; page_list[PA_TABLE_PAGE] = (unsigned long)__pa(page_address(image->control_code_page)); if (image->type == KEXEC_TYPE_DEFAULT) page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT); /* * The segment registers are funny things, they have both a * visible and an invisible part. Whenever the visible part is * set to a specific selector, the invisible part is loaded * with from a table in memory. At no other time is the * descriptor table in memory accessed. * * I take advantage of this here by force loading the * segments, before I zap the gdt with an invalid value. */ load_segments(); /* * The gdt & idt are now invalid. * If you want to load them you must set up your own idt & gdt. */ set_gdt(phys_to_virt(0), 0); set_idt(phys_to_virt(0), 0); /* now call it */ image->start = relocate_kernel((unsigned long)image->head, (unsigned long)page_list, image->start, image->preserve_context);
アセンブリのコードでは、レジスタの初期化を行い、カーネルのリロケーション(ロード先へカーネルをコピー)を行い、新しいカーネルへジャンプします。
これで新しいカーネルが起動してくるのでした。めでたしめでたし。
LXR linux/arch/x86/kernel/relocate_kernel_64.S
relocate_kernel: /* * %rdi indirection_page * %rsi page_list * %rdx start address * %rcx preserve_context */ /* Save the CPU context, used for jumping back */ pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushf movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 movq %rsp, RSP(%r11) movq %cr0, %rax movq %rax, CR0(%r11) movq %cr3, %rax movq %rax, CR3(%r11) movq %cr4, %rax movq %rax, CR4(%r11) /* zero out flags, and disable interrupts */ pushq $0 popfq /* * get physical address of control page now * this is impossible after page table switch */ movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 /* get physical address of page table now too */ movq PTR(PA_TABLE_PAGE)(%rsi), %r9 /* get physical address of swap page now */ movq PTR(PA_SWAP_PAGE)(%rsi), %r10 /* save some information for jumping back */ movq %r9, CP_PA_TABLE_PAGE(%r11) movq %r10, CP_PA_SWAP_PAGE(%r11) movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11) /* Switch to the identity mapped page tables */ movq %r9, %cr3 /* setup a new stack at the end of the physical control page */ lea PAGE_SIZE(%r8), %rsp /* jump to identity mapped page */ addq $(identity_mapped - relocate_kernel), %r8 pushq %r8 ret identity_mapped: /* set return address to 0 if not preserving context */ pushq $0 /* store the start address on the stack */ pushq %rdx /* * Set cr0 to a known state: * - Paging enabled * - Alignment check disabled * - Write protect disabled * - No task switch * - Don't do FP software emulation. * - Proctected mode enabled */ movq %cr0, %rax andq $~(X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %rax orl $(X86_CR0_PG | X86_CR0_PE), %eax movq %rax, %cr0 /* * Set cr4 to a known state: * - physical address extension enabled */ movq $X86_CR4_PAE, %rax movq %rax, %cr4 jmp 1f 1: /* Flush the TLB (needed?) */ movq %r9, %cr3 movq %rcx, %r11 call swap_pages /* * To be certain of avoiding problems with self-modifying code * I need to execute a serializing instruction here. * So I flush the TLB by reloading %cr3 here, it's handy, * and not processor dependent. */ movq %cr3, %rax movq %rax, %cr3 /* * set all of the registers to known values * leave %rsp alone */ testq %r11, %r11 jnz 1f xorq %rax, %rax xorq %rbx, %rbx xorq %rcx, %rcx xorq %rdx, %rdx xorq %rsi, %rsi xorq %rdi, %rdi xorq %rbp, %rbp xorq %r8, %r8 xorq %r9, %r9 xorq %r10, %r9 xorq %r11, %r11 xorq %r12, %r12 xorq %r13, %r13 xorq %r14, %r14 xorq %r15, %r15 ret 1: popq %rdx leaq PAGE_SIZE(%r10), %rsp call *%rdx /* get the re-entry point of the peer system */ movq 0(%rsp), %rbp call 1f 1: popq %r8 subq $(1b - relocate_kernel), %r8 movq CP_PA_SWAP_PAGE(%r8), %r10 movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi movq CP_PA_TABLE_PAGE(%r8), %rax movq %rax, %cr3 lea PAGE_SIZE(%r8), %rsp call swap_pages movq $virtual_mapped, %rax pushq %rax ret virtual_mapped: movq RSP(%r8), %rsp movq CR4(%r8), %rax movq %rax, %cr4 movq CR3(%r8), %rax movq CR0(%r8), %r8 movq %rax, %cr3 movq %r8, %cr0 movq %rbp, %rax popf popq %r15 popq %r14 popq %r13 popq %r12 popq %rbp popq %rbx ret /* Do the copies */ swap_pages: movq %rdi, %rcx /* Put the page_list in %rcx */ xorq %rdi, %rdi xorq %rsi, %rsi jmp 1f 0: /* top, read another word for the indirection page */ movq (%rbx), %rcx addq $8, %rbx 1: testq $0x1, %rcx /* is it a destination page? */ jz 2f movq %rcx, %rdi andq $0xfffffffffffff000, %rdi jmp 0b 2: testq $0x2, %rcx /* is it an indirection page? */ jz 2f movq %rcx, %rbx andq $0xfffffffffffff000, %rbx jmp 0b 2: testq $0x4, %rcx /* is it the done indicator? */ jz 2f jmp 3f 2: testq $0x8, %rcx /* is it the source indicator? */ jz 0b /* Ignore it otherwise */ movq %rcx, %rsi /* For ever source page do a copy */ andq $0xfffffffffffff000, %rsi movq %rdi, %rdx movq %rsi, %rax movq %r10, %rdi movq $512, %rcx rep ; movsq movq %rax, %rdi movq %rdx, %rsi movq $512, %rcx rep ; movsq movq %rdx, %rdi movq %r10, %rsi movq $512, %rcx rep ; movsq lea PAGE_SIZE(%rax), %rsi jmp 0b 3: ret