kexecの実装

ユーザランドのkexecコマンドへ再起動後のカーネルを渡します。

ELFパーサとかbzImageカーネルイメージパーサとか種類あるみたいだけど、それらがカーネルのレイアウト読み取って解釈を行なって、最終的にイメージとメモリレイアウトをシステムコール経由でカーネルへ渡します。
kexec.c\kexec - kernel/kexec/kexec-tools.git - kexec-tools development tree

result = kexec_load(
		info.entry, info.nr_segments, info.segment, info.kexec_flags);

kexec-syscall.h\kexec - kernel/kexec/kexec-tools.git - kexec-tools development tree

static inline long kexec_load(void *entry, unsigned long nr_segments,
			struct kexec_segment *segments, unsigned long flags)
{
	return (long) syscall(__NR_kexec_load, entry, nr_segments, segments, flags);
}

カーネル側では渡されたデータの解釈を行なって最終的に変数「kexec_image」に保存しときます。まだこのカーネルでリブートするわけではありません。
LXR linux/kernel/kexec.c

                for (i = 0; i < nr_segments; i++) {
                        result = kimage_load_segment(image, &image->segment[i]);
                        if (result)
                                goto out;
                }

新しいカーネルで再起動するのはrebootを呼んだ時です。
特別なモードのrebootを呼ぶ必要があるようです。
kexec.c\kexec - kernel/kexec/kexec-tools.git - kexec-tools development tree

/*
 *	Exec the new kernel (reboot)
 */
static int my_exec(void)
{
	reboot(LINUX_REBOOT_CMD_KEXEC);
	/* I have failed if I make it here */
	fprintf(stderr, "kexec failed: %s\n", 
		strerror(errno));
	return -1;
}

カーネル側でLINUX_REBOOT_CMD_KEXECがハンドルされkexecのルーチンが呼ばれます。
LXR linux/kernel/sys.c

#ifdef CONFIG_KEXEC
        case LINUX_REBOOT_CMD_KEXEC:
                ret = kernel_kexec();
                break;
#endif

いろいろあってアーキテクチャごとのルーチンが呼ばれます。
LXR linux/kernel/kexec.c

        machine_kexec(kexec_image);

そこでセグメントレジスタやGDTR・IDTRをリセットし、最終的にアセンブリのルーチンをコールします。
LXR linux/arch/x86/kernel/machine_kexec_64.c

void machine_kexec(struct kimage *image)
{
        unsigned long page_list[PAGES_NR];
        void *control_page;
        int save_ftrace_enabled;

#ifdef CONFIG_KEXEC_JUMP
        if (image->preserve_context)
                save_processor_state();
#endif

        save_ftrace_enabled = __ftrace_enabled_save();

        /* Interrupts aren't acceptable while we reboot */
        local_irq_disable();
        hw_breakpoint_disable();

        if (image->preserve_context) {
#ifdef CONFIG_X86_IO_APIC
                /*
                 * We need to put APICs in legacy mode so that we can
                 * get timer interrupts in second kernel. kexec/kdump
                 * paths already have calls to disable_IO_APIC() in
                 * one form or other. kexec jump path also need
                 * one.
                 */
                disable_IO_APIC();
#endif
        }

        control_page = page_address(image->control_code_page) + PAGE_SIZE;
        memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);

        page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
        page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
        page_list[PA_TABLE_PAGE] =
          (unsigned long)__pa(page_address(image->control_code_page));

        if (image->type == KEXEC_TYPE_DEFAULT)
                page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
                                                << PAGE_SHIFT);

        /*
         * The segment registers are funny things, they have both a
         * visible and an invisible part.  Whenever the visible part is
         * set to a specific selector, the invisible part is loaded
         * with from a table in memory.  At no other time is the
         * descriptor table in memory accessed.
         *
         * I take advantage of this here by force loading the
         * segments, before I zap the gdt with an invalid value.
         */
        load_segments();
        /*
         * The gdt & idt are now invalid.
         * If you want to load them you must set up your own idt & gdt.
         */
        set_gdt(phys_to_virt(0), 0);
        set_idt(phys_to_virt(0), 0);

        /* now call it */
        image->start = relocate_kernel((unsigned long)image->head,
                                       (unsigned long)page_list,
                                       image->start,
                                       image->preserve_context);

アセンブリのコードでは、レジスタの初期化を行い、カーネルのリロケーション(ロード先へカーネルをコピー)を行い、新しいカーネルへジャンプします。
これで新しいカーネルが起動してくるのでした。めでたしめでたし。
LXR linux/arch/x86/kernel/relocate_kernel_64.S

relocate_kernel:
	/*
	 * %rdi indirection_page
	 * %rsi page_list
	 * %rdx start address
	 * %rcx preserve_context
	 */

	/* Save the CPU context, used for jumping back */
	pushq %rbx
	pushq %rbp
	pushq %r12
	pushq %r13
	pushq %r14
	pushq %r15
	pushf

	movq	PTR(VA_CONTROL_PAGE)(%rsi), %r11
	movq	%rsp, RSP(%r11)
	movq	%cr0, %rax
	movq	%rax, CR0(%r11)
	movq	%cr3, %rax
	movq	%rax, CR3(%r11)
	movq	%cr4, %rax
	movq	%rax, CR4(%r11)

	/* zero out flags, and disable interrupts */
	pushq $0
	popfq

	/*
	 * get physical address of control page now
	 * this is impossible after page table switch
	 */
	movq	PTR(PA_CONTROL_PAGE)(%rsi), %r8

	/* get physical address of page table now too */
	movq	PTR(PA_TABLE_PAGE)(%rsi), %r9

	/* get physical address of swap page now */
	movq	PTR(PA_SWAP_PAGE)(%rsi), %r10

	/* save some information for jumping back */
	movq	%r9, CP_PA_TABLE_PAGE(%r11)
	movq	%r10, CP_PA_SWAP_PAGE(%r11)
	movq	%rdi, CP_PA_BACKUP_PAGES_MAP(%r11)

	/* Switch to the identity mapped page tables */
	movq	%r9, %cr3

	/* setup a new stack at the end of the physical control page */
	lea	PAGE_SIZE(%r8), %rsp

	/* jump to identity mapped page */
	addq	$(identity_mapped - relocate_kernel), %r8
	pushq	%r8
	ret

identity_mapped:
	/* set return address to 0 if not preserving context */
	pushq	$0
	/* store the start address on the stack */
	pushq   %rdx

	/*
	 * Set cr0 to a known state:
	 *  - Paging enabled
	 *  - Alignment check disabled
	 *  - Write protect disabled
	 *  - No task switch
	 *  - Don't do FP software emulation.
	 *  - Proctected mode enabled
	 */
	movq	%cr0, %rax
	andq	$~(X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %rax
	orl	$(X86_CR0_PG | X86_CR0_PE), %eax
	movq	%rax, %cr0

	/*
	 * Set cr4 to a known state:
	 *  - physical address extension enabled
	 */
	movq	$X86_CR4_PAE, %rax
	movq	%rax, %cr4

	jmp 1f
1:

	/* Flush the TLB (needed?) */
	movq	%r9, %cr3

	movq	%rcx, %r11
	call	swap_pages

	/*
	 * To be certain of avoiding problems with self-modifying code
	 * I need to execute a serializing instruction here.
	 * So I flush the TLB by reloading %cr3 here, it's handy,
	 * and not processor dependent.
	 */
	movq	%cr3, %rax
	movq	%rax, %cr3

	/*
	 * set all of the registers to known values
	 * leave %rsp alone
	 */

	testq	%r11, %r11
	jnz 1f
	xorq	%rax, %rax
	xorq	%rbx, %rbx
	xorq    %rcx, %rcx
	xorq    %rdx, %rdx
	xorq    %rsi, %rsi
	xorq    %rdi, %rdi
	xorq    %rbp, %rbp
	xorq	%r8,  %r8
	xorq	%r9,  %r9
	xorq	%r10, %r9
	xorq	%r11, %r11
	xorq	%r12, %r12
	xorq	%r13, %r13
	xorq	%r14, %r14
	xorq	%r15, %r15

	ret

1:
	popq	%rdx
	leaq	PAGE_SIZE(%r10), %rsp
	call	*%rdx

	/* get the re-entry point of the peer system */
	movq	0(%rsp), %rbp
	call	1f
1:
	popq	%r8
	subq	$(1b - relocate_kernel), %r8
	movq	CP_PA_SWAP_PAGE(%r8), %r10
	movq	CP_PA_BACKUP_PAGES_MAP(%r8), %rdi
	movq	CP_PA_TABLE_PAGE(%r8), %rax
	movq	%rax, %cr3
	lea	PAGE_SIZE(%r8), %rsp
	call	swap_pages
	movq	$virtual_mapped, %rax
	pushq	%rax
	ret

virtual_mapped:
	movq	RSP(%r8), %rsp
	movq	CR4(%r8), %rax
	movq	%rax, %cr4
	movq	CR3(%r8), %rax
	movq	CR0(%r8), %r8
	movq	%rax, %cr3
	movq	%r8, %cr0
	movq	%rbp, %rax

	popf
	popq	%r15
	popq	%r14
	popq	%r13
	popq	%r12
	popq	%rbp
	popq	%rbx
	ret

	/* Do the copies */
swap_pages:
	movq	%rdi, %rcx 	/* Put the page_list in %rcx */
	xorq	%rdi, %rdi
	xorq	%rsi, %rsi
	jmp	1f

0:	/* top, read another word for the indirection page */

	movq	(%rbx), %rcx
	addq	$8,	%rbx
1:
	testq	$0x1,	%rcx  /* is it a destination page? */
	jz	2f
	movq	%rcx,	%rdi
	andq	$0xfffffffffffff000, %rdi
	jmp	0b
2:
	testq	$0x2,	%rcx  /* is it an indirection page? */
	jz	2f
	movq	%rcx,   %rbx
	andq	$0xfffffffffffff000, %rbx
	jmp	0b
2:
	testq	$0x4,	%rcx  /* is it the done indicator? */
	jz	2f
	jmp	3f
2:
	testq	$0x8,	%rcx  /* is it the source indicator? */
	jz	0b	      /* Ignore it otherwise */
	movq	%rcx,   %rsi  /* For ever source page do a copy */
	andq	$0xfffffffffffff000, %rsi

	movq	%rdi, %rdx
	movq	%rsi, %rax

	movq	%r10, %rdi
	movq	$512,   %rcx
	rep ; movsq

	movq	%rax, %rdi
	movq	%rdx, %rsi
	movq	$512,   %rcx
	rep ; movsq

	movq	%rdx, %rdi
	movq	%r10, %rsi
	movq	$512,   %rcx
	rep ; movsq

	lea	PAGE_SIZE(%rax), %rsi
	jmp	0b
3:
	ret