Linux 2.6/mipsのSMP実装について

linuxは全然知らんので、カーネルの起動シーケンスから確認。

まず、kernel_entryから起動する。

	j		start_kernel
	END(kernel_entry)

kernel_entryからstart_kernel()が呼ばれる。

asmlinkage void __init start_kernel(void)
{
	char * command_line;
	extern struct kernel_param __start___param[], __stop___param[];

	smp_setup_processor_id();

いきなりsmp_setup_processor_id()とかあるが、mipsでは定義されていないようだ。

	/*
	 * Need to run as early as possible, to initialize the
	 * lockdep hash:
	 */
	unwind_init();
	lockdep_init();

	local_irq_disable();
	early_boot_irqs_off();
	early_init_irq_lock_class();

/*
 * Interrupts are still disabled. Do necessary setups, then
 * enable them
 */
	lock_kernel();
	tick_init();
	boot_cpu_init();
	page_address_init();
	printk(KERN_NOTICE);
	printk(linux_banner);
	setup_arch(&command_line);

もう少し進めてみていくと、アーキテクチャ毎の初期化コードを呼ぶ部分があった。

void __init setup_arch(char **cmdline_p)
{
	cpu_probe();
	prom_init();

#ifdef CONFIG_EARLY_PRINTK
	{
		extern void setup_early_printk(void);

		setup_early_printk();
	}
#endif
	cpu_report();

#if defined(CONFIG_VT)
#if defined(CONFIG_VGA_CONSOLE)
	conswitchp = &vga_con;
#elif defined(CONFIG_DUMMY_CONSOLE)
	conswitchp = &dummy_con;
#endif
#endif

	arch_mem_init(cmdline_p);

	resource_init();
#ifdef CONFIG_SMP
	plat_smp_setup();
#endif
}

setup_arch()から、ボード毎の固有のsmp初期化関数なplat_smp_setup()が呼ばれている。
ここではsibyteのcfeを見ていく事にする。

/*
 * Use CFE to find out how many CPUs are available, setting up
 * phys_cpu_present_map and the logical/physical mappings.
 * XXXKW will the boot CPU ever not be physical 0?
 *
 * Common setup before any secondaries are started
 */
void __init plat_smp_setup(void)
{
	int i, num;

	cpus_clear(phys_cpu_present_map);
	cpu_set(0, phys_cpu_present_map);
	__cpu_number_map[0] = 0;
	__cpu_logical_map[0] = 0;

	for (i = 1, num = 0; i < NR_CPUS; i++) {
		if (cfe_cpu_stop(i) == 0) {
			cpu_set(i, phys_cpu_present_map);
			__cpu_number_map[i] = ++num;
			__cpu_logical_map[num] = i;
		}
	}
	printk(KERN_INFO "Detected %i available secondary CPU(s)\n", num);
}

cpus_clear(), cpu_set()というのは、phys_cpu_present_mapにCPUが存在するというフラグを立てる(或いは消す)マクロらしい。
cfe_cpu_stop()は指定されたCPUを停止するようファームウェアに指示する関数だと思うんだが、なぜ今ここで止めるのか?
恐らく、返り値でそのCPUが存在するかどうかが判別出来るから、というのが答えだと思われる。

#if defined(CFE_API_cpu_stop) || defined(CFE_API_ALL)
int cfe_cpu_stop(int cpu)
{
	cfe_xiocb_t xiocb;

	xiocb.xiocb_fcode = CFE_CMD_FW_CPUCTL;
	xiocb.xiocb_status = 0;
	xiocb.xiocb_handle = 0;
	xiocb.xiocb_flags = 0;
	xiocb.xiocb_psize = sizeof(xiocb_cpuctl_t);
	xiocb.plist.xiocb_cpuctl.cpu_number = cpu;
	xiocb.plist.xiocb_cpuctl.cpu_command = CFE_CPU_CMD_STOP;

	cfe_iocb_dispatch(&xiocb);

	return xiocb.xiocb_status;
}
#endif				/* CFE_API_cpu_stop || CFE_API_ALL */

これがcfe_cpu_stop()。関数とその引数を設定してcfe_iocb_dispatch()を呼び、返り値を受け取って返すラッパー関数。

	setup_command_line(command_line);
	unwind_setup();
	setup_per_cpu_areas();
	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */

再びstart_kernel()を見ていく。

#ifdef __GENERIC_PER_CPU
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;

EXPORT_SYMBOL(__per_cpu_offset);

static void __init setup_per_cpu_areas(void)
{
	unsigned long size, i;
	char *ptr;
	unsigned long nr_possible_cpus = num_possible_cpus();

	/* Copy section for each CPU (we discard the original) */
	size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
#ifdef CONFIG_MODULES
	if (size < PERCPU_ENOUGH_ROOM)
		size = PERCPU_ENOUGH_ROOM;
#endif
	ptr = alloc_bootmem(size * nr_possible_cpus);

	for_each_possible_cpu(i) {
		__per_cpu_offset[i] = ptr - __per_cpu_start;
		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
		ptr += size;
	}
}
#endif /* !__GENERIC_PER_CPU */

変なマクロとかもあって良く分からないのだが、多分CPU毎の構造体を入れるメモリ領域の確保及び初期化かな?

/* preload SMP state for boot cpu */
void __devinit smp_prepare_boot_cpu(void)
{
	/*
	 * This assumes that bootup is always handled by the processor
	 * with the logic and physical number 0.
	 */
	__cpu_number_map[0] = 0;
	__cpu_logical_map[0] = 0;
	cpu_set(0, phys_cpu_present_map);
	cpu_set(0, cpu_online_map);
	cpu_set(0, cpu_callin_map);
}

いよいよCPUを立ち上げるのかとおもいきや、ここでは単にCPU0のフラグを初期化しただけのようだ。

	/*
	 * Set up the scheduler prior starting any interrupts (such as the
	 * timer interrupt). Full topology setup happens at smp_init()
	 * time - but meanwhile we still have a functioning scheduler.
	 */
	sched_init();
	/*
	 * Disable preemption - early bootup scheduling is extremely
	 * fragile until we cpu_idle() for the first time.
	 */
	preempt_disable();
	build_all_zonelists();
	page_alloc_init();
	printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
	parse_early_param();
	parse_args("Booting kernel", static_command_line, __start___param,
		   __stop___param - __start___param,
		   &unknown_bootoption);
	if (!irqs_disabled()) {
		printk(KERN_WARNING "start_kernel(): bug: interrupts were "
				"enabled *very* early, fixing it\n");
		local_irq_disable();
	}
	sort_main_extable();
	trap_init();
	rcu_init();
	init_IRQ();
	pidhash_init();
	init_timers();
	hrtimers_init();
	softirq_init();
	timekeeping_init();
	time_init();
	profile_init();
	if (!irqs_disabled())
		printk("start_kernel(): bug: interrupts were enabled early\n");
	early_boot_irqs_on();
	local_irq_enable();

	/*
	 * HACK ALERT! This is early. We're enabling the console before
	 * we've done PCI setups etc, and console_init() must be aware of
	 * this. But we do want output early, in case something goes wrong.
	 */
	console_init();
	if (panic_later)
		panic(panic_later, panic_param);

	lockdep_info();

	/*
	 * Need to run this when irqs are enabled, because it wants
	 * to self-test [hard/soft]-irqs on/off lock inversion bugs
	 * too:
	 */
	locking_selftest();

#ifdef CONFIG_BLK_DEV_INITRD
	if (initrd_start && !initrd_below_start_ok &&
			initrd_start < min_low_pfn << PAGE_SHIFT) {
		printk(KERN_CRIT "initrd overwritten (0x%08lx < 0x%08lx) - "
		    "disabling it.\n",initrd_start,min_low_pfn << PAGE_SHIFT);
		initrd_start = 0;
	}
#endif
	vfs_caches_init_early();
	cpuset_init_early();
	mem_init();
	kmem_cache_init();
	setup_per_cpu_pageset();
	numa_policy_init();
	if (late_time_init)
		late_time_init();
	calibrate_delay();
	pidmap_init();
	pgtable_cache_init();
	prio_tree_init();
	anon_vma_init();
#ifdef CONFIG_X86
	if (efi_enabled)
		efi_enter_virtual_mode();
#endif
	fork_init(num_physpages);
	proc_caches_init();
	buffer_init();
	unnamed_dev_init();
	key_init();
	security_init();
	vfs_caches_init(num_physpages);
	radix_tree_init();
	signals_init();
	/* rootfs populating might need page-writeback */
	page_writeback_init();
#ifdef CONFIG_PROC_FS
	proc_root_init();
#endif
	cpuset_init();
	taskstats_init_early();
	delayacct_init();

	check_bugs();

	acpi_early_init(); /* before LAPIC and SMP init */

	/* Do the rest non-__init'ed, we're now alive */
	rest_init();
}

そのままずっと下までみていくと、最後にrest_init()に行き当たる。

/*
 * We need to finalize in a non-__init function or else race conditions
 * between the root thread and the init thread may cause start_kernel to
 * be reaped by free_initmem before the root thread has proceeded to
 * cpu_idle.
 *
 * gcc-3.4 accidentally inlines this function, so use noinline.
 */

static void noinline rest_init(void)
	__releases(kernel_lock)
{
	kernel_thread(init, NULL, CLONE_FS | CLONE_SIGHAND);
	numa_default_policy();
	unlock_kernel();

	/*
	 * The boot idle thread must execute schedule()
	 * at least one to get things moving:
	 */
	preempt_enable_no_resched();
	schedule();
	preempt_disable();

	/* Call into cpu_idle with preempt disabled */
	cpu_idle();
} 

init()をカーネルスレッドとして起動している。

static int __init init(void * unused)
{
	lock_kernel();
	/*
	 * init can run on any cpu.
	 */
	set_cpus_allowed(current, CPU_MASK_ALL);
	/*
	 * Tell the world that we're going to be the grim
	 * reaper of innocent orphaned children.
	 *
	 * We don't want people to have to make incorrect
	 * assumptions about where in the task array this
	 * can be found.
	 */
	init_pid_ns.child_reaper = current;

	cad_pid = task_pid(current);

	smp_prepare_cpus(max_cpus);

	do_pre_smp_initcalls();

	smp_init();
	sched_init_smp();

	cpuset_init_smp();

	do_basic_setup();

	/*
	 * check if there is an early userspace init.  If yes, let it do all
	 * the work
	 */

	if (!ramdisk_execute_command)
		ramdisk_execute_command = "/init";

	if (sys_access((const char __user *) ramdisk_execute_command, 0) != 0) {
		ramdisk_execute_command = NULL;
		prepare_namespace();
	}

	/*
	 * Ok, we have completed the initial bootup, and
	 * we're essentially up and running. Get rid of the
	 * initmem segments and start the user-mode stuff..
	 */
	init_post();
	return 0;
}

どうも、SMPの初期化は全部ここでやっているように見える。
順にみていこう。

/* called from main before smp_init() */
void __init smp_prepare_cpus(unsigned int max_cpus)
{
	init_new_context(current, &init_mm);
	current_thread_info()->cpu = 0;
	smp_tune_scheduling();
	plat_prepare_cpus(max_cpus);
#ifndef CONFIG_HOTPLUG_CPU
	cpu_present_map = cpu_possible_map;
#endif
}

prepare_cpusはボード依存のコードに丸投げらしい。

void __init plat_prepare_cpus(unsigned int max_cpus)
{
}

が、見てみるとなんと何もしてない。
次。

static void __init do_pre_smp_initcalls(void)
{
	extern int spawn_ksoftirqd(void);
#ifdef CONFIG_SMP
	extern int migration_init(void);

	migration_init();
#endif
	spawn_ksoftirqd();
	spawn_softlockup_task();
}

これは機種非依存コード。SMP対応スケジューラやソフト割り込みなどの初期化だろう。
次。

/* Called by boot processor to activate the rest. */
static void __init smp_init(void)
{
	unsigned int cpu;
	unsigned highest = 0;

	for_each_cpu_mask(cpu, cpu_possible_map)
		highest = cpu;
	nr_cpu_ids = highest + 1;

	/* FIXME: This should be done in userspace --RR */
	for_each_present_cpu(cpu) {
		if (num_online_cpus() >= max_cpus)
			break;
		if (!cpu_online(cpu))
			cpu_up(cpu);
	}

	/* Any cleanup work */
	printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus());
	smp_cpus_done(max_cpus);
}

#endif

cpuを順にみていって、!cpu_online()ならcpu_up()をかけている。

int __cpuinit cpu_up(unsigned int cpu)
{
	int err = 0;

	mutex_lock(&cpu_add_remove_lock);
	if (cpu_hotplug_disabled)
		err = -EBUSY;
	else
		err = _cpu_up(cpu);

	mutex_unlock(&cpu_add_remove_lock);
	return err;
}
/* Requires cpu_add_remove_lock to be held */
static int __cpuinit _cpu_up(unsigned int cpu)
{
	int ret;
	void *hcpu = (void *)(long)cpu;

	if (cpu_online(cpu) || !cpu_present(cpu))
		return -EINVAL;

	ret = raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
	if (ret == NOTIFY_BAD) {
		printk("%s: attempt to bring up CPU %u failed\n",
				__FUNCTION__, cpu);
		ret = -EINVAL;
		goto out_notify;
	}

	/* Arch-specific enabling code. */
	mutex_lock(&cpu_bitmask_lock);
	ret = __cpu_up(cpu);
	mutex_unlock(&cpu_bitmask_lock);
	if (ret != 0)
		goto out_notify;
	BUG_ON(!cpu_online(cpu));

	/* Now call notifier in preparation. */
	raw_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);

out_notify:
	if (ret != 0)
		raw_notifier_call_chain(&cpu_chain,
				CPU_UP_CANCELED, hcpu);

	return ret;
}

なんか間数名がすごいなぁ、、、、cpu_up()はmutexをかけてから_cpu_up()を呼び、更にmutexをかけて__cpu_up()を呼んでいる。
__cpu_up()からが機種依存コード。

/*
 * Called once for each "cpu_possible(cpu)".  Needs to spin up the cpu
 * and keep control until "cpu_online(cpu)" is set.  Note: cpu is
 * physical, not logical.
 */
int __cpuinit __cpu_up(unsigned int cpu)
{
	struct task_struct *idle;

	/*
	 * Processor goes to start_secondary(), sets online flag
	 * The following code is purely to make sure
	 * Linux can schedule processes on this slave.
	 */
	idle = fork_idle(cpu);
	if (IS_ERR(idle))
		panic(KERN_ERR "Fork failed for CPU %d", cpu);

	prom_boot_secondary(cpu, idle);

	/*
	 * Trust is futile.  We should really have timeouts ...
	 */
	while (!cpu_isset(cpu, cpu_callin_map))
		udelay(100);

	cpu_set(cpu, cpu_online_map);

	return 0;
}

結局これもボード固有のコードに丸投げかな?prom_boot_secondary()を呼んでいる。

/*
 * Setup the PC, SP, and GP of a secondary processor and start it
 * running!
 */
void prom_boot_secondary(int cpu, struct task_struct *idle)
{
	int retval;

	retval = cfe_cpu_start(cpu_logical_map(cpu), &smp_bootstrap,
			       __KSTK_TOS(idle),
			       (unsigned long)task_thread_info(idle), 0);
	if (retval != 0)
		printk("cfe_start_cpu(%i) returned %i\n" , cpu, retval);
}

cfe_cpu_start()を呼んでCPUを立ち上げている。引数に積んでいる&smp_bootstrapはセカンダリCPUのベクタアドレスになる。

#if defined(CFE_API_cpu_start) || defined(CFE_API_ALL)
int cfe_cpu_start(int cpu, void (*fn) (void), long sp, long gp, long a1)
{
	cfe_xiocb_t xiocb;

	xiocb.xiocb_fcode = CFE_CMD_FW_CPUCTL;
	xiocb.xiocb_status = 0;
	xiocb.xiocb_handle = 0;
	xiocb.xiocb_flags = 0;
	xiocb.xiocb_psize = sizeof(xiocb_cpuctl_t);
	xiocb.plist.xiocb_cpuctl.cpu_number = cpu;
	xiocb.plist.xiocb_cpuctl.cpu_command = CFE_CPU_CMD_START;
	xiocb.plist.xiocb_cpuctl.gp_val = gp;
	xiocb.plist.xiocb_cpuctl.sp_val = sp;
	xiocb.plist.xiocb_cpuctl.a1_val = a1;
	xiocb.plist.xiocb_cpuctl.start_addr = (long) fn;

	cfe_iocb_dispatch(&xiocb);

	return xiocb.xiocb_status;
}
#endif				/* CFE_API_cpu_start || CFE_API_ALL */

cfe_cpu_start()のコード。cfe_cpu_stop()のコードと同じく単なるラッパー関数。

#ifdef CONFIG_SMP
/*
 * SMP slave cpus entry point.  Board specific code for bootstrap calls this
 * function after setting up the stack and gp registers.
 */
NESTED(smp_bootstrap, 16, sp)
#ifdef CONFIG_MIPS_MT_SMTC
	/*
	 * Read-modify-writes of Status must be atomic, and this
	 * is one case where CLI is invoked without EXL being
	 * necessarily set. The CLI and setup_c0_status will
	 * in fact be redundant for all but the first TC of
	 * each VPE being booted.
	 */
	DMT	10	# dmt t2 /* t0, t1 are used by CLI and setup_c0_status() */
	jal	mips_ihb
#endif /* CONFIG_MIPS_MT_SMTC */
	setup_c0_status_sec
	smp_slave_setup
#ifdef CONFIG_MIPS_MT_SMTC
	andi	t2, t2, VPECONTROL_TE
	beqz	t2, 2f
	EMT		# emt
2:
#endif /* CONFIG_MIPS_MT_SMTC */
	j	start_secondary
	END(smp_bootstrap)
#endif /* CONFIG_SMP */

	__FINIT

セカンダリCPUはここから開始するらしい。

#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_MIPS_MT)

/*
 * MIPS32R2 Instruction Hazard Barrier - must be called
 *
 * For C code use the inline version named instruction_hazard().
 */
LEAF(mips_ihb)
	.set	mips32r2
	jr.hb	ra
	nop
	END(mips_ihb)

なんじゃこれわ。
何かのトリックなのは確か。

/*
 * Do SMP slave processor setup necessary before we can savely execute C code.
 */
	.macro	smp_slave_setup
	.endm

なぜか全く中身のないマクロ。

/*
 * First C code run on the secondary CPUs after being started up by
 * the master.
 */
asmlinkage void start_secondary(void)
{
	unsigned int cpu;

#ifdef CONFIG_MIPS_MT_SMTC
	/* Only do cpu_probe for first TC of CPU */
	if ((read_c0_tcbind() & TCBIND_CURTC) == 0)
#endif /* CONFIG_MIPS_MT_SMTC */
	cpu_probe();
	cpu_report();
	per_cpu_trap_init();
	prom_init_secondary();

	/*
	 * XXX parity protection should be folded in here when it's converted
	 * to an option instead of something based on .cputype
	 */

	calibrate_delay();
	preempt_disable();
	cpu = smp_processor_id();
	cpu_data[cpu].udelay_val = loops_per_jiffy;

	prom_smp_finish();

	cpu_set(cpu, cpu_callin_map);

	cpu_idle();
}

C側の初期化コード。
CPUの認識、トラップの初期化、機種依存初期化処理、スケジューラ上でのCPUの有効化などを行い最後にidleスレッドとしてcpu_idle()を実行している。

/*
 * Code to run on secondary just after probing the CPU
 */
void prom_init_secondary(void)
{
#if defined(CONFIG_SIBYTE_BCM1x55) || defined(CONFIG_SIBYTE_BCM1x80)
	extern void bcm1480_smp_init(void);
	bcm1480_smp_init();
#elif defined(CONFIG_SIBYTE_SB1250)
	extern void sb1250_smp_init(void);
	sb1250_smp_init();
#else
#error invalid SMP configuration
#endif
}

cfeのprom_init_secondary()は更にチップ毎のコードを呼ぶようになっていた。

/*
 * SMP init and finish on secondary CPUs
 */
void sb1250_smp_init(void)
{
	unsigned int imask = STATUSF_IP4 | STATUSF_IP3 | STATUSF_IP2 |
		STATUSF_IP1 | STATUSF_IP0;

	/* Set interrupt mask, but don't enable */
	change_c0_status(ST0_IM, imask);
}

ここまで来てどんな難しい事をするのかとおもいきや、単に割り込みマスクを有効にしているだけだったり。

/*
 * Do any tidying up before marking online and running the idle
 * loop
 */
void prom_smp_finish(void)
{
#if defined(CONFIG_SIBYTE_BCM1x55) || defined(CONFIG_SIBYTE_BCM1x80)
	extern void bcm1480_smp_finish(void);
	bcm1480_smp_finish();
#elif defined(CONFIG_SIBYTE_SB1250)
	extern void sb1250_smp_finish(void);
	sb1250_smp_finish();
#else
#error invalid SMP configuration
#endif
}
void sb1250_smp_finish(void)
{
	extern void sb1250_time_init(void);
	sb1250_time_init();
	local_irq_enable();
}

こちらは、タイマ初期化と割り込み初期化をやっている。