Linux 2.6/mipsのSMP実装について
linuxは全然知らんので、カーネルの起動シーケンスから確認。
まず、kernel_entryから起動する。
j start_kernel END(kernel_entry)
kernel_entryからstart_kernel()が呼ばれる。
asmlinkage void __init start_kernel(void) { char * command_line; extern struct kernel_param __start___param[], __stop___param[]; smp_setup_processor_id();
いきなりsmp_setup_processor_id()とかあるが、mipsでは定義されていないようだ。
/* * Need to run as early as possible, to initialize the * lockdep hash: */ unwind_init(); lockdep_init(); local_irq_disable(); early_boot_irqs_off(); early_init_irq_lock_class(); /* * Interrupts are still disabled. Do necessary setups, then * enable them */ lock_kernel(); tick_init(); boot_cpu_init(); page_address_init(); printk(KERN_NOTICE); printk(linux_banner); setup_arch(&command_line);
もう少し進めてみていくと、アーキテクチャ毎の初期化コードを呼ぶ部分があった。
void __init setup_arch(char **cmdline_p) { cpu_probe(); prom_init(); #ifdef CONFIG_EARLY_PRINTK { extern void setup_early_printk(void); setup_early_printk(); } #endif cpu_report(); #if defined(CONFIG_VT) #if defined(CONFIG_VGA_CONSOLE) conswitchp = &vga_con; #elif defined(CONFIG_DUMMY_CONSOLE) conswitchp = &dummy_con; #endif #endif arch_mem_init(cmdline_p); resource_init(); #ifdef CONFIG_SMP plat_smp_setup(); #endif }
setup_arch()から、ボード毎の固有のsmp初期化関数なplat_smp_setup()が呼ばれている。
ここではsibyteのcfeを見ていく事にする。
/* * Use CFE to find out how many CPUs are available, setting up * phys_cpu_present_map and the logical/physical mappings. * XXXKW will the boot CPU ever not be physical 0? * * Common setup before any secondaries are started */ void __init plat_smp_setup(void) { int i, num; cpus_clear(phys_cpu_present_map); cpu_set(0, phys_cpu_present_map); __cpu_number_map[0] = 0; __cpu_logical_map[0] = 0; for (i = 1, num = 0; i < NR_CPUS; i++) { if (cfe_cpu_stop(i) == 0) { cpu_set(i, phys_cpu_present_map); __cpu_number_map[i] = ++num; __cpu_logical_map[num] = i; } } printk(KERN_INFO "Detected %i available secondary CPU(s)\n", num); }
cpus_clear(), cpu_set()というのは、phys_cpu_present_mapにCPUが存在するというフラグを立てる(或いは消す)マクロらしい。
cfe_cpu_stop()は指定されたCPUを停止するようファームウェアに指示する関数だと思うんだが、なぜ今ここで止めるのか?
恐らく、返り値でそのCPUが存在するかどうかが判別出来るから、というのが答えだと思われる。
#if defined(CFE_API_cpu_stop) || defined(CFE_API_ALL) int cfe_cpu_stop(int cpu) { cfe_xiocb_t xiocb; xiocb.xiocb_fcode = CFE_CMD_FW_CPUCTL; xiocb.xiocb_status = 0; xiocb.xiocb_handle = 0; xiocb.xiocb_flags = 0; xiocb.xiocb_psize = sizeof(xiocb_cpuctl_t); xiocb.plist.xiocb_cpuctl.cpu_number = cpu; xiocb.plist.xiocb_cpuctl.cpu_command = CFE_CPU_CMD_STOP; cfe_iocb_dispatch(&xiocb); return xiocb.xiocb_status; } #endif /* CFE_API_cpu_stop || CFE_API_ALL */
これがcfe_cpu_stop()。関数とその引数を設定してcfe_iocb_dispatch()を呼び、返り値を受け取って返すラッパー関数。
setup_command_line(command_line);
unwind_setup();
setup_per_cpu_areas();
smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
再びstart_kernel()を見ていく。
#ifdef __GENERIC_PER_CPU unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); static void __init setup_per_cpu_areas(void) { unsigned long size, i; char *ptr; unsigned long nr_possible_cpus = num_possible_cpus(); /* Copy section for each CPU (we discard the original) */ size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES); #ifdef CONFIG_MODULES if (size < PERCPU_ENOUGH_ROOM) size = PERCPU_ENOUGH_ROOM; #endif ptr = alloc_bootmem(size * nr_possible_cpus); for_each_possible_cpu(i) { __per_cpu_offset[i] = ptr - __per_cpu_start; memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); ptr += size; } } #endif /* !__GENERIC_PER_CPU */
変なマクロとかもあって良く分からないのだが、多分CPU毎の構造体を入れるメモリ領域の確保及び初期化かな?
/* preload SMP state for boot cpu */ void __devinit smp_prepare_boot_cpu(void) { /* * This assumes that bootup is always handled by the processor * with the logic and physical number 0. */ __cpu_number_map[0] = 0; __cpu_logical_map[0] = 0; cpu_set(0, phys_cpu_present_map); cpu_set(0, cpu_online_map); cpu_set(0, cpu_callin_map); }
いよいよCPUを立ち上げるのかとおもいきや、ここでは単にCPU0のフラグを初期化しただけのようだ。
/* * Set up the scheduler prior starting any interrupts (such as the * timer interrupt). Full topology setup happens at smp_init() * time - but meanwhile we still have a functioning scheduler. */ sched_init(); /* * Disable preemption - early bootup scheduling is extremely * fragile until we cpu_idle() for the first time. */ preempt_disable(); build_all_zonelists(); page_alloc_init(); printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line); parse_early_param(); parse_args("Booting kernel", static_command_line, __start___param, __stop___param - __start___param, &unknown_bootoption); if (!irqs_disabled()) { printk(KERN_WARNING "start_kernel(): bug: interrupts were " "enabled *very* early, fixing it\n"); local_irq_disable(); } sort_main_extable(); trap_init(); rcu_init(); init_IRQ(); pidhash_init(); init_timers(); hrtimers_init(); softirq_init(); timekeeping_init(); time_init(); profile_init(); if (!irqs_disabled()) printk("start_kernel(): bug: interrupts were enabled early\n"); early_boot_irqs_on(); local_irq_enable(); /* * HACK ALERT! This is early. We're enabling the console before * we've done PCI setups etc, and console_init() must be aware of * this. But we do want output early, in case something goes wrong. */ console_init(); if (panic_later) panic(panic_later, panic_param); lockdep_info(); /* * Need to run this when irqs are enabled, because it wants * to self-test [hard/soft]-irqs on/off lock inversion bugs * too: */ locking_selftest(); #ifdef CONFIG_BLK_DEV_INITRD if (initrd_start && !initrd_below_start_ok && initrd_start < min_low_pfn << PAGE_SHIFT) { printk(KERN_CRIT "initrd overwritten (0x%08lx < 0x%08lx) - " "disabling it.\n",initrd_start,min_low_pfn << PAGE_SHIFT); initrd_start = 0; } #endif vfs_caches_init_early(); cpuset_init_early(); mem_init(); kmem_cache_init(); setup_per_cpu_pageset(); numa_policy_init(); if (late_time_init) late_time_init(); calibrate_delay(); pidmap_init(); pgtable_cache_init(); prio_tree_init(); anon_vma_init(); #ifdef CONFIG_X86 if (efi_enabled) efi_enter_virtual_mode(); #endif fork_init(num_physpages); proc_caches_init(); buffer_init(); unnamed_dev_init(); key_init(); security_init(); vfs_caches_init(num_physpages); radix_tree_init(); signals_init(); /* rootfs populating might need page-writeback */ page_writeback_init(); #ifdef CONFIG_PROC_FS proc_root_init(); #endif cpuset_init(); taskstats_init_early(); delayacct_init(); check_bugs(); acpi_early_init(); /* before LAPIC and SMP init */ /* Do the rest non-__init'ed, we're now alive */ rest_init(); }
そのままずっと下までみていくと、最後にrest_init()に行き当たる。
/* * We need to finalize in a non-__init function or else race conditions * between the root thread and the init thread may cause start_kernel to * be reaped by free_initmem before the root thread has proceeded to * cpu_idle. * * gcc-3.4 accidentally inlines this function, so use noinline. */ static void noinline rest_init(void) __releases(kernel_lock) { kernel_thread(init, NULL, CLONE_FS | CLONE_SIGHAND); numa_default_policy(); unlock_kernel(); /* * The boot idle thread must execute schedule() * at least one to get things moving: */ preempt_enable_no_resched(); schedule(); preempt_disable(); /* Call into cpu_idle with preempt disabled */ cpu_idle(); }
init()をカーネルスレッドとして起動している。
static int __init init(void * unused) { lock_kernel(); /* * init can run on any cpu. */ set_cpus_allowed(current, CPU_MASK_ALL); /* * Tell the world that we're going to be the grim * reaper of innocent orphaned children. * * We don't want people to have to make incorrect * assumptions about where in the task array this * can be found. */ init_pid_ns.child_reaper = current; cad_pid = task_pid(current); smp_prepare_cpus(max_cpus); do_pre_smp_initcalls(); smp_init(); sched_init_smp(); cpuset_init_smp(); do_basic_setup(); /* * check if there is an early userspace init. If yes, let it do all * the work */ if (!ramdisk_execute_command) ramdisk_execute_command = "/init"; if (sys_access((const char __user *) ramdisk_execute_command, 0) != 0) { ramdisk_execute_command = NULL; prepare_namespace(); } /* * Ok, we have completed the initial bootup, and * we're essentially up and running. Get rid of the * initmem segments and start the user-mode stuff.. */ init_post(); return 0; }
どうも、SMPの初期化は全部ここでやっているように見える。
順にみていこう。
/* called from main before smp_init() */ void __init smp_prepare_cpus(unsigned int max_cpus) { init_new_context(current, &init_mm); current_thread_info()->cpu = 0; smp_tune_scheduling(); plat_prepare_cpus(max_cpus); #ifndef CONFIG_HOTPLUG_CPU cpu_present_map = cpu_possible_map; #endif }
prepare_cpusはボード依存のコードに丸投げらしい。
void __init plat_prepare_cpus(unsigned int max_cpus) { }
が、見てみるとなんと何もしてない。
次。
static void __init do_pre_smp_initcalls(void) { extern int spawn_ksoftirqd(void); #ifdef CONFIG_SMP extern int migration_init(void); migration_init(); #endif spawn_ksoftirqd(); spawn_softlockup_task(); }
これは機種非依存コード。SMP対応スケジューラやソフト割り込みなどの初期化だろう。
次。
/* Called by boot processor to activate the rest. */ static void __init smp_init(void) { unsigned int cpu; unsigned highest = 0; for_each_cpu_mask(cpu, cpu_possible_map) highest = cpu; nr_cpu_ids = highest + 1; /* FIXME: This should be done in userspace --RR */ for_each_present_cpu(cpu) { if (num_online_cpus() >= max_cpus) break; if (!cpu_online(cpu)) cpu_up(cpu); } /* Any cleanup work */ printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus()); smp_cpus_done(max_cpus); } #endif
cpuを順にみていって、!cpu_online()ならcpu_up()をかけている。
int __cpuinit cpu_up(unsigned int cpu) { int err = 0; mutex_lock(&cpu_add_remove_lock); if (cpu_hotplug_disabled) err = -EBUSY; else err = _cpu_up(cpu); mutex_unlock(&cpu_add_remove_lock); return err; }
/* Requires cpu_add_remove_lock to be held */ static int __cpuinit _cpu_up(unsigned int cpu) { int ret; void *hcpu = (void *)(long)cpu; if (cpu_online(cpu) || !cpu_present(cpu)) return -EINVAL; ret = raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); if (ret == NOTIFY_BAD) { printk("%s: attempt to bring up CPU %u failed\n", __FUNCTION__, cpu); ret = -EINVAL; goto out_notify; } /* Arch-specific enabling code. */ mutex_lock(&cpu_bitmask_lock); ret = __cpu_up(cpu); mutex_unlock(&cpu_bitmask_lock); if (ret != 0) goto out_notify; BUG_ON(!cpu_online(cpu)); /* Now call notifier in preparation. */ raw_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu); out_notify: if (ret != 0) raw_notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu); return ret; }
なんか間数名がすごいなぁ、、、、cpu_up()はmutexをかけてから_cpu_up()を呼び、更にmutexをかけて__cpu_up()を呼んでいる。
__cpu_up()からが機種依存コード。
/* * Called once for each "cpu_possible(cpu)". Needs to spin up the cpu * and keep control until "cpu_online(cpu)" is set. Note: cpu is * physical, not logical. */ int __cpuinit __cpu_up(unsigned int cpu) { struct task_struct *idle; /* * Processor goes to start_secondary(), sets online flag * The following code is purely to make sure * Linux can schedule processes on this slave. */ idle = fork_idle(cpu); if (IS_ERR(idle)) panic(KERN_ERR "Fork failed for CPU %d", cpu); prom_boot_secondary(cpu, idle); /* * Trust is futile. We should really have timeouts ... */ while (!cpu_isset(cpu, cpu_callin_map)) udelay(100); cpu_set(cpu, cpu_online_map); return 0; }
結局これもボード固有のコードに丸投げかな?prom_boot_secondary()を呼んでいる。
/* * Setup the PC, SP, and GP of a secondary processor and start it * running! */ void prom_boot_secondary(int cpu, struct task_struct *idle) { int retval; retval = cfe_cpu_start(cpu_logical_map(cpu), &smp_bootstrap, __KSTK_TOS(idle), (unsigned long)task_thread_info(idle), 0); if (retval != 0) printk("cfe_start_cpu(%i) returned %i\n" , cpu, retval); }
cfe_cpu_start()を呼んでCPUを立ち上げている。引数に積んでいる&smp_bootstrapはセカンダリCPUのベクタアドレスになる。
#if defined(CFE_API_cpu_start) || defined(CFE_API_ALL) int cfe_cpu_start(int cpu, void (*fn) (void), long sp, long gp, long a1) { cfe_xiocb_t xiocb; xiocb.xiocb_fcode = CFE_CMD_FW_CPUCTL; xiocb.xiocb_status = 0; xiocb.xiocb_handle = 0; xiocb.xiocb_flags = 0; xiocb.xiocb_psize = sizeof(xiocb_cpuctl_t); xiocb.plist.xiocb_cpuctl.cpu_number = cpu; xiocb.plist.xiocb_cpuctl.cpu_command = CFE_CPU_CMD_START; xiocb.plist.xiocb_cpuctl.gp_val = gp; xiocb.plist.xiocb_cpuctl.sp_val = sp; xiocb.plist.xiocb_cpuctl.a1_val = a1; xiocb.plist.xiocb_cpuctl.start_addr = (long) fn; cfe_iocb_dispatch(&xiocb); return xiocb.xiocb_status; } #endif /* CFE_API_cpu_start || CFE_API_ALL */
cfe_cpu_start()のコード。cfe_cpu_stop()のコードと同じく単なるラッパー関数。
#ifdef CONFIG_SMP /* * SMP slave cpus entry point. Board specific code for bootstrap calls this * function after setting up the stack and gp registers. */ NESTED(smp_bootstrap, 16, sp) #ifdef CONFIG_MIPS_MT_SMTC /* * Read-modify-writes of Status must be atomic, and this * is one case where CLI is invoked without EXL being * necessarily set. The CLI and setup_c0_status will * in fact be redundant for all but the first TC of * each VPE being booted. */ DMT 10 # dmt t2 /* t0, t1 are used by CLI and setup_c0_status() */ jal mips_ihb #endif /* CONFIG_MIPS_MT_SMTC */ setup_c0_status_sec smp_slave_setup #ifdef CONFIG_MIPS_MT_SMTC andi t2, t2, VPECONTROL_TE beqz t2, 2f EMT # emt 2: #endif /* CONFIG_MIPS_MT_SMTC */ j start_secondary END(smp_bootstrap) #endif /* CONFIG_SMP */ __FINIT
セカンダリCPUはここから開始するらしい。
#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_MIPS_MT) /* * MIPS32R2 Instruction Hazard Barrier - must be called * * For C code use the inline version named instruction_hazard(). */ LEAF(mips_ihb) .set mips32r2 jr.hb ra nop END(mips_ihb)
なんじゃこれわ。
何かのトリックなのは確か。
/* * Do SMP slave processor setup necessary before we can savely execute C code. */ .macro smp_slave_setup .endm
なぜか全く中身のないマクロ。
/* * First C code run on the secondary CPUs after being started up by * the master. */ asmlinkage void start_secondary(void) { unsigned int cpu; #ifdef CONFIG_MIPS_MT_SMTC /* Only do cpu_probe for first TC of CPU */ if ((read_c0_tcbind() & TCBIND_CURTC) == 0) #endif /* CONFIG_MIPS_MT_SMTC */ cpu_probe(); cpu_report(); per_cpu_trap_init(); prom_init_secondary(); /* * XXX parity protection should be folded in here when it's converted * to an option instead of something based on .cputype */ calibrate_delay(); preempt_disable(); cpu = smp_processor_id(); cpu_data[cpu].udelay_val = loops_per_jiffy; prom_smp_finish(); cpu_set(cpu, cpu_callin_map); cpu_idle(); }
C側の初期化コード。
CPUの認識、トラップの初期化、機種依存初期化処理、スケジューラ上でのCPUの有効化などを行い最後にidleスレッドとしてcpu_idle()を実行している。
/* * Code to run on secondary just after probing the CPU */ void prom_init_secondary(void) { #if defined(CONFIG_SIBYTE_BCM1x55) || defined(CONFIG_SIBYTE_BCM1x80) extern void bcm1480_smp_init(void); bcm1480_smp_init(); #elif defined(CONFIG_SIBYTE_SB1250) extern void sb1250_smp_init(void); sb1250_smp_init(); #else #error invalid SMP configuration #endif }
cfeのprom_init_secondary()は更にチップ毎のコードを呼ぶようになっていた。
/* * SMP init and finish on secondary CPUs */ void sb1250_smp_init(void) { unsigned int imask = STATUSF_IP4 | STATUSF_IP3 | STATUSF_IP2 | STATUSF_IP1 | STATUSF_IP0; /* Set interrupt mask, but don't enable */ change_c0_status(ST0_IM, imask); }
ここまで来てどんな難しい事をするのかとおもいきや、単に割り込みマスクを有効にしているだけだったり。
/* * Do any tidying up before marking online and running the idle * loop */ void prom_smp_finish(void) { #if defined(CONFIG_SIBYTE_BCM1x55) || defined(CONFIG_SIBYTE_BCM1x80) extern void bcm1480_smp_finish(void); bcm1480_smp_finish(); #elif defined(CONFIG_SIBYTE_SB1250) extern void sb1250_smp_finish(void); sb1250_smp_finish(); #else #error invalid SMP configuration #endif }
void sb1250_smp_finish(void) { extern void sb1250_time_init(void); sb1250_time_init(); local_irq_enable(); }
こちらは、タイマ初期化と割り込み初期化をやっている。