NetBSD-current/Linuxのpowerpc実装 - キャッシュまわり

NetBSDではcacheの同期にipiを使っていないのだろうか?と思い、powerpcソースコードを眺めてみたが、それらしい所は見当たらなかった。

以下はpowerpc/ibm4xx/cpu.cのキャッシュ操作部分と思われるコードだが、特にipiは使っていない:

/*
 * These small routines may have to be replaced,
 * if/when we support processors other that the 604.
 */

void
dcache_flush_page(vaddr_t va)
{
	int i;

	if (curcpu()->ci_ci.dcache_line_size)
		for (i = 0; i < PAGE_SIZE;
		     i += curcpu()->ci_ci.dcache_line_size)
			__asm volatile("dcbf %0,%1" : : "r" (va), "r" (i));
	__asm volatile("sync;isync" : : );
}

void
icache_flush_page(vaddr_t va)
{
	int i;

	if (curcpu()->ci_ci.icache_line_size)
		for (i = 0; i < PAGE_SIZE;
		     i += curcpu()->ci_ci.icache_line_size)
			__asm volatile("icbi %0,%1" : : "r" (va), "r" (i));
	__asm volatile("sync;isync" : : );
}

void
dcache_flush(vaddr_t va, vsize_t len)
{
	int i;

	if (len == 0)
		return;

	/* Make sure we flush all cache lines */
	len += va & (curcpu()->ci_ci.dcache_line_size-1);
	if (curcpu()->ci_ci.dcache_line_size)
		for (i = 0; i < len; i += curcpu()->ci_ci.dcache_line_size)
			__asm volatile("dcbf %0,%1" : : "r" (va), "r" (i));
	__asm volatile("sync;isync" : : );
}

void
icache_flush(vaddr_t va, vsize_t len)
{
	int i;

	if (len == 0)
		return;

	/* Make sure we flush all cache lines */
	len += va & (curcpu()->ci_ci.icache_line_size-1);
	if (curcpu()->ci_ci.icache_line_size)
		for (i = 0; i < len; i += curcpu()->ci_ci.icache_line_size)
			__asm volatile("icbi %0,%1" : : "r" (va), "r" (i));
	__asm volatile("sync;isync" : : );
}

そもそもLinuxでもpowerpcのコードでそういう事はしてるんだろうか?と思い、確認してみると、やってなかったりした:

/*
 * Flush instruction cache.
 * This is a no-op on the 601.
 */
_GLOBAL(flush_instruction_cache)
#if defined(CONFIG_8xx)
	isync
	lis	r5, IDC_INVALL@h
	mtspr	SPRN_IC_CST, r5
#elif defined(CONFIG_4xx)
#ifdef CONFIG_403GCX
	li      r3, 512
	mtctr   r3
	lis     r4, KERNELBASE@h
1:	iccci   0, r4
	addi    r4, r4, 16
	bdnz    1b
#else
	lis	r3, KERNELBASE@h
	iccci	0,r3
#endif
#elif CONFIG_FSL_BOOKE
BEGIN_FTR_SECTION
	mfspr   r3,SPRN_L1CSR0
	ori     r3,r3,L1CSR0_CFI|L1CSR0_CLFC
	/* msync; isync recommended here */
	mtspr   SPRN_L1CSR0,r3
	isync
	blr
END_FTR_SECTION_IFCLR(CPU_FTR_SPLIT_ID_CACHE)
	mfspr	r3,SPRN_L1CSR1
	ori	r3,r3,L1CSR1_ICFI|L1CSR1_ICLFR
	mtspr	SPRN_L1CSR1,r3
#else
	mfspr	r3,SPRN_PVR
	rlwinm	r3,r3,16,16,31
	cmpwi	0,r3,1
	beqlr			/* for 601, do nothing */
	/* 603/604 processor - use invalidate-all bit in HID0 */
	mfspr	r3,SPRN_HID0
	ori	r3,r3,HID0_ICFI
	mtspr	SPRN_HID0,r3
#endif /* CONFIG_8xx/4xx */
	isync
	blr

/*
 * Write any modified data cache blocks out to memory
 * and invalidate the corresponding instruction cache blocks.
 * This is a no-op on the 601.
 *
 * flush_icache_range(unsigned long start, unsigned long stop)
 */
_GLOBAL(__flush_icache_range)
BEGIN_FTR_SECTION
	blr				/* for 601, do nothing */
END_FTR_SECTION_IFCLR(CPU_FTR_SPLIT_ID_CACHE)
	li	r5,L1_CACHE_BYTES-1
	andc	r3,r3,r5
	subf	r4,r3,r4
	add	r4,r4,r5
	srwi.	r4,r4,L1_CACHE_SHIFT
	beqlr
	mtctr	r4
	mr	r6,r3
1:	dcbst	0,r3
	addi	r3,r3,L1_CACHE_BYTES
	bdnz	1b
	sync				/* wait for dcbst's to get to ram */
	mtctr	r4
2:	icbi	0,r6
	addi	r6,r6,L1_CACHE_BYTES
	bdnz	2b
	sync				/* additional sync needed on g4 */
	isync
	blr
/*
 * Write any modified data cache blocks out to memory.
 * Does not invalidate the corresponding cache lines (especially for
 * any corresponding instruction cache).
 *
 * clean_dcache_range(unsigned long start, unsigned long stop)
 */
_GLOBAL(clean_dcache_range)
	li	r5,L1_CACHE_BYTES-1
	andc	r3,r3,r5
	subf	r4,r3,r4
	add	r4,r4,r5
	srwi.	r4,r4,L1_CACHE_SHIFT
	beqlr
	mtctr	r4

1:	dcbst	0,r3
	addi	r3,r3,L1_CACHE_BYTES
	bdnz	1b
	sync				/* wait for dcbst's to get to ram */
	blr

/*
 * Write any modified data cache blocks out to memory and invalidate them.
 * Does not invalidate the corresponding instruction cache blocks.
 *
 * flush_dcache_range(unsigned long start, unsigned long stop)
 */
_GLOBAL(flush_dcache_range)
	li	r5,L1_CACHE_BYTES-1
	andc	r3,r3,r5
	subf	r4,r3,r4
	add	r4,r4,r5
	srwi.	r4,r4,L1_CACHE_SHIFT
	beqlr
	mtctr	r4

1:	dcbf	0,r3
	addi	r3,r3,L1_CACHE_BYTES
	bdnz	1b
	sync				/* wait for dcbst's to get to ram */
	blr

/*
 * Like above, but invalidate the D-cache.  This is used by the 8xx
 * to invalidate the cache so the PPC core doesn't get stale data
 * from the CPM (no cache snooping here :-).
 *
 * invalidate_dcache_range(unsigned long start, unsigned long stop)
 */
_GLOBAL(invalidate_dcache_range)
	li	r5,L1_CACHE_BYTES-1
	andc	r3,r3,r5
	subf	r4,r3,r4
	add	r4,r4,r5
	srwi.	r4,r4,L1_CACHE_SHIFT
	beqlr
	mtctr	r4

1:	dcbi	0,r3
	addi	r3,r3,L1_CACHE_BYTES
	bdnz	1b
	sync				/* wait for dcbi's to get to ram */
	blr

/*
 * Flush a particular page from the data cache to RAM.
 * Note: this is necessary because the instruction cache does *not*
 * snoop from the data cache.
 * This is a no-op on the 601 which has a unified cache.
 *
 *	void __flush_dcache_icache(void *page)
 */
_GLOBAL(__flush_dcache_icache)
BEGIN_FTR_SECTION
	blr					/* for 601, do nothing */
END_FTR_SECTION_IFCLR(CPU_FTR_SPLIT_ID_CACHE)
	rlwinm	r3,r3,0,0,19			/* Get page base address */
	li	r4,4096/L1_CACHE_BYTES	/* Number of lines in a page */
	mtctr	r4
	mr	r6,r3
0:	dcbst	0,r3				/* Write line to ram */
	addi	r3,r3,L1_CACHE_BYTES
	bdnz	0b
	sync
	mtctr	r4
1:	icbi	0,r6
	addi	r6,r6,L1_CACHE_BYTES
	bdnz	1b
	sync
	isync
	blr

/*
 * Flush a particular page from the data cache to RAM, identified
 * by its physical address.  We turn off the MMU so we can just use
 * the physical address (this may be a highmem page without a kernel
 * mapping).
 *
 *	void __flush_dcache_icache_phys(unsigned long physaddr)
 */
_GLOBAL(__flush_dcache_icache_phys)
BEGIN_FTR_SECTION
	blr					/* for 601, do nothing */
END_FTR_SECTION_IFCLR(CPU_FTR_SPLIT_ID_CACHE)
	mfmsr	r10
	rlwinm	r0,r10,0,28,26			/* clear DR */
	mtmsr	r0
	isync
	rlwinm	r3,r3,0,0,19			/* Get page base address */
	li	r4,4096/L1_CACHE_BYTES	/* Number of lines in a page */
	mtctr	r4
	mr	r6,r3
0:	dcbst	0,r3				/* Write line to ram */
	addi	r3,r3,L1_CACHE_BYTES
	bdnz	0b
	sync
	mtctr	r4
1:	icbi	0,r6
	addi	r6,r6,L1_CACHE_BYTES
	bdnz	1b
	sync
	mtmsr	r10				/* restore DR */
	isync
	blr

ここらへんの書き方はCPUのキャッシュ機構の構造に依存するんだろう。

とはいえ、timebaseの同期など別の用途においてはきちんと使われていた(NetBSDの話。):

void
md_start_timebase(volatile struct cpu_hatch_data *h)
{
	int i;
#ifdef OPENPIC
	if (!openpic_base) {
#endif
		/*
		 * wait for secondary spin up (1.5ms @ 604/200MHz)
		 * XXX we cannot use delay() here because timebase is not
		 * running.
		 */
		for (i = 0; i < 100000; i++)
			if (h->running)
				break;

		/* Start timebase. */
		out32(0xf2800000, 0x100);
		ppc_send_ipi(1, PPC_IPI_NOMESG);
#ifdef OPENPIC
	}
#endif
}