[PATCH v1 0/5] mips: Improve initial Octeon MIPS64 support

This patchset improves the Octeon base support by adding a very early copy of the U-Boot image from bootspace (e.g. parallel NOR CFI flash) to the L2 cache and continue execution there.
This is done by introducing the optional hook mips_mach_early_init() to start.S for very early machine specific code. This code can be selected via the new Kconfig symbol CONFIG_MIPS_MACH_EARLY_INIT.
Additionally, the common invalidate_dcache_range() implementation is also changed to a weak function allowing Octeon to adds its own no-op cache function (Octeon is cache coherent).
This patchset is based on the base Octeon patchsert in v3.
Thanks, Stefan
Stefan Roese (5): mips: Add CONFIG_MIPS_MACH_EARLY_INIT for very early mach init code mips: octeon: use mips_mach_early_init() to copy to L2 cache mips: octeon: octeon_ebb7304: Change TEXT_BASE to L2 cache mips: cache: Make invalidate_dcache_range() weak to enable overwrite mips: octeon: Add empty invalidate_dcache_range()
arch/mips/Kconfig | 10 +++++ arch/mips/cpu/start.S | 5 +++ arch/mips/lib/cache.c | 2 +- arch/mips/mach-octeon/cache.c | 4 ++ arch/mips/mach-octeon/lowlevel_init.S | 56 +++++++++++++++++++++++++++ configs/octeon_ebb7304_defconfig | 3 +- 6 files changed, 78 insertions(+), 2 deletions(-)

This patch adds the optional call to mips_mach_early_init() to start.S at a very early stage. Its disabled per default. It can be used for very early machine / platform specific init code. Its called very early and at this stage the PC is allowed to differ from the linking address (CONFIG_TEXT_BASE) as no absolute jump has been performed until this call.
It will be used by thje Octeon platform.
Signed-off-by: Stefan Roese sr@denx.de ---
arch/mips/Kconfig | 9 +++++++++ arch/mips/cpu/start.S | 5 +++++ 2 files changed, 14 insertions(+)
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index dd56da6dae..327fd4848a 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -296,6 +296,15 @@ config MIPS_CACHE_INDEX_BASE Normally this is CKSEG0. If the MIPS system needs to move this block to some SRAM or ScratchPad RAM, adapt this option accordingly.
+config MIPS_MACH_EARLY_INIT + bool "Enable mach specific very early init code" + help + Use this to enable the call to mips_mach_early_init() very early + from start.S. This function can be used e.g. to do some very early + CPU / SoC intitialization or image copying. Its called very early + and at this stage the PC might not match the linking address + (CONFIG_TEXT_BASE) - no absolute jump done until this call. + config MIPS_CACHE_SETUP bool "Enable startup code to initialize and setup caches" default n if SKIP_LOWLEVEL_INIT diff --git a/arch/mips/cpu/start.S b/arch/mips/cpu/start.S index 08dddbdf5f..a7190ec3b2 100644 --- a/arch/mips/cpu/start.S +++ b/arch/mips/cpu/start.S @@ -195,6 +195,11 @@ wr_done: /* Clear timer interrupt (CP0_COUNT cleared on branch to 'reset') */ mtc0 zero, CP0_COMPARE
+#ifdef CONFIG_MIPS_MACH_EARLY_INIT + bal mips_mach_early_init + nop +#endif + #ifdef CONFIG_MIPS_CACHE_SETUP /* Disable caches */ PTR_LA t9, mips_cache_disable

This patch adds the optional call to mips_mach_early_init() to start.S at a very early stage. Its disabled per default. It can be used for very early machine / platform specific init code. Its called very early and at this stage the PC is allowed to differ from the linking address (CONFIG_TEXT_BASE) as no absolute jump has been performed until this call.
It will be used by thje Octeon platform.
Signed-off-by: Stefan Roese sr@denx.de
arch/mips/Kconfig | 9 +++++++++ arch/mips/cpu/start.S | 5 +++++ 2 files changed, 14 insertions(+)
Reviewed-by: Daniel Schwierzeck daniel.schwierzeck@gmail.com

This patch adds the code to copy itself from bootrom location to a different location (TEXT_BASE) to the Octeon platform. Its used in this case to copy the complete U-Boot image into L2 cache, which greatly improves the bootup time - especially in regard to the very long and complex DDR4 init code.
The Kconfig symbol CONFIG_MIPS_MACH_EARLY_INIT is enabled with this patch for Octeon.
Signed-off-by: Stefan Roese sr@denx.de ---
arch/mips/Kconfig | 1 + arch/mips/mach-octeon/lowlevel_init.S | 56 +++++++++++++++++++++++++++ 2 files changed, 57 insertions(+)
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 327fd4848a..bcf6f26457 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -114,6 +114,7 @@ config ARCH_OCTEON select DM select DM_SERIAL select MIPS_L2_CACHE + select MIPS_MACH_EARLY_INIT select MIPS_TUNE_OCTEON3 select ROM_EXCEPTION_VECTORS select SUPPORTS_BIG_ENDIAN diff --git a/arch/mips/mach-octeon/lowlevel_init.S b/arch/mips/mach-octeon/lowlevel_init.S index d9aab38cde..2d9257b1ed 100644 --- a/arch/mips/mach-octeon/lowlevel_init.S +++ b/arch/mips/mach-octeon/lowlevel_init.S @@ -17,3 +17,59 @@ LEAF(lowlevel_init) jr ra nop END(lowlevel_init) + +LEAF(mips_mach_early_init) + + move s0, ra + + bal __dummy + nop + +__dummy: + /* Get the actual address that we are running at */ + PTR_LA a6, _start /* Linked address of _start */ + PTR_LA a7, __dummy + dsubu t1, a7, a6 /* offset of __dummy label from _start*/ + dsubu t0, ra, t1 /* t0 now has actual address of _start*/ + + PTR_LI t1, CONFIG_SYS_TEXT_BASE + + /* Calculate end address of copy loop */ + PTR_LI s5, CONFIG_BOARD_SIZE_LIMIT + daddu t2, s5, t0 /* t2 = end address */ + daddiu t2, t2, 127 + ins t2, zero, 0, 7 /* Round up to cache line for memcpy */ + + /* Copy ourself to the L2 cache from flash, 32 bytes at a time */ +1: + ld a0, 0(t0) + ld a1, 8(t0) + ld a2, 16(t0) + ld a3, 24(t0) + sd a0, 0(t1) + sd a1, 8(t1) + sd a2, 16(t1) + sd a3, 24(t1) + addiu t0, 32 + bne t0, t2, 1b + addiu t1, 32 + + sync + synci 0(zero) + + PTR_LA t9, uboot_in_cache + j t9 + nop + +uboot_in_cache: + + /* + * Return to start.S now running from TEXT_BASE, which points + * to DRAM address space, which effectively is L2 cache now. + * This speeds up the init process extremely, especially the + * DDR init code. + */ + jr s0 + nop + + END(mips_mach_early_init)

This patch adds the code to copy itself from bootrom location to a different location (TEXT_BASE) to the Octeon platform. Its used in this case to copy the complete U-Boot image into L2 cache, which greatly improves the bootup time - especially in regard to the very long and complex DDR4 init code.
The Kconfig symbol CONFIG_MIPS_MACH_EARLY_INIT is enabled with this patch for Octeon.
Signed-off-by: Stefan Roese sr@denx.de
arch/mips/Kconfig | 1 + arch/mips/mach-octeon/lowlevel_init.S | 56 +++++++++++++++++++++++++++ 2 files changed, 57 insertions(+)
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 327fd4848a..bcf6f26457 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -114,6 +114,7 @@ config ARCH_OCTEON select DM select DM_SERIAL select MIPS_L2_CACHE
- select MIPS_MACH_EARLY_INIT select MIPS_TUNE_OCTEON3 select ROM_EXCEPTION_VECTORS select SUPPORTS_BIG_ENDIAN
diff --git a/arch/mips/mach-octeon/lowlevel_init.S b/arch/mips/mach-octeon/lowlevel_init.S index d9aab38cde..2d9257b1ed 100644 --- a/arch/mips/mach-octeon/lowlevel_init.S +++ b/arch/mips/mach-octeon/lowlevel_init.S @@ -17,3 +17,59 @@ LEAF(lowlevel_init) jr ra nop END(lowlevel_init)
+LEAF(mips_mach_early_init)
move s0, ra
indentation
- bal __dummy
nop
+__dummy:
- /* Get the actual address that we are running at */
- PTR_LA a6, _start /* Linked address of _start */
- PTR_LA a7, __dummy
- dsubu t1, a7, a6 /* offset of __dummy label from _start*/
- dsubu t0, ra, t1 /* t0 now has actual address of _start*/
including _start in the calculation makes it a little bit hard to understand. Wouldn't it be enough to just calculate the difference between a7 and ra to get the relocation offset? You can use that offset later to calculate the destination address based on _start.
- PTR_LI t1, CONFIG_SYS_TEXT_BASE
isn't that the same address as _start?
- /* Calculate end address of copy loop */
- PTR_LI s5, CONFIG_BOARD_SIZE_LIMIT
couldn't you use __image_copy_end to get the real binary size without BSS? Or _end if you need to copy the relocation table as well.
- daddu t2, s5, t0 /* t2 = end address */
- daddiu t2, t2, 127
- ins t2, zero, 0, 7 /* Round up to cache line for memcpy */
- /* Copy ourself to the L2 cache from flash, 32 bytes at a time */
+1:
- ld a0, 0(t0)
- ld a1, 8(t0)
- ld a2, 16(t0)
- ld a3, 24(t0)
- sd a0, 0(t1)
- sd a1, 8(t1)
- sd a2, 16(t1)
- sd a3, 24(t1)
- addiu t0, 32
- bne t0, t2, 1b
- addiu t1, 32
the instruction in the delay slot should be indented by an extra space character
- sync
- synci 0(zero)
- PTR_LA t9, uboot_in_cache
- j t9
nop
Why the extra jump? If you have the relocation offset as suggested above, you could simply add that to s0 and do one jr instruction.
Also instead of synci you could use jr.hb to automatically add a instruction hazard barrier during the jump (if that's implemented on Octeon).
+uboot_in_cache:
- /*
* Return to start.S now running from TEXT_BASE, which points
* to DRAM address space, which effectively is L2 cache now.
* This speeds up the init process extremely, especially the
* DDR init code.
*/
- jr s0
nop
- END(mips_mach_early_init)

On 30.06.20 00:08, Daniel Schwierzeck wrote:
This patch adds the code to copy itself from bootrom location to a different location (TEXT_BASE) to the Octeon platform. Its used in this case to copy the complete U-Boot image into L2 cache, which greatly improves the bootup time - especially in regard to the very long and complex DDR4 init code.
The Kconfig symbol CONFIG_MIPS_MACH_EARLY_INIT is enabled with this patch for Octeon.
Signed-off-by: Stefan Roese sr@denx.de
arch/mips/Kconfig | 1 + arch/mips/mach-octeon/lowlevel_init.S | 56 +++++++++++++++++++++++++++ 2 files changed, 57 insertions(+)
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 327fd4848a..bcf6f26457 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -114,6 +114,7 @@ config ARCH_OCTEON select DM select DM_SERIAL select MIPS_L2_CACHE
- select MIPS_MACH_EARLY_INIT select MIPS_TUNE_OCTEON3 select ROM_EXCEPTION_VECTORS select SUPPORTS_BIG_ENDIAN
diff --git a/arch/mips/mach-octeon/lowlevel_init.S b/arch/mips/mach-octeon/lowlevel_init.S index d9aab38cde..2d9257b1ed 100644 --- a/arch/mips/mach-octeon/lowlevel_init.S +++ b/arch/mips/mach-octeon/lowlevel_init.S @@ -17,3 +17,59 @@ LEAF(lowlevel_init) jr ra nop END(lowlevel_init)
+LEAF(mips_mach_early_init)
move s0, ra
indentation
Thanks for spotting.
- bal __dummy
nop
+__dummy:
- /* Get the actual address that we are running at */
- PTR_LA a6, _start /* Linked address of _start */
- PTR_LA a7, __dummy
- dsubu t1, a7, a6 /* offset of __dummy label from _start*/
- dsubu t0, ra, t1 /* t0 now has actual address of _start*/
including _start in the calculation makes it a little bit hard to understand. Wouldn't it be enough to just calculate the difference between a7 and ra to get the relocation offset?
Yes. Thanks for the suggestion. I'll update this part accordingly.
You can use that offset later to calculate the destination address based on _start.
Yep.
- PTR_LI t1, CONFIG_SYS_TEXT_BASE
isn't that the same address as _start?
Yes.
- /* Calculate end address of copy loop */
- PTR_LI s5, CONFIG_BOARD_SIZE_LIMIT
couldn't you use __image_copy_end to get the real binary size without BSS? Or _end if you need to copy the relocation table as well.
I tried this before. And even _end is not enough, as the appended DTB also needs to be copied. I will use _end plus some additional size for the DTB in the next version and will drop CONFIG_BOARD_SIZE_LIMIT.
- daddu t2, s5, t0 /* t2 = end address */
- daddiu t2, t2, 127
- ins t2, zero, 0, 7 /* Round up to cache line for memcpy */
- /* Copy ourself to the L2 cache from flash, 32 bytes at a time */
+1:
- ld a0, 0(t0)
- ld a1, 8(t0)
- ld a2, 16(t0)
- ld a3, 24(t0)
- sd a0, 0(t1)
- sd a1, 8(t1)
- sd a2, 16(t1)
- sd a3, 24(t1)
- addiu t0, 32
- bne t0, t2, 1b
- addiu t1, 32
the instruction in the delay slot should be indented by an extra space character
Right, will change in v2.
- sync
- synci 0(zero)
- PTR_LA t9, uboot_in_cache
- j t9
nop
Why the extra jump? If you have the relocation offset as suggested above, you could simply add that to s0 and do one jr instruction.
Good idea, thanks.
Also instead of synci you could use jr.hb to automatically add a instruction hazard barrier during the jump (if that's implemented on Octeon).
I tried it and it works as you suggested. Will update on v2.
+uboot_in_cache:
- /*
* Return to start.S now running from TEXT_BASE, which points
* to DRAM address space, which effectively is L2 cache now.
* This speeds up the init process extremely, especially the
* DDR init code.
*/
- jr s0
nop
- END(mips_mach_early_init)
Thanks for all your suggestions. The resulting code now looks better (easier to understand and smaller).
Thanks, Stefan

Change the linking address (TEXT_BASE) to point to the L2 cache. This way, mips_mach_early_init() will copy itself into L2 cache and run from there to improve the bootup speed.
Also CONFIG_MIPS_CACHE_SETUP needs to be disabled, as now the cache is used at this time and can't be resetted.
Signed-off-by: Stefan Roese sr@denx.de ---
configs/octeon_ebb7304_defconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/configs/octeon_ebb7304_defconfig b/configs/octeon_ebb7304_defconfig index 0304b1ef8d..dc80fba848 100644 --- a/configs/octeon_ebb7304_defconfig +++ b/configs/octeon_ebb7304_defconfig @@ -1,5 +1,5 @@ CONFIG_MIPS=y -CONFIG_SYS_TEXT_BASE=0xffffffffbfc00000 +CONFIG_SYS_TEXT_BASE=0xffffffff80000000 CONFIG_SYS_MALLOC_F_LEN=0x4000 CONFIG_ENV_SIZE=0x2000 CONFIG_ENV_SECT_SIZE=0x10000 @@ -7,6 +7,7 @@ CONFIG_NR_DRAM_BANKS=2 CONFIG_DEBUG_UART_BASE=0x8001180000000800 CONFIG_DEBUG_UART_CLOCK=1200000000 CONFIG_ARCH_OCTEON=y +# CONFIG_MIPS_CACHE_SETUP is not set CONFIG_DEBUG_UART=y CONFIG_SYS_CONSOLE_INFO_QUIET=y CONFIG_HUSH_PARSER=y

This patch adds __weak to invalidate_dcache_range() in lib/cache.c. This makes it possible to overwrite this function by a platforms specific version, which will be done for Octeon.
Signed-off-by: Stefan Roese sr@denx.de ---
arch/mips/lib/cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/mips/lib/cache.c b/arch/mips/lib/cache.c index ad37f05802..cf29994a7a 100644 --- a/arch/mips/lib/cache.c +++ b/arch/mips/lib/cache.c @@ -159,7 +159,7 @@ void __weak flush_dcache_range(ulong start_addr, ulong stop) sync(); }
-void invalidate_dcache_range(ulong start_addr, ulong stop) +void __weak invalidate_dcache_range(ulong start_addr, ulong stop) { unsigned long lsize = dcache_line_size(); unsigned long slsize = scache_line_size();

As Octeon is cache coherent, lets add an empty version of invalidate_dcache_range(). With this, all global cache functions are replaced by no-ops on Octeon.
Signed-off-by: Stefan Roese sr@denx.de
---
arch/mips/mach-octeon/cache.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/arch/mips/mach-octeon/cache.c b/arch/mips/mach-octeon/cache.c index bea846d757..9a88bb97c7 100644 --- a/arch/mips/mach-octeon/cache.c +++ b/arch/mips/mach-octeon/cache.c @@ -18,3 +18,7 @@ void flush_dcache_range(ulong start_addr, ulong stop) void flush_cache(ulong start_addr, ulong size) { } + +void invalidate_dcache_range(ulong start_addr, ulong stop) +{ +}
participants (2)
-
Daniel Schwierzeck
-
Stefan Roese