[U-Boot] [PATCH v3] Fix board init code to use a valid C runtime environment

board_init_f_mem() alters the C runtime environment's stack it ls actually already using. This is not a valid C runtime environment.
Split board_init_f_mem into C functions which do not alter their own stack and therefore run in a valid C runtime environment.
Signed-off-by: Albert ARIBAUD albert.u.boot@aribaud.net --- For NIOS2, this patch contains all manual v1 and v2 fixes by Thomas.
For x86, this patch contains all manual v2 fixes by Bin.
Changes in v3: - Rebase after commit 9ac4fc82 - Simplify malloc conditional as per commit 9ac4fc82 - Fix NIOS2 return value register (r2, not r4) - Fix x86 subl argument inversion - Group allocations in a single function - Group initializations in a single function
Changes in v2: - Fix all checkpatch issues - Fix board_init_f_malloc prototype mismatch - Fix board_init_[f_]xxx typo in NIOS2 - Fix aarch64 asm 'sub' syntax error
arch/arc/lib/start.S | 12 +++++--- arch/arm/lib/crt0.S | 6 ++-- arch/arm/lib/crt0_64.S | 6 ++-- arch/microblaze/cpu/start.S | 4 +-- arch/nios2/cpu/start.S | 13 +++++--- arch/powerpc/cpu/ppc4xx/start.S | 14 ++++++--- arch/x86/cpu/start.S | 7 +++-- arch/x86/lib/fsp/fsp_common.c | 4 +-- common/init/board_init.c | 67 +++++++++++++++++++++++++++++++++-------- include/common.h | 33 ++++++++------------ 10 files changed, 108 insertions(+), 58 deletions(-)
diff --git a/arch/arc/lib/start.S b/arch/arc/lib/start.S index 26a5934..2a99193 100644 --- a/arch/arc/lib/start.S +++ b/arch/arc/lib/start.S @@ -54,14 +54,16 @@ ENTRY(_start) mov %sp, CONFIG_SYS_INIT_SP_ADDR mov %fp, %sp
- /* Allocate and zero GD, update SP */ - mov %r0, %sp - bl board_init_f_mem - + /* Get reserved area size, update SP and FP */ + bl board_init_f_get_reserve_size /* Update stack- and frame-pointers */ - mov %sp, %r0 + sub %sp, %sp, %r0 mov %fp, %sp
+ /* Initialize reserved area */ + mov %r0, %sp + bl board_init_f_init_reserve + /* Zero the one and only argument of "board_init_f" */ mov_s %r0, 0 j board_init_f diff --git a/arch/arm/lib/crt0.S b/arch/arm/lib/crt0.S index 80548eb..4ec89a4 100644 --- a/arch/arm/lib/crt0.S +++ b/arch/arm/lib/crt0.S @@ -82,9 +82,11 @@ ENTRY(_main) #else bic sp, sp, #7 /* 8-byte alignment for ABI compliance */ #endif + bl board_init_f_get_reserve_size + sub sp, sp, r0 + mov r9, sp mov r0, sp - bl board_init_f_mem - mov sp, r0 + bl board_init_f_init_reserve
mov r0, #0 bl board_init_f diff --git a/arch/arm/lib/crt0_64.S b/arch/arm/lib/crt0_64.S index cef1c71..d0f8db8 100644 --- a/arch/arm/lib/crt0_64.S +++ b/arch/arm/lib/crt0_64.S @@ -75,8 +75,10 @@ ENTRY(_main) ldr x0, =(CONFIG_SYS_INIT_SP_ADDR) #endif bic sp, x0, #0xf /* 16-byte alignment for ABI compliance */ - bl board_init_f_mem - mov sp, x0 + bl board_init_f_reserve_size + sub sp, sp, x0 + mov x0, sp + bl board_init_f_init_reserve
mov x0, #0 bl board_init_f diff --git a/arch/microblaze/cpu/start.S b/arch/microblaze/cpu/start.S index 14f46a8..206be3e 100644 --- a/arch/microblaze/cpu/start.S +++ b/arch/microblaze/cpu/start.S @@ -25,7 +25,7 @@ _start:
addi r8, r0, __end mts rslr, r8 - /* TODO: Redo this code to call board_init_f_mem() */ + /* TODO: Redo this code to call board_init_f_*() */ #if defined(CONFIG_SPL_BUILD) addi r1, r0, CONFIG_SPL_STACK_ADDR mts rshr, r1 @@ -142,7 +142,7 @@ _start: ori r12, r12, 0x1a0 mts rmsr, r12
- /* TODO: Redo this code to call board_init_f_mem() */ + /* TODO: Redo this code to call board_init_f_*() */ clear_bss: /* clear BSS segments */ addi r5, r0, __bss_start diff --git a/arch/nios2/cpu/start.S b/arch/nios2/cpu/start.S index 54787c5..c95f1f4 100644 --- a/arch/nios2/cpu/start.S +++ b/arch/nios2/cpu/start.S @@ -106,14 +106,17 @@ _reloc: stw r0, 4(sp) mov fp, sp
- /* Allocate and zero GD, update SP */ + /* Allocate and initialize reserved area, update SP */ + movhi r2, %hi(board_init_f_reserve_size@h) + ori r2, r2, %lo(board_init_f_reserve_size@h) + callr r2 + sub sp, sp, r2 mov r4, sp - movhi r2, %hi(board_init_f_mem@h) - ori r2, r2, %lo(board_init_f_mem@h) + movhi r2, %hi(board_init_f_init_reserve@h) + ori r2, r2, %lo(board_init_f_init_reserve@h) callr r2
- /* Update stack- and frame-pointers */ - mov sp, r2 + /* Update frame-pointer */ mov fp, sp
/* Call board_init_f -- never returns */ diff --git a/arch/powerpc/cpu/ppc4xx/start.S b/arch/powerpc/cpu/ppc4xx/start.S index 77d4040..cc6e4ec 100644 --- a/arch/powerpc/cpu/ppc4xx/start.S +++ b/arch/powerpc/cpu/ppc4xx/start.S @@ -761,9 +761,10 @@ _start:
bl cpu_init_f /* run low-level CPU init code (from Flash) */ #ifdef CONFIG_SYS_GENERIC_BOARD + bl board_init_f_reserve_size + sub r1, r1, r3 mr r3, r1 - bl board_init_f_mem - mr r1, r3 + bl board_init_f_init_reserve li r0,0 stwu r0, -4(r1) stwu r0, -4(r1) @@ -1037,9 +1038,14 @@ _start:
bl cpu_init_f /* run low-level CPU init code (from Flash) */ #ifdef CONFIG_SYS_GENERIC_BOARD + bl board_init_f_gd_size + sub r1, r1, r3 mr r3, r1 - bl board_init_f_mem - mr r1, r3 + bl board_init_f_gd + bl board_init_f_malloc_size + sub r1, r1, r3 + mr r3, r1 + bl board_init_f_malloc stwu r0, -4(r1) stwu r0, -4(r1) #endif diff --git a/arch/x86/cpu/start.S b/arch/x86/cpu/start.S index 5b4ee79..b30383f 100644 --- a/arch/x86/cpu/start.S +++ b/arch/x86/cpu/start.S @@ -121,10 +121,11 @@ car_init_ret: * with esi holding the HOB list address returned by the FSP. */ #endif - /* Set up global data */ + /* Reserve 'globals' on the stack */ + call board_init_f_reserve_size + subl %eax, %esp mov %esp, %eax - call board_init_f_mem - mov %eax, %esp + call board_init_f_init_reserve
#ifdef CONFIG_DEBUG_UART call debug_uart_init diff --git a/arch/x86/lib/fsp/fsp_common.c b/arch/x86/lib/fsp/fsp_common.c index c78df94..5249455 100644 --- a/arch/x86/lib/fsp/fsp_common.c +++ b/arch/x86/lib/fsp/fsp_common.c @@ -95,8 +95,8 @@ int x86_fsp_init(void) /* * The second time we enter here, adjust the size of malloc() * pool before relocation. Given gd->malloc_base was adjusted - * after the call to board_init_f_mem() in arch/x86/cpu/start.S, - * we should fix up gd->malloc_limit here. + * after the call to board_init_f_malloc() in arch/x86/cpu/ + * start.S, we should fix up gd->malloc_limit here. */ gd->malloc_limit += CONFIG_FSP_SYS_MALLOC_F_LEN; } diff --git a/common/init/board_init.c b/common/init/board_init.c index 1c6126d..e033253 100644 --- a/common/init/board_init.c +++ b/common/init/board_init.c @@ -21,39 +21,80 @@ DECLARE_GLOBAL_DATA_PTR; #define _USE_MEMCPY #endif
-/* Unfortunately x86 can't compile this code as gd cannot be assigned */ -#ifndef CONFIG_X86 +/* Unfortunately x86 or ARM can't compile this code as gd cannot be assigned */ +#if !defined(CONFIG_X86) && !defined(CONFIG_SYS_THUMB_BUILD) __weak void arch_setup_gd(struct global_data *gd_ptr) { gd = gd_ptr; } -#endif /* !CONFIG_X86 */ +#endif /* !CONFIG_X86 && !CONFIG_SYS_THUMB_BUILD */
-ulong board_init_f_mem(ulong top) +/* + * Compute size of space to reserve on stack for use as 'globals' + * and return size request for reserved area. + * + * Notes: + * + * Reservation cannot be actually done from within a C function + * as it implies modifying the C stack. + * + */ + +ulong board_init_f_get_reserve_size(void) +{ + ulong size; + + /* Reserve GD (rounded up to a multiple of 16 bytes) */ + size = roundup(sizeof(struct global_data), 16); + + /* Reserve malloc arena */ +#if defined(CONFIG_SYS_MALLOC_F) + size += CONFIG_SYS_MALLOC_F_LEN; +#endif + + return size; +} + +/* + * Initialize reserved space (which has been safely allocated on the C + * stack from the C runtime environment handling code). + * + * Do not use 'gd->' until arch_setup_gd() has been called! + */ + +void board_init_f_init_reserve(ulong base) { struct global_data *gd_ptr; #ifndef _USE_MEMCPY int *ptr; #endif
- /* Leave space for the stack we are running with now */ - top -= 0x40; + /* + * clear GD entirely + */
- top -= sizeof(struct global_data); - top = ALIGN(top, 16); - gd_ptr = (struct global_data *)top; + gd_ptr = (struct global_data *)base; + /* go down one GD plus 16-byte alignment */ + base += roundup(sizeof(struct global_data), 16); + /* zero the area */ #ifdef _USE_MEMCPY memset(gd_ptr, '\0', sizeof(*gd)); #else for (ptr = (int *)gd_ptr; ptr < (int *)(gd_ptr + 1); ) *ptr++ = 0; #endif + /* set GD unless architecture did it already */ +#if !defined(CONFIG_X86) && !defined(CONFIG_SYS_THUMB_BUILD) arch_setup_gd(gd_ptr); +#endif + + /* + * record malloc arena start + */
#if defined(CONFIG_SYS_MALLOC_F) - top -= CONFIG_SYS_MALLOC_F_LEN; - gd->malloc_base = top; + /* go down one malloc arena */ + gd->malloc_base = base; + base += CONFIG_SYS_MALLOC_F_LEN; #endif - - return top; } diff --git a/include/common.h b/include/common.h index 09a131d..988d67a 100644 --- a/include/common.h +++ b/include/common.h @@ -225,32 +225,25 @@ void board_init_f(ulong); void board_init_r(gd_t *, ulong) __attribute__ ((noreturn));
/** - * board_init_f_mem() - Allocate global data and set stack position + * ulong board_init_f_get_reserve_size - return size of reserved area * * This function is called by each architecture very early in the start-up - * code to set up the environment for board_init_f(). It allocates space for - * global_data (see include/asm-generic/global_data.h) and places the stack - * below this. + * code to allow the C runtime to reserve space on the stack for writable + * 'globals' such as GD and the malloc arena. * - * This function requires a stack[1] Normally this is at @top. The function - * starts allocating space from 64 bytes below @top. First it creates space - * for global_data. Then it calls arch_setup_gd() which sets gd to point to - * the global_data space and can reserve additional bytes of space if - * required). Finally it allocates early malloc() memory - * (CONFIG_SYS_MALLOC_F_LEN). The new top of the stack is just below this, - * and it returned by this function. + * @return: size of reserved area + */ +ulong board_init_f_get_reserve_size(void); + +/** + * board_init_f_init_reserve - initialize the reserved area(s) * - * [1] Strictly speaking it would be possible to implement this function - * in C on many archs such that it does not require a stack. However this - * does not seem hugely important as only 64 byte are wasted. The 64 bytes - * are used to handle the calling standard which generally requires pushing - * addresses or registers onto the stack. We should be able to get away with - * less if this becomes important. + * This function is called once the C runtime has allocated the reserved + * area on the stack. It must initialize the GD at the base of that area. * - * @top: Top of available memory, also normally the top of the stack - * @return: New stack location + * @base: top from which reservation was done */ -ulong board_init_f_mem(ulong top); +void board_init_f_init_reserve(ulong base);
/** * arch_setup_gd() - Set up the global_data pointer

Hi Albert,
On 13 November 2015 at 07:43, Albert ARIBAUD albert.u.boot@aribaud.net wrote:
board_init_f_mem() alters the C runtime environment's stack it ls actually already using. This is not a valid C runtime environment.
I think it is doing something like alloca() and reserving space on the stack. It does not actually change the stack pointer in C code. It changes data that is below the stack pointer, and therefore by definition not part of the stack, I think.
What actually goes wrong here? I read your info in the other thread but I'm afraid I still don't understand it.
Split board_init_f_mem into C functions which do not alter their own stack and therefore run in a valid C runtime environment.
Signed-off-by: Albert ARIBAUD albert.u.boot@aribaud.net
For NIOS2, this patch contains all manual v1 and v2 fixes by Thomas.
For x86, this patch contains all manual v2 fixes by Bin.
Changes in v3:
- Rebase after commit 9ac4fc82
- Simplify malloc conditional as per commit 9ac4fc82
- Fix NIOS2 return value register (r2, not r4)
- Fix x86 subl argument inversion
- Group allocations in a single function
- Group initializations in a single function
Changes in v2:
- Fix all checkpatch issues
- Fix board_init_f_malloc prototype mismatch
- Fix board_init_[f_]xxx typo in NIOS2
- Fix aarch64 asm 'sub' syntax error
arch/arc/lib/start.S | 12 +++++--- arch/arm/lib/crt0.S | 6 ++-- arch/arm/lib/crt0_64.S | 6 ++-- arch/microblaze/cpu/start.S | 4 +-- arch/nios2/cpu/start.S | 13 +++++--- arch/powerpc/cpu/ppc4xx/start.S | 14 ++++++--- arch/x86/cpu/start.S | 7 +++-- arch/x86/lib/fsp/fsp_common.c | 4 +-- common/init/board_init.c | 67 +++++++++++++++++++++++++++++++++-------- include/common.h | 33 ++++++++------------ 10 files changed, 108 insertions(+), 58 deletions(-)
diff --git a/arch/arc/lib/start.S b/arch/arc/lib/start.S index 26a5934..2a99193 100644 --- a/arch/arc/lib/start.S +++ b/arch/arc/lib/start.S @@ -54,14 +54,16 @@ ENTRY(_start) mov %sp, CONFIG_SYS_INIT_SP_ADDR mov %fp, %sp
/* Allocate and zero GD, update SP */
mov %r0, %sp
bl board_init_f_mem
/* Get reserved area size, update SP and FP */
bl board_init_f_get_reserve_size /* Update stack- and frame-pointers */
mov %sp, %r0
sub %sp, %sp, %r0 mov %fp, %sp
/* Initialize reserved area */
mov %r0, %sp
bl board_init_f_init_reserve
/* Zero the one and only argument of "board_init_f" */ mov_s %r0, 0 j board_init_f
diff --git a/arch/arm/lib/crt0.S b/arch/arm/lib/crt0.S index 80548eb..4ec89a4 100644 --- a/arch/arm/lib/crt0.S +++ b/arch/arm/lib/crt0.S @@ -82,9 +82,11 @@ ENTRY(_main) #else bic sp, sp, #7 /* 8-byte alignment for ABI compliance */ #endif
bl board_init_f_get_reserve_size
sub sp, sp, r0
mov r9, sp mov r0, sp
bl board_init_f_mem
mov sp, r0
bl board_init_f_init_reserve mov r0, #0 bl board_init_f
diff --git a/arch/arm/lib/crt0_64.S b/arch/arm/lib/crt0_64.S index cef1c71..d0f8db8 100644 --- a/arch/arm/lib/crt0_64.S +++ b/arch/arm/lib/crt0_64.S @@ -75,8 +75,10 @@ ENTRY(_main) ldr x0, =(CONFIG_SYS_INIT_SP_ADDR) #endif bic sp, x0, #0xf /* 16-byte alignment for ABI compliance */
bl board_init_f_mem
mov sp, x0
bl board_init_f_reserve_size
sub sp, sp, x0
mov x0, sp
bl board_init_f_init_reserve mov x0, #0 bl board_init_f
diff --git a/arch/microblaze/cpu/start.S b/arch/microblaze/cpu/start.S index 14f46a8..206be3e 100644 --- a/arch/microblaze/cpu/start.S +++ b/arch/microblaze/cpu/start.S @@ -25,7 +25,7 @@ _start:
addi r8, r0, __end mts rslr, r8
/* TODO: Redo this code to call board_init_f_mem() */
/* TODO: Redo this code to call board_init_f_*() */
#if defined(CONFIG_SPL_BUILD) addi r1, r0, CONFIG_SPL_STACK_ADDR mts rshr, r1 @@ -142,7 +142,7 @@ _start: ori r12, r12, 0x1a0 mts rmsr, r12
/* TODO: Redo this code to call board_init_f_mem() */
/* TODO: Redo this code to call board_init_f_*() */
clear_bss: /* clear BSS segments */ addi r5, r0, __bss_start diff --git a/arch/nios2/cpu/start.S b/arch/nios2/cpu/start.S index 54787c5..c95f1f4 100644 --- a/arch/nios2/cpu/start.S +++ b/arch/nios2/cpu/start.S @@ -106,14 +106,17 @@ _reloc: stw r0, 4(sp) mov fp, sp
/* Allocate and zero GD, update SP */
/* Allocate and initialize reserved area, update SP */
movhi r2, %hi(board_init_f_reserve_size@h)
ori r2, r2, %lo(board_init_f_reserve_size@h)
callr r2
sub sp, sp, r2 mov r4, sp
movhi r2, %hi(board_init_f_mem@h)
ori r2, r2, %lo(board_init_f_mem@h)
movhi r2, %hi(board_init_f_init_reserve@h)
ori r2, r2, %lo(board_init_f_init_reserve@h) callr r2
/* Update stack- and frame-pointers */
mov sp, r2
/* Update frame-pointer */ mov fp, sp /* Call board_init_f -- never returns */
diff --git a/arch/powerpc/cpu/ppc4xx/start.S b/arch/powerpc/cpu/ppc4xx/start.S index 77d4040..cc6e4ec 100644 --- a/arch/powerpc/cpu/ppc4xx/start.S +++ b/arch/powerpc/cpu/ppc4xx/start.S @@ -761,9 +761,10 @@ _start:
bl cpu_init_f /* run low-level CPU init code (from Flash) */
#ifdef CONFIG_SYS_GENERIC_BOARD
bl board_init_f_reserve_size
sub r1, r1, r3 mr r3, r1
bl board_init_f_mem
mr r1, r3
bl board_init_f_init_reserve li r0,0 stwu r0, -4(r1) stwu r0, -4(r1)
@@ -1037,9 +1038,14 @@ _start:
bl cpu_init_f /* run low-level CPU init code (from Flash) */
#ifdef CONFIG_SYS_GENERIC_BOARD
bl board_init_f_gd_size
sub r1, r1, r3 mr r3, r1
bl board_init_f_mem
mr r1, r3
bl board_init_f_gd
bl board_init_f_malloc_size
sub r1, r1, r3
mr r3, r1
bl board_init_f_malloc stwu r0, -4(r1) stwu r0, -4(r1)
#endif diff --git a/arch/x86/cpu/start.S b/arch/x86/cpu/start.S index 5b4ee79..b30383f 100644 --- a/arch/x86/cpu/start.S +++ b/arch/x86/cpu/start.S @@ -121,10 +121,11 @@ car_init_ret: * with esi holding the HOB list address returned by the FSP. */ #endif
/* Set up global data */
/* Reserve 'globals' on the stack */
call board_init_f_reserve_size
subl %eax, %esp mov %esp, %eax
call board_init_f_mem
mov %eax, %esp
call board_init_f_init_reserve
#ifdef CONFIG_DEBUG_UART call debug_uart_init diff --git a/arch/x86/lib/fsp/fsp_common.c b/arch/x86/lib/fsp/fsp_common.c index c78df94..5249455 100644 --- a/arch/x86/lib/fsp/fsp_common.c +++ b/arch/x86/lib/fsp/fsp_common.c @@ -95,8 +95,8 @@ int x86_fsp_init(void) /* * The second time we enter here, adjust the size of malloc() * pool before relocation. Given gd->malloc_base was adjusted
* after the call to board_init_f_mem() in arch/x86/cpu/start.S,
* we should fix up gd->malloc_limit here.
* after the call to board_init_f_malloc() in arch/x86/cpu/
* start.S, we should fix up gd->malloc_limit here. */ gd->malloc_limit += CONFIG_FSP_SYS_MALLOC_F_LEN; }
diff --git a/common/init/board_init.c b/common/init/board_init.c index 1c6126d..e033253 100644 --- a/common/init/board_init.c +++ b/common/init/board_init.c @@ -21,39 +21,80 @@ DECLARE_GLOBAL_DATA_PTR; #define _USE_MEMCPY #endif
-/* Unfortunately x86 can't compile this code as gd cannot be assigned */ -#ifndef CONFIG_X86 +/* Unfortunately x86 or ARM can't compile this code as gd cannot be assigned */ +#if !defined(CONFIG_X86) && !defined(CONFIG_SYS_THUMB_BUILD) __weak void arch_setup_gd(struct global_data *gd_ptr) { gd = gd_ptr; } -#endif /* !CONFIG_X86 */ +#endif /* !CONFIG_X86 && !CONFIG_SYS_THUMB_BUILD */
-ulong board_init_f_mem(ulong top) +/*
- Compute size of space to reserve on stack for use as 'globals'
- and return size request for reserved area.
- Notes:
- Reservation cannot be actually done from within a C function
- as it implies modifying the C stack.
- */
+ulong board_init_f_get_reserve_size(void) +{
ulong size;
/* Reserve GD (rounded up to a multiple of 16 bytes) */
size = roundup(sizeof(struct global_data), 16);
/* Reserve malloc arena */
+#if defined(CONFIG_SYS_MALLOC_F)
size += CONFIG_SYS_MALLOC_F_LEN;
+#endif
return size;
+}
+/*
- Initialize reserved space (which has been safely allocated on the C
- stack from the C runtime environment handling code).
- Do not use 'gd->' until arch_setup_gd() has been called!
- */
+void board_init_f_init_reserve(ulong base) { struct global_data *gd_ptr; #ifndef _USE_MEMCPY int *ptr; #endif
/* Leave space for the stack we are running with now */
top -= 0x40;
/*
* clear GD entirely
*/
top -= sizeof(struct global_data);
top = ALIGN(top, 16);
gd_ptr = (struct global_data *)top;
gd_ptr = (struct global_data *)base;
/* go down one GD plus 16-byte alignment */
base += roundup(sizeof(struct global_data), 16);
But base is passed in, and on ARM sp and r9 are already set to this address. So I don't think you can potentially adjust it here.
/* zero the area */
#ifdef _USE_MEMCPY memset(gd_ptr, '\0', sizeof(*gd)); #else for (ptr = (int *)gd_ptr; ptr < (int *)(gd_ptr + 1); ) *ptr++ = 0; #endif
/* set GD unless architecture did it already */
+#if !defined(CONFIG_X86) && !defined(CONFIG_SYS_THUMB_BUILD) arch_setup_gd(gd_ptr); +#endif
/*
* record malloc arena start
*/
#if defined(CONFIG_SYS_MALLOC_F)
top -= CONFIG_SYS_MALLOC_F_LEN;
gd->malloc_base = top;
/* go down one malloc arena */
gd->malloc_base = base;
base += CONFIG_SYS_MALLOC_F_LEN;
#endif
Somewhat subtle, but you are allocating the memory in board_init_f_get_reserve_size() in this order, from top to bottom:
global_data CONIG_SYS_MALLOC_F_LEN
and in board_init_f_init_reserve(), from bottom to top:
global_data CONIG_SYS_MALLOC_F_LEN
Now we put malloc() above global_data. That's fine I think. We now have two pieces of code to keep in sync, which is probably unavoidable.
return top;
} diff --git a/include/common.h b/include/common.h index 09a131d..988d67a 100644 --- a/include/common.h +++ b/include/common.h @@ -225,32 +225,25 @@ void board_init_f(ulong); void board_init_r(gd_t *, ulong) __attribute__ ((noreturn));
/**
- board_init_f_mem() - Allocate global data and set stack position
- ulong board_init_f_get_reserve_size - return size of reserved area
- This function is called by each architecture very early in the start-up
- code to set up the environment for board_init_f(). It allocates space for
- global_data (see include/asm-generic/global_data.h) and places the stack
- below this.
- code to allow the C runtime to reserve space on the stack for writable
- 'globals' such as GD and the malloc arena.
- This function requires a stack[1] Normally this is at @top. The function
- starts allocating space from 64 bytes below @top. First it creates space
- for global_data. Then it calls arch_setup_gd() which sets gd to point to
- the global_data space and can reserve additional bytes of space if
- required). Finally it allocates early malloc() memory
- (CONFIG_SYS_MALLOC_F_LEN). The new top of the stack is just below this,
- and it returned by this function.
- @return: size of reserved area
- */
+ulong board_init_f_get_reserve_size(void);
+/**
- board_init_f_init_reserve - initialize the reserved area(s)
- [1] Strictly speaking it would be possible to implement this function
- in C on many archs such that it does not require a stack. However this
- does not seem hugely important as only 64 byte are wasted. The 64 bytes
- are used to handle the calling standard which generally requires pushing
- addresses or registers onto the stack. We should be able to get away with
- less if this becomes important.
- This function is called once the C runtime has allocated the reserved
- area on the stack. It must initialize the GD at the base of that area.
- @top: Top of available memory, also normally the top of the stack
- @return: New stack location
*/
- @base: top from which reservation was done
-ulong board_init_f_mem(ulong top); +void board_init_f_init_reserve(ulong base);
/**
- arch_setup_gd() - Set up the global_data pointer
-- 2.5.0
Regards, Simon

Hello Simon,
On Fri, 13 Nov 2015 19:29:21 -0700, Simon Glass sjg@chromium.org wrote:
Hi Albert,
On 13 November 2015 at 07:43, Albert ARIBAUD albert.u.boot@aribaud.net wrote:
board_init_f_mem() alters the C runtime environment's stack it ls actually already using. This is not a valid C runtime environment.
I think it is doing something like alloca() and reserving space on the stack.
AFAICT, board_init_f_mem does *not* "do something like alloca()", it does something that alloca() precisely does *not* do.
What alloca() does is, it creates a local variable on the stack, only the size of that local is determined at run time; but that is still a *local* variable, that will be freed like all locals when the current function returns. The location of the alloca reservation is determined by the compiler, which knows how which way the stack works and how much of it is being used by the calling function already; and the resulting pointer cannot be accessed beyond the size that was allocated.
What board_init_f_mem does is *completely* different: it wild-guesses how much of the stack it is using, it assumes which way the stack grows, and it purposefully accesses space that it was not given access to, and it expects that allocation to survive upon returning to the calling context.
Now, if you look at alloca's definition in glibc, you'll see it is defined to be a gcc built-in, not a C function. There is a reason for this: stack allocation within a C function requires the compiler's help, because the compiler must be made aware of the alloca use in order to properly free the alloca()ed space on every exit point in the function. Only a built-in can get that level of compiler support.
board_init_f_mem's case is worse yet, as, contrary to alloca() which only expects /local/ effects, board_init_f_mem expects the allocated space to survive the return to the calling context. Doing this would require *more* compiler help than alloca does, because it is not more a matter of managing the stack pointer; it is a matter of moving things around so that the allocated space ends up *above* the call frame for board_init_f.
It does not actually change the stack pointer in C code.
Non, it does not indeed; it could not, in fact, do this at all without resorting to the same technique that is resorted to for implementing alloca(), that is, it would have to be a built-in of gcc, as alloca() is -- and, as I have just explained, it would require extensive help from the compiler, as all compiler-generated epilog code would have to account for that potential allocation.
It changes data that is below the stack pointer, and therefore by definition not part of the stack, I think.
It changes data that it /assumes/ is below the stack pointer and that it /assumes/ it can write to; but these two assumptions are unfounded, see below.
What actually goes wrong here? I read your info in the other thread but I'm afraid I still don't understand it.
At least three things are formally wrong here with board_init_f_mem:
- it assumes the stack grows downward.
- it assumes it never uses more than 0x40 bytes of the stack.
- it assumes it can write to the stack beyond what it was allocated for local variables, temporaries and arguments.
Each of these assumptions is unfounded in code which is supposed to be architecture-agnostic. Not all stacks grow downward. There is no way to ensure that only 0x40 bytes are used by board_init_f_mem. There is no guarantee that a C function can write to its own stack beyond what it was allocated -- in fact, there is no guarantee that a C function can write to its stack anywhere else than its declared or alloca()ed locals. Such writes are undefined behavior.
These assumptions are just as bad as assuming any pointer-to-long value is valid when we could ensure it is; and we would fix such a situation, would we not?
Same here. This patch ensures that the code in board_init_f_mem becomes compliant with C runtime environment constraints. This requires splitting the C code in two functions so that the first one does not access its stack (except the allowed accesses generated by the compiler) and the second accesses an area which is in fact not the stack any more, thus is safe to access.
Regards, Simon
Amicalement,

Hi Albert,
On 14 November 2015 at 07:36, Albert ARIBAUD albert.u.boot@aribaud.net wrote:
Hello Simon,
On Fri, 13 Nov 2015 19:29:21 -0700, Simon Glass sjg@chromium.org wrote:
Hi Albert,
On 13 November 2015 at 07:43, Albert ARIBAUD albert.u.boot@aribaud.net wrote:
board_init_f_mem() alters the C runtime environment's stack it ls actually already using. This is not a valid C runtime environment.
I think it is doing something like alloca() and reserving space on the stack.
AFAICT, board_init_f_mem does *not* "do something like alloca()", it does something that alloca() precisely does *not* do.
What alloca() does is, it creates a local variable on the stack, only the size of that local is determined at run time; but that is still a *local* variable, that will be freed like all locals when the current function returns. The location of the alloca reservation is determined by the compiler, which knows how which way the stack works and how much of it is being used by the calling function already; and the resulting pointer cannot be accessed beyond the size that was allocated.
What board_init_f_mem does is *completely* different: it wild-guesses how much of the stack it is using, it assumes which way the stack grows, and it purposefully accesses space that it was not given access to, and it expects that allocation to survive upon returning to the calling context.
Now, if you look at alloca's definition in glibc, you'll see it is defined to be a gcc built-in, not a C function. There is a reason for this: stack allocation within a C function requires the compiler's help, because the compiler must be made aware of the alloca use in order to properly free the alloca()ed space on every exit point in the function. Only a built-in can get that level of compiler support.
board_init_f_mem's case is worse yet, as, contrary to alloca() which only expects /local/ effects, board_init_f_mem expects the allocated space to survive the return to the calling context. Doing this would require *more* compiler help than alloca does, because it is not more a matter of managing the stack pointer; it is a matter of moving things around so that the allocated space ends up *above* the call frame for board_init_f.
OK. It doesn't matter much - I think we both understand what is going on in the function.
It does not actually change the stack pointer in C code.
Non, it does not indeed; it could not, in fact, do this at all without resorting to the same technique that is resorted to for implementing alloca(), that is, it would have to be a built-in of gcc, as alloca() is -- and, as I have just explained, it would require extensive help from the compiler, as all compiler-generated epilog code would have to account for that potential allocation.
It changes data that is below the stack pointer, and therefore by definition not part of the stack, I think.
It changes data that it /assumes/ is below the stack pointer and that it /assumes/ it can write to; but these two assumptions are unfounded, see below.
What actually goes wrong here? I read your info in the other thread but I'm afraid I still don't understand it.
At least three things are formally wrong here with board_init_f_mem:
it assumes the stack grows downward.
it assumes it never uses more than 0x40 bytes of the stack.
it assumes it can write to the stack beyond what it was allocated for local variables, temporaries and arguments.
Each of these assumptions is unfounded in code which is supposed to be architecture-agnostic. Not all stacks grow downward. There is no way to ensure that only 0x40 bytes are used by board_init_f_mem. There is no guarantee that a C function can write to its own stack beyond what it was allocated -- in fact, there is no guarantee that a C function can write to its stack anywhere else than its declared or alloca()ed locals. Such writes are undefined behavior.
These assumptions are just as bad as assuming any pointer-to-long value is valid when we could ensure it is; and we would fix such a situation, would we not?
Same here. This patch ensures that the code in board_init_f_mem becomes compliant with C runtime environment constraints. This requires splitting the C code in two functions so that the first one does not access its stack (except the allowed accesses generated by the compiler) and the second accesses an area which is in fact not the stack any more, thus is safe to access.
Reading between the lines, there isn't actually a problem with this in practice, but it is doing things that C should not really be doing. Is that right? To me it seems OK provided we can assume that the stack needed to call the function is less than 64 bytes. But I'm sure there is a spec somewhere that the current function violates. Your idea of what constitutes the stack seems slightly different to mine - I think of it as the data already written, whereas you think of it as data yet to write, There is some associated uncertainty since the compiler is free to fiddle with the stack pointer. In practice in a boot loader we always make some architectural assumptions based on current operation. But I agree we should minimise this.
Anyway, I think your solution only adds one more function and still has most of the code in C rather than assembler, so that it is common to all archs. That was my main goal with the change.
I prefer to set up gd in the C function if we can, to avoid the oddness that gd has to be at the bottom and the associated 16-byte alignment issue. But let's see how that goes. If we need to do it in assembler, it might be better to require the caller to do it before calling board_init_f_reserve().
Regards, Simon

Hello Simon,
On Sat, 14 Nov 2015 08:41:01 -0700, Simon Glass sjg@chromium.org wrote:
Reading between the lines, there isn't actually a problem with this in practice, but it is doing things that C should not really be doing. Is that right?
You did not even need to read between the lines :) as the very first comment to the v1 of this patch was an explicit question whether I had seen actual failure cause by board_init_f_mem() -- and my explicit answer was that no, I hadn't, but as an analogy, I hadn't personally seen any car crash caused by worn tires either, and that did not deter me from changing worn tires as soon as I noticed them rather than wait for the tires to fail.
To me it seems OK provided we can assume that the stack needed to call the function is less than 64 bytes. But I'm sure there is a spec somewhere that the current function violates. Your idea of what constitutes the stack seems slightly different to mine - I think of it as the data already written, whereas you think of it as data yet to write, There is some associated uncertainty since the compiler is free to fiddle with the stack pointer. In practice in a boot loader we always make some architectural assumptions based on current operation. But I agree we should minimise this.
Ok.
Anyway, I think your solution only adds one more function and still has most of the code in C rather than assembler, so that it is common to all archs. That was my main goal with the change.
Just to clarify, keeping as much C as possible is my goal too. Especially, the patch keeps the size computation in a C function rather than reverting back to computing it in crt0.S as it was 'in olden days'; that is because computing the size in a C function is much more flexible than it would be in assembly language, and accessible to many more developers.
My general view of assembly vs C language is that assembly language should only be used to perform things that cannot be done in C language without violating the C runtime environment, or for which a C runtime environment is not available, or which may fail depending on the compiler implementation. This essentially covers:
- the entry point: when it is directly pointed at by the reset vector, it obviously does not have a C runtime environment, and it usually does not have one either even when it it entered from ROM code. In fact, the general goal of the entry point is to set up the C runtime environment(s).
- exception vectors: except for rare architectures for which the compiler provides a way to directly write interrupt routines in C, exception vectors have to be written in assembly language; again, their role *is* to set up the C runtime environment for the C language exception handler.
- exceptional cases where any implementation of some design in C conflicts with the compiler implementation. Of course, these case are specific to an architecture and frequently even a compiler version (see below for the r9-as-GD issue) and must only resort to assembly language for said architecture, and use as little of it as possible. Typically, such designs are related to an architecture's system ABI, which a compiler does not necessarily handle -- again, r9 as a system wide constant register comes to the mind of the ARM custodian -- or the need for specific instructions for which the compiler does not provide support in C (mcr and mrc are ARM examples).
Also, while the patch does add one C function, it does not change the C code functionally overall, and adds very little assembly code, thereby resulting in a negligible global code increase (actually, on ARM, I see more boards experiencing a slight global code size *decrease* than boards experiencing an *increase*).
I prefer to set up gd in the C function if we can, to avoid the oddness that gd has to be at the bottom and the associated 16-byte alignment issue. But let's see how that goes. If we need to do it in assembler, it might be better to require the caller to do it before calling board_init_f_reserve().
Right now I only see two architectures which cannot set GD in arch_setup_gd(): x86 and ARM; my patch takes care not to affect other arches, nor do I have an intent to do so. They can set GD from C, and that's fine.
Note about the ARM case:
Regarding ARM, I do change the behaviour for all of ARM, whereas the issue of arch_setup_gd() failing to set r9 only happens for Thumb-1 code generation and only with gcc 5.2.1 (and possibly a few other versions).
This is because, as I said elsewhere, I fear that the gcc -ffixed-reg behaviour change is compliant with a strict interpretation of the option (all accesses to r9 from user C code will return the same value) and therefore I expect later gcc versions to not change the Thumb-1 behaviour; it might even happen that this behaviour change be 'ported over' to Thumb-2 or even ARM code generation.
As the ARM custodian, I do not view tracking gcc behaviour changes as a rewarding task, nor do I consider it a sane solution to pepper gcc-version conditionals around some C code because its success happens to depend on which implementation of gcc it was compiled with.
Plus, I do not know right now how LLVM handles r9 but I do remember some hours of... fun... with it, and I don't want to discover in later times that LLVM suddenly changed its mind about r9 handling too.
This is why, for ARM, I prefer cutting the problem short; taking the strictest option and only trusting -ffixed-reg to provide a constant r9 for reading, not writing, from within C code; therefore, setting r9 to GD from outside any C context; and doing all of this regardless of ARM instruction set or compiler version or even compiler choice.
Regards, Simon
Amicalement,

Hi Albert,
On 2015年11月13日 22:43, Albert ARIBAUD wrote:
+/*
- Initialize reserved space (which has been safely allocated on the C
- stack from the C runtime environment handling code).
- Do not use 'gd->' until arch_setup_gd() has been called!
- */
+void board_init_f_init_reserve(ulong base) { struct global_data *gd_ptr; #ifndef _USE_MEMCPY int *ptr; #endif
- /* Leave space for the stack we are running with now */
- top -= 0x40;
- /*
* clear GD entirely
*/
- top -= sizeof(struct global_data);
- top = ALIGN(top, 16);
- gd_ptr = (struct global_data *)top;
- gd_ptr = (struct global_data *)base;
- /* go down one GD plus 16-byte alignment */
- base += roundup(sizeof(struct global_data), 16);
Maybe it can keep the original order, top--gd--malloc--base.
#if defined(CONFIG_SYS_MALLOC_F) base += CONFIG_SYS_MALLOC_F_LEN; #endif gd_ptr = (struct global_data *)base;
- /* zero the area */ #ifdef _USE_MEMCPY memset(gd_ptr, '\0', sizeof(*gd)); #else for (ptr = (int *)gd_ptr; ptr < (int *)(gd_ptr + 1); ) *ptr++ = 0; #endif
- /* set GD unless architecture did it already */
+#if !defined(CONFIG_X86) && !defined(CONFIG_SYS_THUMB_BUILD) arch_setup_gd(gd_ptr); +#endif
/*
* record malloc arena start
*/
#if defined(CONFIG_SYS_MALLOC_F)
- top -= CONFIG_SYS_MALLOC_F_LEN;
- gd->malloc_base = top;
- /* go down one malloc arena */
- gd->malloc_base = base;
#if defined(CONFIG_SYS_MALLOC_F) base -= CONFIG_SYS_MALLOC_F_LEN; gd->malloc_base = base; #endif
- base += CONFIG_SYS_MALLOC_F_LEN;
Useless statement.
Best regards, Thomas

Hello Thomas,
On Sat, 14 Nov 2015 14:23:31 +0800, Thomas Chou thomas@wytron.com.tw wrote:
Hi Albert,
On 2015年11月13日 22:43, Albert ARIBAUD wrote:
+/*
- Initialize reserved space (which has been safely allocated on the C
- stack from the C runtime environment handling code).
- Do not use 'gd->' until arch_setup_gd() has been called!
- */
+void board_init_f_init_reserve(ulong base) { struct global_data *gd_ptr; #ifndef _USE_MEMCPY int *ptr; #endif
- /* Leave space for the stack we are running with now */
- top -= 0x40;
- /*
* clear GD entirely
*/
- top -= sizeof(struct global_data);
- top = ALIGN(top, 16);
- gd_ptr = (struct global_data *)top;
- gd_ptr = (struct global_data *)base;
- /* go down one GD plus 16-byte alignment */
- base += roundup(sizeof(struct global_data), 16);
Maybe it can keep the original order, top--gd--malloc--base.
Actually, I inverted the two between v2 and v3 for a reason, but I should have made it more explicit.
This reason is, for architectures which may not be able to write to gd from arch_setup_gd(), the C runtime environment handling code (crt0.S for ARM, etc) needs a way to easily find our where gd_ptr was allocated.
So, it was either:
- keeping *gd_ptr above the malloc arena and having to pass gd_ptr back to board_init_f_reserve_size's caller; but board_init_f_reserve_size already returns a value back to its caller, and returning two values from a C function takes resources (a pointer argument and a memory location at least);
or:
- putting *gd_ptr at the base of the allocated space, below the malloc arena, so that the C runtime environment handling code knows where to point gd to without incurring the cost of passing additional results back from board_init_f_reserve_init.
This is the reason why I placed the global data below the malloc arena.
[besides, considering that our malloc implementation starts from the base of the arena and allocates upward, it is (admittedly very slightly) safer to have GD placed below it than above.]
I will add a note in v4 about the reason for this (re)ordering of allocations.
- base += CONFIG_SYS_MALLOC_F_LEN;
Useless statement.
Right now it is indeed useless. However, if and when some other space gets reserved and initialized besides GD and the malloc arena, this statement will be needed before allocating / accessing the space allocated above the malloc arena.
Note that this statement will not result in any useless *code*, as the compiler's optimizer detects that base remains unused after it, and thus removes it from the generated object code.
So I'll insist on keeping this statement, although I will add a comment next to it explaining why it matters despite not having any visible effect.
Best regards, Thomas
Amicalement,

Hi Albert,
On 2015年11月14日 23:06, Albert ARIBAUD wrote:
Hello Thomas,
On Sat, 14 Nov 2015 14:23:31 +0800, Thomas Chou thomas@wytron.com.tw wrote:
Hi Albert,
On 2015年11月13日 22:43, Albert ARIBAUD wrote:
+/*
- Initialize reserved space (which has been safely allocated on the C
- stack from the C runtime environment handling code).
- Do not use 'gd->' until arch_setup_gd() has been called!
- */
+void board_init_f_init_reserve(ulong base) { struct global_data *gd_ptr; #ifndef _USE_MEMCPY int *ptr; #endif
- /* Leave space for the stack we are running with now */
- top -= 0x40;
- /*
* clear GD entirely
*/
- top -= sizeof(struct global_data);
- top = ALIGN(top, 16);
- gd_ptr = (struct global_data *)top;
- gd_ptr = (struct global_data *)base;
- /* go down one GD plus 16-byte alignment */
- base += roundup(sizeof(struct global_data), 16);
Maybe it can keep the original order, top--gd--malloc--base.
Actually, I inverted the two between v2 and v3 for a reason, but I should have made it more explicit.
This reason is, for architectures which may not be able to write to gd from arch_setup_gd(), the C runtime environment handling code (crt0.S for ARM, etc) needs a way to easily find our where gd_ptr was allocated.
So, it was either:
- keeping *gd_ptr above the malloc arena and having to pass gd_ptr back to board_init_f_reserve_size's caller; but board_init_f_reserve_size already returns a value back to its caller, and returning two values from a C function takes resources (a pointer argument and a memory location at least);
or:
- putting *gd_ptr at the base of the allocated space, below the malloc arena, so that the C runtime environment handling code knows where to point gd to without incurring the cost of passing additional results back from board_init_f_reserve_init.
This is the reason why I placed the global data below the malloc arena.
[besides, considering that our malloc implementation starts from the base of the arena and allocates upward, it is (admittedly very slightly) safer to have GD placed below it than above.]
Agree.
I will add a note in v4 about the reason for this (re)ordering of allocations.
- base += CONFIG_SYS_MALLOC_F_LEN;
Useless statement.
Right now it is indeed useless. However, if and when some other space gets reserved and initialized besides GD and the malloc arena, this statement will be needed before allocating / accessing the space allocated above the malloc arena.
Note that this statement will not result in any useless *code*, as the compiler's optimizer detects that base remains unused after it, and thus removes it from the generated object code.
So I'll insist on keeping this statement, although I will add a comment next to it explaining why it matters despite not having any visible effect.
OK.
Best regards, Thomas
participants (3)
-
Albert ARIBAUD
-
Simon Glass
-
Thomas Chou