[PATCH v3 0/4] malloc: Reduce size by initializing data at runtime

In my efforts to get SPL to fit into flash after some changes I made, I noticed that av_ is one of the largest variables in SPL. As it turns out, we can generate it at runtime, and the code is already there. This has the potential to save 1-2k across the board, for some (very) minor boot time increase.
This series is based on [1], since this makes checking for SYS_MALLOC_F easier. Passing CI at [2].
To measure the boot time difference, I applied the following patch:
--- common/board_r.c | 5 +++++ common/spl/spl.c | 4 ++++ 2 files changed, 9 insertions(+)
diff --git a/common/board_r.c b/common/board_r.c index 58a5986aa54..ca624b20d46 100644 --- a/common/board_r.c +++ b/common/board_r.c @@ -194,6 +194,7 @@ static int initr_barrier(void) return 0; }
+static ulong malloc_begin, malloc_end; static int initr_malloc(void) { ulong malloc_start; @@ -208,8 +209,10 @@ static int initr_malloc(void) * reserve_noncached(). */ malloc_start = gd->relocaddr - TOTAL_MALLOC_LEN; + malloc_begin = timer_get_boot_us(); mem_malloc_init((ulong)map_sysmem(malloc_start, TOTAL_MALLOC_LEN), TOTAL_MALLOC_LEN); + malloc_end = timer_get_boot_us(); gd->flags |= GD_FLG_FULL_MALLOC_INIT; return 0; } @@ -570,6 +573,8 @@ static int dm_announce(void)
static int run_main_loop(void) { + printf("malloc_init took %luus (%lu %lu)\n", malloc_end - malloc_begin, + malloc_begin, malloc_end); #ifdef CONFIG_SANDBOX sandbox_main_loop_init(); #endif diff --git a/common/spl/spl.c b/common/spl/spl.c index d74acec10b5..b34d1f4b4e6 100644 --- a/common/spl/spl.c +++ b/common/spl/spl.c @@ -755,7 +755,9 @@ void board_init_r(gd_t *dummy1, ulong dummy2) spl_set_bd();
#if defined(CONFIG_SYS_SPL_MALLOC) + ulong malloc_begin = timer_get_boot_us(); mem_malloc_init(SYS_SPL_MALLOC_START, CONFIG_SYS_SPL_MALLOC_SIZE); + ulong malloc_end = timer_get_boot_us(); gd->flags |= GD_FLG_FULL_MALLOC_INIT; #endif if (!(gd->flags & GD_FLG_SPL_INIT)) { @@ -817,6 +819,8 @@ void board_init_r(gd_t *dummy1, ulong dummy2) spl_image.boot_device = BOOT_DEVICE_NONE; board_boot_order(spl_boot_list);
+ printf("malloc_init took %luus (%lu %lu)\n", malloc_end - malloc_begin, + malloc_begin, malloc_end); ret = boot_from_devices(&spl_image, spl_boot_list, ARRAY_SIZE(spl_boot_list)); if (ret) { -- 2.25.1
I found that MALLOC_CLEAR_ON_INIT dominated the mem_malloc_init time (taking around 150 ms in SPL on my board). After disabling it, I found that MALLOC_RUNTIME_INIT took around 5 us on average.
[1] https://lore.kernel.org/u-boot/20230926141514.2101787-1-sjg@chromium.org/ [2] https://source.denx.de/u-boot/custodians/u-boot-clk/-/pipelines/17900
Changes in v3: - Use CONFIG_IS_ENABLED in conditionals - Don't enable SPL_SYS_MALLOC_RUNTIME_INIT if we are short on BSS
Changes in v2: - Only mark malloc initialized after mem_malloc_init - Fix cALLOc condition
Sean Anderson (4): common: Only mark malloc initialized after mem_malloc_init malloc: Don't use ifdefs for SYS_MALLOC_DEFAULT_TO_INIT malloc: Don't statically initialize av_ if using malloc_init malloc: Enable SYS_MALLOC_RUNTIME_INIT by default in SPL
Kconfig | 27 +++++++++++++++++---------- common/board_r.c | 3 ++- common/dlmalloc.c | 16 ++++++++-------- 3 files changed, 27 insertions(+), 19 deletions(-)

Instead of marking malloc as initialized as soon as relocation is done, defer it until after we call mem_malloc_init. This ensures that malloc initialization is done before we switch away from simple_malloc, and matches the SPL behavior.
Fixes: c9356be3074 ("dm: Split the simple malloc() implementation into its own file") Signed-off-by: Sean Anderson sean.anderson@seco.com Reviewed-by: Simon Glass sjg@chromium.org ---
(no changes since v2)
Changes in v2: - New
common/board_r.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/common/board_r.c b/common/board_r.c index 52786901be5..cd1e5a3a4c7 100644 --- a/common/board_r.c +++ b/common/board_r.c @@ -98,7 +98,7 @@ static int initr_trace(void) static int initr_reloc(void) { /* tell others: relocation done */ - gd->flags |= GD_FLG_RELOC | GD_FLG_FULL_MALLOC_INIT; + gd->flags |= GD_FLG_RELOC;
return 0; } @@ -204,6 +204,7 @@ static int initr_malloc(void) gd_set_malloc_start(start); mem_malloc_init((ulong)map_sysmem(start, TOTAL_MALLOC_LEN), TOTAL_MALLOC_LEN); + gd->flags |= GD_FLG_FULL_MALLOC_INIT; return 0; }

With CONFIG_IS_ENABLED we can eliminate some ifdefs.
Signed-off-by: Sean Anderson sean.anderson@seco.com Reviewed-by: Heinrich Schuchardt xypron.glpk@gmx.de Reviewed-by: Simon Glass sjg@chromium.org ---
(no changes since v1)
common/dlmalloc.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-)
diff --git a/common/dlmalloc.c b/common/dlmalloc.c index c2f5a7347b8..c7cd7815a27 100644 --- a/common/dlmalloc.c +++ b/common/dlmalloc.c @@ -575,9 +575,7 @@ static mbinptr av_[NAV * 2 + 2] = { IAV(120), IAV(121), IAV(122), IAV(123), IAV(124), IAV(125), IAV(126), IAV(127) };
-#ifdef CONFIG_SYS_MALLOC_DEFAULT_TO_INIT static void malloc_init(void); -#endif
ulong mem_malloc_start = 0; ulong mem_malloc_end = 0; @@ -612,9 +610,8 @@ void mem_malloc_init(ulong start, ulong size) mem_malloc_end = start + size; mem_malloc_brk = start;
-#ifdef CONFIG_SYS_MALLOC_DEFAULT_TO_INIT - malloc_init(); -#endif + if (CONFIG_IS_ENABLED(SYS_MALLOC_DEFAULT_TO_INIT)) + malloc_init();
debug("using memory %#lx-%#lx for malloc()\n", mem_malloc_start, mem_malloc_end); @@ -719,7 +716,6 @@ static unsigned int max_n_mmaps = 0; static unsigned long max_mmapped_mem = 0; #endif
-#ifdef CONFIG_SYS_MALLOC_DEFAULT_TO_INIT static void malloc_init(void) { int i, j; @@ -748,7 +744,6 @@ static void malloc_init(void) memset((void *)¤t_mallinfo, 0, sizeof(struct mallinfo)); #endif } -#endif
/* Debugging support

When we enable malloc_init, there is no need to statically initialize av_, since we are going to do it manually. This lets us move av_ to .bss, saving around 1-2k of data (depending on the pointer size).
cALLOc must be adjusted to not access top before malloc_init.
While we're at it, rename/reword the Kconfig to better describe what this option does.
Signed-off-by: Sean Anderson sean.anderson@seco.com Reviewed-by: Simon Glass sjg@chromium.org ---
Changes in v3: - Use CONFIG_IS_ENABLED in conditionals
Changes in v2: - Fix cALLOc condition
Kconfig | 18 +++++++----------- common/dlmalloc.c | 9 +++++++-- 2 files changed, 14 insertions(+), 13 deletions(-)
diff --git a/Kconfig b/Kconfig index a3061c86124..3967613b232 100644 --- a/Kconfig +++ b/Kconfig @@ -405,18 +405,14 @@ if EXPERT When disabling this, please check if malloc calls, maybe should be replaced by calloc - if one expects zeroed memory.
-config SYS_MALLOC_DEFAULT_TO_INIT - bool "Default malloc to init while reserving the memory for it" +config SYS_MALLOC_RUNTIME_INIT + bool "Initialize malloc's internal data at runtime" help - It may happen that one needs to move the dynamic allocation - from one to another memory range, eg. when moving the malloc - from the limited static to a potentially large dynamic (DDR) - memory. - - If so then on top of setting the updated memory aside one - needs to bring the malloc init. - - If such a scenario is sought choose yes. + Initialize malloc's internal data structures at runtime, rather than + at compile-time. This is necessary if relocating the malloc arena + from a smaller static memory to a large DDR memory. It can also + reduce the size of U-Boot by letting malloc's data reside in .bss + instead of .data.
config TOOLS_DEBUG bool "Enable debug information for tools" diff --git a/common/dlmalloc.c b/common/dlmalloc.c index c7cd7815a27..ec1a20fc3d7 100644 --- a/common/dlmalloc.c +++ b/common/dlmalloc.c @@ -556,6 +556,7 @@ typedef struct malloc_chunk* mbinptr; #define IAV(i) bin_at(i), bin_at(i)
static mbinptr av_[NAV * 2 + 2] = { +#if !CONFIG_IS_ENABLED(SYS_MALLOC_RUNTIME_INIT) NULL, NULL, IAV(0), IAV(1), IAV(2), IAV(3), IAV(4), IAV(5), IAV(6), IAV(7), IAV(8), IAV(9), IAV(10), IAV(11), IAV(12), IAV(13), IAV(14), IAV(15), @@ -573,6 +574,7 @@ static mbinptr av_[NAV * 2 + 2] = { IAV(104), IAV(105), IAV(106), IAV(107), IAV(108), IAV(109), IAV(110), IAV(111), IAV(112), IAV(113), IAV(114), IAV(115), IAV(116), IAV(117), IAV(118), IAV(119), IAV(120), IAV(121), IAV(122), IAV(123), IAV(124), IAV(125), IAV(126), IAV(127) +#endif };
static void malloc_init(void); @@ -610,7 +612,7 @@ void mem_malloc_init(ulong start, ulong size) mem_malloc_end = start + size; mem_malloc_brk = start;
- if (CONFIG_IS_ENABLED(SYS_MALLOC_DEFAULT_TO_INIT)) + if (CONFIG_IS_ENABLED(SYS_MALLOC_RUNTIME_INIT)) malloc_init();
debug("using memory %#lx-%#lx for malloc()\n", mem_malloc_start, @@ -2137,7 +2139,10 @@ Void_t* cALLOc(n, elem_size) size_t n; size_t elem_size; #ifdef CONFIG_SYS_MALLOC_CLEAR_ON_INIT #if MORECORE_CLEARS mchunkptr oldtop = top; - INTERNAL_SIZE_T oldtopsize = chunksize(top); + INTERNAL_SIZE_T oldtopsize; + if (!CONFIG_IS_ENABLED(SYS_MALLOC_F) || + (gd->flags & GD_FLG_FULL_MALLOC_INIT)) + oldtopsize = chunksize(top); #endif #endif Void_t* mem = mALLOc (sz);

On boards with size restrictions, 1-2k can be a significant fraction of the binary size. Add a new SPL version of SYS_MALLOC_RUNTIME_INIT. As this trades text size for BSS size, enable it by default only for boards with at least 16k of BSS.
Signed-off-by: Sean Anderson sean.anderson@seco.com ---
Changes in v3: - Don't enable SPL_SYS_MALLOC_RUNTIME_INIT if we are short on BSS
Kconfig | 11 +++++++++++ 1 file changed, 11 insertions(+)
diff --git a/Kconfig b/Kconfig index 3967613b232..da5986e2fe8 100644 --- a/Kconfig +++ b/Kconfig @@ -414,6 +414,17 @@ config SYS_MALLOC_RUNTIME_INIT reduce the size of U-Boot by letting malloc's data reside in .bss instead of .data.
+config SPL_SYS_MALLOC_RUNTIME_INIT + bool "Initialize malloc's internal data at runtime in SPL" + default y if !SPL_BSS_LIMIT || SPL_BSS_MAX_SIZE >= 0x1000 + depends on SPL + help + Initialize malloc's internal data structures at SPL runtime, rather + than at compile-time. This is necessary if relocating the malloc arena + from a smaller static memory to a large DDR memory. It can also reduce + the size of U-Boot by letting malloc's data reside in .bss instead of + .data. + config TOOLS_DEBUG bool "Enable debug information for tools" help

On Thu, 28 Sept 2023 at 08:45, Sean Anderson sean.anderson@seco.com wrote:
On boards with size restrictions, 1-2k can be a significant fraction of the binary size. Add a new SPL version of SYS_MALLOC_RUNTIME_INIT. As this trades text size for BSS size, enable it by default only for boards with at least 16k of BSS.
Signed-off-by: Sean Anderson sean.anderson@seco.com
Changes in v3:
- Don't enable SPL_SYS_MALLOC_RUNTIME_INIT if we are short on BSS
Kconfig | 11 +++++++++++ 1 file changed, 11 insertions(+)
Reviewed-by: Simon Glass sjg@chromium.org

Hi Sean,
On Thu, 28 Sept 2023 at 08:45, Sean Anderson sean.anderson@seco.com wrote:
In my efforts to get SPL to fit into flash after some changes I made, I noticed that av_ is one of the largest variables in SPL. As it turns out, we can generate it at runtime, and the code is already there. This has the potential to save 1-2k across the board, for some (very) minor boot time increase.
This series is based on [1], since this makes checking for SYS_MALLOC_F easier. Passing CI at [2].
To measure the boot time difference, I applied the following patch:
common/board_r.c | 5 +++++ common/spl/spl.c | 4 ++++ 2 files changed, 9 insertions(+)
diff --git a/common/board_r.c b/common/board_r.c index 58a5986aa54..ca624b20d46 100644 --- a/common/board_r.c +++ b/common/board_r.c @@ -194,6 +194,7 @@ static int initr_barrier(void) return 0; }
+static ulong malloc_begin, malloc_end; static int initr_malloc(void) { ulong malloc_start; @@ -208,8 +209,10 @@ static int initr_malloc(void) * reserve_noncached(). */ malloc_start = gd->relocaddr - TOTAL_MALLOC_LEN;
malloc_begin = timer_get_boot_us();
Perhaps this would be better done with bootstage, since then the timing can be enabled / disabled, and reported along with other timings.
mem_malloc_init((ulong)map_sysmem(malloc_start, TOTAL_MALLOC_LEN), TOTAL_MALLOC_LEN);
malloc_end = timer_get_boot_us(); gd->flags |= GD_FLG_FULL_MALLOC_INIT; return 0;
} @@ -570,6 +573,8 @@ static int dm_announce(void)
static int run_main_loop(void) {
printf("malloc_init took %luus (%lu %lu)\n", malloc_end - malloc_begin,
malloc_begin, malloc_end);
#ifdef CONFIG_SANDBOX sandbox_main_loop_init(); #endif diff --git a/common/spl/spl.c b/common/spl/spl.c index d74acec10b5..b34d1f4b4e6 100644 --- a/common/spl/spl.c +++ b/common/spl/spl.c @@ -755,7 +755,9 @@ void board_init_r(gd_t *dummy1, ulong dummy2) spl_set_bd();
#if defined(CONFIG_SYS_SPL_MALLOC)
ulong malloc_begin = timer_get_boot_us(); mem_malloc_init(SYS_SPL_MALLOC_START, CONFIG_SYS_SPL_MALLOC_SIZE);
ulong malloc_end = timer_get_boot_us(); gd->flags |= GD_FLG_FULL_MALLOC_INIT;
#endif if (!(gd->flags & GD_FLG_SPL_INIT)) { @@ -817,6 +819,8 @@ void board_init_r(gd_t *dummy1, ulong dummy2) spl_image.boot_device = BOOT_DEVICE_NONE; board_boot_order(spl_boot_list);
printf("malloc_init took %luus (%lu %lu)\n", malloc_end - malloc_begin,
malloc_begin, malloc_end);
debug() ?
ret = boot_from_devices(&spl_image, spl_boot_list, ARRAY_SIZE(spl_boot_list)); if (ret) {
-- 2.25.1
I found that MALLOC_CLEAR_ON_INIT dominated the mem_malloc_init time (taking around 150 ms in SPL on my board). After disabling it, I found that MALLOC_RUNTIME_INIT took around 5 us on average.
[1] https://lore.kernel.org/u-boot/20230926141514.2101787-1-sjg@chromium.org/ [2] https://source.denx.de/u-boot/custodians/u-boot-clk/-/pipelines/17900
Changes in v3:
- Use CONFIG_IS_ENABLED in conditionals
- Don't enable SPL_SYS_MALLOC_RUNTIME_INIT if we are short on BSS
Changes in v2:
- Only mark malloc initialized after mem_malloc_init
- Fix cALLOc condition
Sean Anderson (4): common: Only mark malloc initialized after mem_malloc_init malloc: Don't use ifdefs for SYS_MALLOC_DEFAULT_TO_INIT malloc: Don't statically initialize av_ if using malloc_init malloc: Enable SYS_MALLOC_RUNTIME_INIT by default in SPL
Kconfig | 27 +++++++++++++++++---------- common/board_r.c | 3 ++- common/dlmalloc.c | 16 ++++++++-------- 3 files changed, 27 insertions(+), 19 deletions(-)
-- 2.35.1.1320.gc452695387.dirty
REgards, SImon

On 10/1/23 21:16, Simon Glass wrote:
Hi Sean,
On Thu, 28 Sept 2023 at 08:45, Sean Anderson sean.anderson@seco.com wrote:
In my efforts to get SPL to fit into flash after some changes I made, I noticed that av_ is one of the largest variables in SPL. As it turns out, we can generate it at runtime, and the code is already there. This has the potential to save 1-2k across the board, for some (very) minor boot time increase.
This series is based on [1], since this makes checking for SYS_MALLOC_F easier. Passing CI at [2].
To measure the boot time difference, I applied the following patch:
common/board_r.c | 5 +++++ common/spl/spl.c | 4 ++++ 2 files changed, 9 insertions(+)
diff --git a/common/board_r.c b/common/board_r.c index 58a5986aa54..ca624b20d46 100644 --- a/common/board_r.c +++ b/common/board_r.c @@ -194,6 +194,7 @@ static int initr_barrier(void) return 0; }
+static ulong malloc_begin, malloc_end; static int initr_malloc(void) { ulong malloc_start; @@ -208,8 +209,10 @@ static int initr_malloc(void) * reserve_noncached(). */ malloc_start = gd->relocaddr - TOTAL_MALLOC_LEN;
malloc_begin = timer_get_boot_us();
Perhaps this would be better done with bootstage, since then the timing can be enabled / disabled, and reported along with other timings.
I'll try that out next time.
mem_malloc_init((ulong)map_sysmem(malloc_start, TOTAL_MALLOC_LEN), TOTAL_MALLOC_LEN);
}malloc_end = timer_get_boot_us(); gd->flags |= GD_FLG_FULL_MALLOC_INIT; return 0;
@@ -570,6 +573,8 @@ static int dm_announce(void)
static int run_main_loop(void) {
printf("malloc_init took %luus (%lu %lu)\n", malloc_end - malloc_begin,
#ifdef CONFIG_SANDBOX sandbox_main_loop_init(); #endifmalloc_begin, malloc_end);
diff --git a/common/spl/spl.c b/common/spl/spl.c index d74acec10b5..b34d1f4b4e6 100644 --- a/common/spl/spl.c +++ b/common/spl/spl.c @@ -755,7 +755,9 @@ void board_init_r(gd_t *dummy1, ulong dummy2) spl_set_bd();
#if defined(CONFIG_SYS_SPL_MALLOC)
ulong malloc_begin = timer_get_boot_us(); mem_malloc_init(SYS_SPL_MALLOC_START, CONFIG_SYS_SPL_MALLOC_SIZE);
#endif if (!(gd->flags & GD_FLG_SPL_INIT)) {ulong malloc_end = timer_get_boot_us(); gd->flags |= GD_FLG_FULL_MALLOC_INIT;
@@ -817,6 +819,8 @@ void board_init_r(gd_t *dummy1, ulong dummy2) spl_image.boot_device = BOOT_DEVICE_NONE; board_boot_order(spl_boot_list);
printf("malloc_init took %luus (%lu %lu)\n", malloc_end - malloc_begin,
malloc_begin, malloc_end);
debug() ?
Well, this is not going to be applied, so I used the easiest thing :)
--Sean
ret = boot_from_devices(&spl_image, spl_boot_list, ARRAY_SIZE(spl_boot_list)); if (ret) {
-- 2.25.1
I found that MALLOC_CLEAR_ON_INIT dominated the mem_malloc_init time (taking around 150 ms in SPL on my board). After disabling it, I found that MALLOC_RUNTIME_INIT took around 5 us on average.
[1] https://lore.kernel.org/u-boot/20230926141514.2101787-1-sjg@chromium.org/ [2] https://source.denx.de/u-boot/custodians/u-boot-clk/-/pipelines/17900
Changes in v3:
- Use CONFIG_IS_ENABLED in conditionals
- Don't enable SPL_SYS_MALLOC_RUNTIME_INIT if we are short on BSS
Changes in v2:
- Only mark malloc initialized after mem_malloc_init
- Fix cALLOc condition
Sean Anderson (4): common: Only mark malloc initialized after mem_malloc_init malloc: Don't use ifdefs for SYS_MALLOC_DEFAULT_TO_INIT malloc: Don't statically initialize av_ if using malloc_init malloc: Enable SYS_MALLOC_RUNTIME_INIT by default in SPL
Kconfig | 27 +++++++++++++++++---------- common/board_r.c | 3 ++- common/dlmalloc.c | 16 ++++++++-------- 3 files changed, 27 insertions(+), 19 deletions(-)
-- 2.35.1.1320.gc452695387.dirty
REgards, SImon
participants (3)
-
Sean Anderson
-
Sean Anderson
-
Simon Glass