[U-Boot] [PATCH 0/7] tegra20: enable thumb

This patch series enables thumb compile for tegra SPL and u-boot. It is not ready for review yet as it still contains a few hacks, but I'm posting it in case anyone finds it useful. This reduces the size of the combined SPL/u-boot by about 20%. I havent' measured if there's any change in boot time.
This patch series depends on v6 of the SPL patch series.
This patch series is also available from: git://github.com/arm000/u-boot.git branch: tegra-thumb-v1
[PATCH 1/7] tegra20: remove inline assembly for u32 cast [PATCH 2/7] HACK: rearrange link order for thumb [PATCH 3/7] tegra20: enable thumb build [PATCH 4/7] arm: add _thumb1_case_uqi to libgcc [PATCH 5/7] arm: add thumb compatible return instructions [PATCH 6/7] arm: use thumb compatible return in arm720t [PATCH 7/7] arm: change arm720t to armv4t
Makefile | 8 +++--- arch/arm/cpu/arm720t/config.mk | 2 +- arch/arm/cpu/arm720t/start.S | 2 +- arch/arm/cpu/arm720t/tegra20/config.mk | 7 +++++ arch/arm/cpu/tegra20-common/warmboot_avp.c | 9 +----- arch/arm/lib/Makefile | 1 + arch/arm/lib/_thumb1_case_uqi.S | 41 ++++++++++++++++++++++++++++ arch/arm/lib/_udivsi3.S | 6 ++-- include/configs/tegra20-common.h | 1 + 9 files changed, 60 insertions(+), 17 deletions(-)
-- nvpublic

This inline assembly just converts a function address to a u32. Replace it with equivalent C code since the assembly was not thumb compatible.
Signed-off-by: Allen Martin amartin@nvidia.com --- arch/arm/cpu/tegra20-common/warmboot_avp.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-)
diff --git a/arch/arm/cpu/tegra20-common/warmboot_avp.c b/arch/arm/cpu/tegra20-common/warmboot_avp.c index cd01908..0a7f09f 100644 --- a/arch/arm/cpu/tegra20-common/warmboot_avp.c +++ b/arch/arm/cpu/tegra20-common/warmboot_avp.c @@ -51,14 +51,7 @@ void wb_start(void) /* enable JTAG & TBE */ writel(CONFIG_CTL_TBE | CONFIG_CTL_JTAG, &pmt->pmt_cfg_ctl);
- /* Are we running where we're supposed to be? */ - asm volatile ( - "adr %0, wb_start;" /* reg: wb_start address */ - : "=r"(reg) /* output */ - /* no input, no clobber list */ - ); - - if (reg != AP20_WB_RUN_ADDRESS) + if ((u32)wb_start != AP20_WB_RUN_ADDRESS) goto do_reset;
/* Are we running with AVP? */

Rearrange the link order of libraries to avoid out of bound relocations in thumb mode. I have no idea how to fix this for real.
Signed-off-by: Allen Martin amartin@nvidia.com --- Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/Makefile b/Makefile index ff04503..75d1c96 100644 --- a/Makefile +++ b/Makefile @@ -232,9 +232,6 @@ LIBS += lib/zlib/libz.o LIBS += $(shell if [ -f board/$(VENDOR)/common/Makefile ]; then echo \ "board/$(VENDOR)/common/lib$(VENDOR).o"; fi) LIBS += $(CPUDIR)/lib$(CPU).o -ifdef SOC -LIBS += $(CPUDIR)/$(SOC)/lib$(SOC).o -endif ifeq ($(CPU),ixp) LIBS += arch/arm/cpu/ixp/npe/libnpe.o endif @@ -301,6 +298,9 @@ LIBS += common/libcommon.o LIBS += lib/libfdt/libfdt.o LIBS += api/libapi.o LIBS += post/libpost.o +ifdef SOC +LIBS += $(CPUDIR)/$(SOC)/lib$(SOC).o +endif
ifneq ($(CONFIG_AM33XX)$(CONFIG_OMAP34XX)$(CONFIG_OMAP44XX)$(CONFIG_OMAP54XX),) LIBS += $(CPUDIR)/omap-common/libomap-common.o @@ -323,7 +323,7 @@ ifeq ($(SOC),tegra20) LIBS += arch/$(ARCH)/cpu/$(SOC)-common/lib$(SOC)-common.o endif
-LIBS := $(addprefix $(obj),$(sort $(LIBS))) +LIBS := $(addprefix $(obj),$(LIBS)) .PHONY : $(LIBS)
LIBBOARD = board/$(BOARDDIR)/lib$(BOARD).o

On 07/06/2012 12:08 PM, Allen Martin wrote:
Rearrange the link order of libraries to avoid out of bound relocations in thumb mode. I have no idea how to fix this for real.
Are the relocations branches or something else? It looks like unconditional jump range is +/-4MB for Thumb1 and +/-16MB for Thumb2, so I'm surprised we'd be exceeding that, considering the U-boot binary is on the order of 256KB on Tegra right now.

On Fri, Jul 06, 2012 at 12:09:43PM -0700, Stephen Warren wrote:
On 07/06/2012 12:08 PM, Allen Martin wrote:
Rearrange the link order of libraries to avoid out of bound relocations in thumb mode. I have no idea how to fix this for real.
Are the relocations branches or something else? It looks like unconditional jump range is +/-4MB for Thumb1 and +/-16MB for Thumb2, so I'm surprised we'd be exceeding that, considering the U-boot binary is on the order of 256KB on Tegra right now.
This is the relcation type:
arch/arm/lib/libarm.o: In function `__flush_dcache_all': /home/arm/u-boot/arch/arm/lib/cache.c:52: relocation truncated to fit: R_ARM_THM_JUMP11 against symbol `flush_cache' defined in .text section in arch/arm/cpu/armv7/libarmv7.o
The instruction is a "b.n" not a "b", which is what is causing the problem.
I think because of the weak alias the compiler used a short jump to the local function, but when it got linked it resolved to a function that was too far away for the short jump:
void flush_cache(unsigned long start, unsigned long size) __attribute__((weak, alias("__flush_cache")));
00000002 <__flush_dcache_all>: 2: 2000 movs r0, #0 4: f04f 31ff mov.w r1, #4294967295 ; 0xffffffff 8: e7fe b.n 0 <__flush_cache>
It looks like there's a "-fno-optimize-sibling-calls" option to gcc to avoid this problem. Seems a shame to disable all short jumps for this one case though.
-Allen

On 07/06/2012 02:33 PM, Allen Martin wrote:
On Fri, Jul 06, 2012 at 12:09:43PM -0700, Stephen Warren wrote:
On 07/06/2012 12:08 PM, Allen Martin wrote:
Rearrange the link order of libraries to avoid out of bound relocations in thumb mode. I have no idea how to fix this for real.
Are the relocations branches or something else? It looks like unconditional jump range is +/-4MB for Thumb1 and +/-16MB for Thumb2, so I'm surprised we'd be exceeding that, considering the U-boot binary is on the order of 256KB on Tegra right now.
This is the relcation type:
arch/arm/lib/libarm.o: In function `__flush_dcache_all': /home/arm/u-boot/arch/arm/lib/cache.c:52: relocation truncated to fit: R_ARM_THM_JUMP11 against symbol `flush_cache' defined in .text section in arch/arm/cpu/armv7/libarmv7.o
The instruction is a "b.n" not a "b", which is what is causing the problem.
I think because of the weak alias the compiler used a short jump to the local function, but when it got linked it resolved to a function that was too far away for the short jump:
void flush_cache(unsigned long start, unsigned long size) __attribute__((weak, alias("__flush_cache")));
00000002 <__flush_dcache_all>: 2: 2000 movs r0, #0 4: f04f 31ff mov.w r1, #4294967295 ; 0xffffffff 8: e7fe b.n 0 <__flush_cache>
Ah, that explanation makes sense.
It looks like there's a "-fno-optimize-sibling-calls" option to gcc to avoid this problem. Seems a shame to disable all short jumps for this one case though.
It seems like a bug that the b-vs-b.n optimization is applied to a weak symbol, since the compiler can't possibly know the range of the jump.
Also, I've seen ld for some architectures rewrite the equivalent of b.n to plain b when needing to expand the branch target range; IIRC a process known as "relaxing"? Perhaps gcc is expecting ld to do that, but ld isn't?

On Fri, Jul 06, 2012 at 01:44:32PM -0700, Stephen Warren wrote:
On 07/06/2012 02:33 PM, Allen Martin wrote:
On Fri, Jul 06, 2012 at 12:09:43PM -0700, Stephen Warren wrote:
On 07/06/2012 12:08 PM, Allen Martin wrote:
Rearrange the link order of libraries to avoid out of bound relocations in thumb mode. I have no idea how to fix this for real.
Are the relocations branches or something else? It looks like unconditional jump range is +/-4MB for Thumb1 and +/-16MB for Thumb2, so I'm surprised we'd be exceeding that, considering the U-boot binary is on the order of 256KB on Tegra right now.
This is the relcation type:
arch/arm/lib/libarm.o: In function `__flush_dcache_all': /home/arm/u-boot/arch/arm/lib/cache.c:52: relocation truncated to fit: R_ARM_THM_JUMP11 against symbol `flush_cache' defined in .text section in arch/arm/cpu/armv7/libarmv7.o
The instruction is a "b.n" not a "b", which is what is causing the problem.
I think because of the weak alias the compiler used a short jump to the local function, but when it got linked it resolved to a function that was too far away for the short jump:
void flush_cache(unsigned long start, unsigned long size) __attribute__((weak, alias("__flush_cache")));
00000002 <__flush_dcache_all>: 2: 2000 movs r0, #0 4: f04f 31ff mov.w r1, #4294967295 ; 0xffffffff 8: e7fe b.n 0 <__flush_cache>
Ah, that explanation makes sense.
It looks like there's a "-fno-optimize-sibling-calls" option to gcc to avoid this problem. Seems a shame to disable all short jumps for this one case though.
It seems like a bug that the b-vs-b.n optimization is applied to a weak symbol, since the compiler can't possibly know the range of the jump.
Also, I've seen ld for some architectures rewrite the equivalent of b.n to plain b when needing to expand the branch target range; IIRC a process known as "relaxing"? Perhaps gcc is expecting ld to do that, but ld isn't?
I found this issue online, which if not the exact same problem, is very similar:
http://lists.linaro.org/pipermail/linaro-dev/2011-February/002886.html
-Allen

On Fri, Jul 06, 2012 at 01:44:32PM -0700, Stephen Warren wrote:
On 07/06/2012 02:33 PM, Allen Martin wrote:
On Fri, Jul 06, 2012 at 12:09:43PM -0700, Stephen Warren wrote:
On 07/06/2012 12:08 PM, Allen Martin wrote:
Rearrange the link order of libraries to avoid out of bound relocations in thumb mode. I have no idea how to fix this for real.
Are the relocations branches or something else? It looks like unconditional jump range is +/-4MB for Thumb1 and +/-16MB for Thumb2, so I'm surprised we'd be exceeding that, considering the U-boot binary is on the order of 256KB on Tegra right now.
This is the relcation type:
arch/arm/lib/libarm.o: In function `__flush_dcache_all': /home/arm/u-boot/arch/arm/lib/cache.c:52: relocation truncated to fit: R_ARM_THM_JUMP11 against symbol `flush_cache' defined in .text section in arch/arm/cpu/armv7/libarmv7.o
The instruction is a "b.n" not a "b", which is what is causing the problem.
I think because of the weak alias the compiler used a short jump to the local function, but when it got linked it resolved to a function that was too far away for the short jump:
void flush_cache(unsigned long start, unsigned long size) __attribute__((weak, alias("__flush_cache")));
00000002 <__flush_dcache_all>: 2: 2000 movs r0, #0 4: f04f 31ff mov.w r1, #4294967295 ; 0xffffffff 8: e7fe b.n 0 <__flush_cache>
Ah, that explanation makes sense.
It looks like there's a "-fno-optimize-sibling-calls" option to gcc to avoid this problem. Seems a shame to disable all short jumps for this one case though.
It seems like a bug that the b-vs-b.n optimization is applied to a weak symbol, since the compiler can't possibly know the range of the jump.
Also, I've seen ld for some architectures rewrite the equivalent of b.n to plain b when needing to expand the branch target range; IIRC a process known as "relaxing"? Perhaps gcc is expecting ld to do that, but ld isn't?
I verified the "-fno-optimize-sibling-calls" compiler option works around this. So here's the improved patch:
Author: Allen Martin amartin@nvidia.com Date: Fri Jul 6 15:58:24 2012 -0700
tegra20: disable local branch optimization
This works around a bug when compiling in thumb mode when branching to a weak aliased function. The compiler will optimize the branch to a short branch (b.n instruction) but the linker may resolve the target to a function that is too far away for a short branch.
Signed-off-by: Allen Martin amartin@nvidia.com
diff --git a/arch/arm/cpu/armv7/tegra20/config.mk b/arch/arm/cpu/armv7/tegra20/config.mk index 6432e75..56bb830 100644 --- a/arch/arm/cpu/armv7/tegra20/config.mk +++ b/arch/arm/cpu/armv7/tegra20/config.mk @@ -24,3 +24,8 @@ # MA 02111-1307 USA # CONFIG_ARCH_DEVICE_TREE := tegra20 + +# this works around a bug when compiling thumb where some branches +# are incorrectly optimized to short branches when they shouldn't +# be +PLATFORM_RELFLAGS += -fno-optimize-sibling-calls

On Fri, Jul 06, 2012 at 01:44:32PM -0700, Stephen Warren wrote:
On 07/06/2012 02:33 PM, Allen Martin wrote:
On Fri, Jul 06, 2012 at 12:09:43PM -0700, Stephen Warren wrote:
On 07/06/2012 12:08 PM, Allen Martin wrote:
Rearrange the link order of libraries to avoid out of bound relocations in thumb mode. I have no idea how to fix this for real.
Are the relocations branches or something else? It looks like unconditional jump range is +/-4MB for Thumb1 and +/-16MB for Thumb2, so I'm surprised we'd be exceeding that, considering the U-boot binary is on the order of 256KB on Tegra right now.
This is the relcation type:
arch/arm/lib/libarm.o: In function `__flush_dcache_all': /home/arm/u-boot/arch/arm/lib/cache.c:52: relocation truncated to fit: R_ARM_THM_JUMP11 against symbol `flush_cache' defined in .text section in arch/arm/cpu/armv7/libarmv7.o
The instruction is a "b.n" not a "b", which is what is causing the problem.
I think because of the weak alias the compiler used a short jump to the local function, but when it got linked it resolved to a function that was too far away for the short jump:
void flush_cache(unsigned long start, unsigned long size) __attribute__((weak, alias("__flush_cache")));
00000002 <__flush_dcache_all>: 2: 2000 movs r0, #0 4: f04f 31ff mov.w r1, #4294967295 ; 0xffffffff 8: e7fe b.n 0 <__flush_cache>
Ah, that explanation makes sense.
It looks like there's a "-fno-optimize-sibling-calls" option to gcc to avoid this problem. Seems a shame to disable all short jumps for this one case though.
It seems like a bug that the b-vs-b.n optimization is applied to a weak symbol, since the compiler can't possibly know the range of the jump.
Also, I've seen ld for some architectures rewrite the equivalent of b.n to plain b when needing to expand the branch target range; IIRC a process known as "relaxing"? Perhaps gcc is expecting ld to do that, but ld isn't?
And I forgot to mention, the code bloat from disabling the optimization is about 400 bytes (185136 -> 185540), so it's not bad, but it still seems a shame to disable all short branches because of one misoptimized one.
-Allen

Hi Allen,
On Fri, 6 Jul 2012 16:17:19 -0700, Allen Martin amartin@nvidia.com wrote:
On Fri, Jul 06, 2012 at 01:44:32PM -0700, Stephen Warren wrote:
On 07/06/2012 02:33 PM, Allen Martin wrote:
On Fri, Jul 06, 2012 at 12:09:43PM -0700, Stephen Warren wrote:
On 07/06/2012 12:08 PM, Allen Martin wrote:
Rearrange the link order of libraries to avoid out of bound relocations in thumb mode. I have no idea how to fix this for real.
Are the relocations branches or something else? It looks like unconditional jump range is +/-4MB for Thumb1 and +/-16MB for Thumb2, so I'm surprised we'd be exceeding that, considering the U-boot binary is on the order of 256KB on Tegra right now.
This is the relcation type:
arch/arm/lib/libarm.o: In function `__flush_dcache_all': /home/arm/u-boot/arch/arm/lib/cache.c:52: relocation truncated to fit: R_ARM_THM_JUMP11 against symbol `flush_cache' defined in .text section in arch/arm/cpu/armv7/libarmv7.o
The instruction is a "b.n" not a "b", which is what is causing the problem.
I think because of the weak alias the compiler used a short jump to the local function, but when it got linked it resolved to a function that was too far away for the short jump:
void flush_cache(unsigned long start, unsigned long size) __attribute__((weak, alias("__flush_cache")));
00000002 <__flush_dcache_all>: 2: 2000 movs r0, #0 4: f04f 31ff mov.w r1, #4294967295 ; 0xffffffff 8: e7fe b.n 0 <__flush_cache>
Ah, that explanation makes sense.
It looks like there's a "-fno-optimize-sibling-calls" option to gcc to avoid this problem. Seems a shame to disable all short jumps for this one case though.
It seems like a bug that the b-vs-b.n optimization is applied to a weak symbol, since the compiler can't possibly know the range of the jump.
Also, I've seen ld for some architectures rewrite the equivalent of b.n to plain b when needing to expand the branch target range; IIRC a process known as "relaxing"? Perhaps gcc is expecting ld to do that, but ld isn't?
And I forgot to mention, the code bloat from disabling the optimization is about 400 bytes (185136 -> 185540), so it's not bad, but it still seems a shame to disable all short branches because of one misoptimized one.
Can this not be limited to compiling the object files which are known to be sensitive to the problem?
-Allen
Amicalement,

On Sat, Jul 07, 2012 at 03:15:36AM -0700, Albert ARIBAUD wrote:
Hi Allen,
On Fri, 6 Jul 2012 16:17:19 -0700, Allen Martin amartin@nvidia.com wrote:
On Fri, Jul 06, 2012 at 01:44:32PM -0700, Stephen Warren wrote:
On 07/06/2012 02:33 PM, Allen Martin wrote:
On Fri, Jul 06, 2012 at 12:09:43PM -0700, Stephen Warren wrote:
On 07/06/2012 12:08 PM, Allen Martin wrote:
Rearrange the link order of libraries to avoid out of bound relocations in thumb mode. I have no idea how to fix this for real.
Are the relocations branches or something else? It looks like unconditional jump range is +/-4MB for Thumb1 and +/-16MB for Thumb2, so I'm surprised we'd be exceeding that, considering the U-boot binary is on the order of 256KB on Tegra right now.
This is the relcation type:
arch/arm/lib/libarm.o: In function `__flush_dcache_all': /home/arm/u-boot/arch/arm/lib/cache.c:52: relocation truncated to fit: R_ARM_THM_JUMP11 against symbol `flush_cache' defined in .text section in arch/arm/cpu/armv7/libarmv7.o
The instruction is a "b.n" not a "b", which is what is causing the problem.
I think because of the weak alias the compiler used a short jump to the local function, but when it got linked it resolved to a function that was too far away for the short jump:
void flush_cache(unsigned long start, unsigned long size) __attribute__((weak, alias("__flush_cache")));
00000002 <__flush_dcache_all>: 2: 2000 movs r0, #0 4: f04f 31ff mov.w r1, #4294967295 ; 0xffffffff 8: e7fe b.n 0 <__flush_cache>
Ah, that explanation makes sense.
It looks like there's a "-fno-optimize-sibling-calls" option to gcc to avoid this problem. Seems a shame to disable all short jumps for this one case though.
It seems like a bug that the b-vs-b.n optimization is applied to a weak symbol, since the compiler can't possibly know the range of the jump.
Also, I've seen ld for some architectures rewrite the equivalent of b.n to plain b when needing to expand the branch target range; IIRC a process known as "relaxing"? Perhaps gcc is expecting ld to do that, but ld isn't?
And I forgot to mention, the code bloat from disabling the optimization is about 400 bytes (185136 -> 185540), so it's not bad, but it still seems a shame to disable all short branches because of one misoptimized one.
Can this not be limited to compiling the object files which are known to be sensitive to the problem?
That's a possibility, although it does seem a little fragile, since I don't fully understand all the cases where the bad optimization can get inserted. I'll follow up on the toolchain lists to better understand the problem. It's still not clear to me if the linker was supposed to fix this up or not when it resolved the weak symbol.
-Allen

On Sat, Jul 07, 2012 at 03:15:36AM -0700, Albert ARIBAUD wrote:
Hi Allen,
On Fri, 6 Jul 2012 16:17:19 -0700, Allen Martin amartin@nvidia.com wrote:
On Fri, Jul 06, 2012 at 01:44:32PM -0700, Stephen Warren wrote:
On 07/06/2012 02:33 PM, Allen Martin wrote:
On Fri, Jul 06, 2012 at 12:09:43PM -0700, Stephen Warren wrote:
On 07/06/2012 12:08 PM, Allen Martin wrote:
Rearrange the link order of libraries to avoid out of bound relocations in thumb mode. I have no idea how to fix this for real.
Are the relocations branches or something else? It looks like unconditional jump range is +/-4MB for Thumb1 and +/-16MB for Thumb2, so I'm surprised we'd be exceeding that, considering the U-boot binary is on the order of 256KB on Tegra right now.
This is the relcation type:
arch/arm/lib/libarm.o: In function `__flush_dcache_all': /home/arm/u-boot/arch/arm/lib/cache.c:52: relocation truncated to fit: R_ARM_THM_JUMP11 against symbol `flush_cache' defined in .text section in arch/arm/cpu/armv7/libarmv7.o
The instruction is a "b.n" not a "b", which is what is causing the problem.
I think because of the weak alias the compiler used a short jump to the local function, but when it got linked it resolved to a function that was too far away for the short jump:
void flush_cache(unsigned long start, unsigned long size) __attribute__((weak, alias("__flush_cache")));
00000002 <__flush_dcache_all>: 2: 2000 movs r0, #0 4: f04f 31ff mov.w r1, #4294967295 ; 0xffffffff 8: e7fe b.n 0 <__flush_cache>
Ah, that explanation makes sense.
It looks like there's a "-fno-optimize-sibling-calls" option to gcc to avoid this problem. Seems a shame to disable all short jumps for this one case though.
It seems like a bug that the b-vs-b.n optimization is applied to a weak symbol, since the compiler can't possibly know the range of the jump.
Also, I've seen ld for some architectures rewrite the equivalent of b.n to plain b when needing to expand the branch target range; IIRC a process known as "relaxing"? Perhaps gcc is expecting ld to do that, but ld isn't?
And I forgot to mention, the code bloat from disabling the optimization is about 400 bytes (185136 -> 185540), so it's not bad, but it still seems a shame to disable all short branches because of one misoptimized one.
Can this not be limited to compiling the object files which are known to be sensitive to the problem?
I understand this issue fairly well now. It's a known bug in the assembler that has already been fixed:
http://sourceware.org/bugzilla/show_bug.cgi?id=12532
It only impacts preembtable symbols, and since u-boot doesn't have any dynamic loadable objects it's only explictly defined weak symbols that should trigger the bug.
I built a new toolchain with binutils 2.22 and verified the bug is no longer present there, and -fno-optimize-sibling-calls is the correct workaround for toolchains that do have the bug, so conditionally disabling the optimization for binutils < 2.22 seems like the right fix.
I ran a quick scrub of the u-boot tree and there's 195 instances of __attribute__((weak)) spread across 123 source files, so I think just disabling optimization on the failing object files may be too fragile, as code movement could cause others to crop up.
-Allen

Hi Allen
On Tue, Jul 10, 2012 at 10:45 AM, Allen Martin amartin@nvidia.com wrote:
On Sat, Jul 07, 2012 at 03:15:36AM -0700, Albert ARIBAUD wrote:
Hi Allen,
On Fri, 6 Jul 2012 16:17:19 -0700, Allen Martin amartin@nvidia.com wrote:
[snip]
And I forgot to mention, the code bloat from disabling the optimization is about 400 bytes (185136 -> 185540), so it's not bad, but it still seems a shame to disable all short branches because of one misoptimized one.
0.2% be my calcs
Can this not be limited to compiling the object files which are known to be sensitive to the problem?
I understand this issue fairly well now. It's a known bug in the assembler that has already been fixed:
http://sourceware.org/bugzilla/show_bug.cgi?id=12532
It only impacts preembtable symbols, and since u-boot doesn't have any dynamic loadable objects it's only explictly defined weak symbols that should trigger the bug.
I built a new toolchain with binutils 2.22 and verified the bug is no longer present there, and -fno-optimize-sibling-calls is the correct workaround for toolchains that do have the bug, so conditionally disabling the optimization for binutils < 2.22 seems like the right fix.
I ran a quick scrub of the u-boot tree and there's 195 instances of __attribute__((weak)) spread across 123 source files, so I think just disabling optimization on the failing object files may be too fragile, as code movement could cause others to crop up.
Adding -fno-optimize-sibling-calls for binutils < 2.22 - 0.2% code size increase for people using slightly older tools
Maintain the tweaking of a set of files - someone using binutils >= 2.22 adds __attribute__((weak)) to a single function and *BAM* three months later someone complains that something broke
I vote option 1
I do wonder, though, if we should spit out warnings when applying workaraounds for older tool-chains?
Regards,
Graeme

Hi Graeme,
On Tue, 10 Jul 2012 10:57:39 +1000, Graeme Russ graeme.russ@gmail.com wrote:
Hi Allen
On Tue, Jul 10, 2012 at 10:45 AM, Allen Martin amartin@nvidia.com wrote:
On Sat, Jul 07, 2012 at 03:15:36AM -0700, Albert ARIBAUD wrote:
Hi Allen,
On Fri, 6 Jul 2012 16:17:19 -0700, Allen Martin amartin@nvidia.com wrote:
[snip]
And I forgot to mention, the code bloat from disabling the optimization is about 400 bytes (185136 -> 185540), so it's not bad, but it still seems a shame to disable all short branches because of one misoptimized one.
0.2% be my calcs
Can this not be limited to compiling the object files which are known to be sensitive to the problem?
I understand this issue fairly well now. It's a known bug in the assembler that has already been fixed:
http://sourceware.org/bugzilla/show_bug.cgi?id=12532
It only impacts preembtable symbols, and since u-boot doesn't have any dynamic loadable objects it's only explictly defined weak symbols that should trigger the bug.
I built a new toolchain with binutils 2.22 and verified the bug is no longer present there, and -fno-optimize-sibling-calls is the correct workaround for toolchains that do have the bug, so conditionally disabling the optimization for binutils < 2.22 seems like the right fix.
I ran a quick scrub of the u-boot tree and there's 195 instances of __attribute__((weak)) spread across 123 source files, so I think just disabling optimization on the failing object files may be too fragile, as code movement could cause others to crop up.
Adding -fno-optimize-sibling-calls for binutils < 2.22 - 0.2% code size increase for people using slightly older tools
Maintain the tweaking of a set of files - someone using binutils >= 2.22 adds __attribute__((weak)) to a single function and *BAM* three months later someone complains that something broke
I vote option 1
I do wonder, though, if we should spit out warnings when applying workaraounds for older tool-chains?
I am against this idea: this persistent warning will either worry or anoy the reader, neither of which is good IMO. OTOH, when in the future the workaround is removed because the toolchain version it fixes is considered obsolete, *then* we shall add a warning to let developers know that they use an *unsupported* binutils version.
Meanwhile, we could mark the workaround with a FIXME note in the code, for present and future U-Boot *developers* to notice and remember what they should do with this workaround. :)
Regards,
Graeme
Amicalement,

On Thu, Jul 12, 2012 at 11:45:18AM -0700, Albert ARIBAUD wrote:
Hi Graeme,
On Tue, 10 Jul 2012 10:57:39 +1000, Graeme Russ graeme.russ@gmail.com wrote:
Hi Allen
On Tue, Jul 10, 2012 at 10:45 AM, Allen Martin amartin@nvidia.com wrote:
On Sat, Jul 07, 2012 at 03:15:36AM -0700, Albert ARIBAUD wrote:
Hi Allen,
On Fri, 6 Jul 2012 16:17:19 -0700, Allen Martin amartin@nvidia.com wrote:
[snip]
And I forgot to mention, the code bloat from disabling the optimization is about 400 bytes (185136 -> 185540), so it's not bad, but it still seems a shame to disable all short branches because of one misoptimized one.
0.2% be my calcs
Can this not be limited to compiling the object files which are known to be sensitive to the problem?
I understand this issue fairly well now. It's a known bug in the assembler that has already been fixed:
http://sourceware.org/bugzilla/show_bug.cgi?id=12532
It only impacts preembtable symbols, and since u-boot doesn't have any dynamic loadable objects it's only explictly defined weak symbols that should trigger the bug.
I built a new toolchain with binutils 2.22 and verified the bug is no longer present there, and -fno-optimize-sibling-calls is the correct workaround for toolchains that do have the bug, so conditionally disabling the optimization for binutils < 2.22 seems like the right fix.
I ran a quick scrub of the u-boot tree and there's 195 instances of __attribute__((weak)) spread across 123 source files, so I think just disabling optimization on the failing object files may be too fragile, as code movement could cause others to crop up.
Adding -fno-optimize-sibling-calls for binutils < 2.22 - 0.2% code size increase for people using slightly older tools
Maintain the tweaking of a set of files - someone using binutils >= 2.22 adds __attribute__((weak)) to a single function and *BAM* three months later someone complains that something broke
I vote option 1
I do wonder, though, if we should spit out warnings when applying workaraounds for older tool-chains?
I am against this idea: this persistent warning will either worry or anoy the reader, neither of which is good IMO. OTOH, when in the future the workaround is removed because the toolchain version it fixes is considered obsolete, *then* we shall add a warning to let developers know that they use an *unsupported* binutils version.
Meanwhile, we could mark the workaround with a FIXME note in the code, for present and future U-Boot *developers* to notice and remember what they should do with this workaround. :)
I'm ok with either way, I added the warning at Graeme's suggestion. I'll send another patch series with a FIXME comment and the warning removed and let you guys fight it out :^)
-Allen

Signed-off-by: Allen Martin amartin@nvidia.com --- include/configs/tegra20-common.h | 1 + 1 file changed, 1 insertion(+)
diff --git a/include/configs/tegra20-common.h b/include/configs/tegra20-common.h index bbb80d0..16466cc 100644 --- a/include/configs/tegra20-common.h +++ b/include/configs/tegra20-common.h @@ -191,6 +191,7 @@ #define CONFIG_TEGRA_GPIO #define CONFIG_CMD_GPIO #define CONFIG_CMD_ENTERRCM +#define CONFIG_SYS_THUMB_BUILD
/* Defines for SPL */ #define CONFIG_SPL

On 07/06/2012 12:08 PM, Allen Martin wrote:
git bisect probably requires this to be the final patch in the series?

On Fri, Jul 06, 2012 at 12:10:18PM -0700, Stephen Warren wrote:
On 07/06/2012 12:08 PM, Allen Martin wrote:
git bisect probably requires this to be the final patch in the series?
Yeah I'm sure git bisect is completely broken on the series right now, I'll fix it up when I remove all the hacks and post it for real.

Add function required by some thumb switch statements
Signed-off-by: Allen Martin amartin@nvidia.com --- arch/arm/lib/Makefile | 1 + arch/arm/lib/_thumb1_case_uqi.S | 41 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) create mode 100644 arch/arm/lib/_thumb1_case_uqi.S
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile index bd3b77f..a54f831 100644 --- a/arch/arm/lib/Makefile +++ b/arch/arm/lib/Makefile @@ -33,6 +33,7 @@ GLSOBJS += _lshrdi3.o GLSOBJS += _modsi3.o GLSOBJS += _udivsi3.o GLSOBJS += _umodsi3.o +GLSOBJS += _thumb1_case_uqi.o
GLCOBJS += div0.o
diff --git a/arch/arm/lib/_thumb1_case_uqi.S b/arch/arm/lib/_thumb1_case_uqi.S new file mode 100644 index 0000000..e4bb194 --- /dev/null +++ b/arch/arm/lib/_thumb1_case_uqi.S @@ -0,0 +1,41 @@ +/* Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005 + Free Software Foundation, Inc. + +This file is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 2, or (at your option) any +later version. + +In addition to the permissions in the GNU General Public License, the +Free Software Foundation gives you unlimited permission to link the +compiled version of this file into combinations with other programs, +and to distribute those combinations without any restriction coming +from the use of this file. (The General Public License restrictions +do apply in other respects; for example, they cover modification of +the file, and distribution when not linked into a combine +executable.) + +This file is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; see the file COPYING. If not, write to +the Free Software Foundation, 51 Franklin Street, Fifth Floor, +Boston, MA 02110-1301, USA. */ + + .force_thumb + .syntax unified + .globl __gnu_thumb1_case_uqi +__gnu_thumb1_case_uqi: + push {r1} + mov r1, lr + lsrs r1, r1, #1 + lsls r1, r1, #1 + ldrb r1, [r1, r0] + lsls r1, r1, #1 + add lr, lr, r1 + pop {r1} + bx lr +

Convert return instructions to thumb compatible bx returns. Probably what's really needed here is a thumb version of all the libgcc assembly routines.
Signed-off-by: Allen Martin amartin@nvidia.com --- arch/arm/lib/_udivsi3.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/arch/arm/lib/_udivsi3.S b/arch/arm/lib/_udivsi3.S index 1309802..0b33b04 100644 --- a/arch/arm/lib/_udivsi3.S +++ b/arch/arm/lib/_udivsi3.S @@ -64,7 +64,7 @@ Loop3: bne Loop3 Lgot_result: mov r0, result - mov pc, lr + bx lr Ldiv0: str lr, [sp, #-4]! bl __div0 (PLT) @@ -80,7 +80,7 @@ __aeabi_uidivmod: ldmfd sp!, {r1, r2, ip, lr} mul r3, r0, r2 sub r1, r1, r3 - mov pc, lr + bx lr
.globl __aeabi_idivmod __aeabi_idivmod: @@ -90,4 +90,4 @@ __aeabi_idivmod: ldmfd sp!, {r1, r2, ip, lr} mul r3, r0, r2 sub r1, r1, r3 - mov pc, lr + bx lr

Convert return from relocate_code to a thumb compatible bx instruction.
Signed-off-by: Allen Martin amartin@nvidia.com --- arch/arm/cpu/arm720t/start.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/arm/cpu/arm720t/start.S b/arch/arm/cpu/arm720t/start.S index 3371d3d..af33e51 100644 --- a/arch/arm/cpu/arm720t/start.S +++ b/arch/arm/cpu/arm720t/start.S @@ -260,7 +260,7 @@ clbss_l:str r2, [r0] /* clear loop... */ mov r0, r5 /* gd_t */ mov r1, r6 /* dest_addr */ /* jump to it ... */ - mov pc, lr + bx lr
_board_init_r_ofs: .word board_init_r - _start

arm720t is an armv4t not an armv4. Force some tegra initialization functions to arm mode because they contain arm only inline assembly.
Signed-off-by: Allen Martin amartin@nvidia.com --- arch/arm/cpu/arm720t/config.mk | 2 +- arch/arm/cpu/arm720t/tegra20/config.mk | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-)
diff --git a/arch/arm/cpu/arm720t/config.mk b/arch/arm/cpu/arm720t/config.mk index 210c6dc..1f8aa95 100644 --- a/arch/arm/cpu/arm720t/config.mk +++ b/arch/arm/cpu/arm720t/config.mk @@ -24,7 +24,7 @@
PLATFORM_RELFLAGS += -fno-common -ffixed-r8 -msoft-float
-PLATFORM_CPPFLAGS += -march=armv4 -mtune=arm7tdmi +PLATFORM_CPPFLAGS += -march=armv4t -mtune=arm7tdmi # ========================================================================= # # Supply options according to compiler version diff --git a/arch/arm/cpu/arm720t/tegra20/config.mk b/arch/arm/cpu/arm720t/tegra20/config.mk index 62a31d8..af63fcb 100644 --- a/arch/arm/cpu/arm720t/tegra20/config.mk +++ b/arch/arm/cpu/arm720t/tegra20/config.mk @@ -24,3 +24,10 @@ # MA 02111-1307 USA # USE_PRIVATE_LIBGCC = yes + +# +# THUMB1 doesn't allow mrc/mcr instructions, so need to force +# these files to ARM mode +# +CFLAGS_arch/arm/cpu/tegra20-common/ap20.o += -marm +CFLAGS_arch/arm/lib/cache-cp15.o += -marm
participants (4)
-
Albert ARIBAUD
-
Allen Martin
-
Graeme Russ
-
Stephen Warren