[U-Boot] [PATCH v2 0/9] enable thumb for tegra20

Add changes necessary to enable thumb compilation for tegra20. This series includes two patches to work around a bug in gas that incorrectly optimizes short jumps in thumb mode that were posted separately. They can be dropped from this series when they are applied.
Allen Martin (9): tools, config.mk: add binutils-version arm: work around assembler bug tegra20: remove inline assembly for u32 cast arm: add _thumb1_case_uqi to libgcc arm: use thumb compatible return in arm720t arm: change arm720t to armv4t arm720t: add linkage macro for relocate_code arm: use thumb interworking returns in libgcc tegra20: enable thumb build
arch/arm/config.mk | 18 +++++++++++++ arch/arm/cpu/arm720t/config.mk | 2 +- arch/arm/cpu/arm720t/start.S | 7 ++--- arch/arm/cpu/arm720t/tegra20/config.mk | 7 +++++ arch/arm/cpu/tegra20-common/warmboot_avp.c | 9 +------ arch/arm/include/asm/assembler.h | 10 +++++++ arch/arm/lib/Makefile | 1 + arch/arm/lib/_ashldi3.S | 3 ++- arch/arm/lib/_ashrdi3.S | 3 ++- arch/arm/lib/_divsi3.S | 15 ++++++++--- arch/arm/lib/_lshrdi3.S | 3 ++- arch/arm/lib/_modsi3.S | 9 ++++++- arch/arm/lib/{_ashrdi3.S => _thumb1_case_uqi.S} | 33 +++++++++-------------- arch/arm/lib/_udivsi3.S | 8 +++--- arch/arm/lib/_umodsi3.S | 18 ++++++++++++- arch/arm/lib/memcpy.S | 10 +++++++ arch/arm/lib/memset.S | 3 ++- config.mk | 1 + include/configs/tegra20-common.h | 2 ++ tools/binutils-version.sh | 20 ++++++++++++++ 20 files changed, 137 insertions(+), 45 deletions(-) copy arch/arm/lib/{_ashrdi3.S => _thumb1_case_uqi.S} (81%) create mode 100755 tools/binutils-version.sh

Modeled after gcc-version, add function to get binutils version.
Signed-off-by: Allen Martin amartin@nvidia.com --- config.mk | 1 + tools/binutils-version.sh | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) create mode 100755 tools/binutils-version.sh
diff --git a/config.mk b/config.mk index 3dcea6a..919b77d 100644 --- a/config.mk +++ b/config.mk @@ -128,6 +128,7 @@ endif # cc-version # Usage gcc-ver := $(call cc-version) cc-version = $(shell $(SHELL) $(SRCTREE)/tools/gcc-version.sh $(CC)) +binutils-version = $(shell $(SHELL) $(SRCTREE)/tools/binutils-version.sh $(AS))
# # Include the make variables (CC, etc...) diff --git a/tools/binutils-version.sh b/tools/binutils-version.sh new file mode 100755 index 0000000..d4d9eb4 --- /dev/null +++ b/tools/binutils-version.sh @@ -0,0 +1,20 @@ +#!/bin/sh +# +# binutils-version [-p] gas-command +# +# Prints the binutils version of `gas-command' in a canonical 4-digit form +# such as `0222' for binutils 2.22 +# + +gas="$*" + +if [ ${#gas} -eq 0 ]; then + echo "Error: No assembler specified." + printf "Usage:\n\t$0 <gas-command>\n" + exit 1 +fi + +MAJOR=$($gas --version | head -1 | awk '{print $NF}' | cut -d . -f 1) +MINOR=$($gas --version | head -1 | awk '{print $NF}' | cut -d . -f 2) + +printf "%02d%02d\n" $MAJOR $MINOR

Disable sibling call optimization based on binutils version. This is to work around a bug in the assember in binutils versions < 2.22. Branches to weak symbols can be incorrectly optimized in thumb mode to a short branch (b.n instruction) that won't reach when the symbol gets preempted.
http://sourceware.org/bugzilla/show_bug.cgi?id=12532
Signed-off-by: Allen Martin amartin@nvidia.com --- arch/arm/config.mk | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+)
diff --git a/arch/arm/config.mk b/arch/arm/config.mk index 3f4453a..24b9d7c 100644 --- a/arch/arm/config.mk +++ b/arch/arm/config.mk @@ -87,3 +87,21 @@ endif ifndef CONFIG_NAND_SPL LDFLAGS_u-boot += -pie endif + +# +# FIXME: binutils versions < 2.22 have a bug in the assembler where +# branches to weak symbols can be incorrectly optimized in thumb mode +# to a short branch (b.n instruction) that won't reach when the symbol +# gets preempted +# +# http://sourceware.org/bugzilla/show_bug.cgi?id=12532 +# +ifeq ($(CONFIG_SYS_THUMB_BUILD),y) +ifeq ($(GAS_BUG_12532),) +export GAS_BUG_12532:=$(shell if [ $(call binutils-version) -lt 0222 ] ; \ + then echo y; else echo n; fi) +endif +ifeq ($(GAS_BUG_12532),y) +PLATFORM_RELFLAGS += -fno-optimize-sibling-calls +endif +endif

This inline assembly just converts a function address to a u32. Replace it with equivalent C code since the assembly was not thumb compatible.
Signed-off-by: Allen Martin amartin@nvidia.com --- arch/arm/cpu/tegra20-common/warmboot_avp.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-)
diff --git a/arch/arm/cpu/tegra20-common/warmboot_avp.c b/arch/arm/cpu/tegra20-common/warmboot_avp.c index cd01908..0a7f09f 100644 --- a/arch/arm/cpu/tegra20-common/warmboot_avp.c +++ b/arch/arm/cpu/tegra20-common/warmboot_avp.c @@ -51,14 +51,7 @@ void wb_start(void) /* enable JTAG & TBE */ writel(CONFIG_CTL_TBE | CONFIG_CTL_JTAG, &pmt->pmt_cfg_ctl);
- /* Are we running where we're supposed to be? */ - asm volatile ( - "adr %0, wb_start;" /* reg: wb_start address */ - : "=r"(reg) /* output */ - /* no input, no clobber list */ - ); - - if (reg != AP20_WB_RUN_ADDRESS) + if ((u32)wb_start != AP20_WB_RUN_ADDRESS) goto do_reset;
/* Are we running with AVP? */

Add function required by some thumb switch statements
Signed-off-by: Allen Martin amartin@nvidia.com --- arch/arm/lib/Makefile | 1 + arch/arm/lib/_thumb1_case_uqi.S | 41 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) create mode 100644 arch/arm/lib/_thumb1_case_uqi.S
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile index bd3b77f..a54f831 100644 --- a/arch/arm/lib/Makefile +++ b/arch/arm/lib/Makefile @@ -33,6 +33,7 @@ GLSOBJS += _lshrdi3.o GLSOBJS += _modsi3.o GLSOBJS += _udivsi3.o GLSOBJS += _umodsi3.o +GLSOBJS += _thumb1_case_uqi.o
GLCOBJS += div0.o
diff --git a/arch/arm/lib/_thumb1_case_uqi.S b/arch/arm/lib/_thumb1_case_uqi.S new file mode 100644 index 0000000..e4bb194 --- /dev/null +++ b/arch/arm/lib/_thumb1_case_uqi.S @@ -0,0 +1,41 @@ +/* Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005 + Free Software Foundation, Inc. + +This file is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 2, or (at your option) any +later version. + +In addition to the permissions in the GNU General Public License, the +Free Software Foundation gives you unlimited permission to link the +compiled version of this file into combinations with other programs, +and to distribute those combinations without any restriction coming +from the use of this file. (The General Public License restrictions +do apply in other respects; for example, they cover modification of +the file, and distribution when not linked into a combine +executable.) + +This file is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; see the file COPYING. If not, write to +the Free Software Foundation, 51 Franklin Street, Fifth Floor, +Boston, MA 02110-1301, USA. */ + + .force_thumb + .syntax unified + .globl __gnu_thumb1_case_uqi +__gnu_thumb1_case_uqi: + push {r1} + mov r1, lr + lsrs r1, r1, #1 + lsls r1, r1, #1 + ldrb r1, [r1, r0] + lsls r1, r1, #1 + add lr, lr, r1 + pop {r1} + bx lr +

On 08/01/2012 02:32 PM, Allen Martin wrote:
Add function required by some thumb switch statements
diff --git a/arch/arm/lib/_thumb1_case_uqi.S b/arch/arm/lib/_thumb1_case_uqi.S
- .force_thumb
I believe that line should be removed.
The issue here is that when gcc emits Thumb code to call this function, it actually emits:
108af8: f000 f94a bl 108d90 <____gnu_thumb1_case_uqi_from_thumb>
which is implemented as:
00108d90 <____gnu_thumb1_case_uqi_from_thumb>: 108d90: 4778 bx pc 108d92: 46c0 nop ; (mov r8, r8) 108d94: eafffde1 b 108520 <__gnu_thumb1_case_uqi>
i.e. it switches to ARM mode then jumps to that function. Hence, __gnu_thumb1_case_uqi must be compiled as ARM, not as Thumb.
(renaming the function to ____gnu_thumb1_case_uqi_from_thumb in the hope it'll be called directly instead of going through a stub doesn't seem to work)
If I make that change, then this patch series starts working on Whistler, which for reference, uses UARTA, so triggers funcmux.c's funcmux_select() to enter case PERIPH_ID_UART1, which then uses ____gnu_thumb1_case_uqi_from_thumb to perform the nested switch (config).

On Mon, Aug 13, 2012 at 04:44:05PM -0700, Stephen Warren wrote:
On 08/01/2012 02:32 PM, Allen Martin wrote:
Add function required by some thumb switch statements
diff --git a/arch/arm/lib/_thumb1_case_uqi.S b/arch/arm/lib/_thumb1_case_uqi.S
- .force_thumb
I believe that line should be removed.
The issue here is that when gcc emits Thumb code to call this function, it actually emits:
108af8: f000 f94a bl 108d90 <____gnu_thumb1_case_uqi_from_thumb>
which is implemented as:
00108d90 <____gnu_thumb1_case_uqi_from_thumb>: 108d90: 4778 bx pc 108d92: 46c0 nop ; (mov r8, r8) 108d94: eafffde1 b 108520 <__gnu_thumb1_case_uqi>
i.e. it switches to ARM mode then jumps to that function. Hence, __gnu_thumb1_case_uqi must be compiled as ARM, not as Thumb.
The function is supposed to be thumb code, it's basically a thumb optimization of a switch statement, and in the ARM case the compiler just emits the code directly instead of calling into libgcc. So it doesn't really make sense for the function to be anything but thumb.
I think the real problem is the linker incorrectly thought the code was ARM so it generated a thumb to ARM interworking veneer.
Can you try adding a ".type __gnu_thumb1_case_uqi STT_FUNC" instead? I've seen cases where the assembler generates the wrong type of symbol without some explicit guidance which messes up the linker's interworking generator. This might be one of those.
(renaming the function to ____gnu_thumb1_case_uqi_from_thumb in the hope it'll be called directly instead of going through a stub doesn't seem to work)
If I make that change, then this patch series starts working on Whistler, which for reference, uses UARTA, so triggers funcmux.c's funcmux_select() to enter case PERIPH_ID_UART1, which then uses ____gnu_thumb1_case_uqi_from_thumb to perform the nested switch (config).
-Allen

On Mon, Aug 13, 2012 at 5:36 PM, Allen Martin amartin@nvidia.com wrote:
On Mon, Aug 13, 2012 at 04:44:05PM -0700, Stephen Warren wrote:
On 08/01/2012 02:32 PM, Allen Martin wrote:
Add function required by some thumb switch statements
diff --git a/arch/arm/lib/_thumb1_case_uqi.S b/arch/arm/lib/_thumb1_case_uqi.S
- .force_thumb
I believe that line should be removed.
The issue here is that when gcc emits Thumb code to call this function, it actually emits:
108af8: f000 f94a bl 108d90 <____gnu_thumb1_case_uqi_from_thumb>
which is implemented as:
00108d90 <____gnu_thumb1_case_uqi_from_thumb>: 108d90: 4778 bx pc 108d92: 46c0 nop ; (mov r8, r8) 108d94: eafffde1 b 108520 <__gnu_thumb1_case_uqi>
i.e. it switches to ARM mode then jumps to that function. Hence, __gnu_thumb1_case_uqi must be compiled as ARM, not as Thumb.
The function is supposed to be thumb code, it's basically a thumb optimization of a switch statement, and in the ARM case the compiler just emits the code directly instead of calling into libgcc. So it doesn't really make sense for the function to be anything but thumb.
I think the real problem is the linker incorrectly thought the code was ARM so it generated a thumb to ARM interworking veneer.
Can you try adding a ".type __gnu_thumb1_case_uqi STT_FUNC" instead? I've seen cases where the assembler generates the wrong type of symbol without some explicit guidance which messes up the linker's interworking generator. This might be one of those.
Yes, not having STT_FUNC or %function is troublesome. I have faced this before.
Maybe you should wrap the assembly functions with standard ENTRY, END_PROC macros from linkage.h. This will take care of STT_FUNC
best regards, Aneesh

On 08/13/2012 08:39 PM, V, Aneesh wrote:
On Mon, Aug 13, 2012 at 5:36 PM, Allen Martin amartin@nvidia.com wrote:
On Mon, Aug 13, 2012 at 04:44:05PM -0700, Stephen Warren wrote:
On 08/01/2012 02:32 PM, Allen Martin wrote:
Add function required by some thumb switch statements
diff --git a/arch/arm/lib/_thumb1_case_uqi.S b/arch/arm/lib/_thumb1_case_uqi.S
- .force_thumb
I believe that line should be removed.
The issue here is that when gcc emits Thumb code to call this function, it actually emits:
108af8: f000 f94a bl 108d90 <____gnu_thumb1_case_uqi_from_thumb>
which is implemented as:
00108d90 <____gnu_thumb1_case_uqi_from_thumb>: 108d90: 4778 bx pc 108d92: 46c0 nop ; (mov r8, r8) 108d94: eafffde1 b 108520 <__gnu_thumb1_case_uqi>
i.e. it switches to ARM mode then jumps to that function. Hence, __gnu_thumb1_case_uqi must be compiled as ARM, not as Thumb.
The function is supposed to be thumb code, it's basically a thumb optimization of a switch statement, and in the ARM case the compiler just emits the code directly instead of calling into libgcc. So it doesn't really make sense for the function to be anything but thumb.
I think the real problem is the linker incorrectly thought the code was ARM so it generated a thumb to ARM interworking veneer.
Can you try adding a ".type __gnu_thumb1_case_uqi STT_FUNC" instead? I've seen cases where the assembler generates the wrong type of symbol without some explicit guidance which messes up the linker's interworking generator. This might be one of those.
Yes, not having STT_FUNC or %function is troublesome. I have faced this before.
Maybe you should wrap the assembly functions with standard ENTRY, END_PROC macros from linkage.h. This will take care of STT_FUNC
Yes. In fact patch 7/9 of this series is doing exactly that for some other assembly function.
So, the patch below fixes this problem; I see the function being called directly without any compiler-synthesized "from_thumb" stub/trampoline being used.
diff --git a/arch/arm/lib/_thumb1_case_uqi.S b/arch/arm/lib/_thumb1_case_uqi.S index e4bb194..a4241f6 100644 --- a/arch/arm/lib/_thumb1_case_uqi.S +++ b/arch/arm/lib/_thumb1_case_uqi.S @@ -25,10 +25,11 @@ along with this program; see the file COPYING. If not, write to the Free Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
+#include <linux/linkage.h>
.force_thumb .syntax unified
.globl __gnu_thumb1_case_uqi
-__gnu_thumb1_case_uqi: +ENTRY(__gnu_thumb1_case_uqi) push {r1} mov r1, lr lsrs r1, r1, #1 @@ -38,4 +39,4 @@ __gnu_thumb1_case_uqi: add lr, lr, r1 pop {r1} bx lr
+ENDPROC(__gnu_thumb1_case_uqi)

Convert return from relocate_code to a thumb compatible bx instruction.
Signed-off-by: Allen Martin amartin@nvidia.com --- arch/arm/cpu/arm720t/start.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/arm/cpu/arm720t/start.S b/arch/arm/cpu/arm720t/start.S index 9d02709..73b57c0 100644 --- a/arch/arm/cpu/arm720t/start.S +++ b/arch/arm/cpu/arm720t/start.S @@ -262,7 +262,7 @@ clbss_e: mov r0, r5 /* gd_t */ mov r1, r6 /* dest_addr */ /* jump to it ... */ - mov pc, lr + bx lr
_board_init_r_ofs: .word board_init_r - _start

arm720t is an armv4t not an armv4. Force some tegra initialization functions to arm mode because they contain arm only inline assembly.
Signed-off-by: Allen Martin amartin@nvidia.com --- arch/arm/cpu/arm720t/config.mk | 2 +- arch/arm/cpu/arm720t/tegra20/config.mk | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-)
diff --git a/arch/arm/cpu/arm720t/config.mk b/arch/arm/cpu/arm720t/config.mk index 210c6dc..1f8aa95 100644 --- a/arch/arm/cpu/arm720t/config.mk +++ b/arch/arm/cpu/arm720t/config.mk @@ -24,7 +24,7 @@
PLATFORM_RELFLAGS += -fno-common -ffixed-r8 -msoft-float
-PLATFORM_CPPFLAGS += -march=armv4 -mtune=arm7tdmi +PLATFORM_CPPFLAGS += -march=armv4t -mtune=arm7tdmi # ========================================================================= # # Supply options according to compiler version diff --git a/arch/arm/cpu/arm720t/tegra20/config.mk b/arch/arm/cpu/arm720t/tegra20/config.mk index 62a31d8..af63fcb 100644 --- a/arch/arm/cpu/arm720t/tegra20/config.mk +++ b/arch/arm/cpu/arm720t/tegra20/config.mk @@ -24,3 +24,10 @@ # MA 02111-1307 USA # USE_PRIVATE_LIBGCC = yes + +# +# THUMB1 doesn't allow mrc/mcr instructions, so need to force +# these files to ARM mode +# +CFLAGS_arch/arm/cpu/tegra20-common/ap20.o += -marm +CFLAGS_arch/arm/lib/cache-cp15.o += -marm

The linker needs this to understand that the symbol is actually a function so it will generate correct thumb interworking code.
Signed-off-by: Allen Martin amartin@nvidia.com --- arch/arm/cpu/arm720t/start.S | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/arch/arm/cpu/arm720t/start.S b/arch/arm/cpu/arm720t/start.S index 73b57c0..11e9194 100644 --- a/arch/arm/cpu/arm720t/start.S +++ b/arch/arm/cpu/arm720t/start.S @@ -27,6 +27,7 @@ #include <config.h> #include <version.h> #include <asm/hardware.h> +#include <linux/linkage.h>
/* ************************************************************************* @@ -165,8 +166,7 @@ call_board_init_f: * after relocating the monitor code. * */ - .globl relocate_code -relocate_code: +ENTRY(relocate_code) mov r4, r0 /* save addr_sp */ mov r5, r1 /* save addr of gd */ mov r6, r2 /* save addr of destination */ @@ -273,6 +273,7 @@ _rel_dyn_end_ofs: .word __rel_dyn_end - _start _dynsym_start_ofs: .word __dynsym_start - _start +ENDPROC(relocate_code)
/* *************************************************************************

If CONFIG_SYS_THUMB_BUILD is enabled, use thumb interworking return instructions from libgcc routines, otherwise use ARM returns.
Signed-off-by: Allen Martin amartin@nvidia.com --- arch/arm/include/asm/assembler.h | 10 ++++++++++ arch/arm/lib/_ashldi3.S | 3 ++- arch/arm/lib/_ashrdi3.S | 3 ++- arch/arm/lib/_divsi3.S | 15 +++++++++++---- arch/arm/lib/_lshrdi3.S | 3 ++- arch/arm/lib/_modsi3.S | 9 ++++++++- arch/arm/lib/_udivsi3.S | 8 +++++--- arch/arm/lib/_umodsi3.S | 18 +++++++++++++++++- arch/arm/lib/memcpy.S | 10 ++++++++++ arch/arm/lib/memset.S | 3 ++- 10 files changed, 69 insertions(+), 13 deletions(-)
diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h index 5e4789b..25ece01 100644 --- a/arch/arm/include/asm/assembler.h +++ b/arch/arm/include/asm/assembler.h @@ -13,6 +13,7 @@ * Do not include any C declarations in this file - it is included by * assembler source. */ +#include <config.h>
/* * Endian independent macros for shifting bytes within registers. @@ -58,3 +59,12 @@ * Cache alligned */ #define CALGN(code...) code + +/* + * return instruction + */ +#ifdef CONFIG_SYS_THUMB_BUILD +#define RET bx lr +#else +#define RET mov pc, lr +#endif diff --git a/arch/arm/lib/_ashldi3.S b/arch/arm/lib/_ashldi3.S index 834ddc2..e280f26 100644 --- a/arch/arm/lib/_ashldi3.S +++ b/arch/arm/lib/_ashldi3.S @@ -25,6 +25,7 @@ along with this program; see the file COPYING. If not, write to the Free Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
+#include <asm/assembler.h>
#ifdef __ARMEB__ #define al r1 @@ -45,4 +46,4 @@ __aeabi_llsl: movpl ah, al, lsl r3 orrmi ah, ah, al, lsr ip mov al, al, lsl r2 - mov pc, lr + RET diff --git a/arch/arm/lib/_ashrdi3.S b/arch/arm/lib/_ashrdi3.S index 671ac87..3edda9f 100644 --- a/arch/arm/lib/_ashrdi3.S +++ b/arch/arm/lib/_ashrdi3.S @@ -25,6 +25,7 @@ along with this program; see the file COPYING. If not, write to the Free Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
+#include <asm/assembler.h>
#ifdef __ARMEB__ #define al r1 @@ -45,4 +46,4 @@ __aeabi_lasr: movpl al, ah, asr r3 orrmi al, al, ah, lsl ip mov ah, ah, asr r2 - mov pc, lr + RET diff --git a/arch/arm/lib/_divsi3.S b/arch/arm/lib/_divsi3.S index cfbadb2..ab59d78 100644 --- a/arch/arm/lib/_divsi3.S +++ b/arch/arm/lib/_divsi3.S @@ -1,3 +1,5 @@ +#include <config.h> +#include <asm/assembler.h>
.macro ARM_DIV_BODY dividend, divisor, result, curbit
@@ -116,27 +118,32 @@ __aeabi_idiv:
cmp ip, #0 rsbmi r0, r0, #0 - mov pc, lr + RET
10: teq ip, r0 @ same sign ? rsbmi r0, r0, #0 - mov pc, lr + RET
11: movlo r0, #0 moveq r0, ip, asr #31 orreq r0, r0, #1 - mov pc, lr + RET
12: ARM_DIV2_ORDER r1, r2
cmp ip, #0 mov r0, r3, lsr r2 rsbmi r0, r0, #0 - mov pc, lr + RET
Ldiv0:
str lr, [sp, #-4]! bl __div0 mov r0, #0 @ About as wrong as it could be. +#ifdef CONFIG_SYS_THUMB_BUILD + ldr lr, [sp], #4 + bx lr +#else ldr pc, [sp], #4 +#endif diff --git a/arch/arm/lib/_lshrdi3.S b/arch/arm/lib/_lshrdi3.S index e7fa799..4d7784d 100644 --- a/arch/arm/lib/_lshrdi3.S +++ b/arch/arm/lib/_lshrdi3.S @@ -25,6 +25,7 @@ along with this program; see the file COPYING. If not, write to the Free Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
+#include <asm/assembler.h>
#ifdef __ARMEB__ #define al r1 @@ -45,4 +46,4 @@ __aeabi_llsr: movpl al, ah, lsr r3 orrmi al, al, ah, lsl ip mov ah, ah, lsr r2 - mov pc, lr + RET diff --git a/arch/arm/lib/_modsi3.S b/arch/arm/lib/_modsi3.S index 539c584..9c7ce8c 100644 --- a/arch/arm/lib/_modsi3.S +++ b/arch/arm/lib/_modsi3.S @@ -1,3 +1,5 @@ +#include <config.h> +#include <asm/assembler.h>
.macro ARM_MOD_BODY dividend, divisor, order, spare
@@ -88,7 +90,7 @@ __modsi3:
10: cmp ip, #0 rsbmi r0, r0, #0 - mov pc, lr + RET
Ldiv0: @@ -96,4 +98,9 @@ Ldiv0: str lr, [sp, #-4]! bl __div0 mov r0, #0 @ About as wrong as it could be. +#ifdef CONFIG_SYS_THUMB_BUILD + ldr lr, [sp], #4 + bx lr +#else ldr pc, [sp], #4 +#endif diff --git a/arch/arm/lib/_udivsi3.S b/arch/arm/lib/_udivsi3.S index 1309802..09a1664 100644 --- a/arch/arm/lib/_udivsi3.S +++ b/arch/arm/lib/_udivsi3.S @@ -1,3 +1,5 @@ +#include <asm/assembler.h> + /* # 1 "libgcc1.S" */ @ libgcc1 routines for ARM cpu. @ Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk) @@ -64,7 +66,7 @@ Loop3: bne Loop3 Lgot_result: mov r0, result - mov pc, lr + RET Ldiv0: str lr, [sp, #-4]! bl __div0 (PLT) @@ -80,7 +82,7 @@ __aeabi_uidivmod: ldmfd sp!, {r1, r2, ip, lr} mul r3, r0, r2 sub r1, r1, r3 - mov pc, lr + RET
.globl __aeabi_idivmod __aeabi_idivmod: @@ -90,4 +92,4 @@ __aeabi_idivmod: ldmfd sp!, {r1, r2, ip, lr} mul r3, r0, r2 sub r1, r1, r3 - mov pc, lr + RET diff --git a/arch/arm/lib/_umodsi3.S b/arch/arm/lib/_umodsi3.S index 8465ef0..93eadf9 100644 --- a/arch/arm/lib/_umodsi3.S +++ b/arch/arm/lib/_umodsi3.S @@ -1,3 +1,6 @@ +#include <config.h> +#include <asm/assembler.h> + /* # 1 "libgcc1.S" */ @ libgcc1 routines for ARM cpu. @ Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk) @@ -19,7 +22,11 @@ curbit .req r3 beq Ldiv0 mov curbit, #1 cmp dividend, divisor +#ifdef CONFIG_SYS_THUMB_BUILD + bxcc lr +#else movcc pc, lr +#endif Loop1: @ Unless the divisor is very big, shift it up in multiples of @ four bits, since this is the amount of unwinding in the main @@ -66,19 +73,28 @@ Loop3: @ then none of the below will match, since the bit in ip will not be @ in the bottom nibble. ands overdone, overdone, #0xe0000000 +#ifdef CONFIG_SYS_THUMB_BUILD + bxeq lr +#else moveq pc, lr @ No fixups needed +#endif tst overdone, ip, ror #3 addne dividend, dividend, divisor, lsr #3 tst overdone, ip, ror #2 addne dividend, dividend, divisor, lsr #2 tst overdone, ip, ror #1 addne dividend, dividend, divisor, lsr #1 - mov pc, lr + RET Ldiv0: str lr, [sp, #-4]! bl __div0 (PLT) mov r0, #0 @ about as wrong as it could be +#ifdef CONFIG_SYS_THUMB_BUILD + ldmia sp!, {lr} + bx lr +#else ldmia sp!, {pc} +#endif .size __umodsi3 , . - __umodsi3 /* # 320 "libgcc1.S" */ /* # 421 "libgcc1.S" */ diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S index f655256..1e1dbd4 100644 --- a/arch/arm/lib/memcpy.S +++ b/arch/arm/lib/memcpy.S @@ -10,6 +10,7 @@ * published by the Free Software Foundation. */
+#include <config.h> #include <asm/assembler.h>
#define W(instr) instr @@ -61,7 +62,11 @@ memcpy:
cmp r0, r1 +#ifdef CONFIG_SYS_THUMB_BUILD + bxeq lr +#else moveq pc, lr +#endif
enter r4, lr
@@ -149,7 +154,12 @@ memcpy: str1b r0, r4, cs, abort=21f str1b r0, ip, cs, abort=21f
+#ifdef CONFIG_SYS_THUMB_BUILD + exit r4, lr + bx lr +#else exit r4, pc +#endif
9: rsb ip, ip, #4 cmp ip, #2 diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S index 0cdf895..e149c46 100644 --- a/arch/arm/lib/memset.S +++ b/arch/arm/lib/memset.S @@ -9,6 +9,7 @@ * * ASM optimised string functions */ +#include <config.h> #include <asm/assembler.h>
.text @@ -123,4 +124,4 @@ memset: strneb r1, [r0], #1 tst r2, #1 strneb r1, [r0], #1 - mov pc, lr + RET

Hi Allen,
On Wed, Aug 1, 2012 at 1:32 PM, Allen Martin amartin@nvidia.com wrote:
If CONFIG_SYS_THUMB_BUILD is enabled, use thumb interworking return instructions from libgcc routines, otherwise use ARM returns.
Signed-off-by: Allen Martin amartin@nvidia.com
arch/arm/include/asm/assembler.h | 10 ++++++++++ arch/arm/lib/_ashldi3.S | 3 ++- arch/arm/lib/_ashrdi3.S | 3 ++- arch/arm/lib/_divsi3.S | 15 +++++++++++---- arch/arm/lib/_lshrdi3.S | 3 ++- arch/arm/lib/_modsi3.S | 9 ++++++++- arch/arm/lib/_udivsi3.S | 8 +++++--- arch/arm/lib/_umodsi3.S | 18 +++++++++++++++++- arch/arm/lib/memcpy.S | 10 ++++++++++ arch/arm/lib/memset.S | 3 ++- 10 files changed, 69 insertions(+), 13 deletions(-)
diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h index 5e4789b..25ece01 100644 --- a/arch/arm/include/asm/assembler.h +++ b/arch/arm/include/asm/assembler.h @@ -13,6 +13,7 @@
- Do not include any C declarations in this file - it is included by
- assembler source.
*/ +#include <config.h>
/*
- Endian independent macros for shifting bytes within registers.
@@ -58,3 +59,12 @@
- Cache alligned
*/ #define CALGN(code...) code
+/*
- return instruction
- */
+#ifdef CONFIG_SYS_THUMB_BUILD +#define RET bx lr +#else +#define RET mov pc, lr
Why not "bx lr" in all cases? In that case you can just replace all the instances of "mov pc, lr" directly by "bx lr" instead of this macro. That looks cleaner to me.
BTW, as far as I remember when I did this originally my compiler was compiling all assembly functions in ARM and it was automatically converting "mov pc, lr" to "bx lr" ,where necessary. Maybe that was just my compiler and I don't remember the details now. Did you really face any issue with "mov pc, lr" making wrong jumps?
best regards, Aneesh

On Wed, Aug 01, 2012 at 02:11:48PM -0700, V, Aneesh wrote:
Hi Allen,
On Wed, Aug 1, 2012 at 1:32 PM, Allen Martin <amartin@nvidia.commailto:amartin@nvidia.com> wrote: If CONFIG_SYS_THUMB_BUILD is enabled, use thumb interworking return instructions from libgcc routines, otherwise use ARM returns.
Signed-off-by: Allen Martin <amartin@nvidia.commailto:amartin@nvidia.com>
arch/arm/include/asm/assembler.h | 10 ++++++++++ arch/arm/lib/_ashldi3.S | 3 ++- arch/arm/lib/_ashrdi3.S | 3 ++- arch/arm/lib/_divsi3.S | 15 +++++++++++---- arch/arm/lib/_lshrdi3.S | 3 ++- arch/arm/lib/_modsi3.S | 9 ++++++++- arch/arm/lib/_udivsi3.S | 8 +++++--- arch/arm/lib/_umodsi3.S | 18 +++++++++++++++++- arch/arm/lib/memcpy.S | 10 ++++++++++ arch/arm/lib/memset.S | 3 ++- 10 files changed, 69 insertions(+), 13 deletions(-)
diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h index 5e4789b..25ece01 100644 --- a/arch/arm/include/asm/assembler.h +++ b/arch/arm/include/asm/assembler.h @@ -13,6 +13,7 @@
- Do not include any C declarations in this file - it is included by
- assembler source.
*/ +#include <config.h>
/*
- Endian independent macros for shifting bytes within registers.
@@ -58,3 +59,12 @@
- Cache alligned
*/ #define CALGN(code...) code
+/*
- return instruction
- */
+#ifdef CONFIG_SYS_THUMB_BUILD +#define RET bx lr +#else +#define RET mov pc, lr
Why not "bx lr" in all cases? In that case you can just replace all the instances of "mov pc, lr" directly by "bx lr" instead of this macro. That looks cleaner to me.
I didn't want to break any older ARM architectures that don't support the bx instruction but use this code.
BTW, as far as I remember when I did this originally my compiler was compiling all assembly functions in ARM and it was automatically converting "mov pc, lr" to "bx lr" ,where necessary. Maybe that was just my compiler and I don't remember the details now. Did you really face any issue with "mov pc, lr" making wrong jumps?
Yes, without this change if I disassemble the code I'm seeing it's still a mov instruction and it hangs.
-Allen

Hi Allen,
On Wed, Aug 1, 2012 at 2:55 PM, Allen Martin amartin@nvidia.com wrote:
On Wed, Aug 01, 2012 at 02:11:48PM -0700, V, Aneesh wrote:
Hi Allen,
On Wed, Aug 1, 2012 at 1:32 PM, Allen Martin <amartin@nvidia.com<mailto:
amartin@nvidia.com>> wrote:
If CONFIG_SYS_THUMB_BUILD is enabled, use thumb interworking return instructions from libgcc routines, otherwise use ARM returns.
Signed-off-by: Allen Martin <amartin@nvidia.com<mailto:
amartin@nvidia.com>>
arch/arm/include/asm/assembler.h | 10 ++++++++++ arch/arm/lib/_ashldi3.S | 3 ++- arch/arm/lib/_ashrdi3.S | 3 ++- arch/arm/lib/_divsi3.S | 15 +++++++++++---- arch/arm/lib/_lshrdi3.S | 3 ++- arch/arm/lib/_modsi3.S | 9 ++++++++- arch/arm/lib/_udivsi3.S | 8 +++++--- arch/arm/lib/_umodsi3.S | 18 +++++++++++++++++- arch/arm/lib/memcpy.S | 10 ++++++++++ arch/arm/lib/memset.S | 3 ++- 10 files changed, 69 insertions(+), 13 deletions(-)
diff --git a/arch/arm/include/asm/assembler.h
b/arch/arm/include/asm/assembler.h
index 5e4789b..25ece01 100644 --- a/arch/arm/include/asm/assembler.h +++ b/arch/arm/include/asm/assembler.h @@ -13,6 +13,7 @@
- Do not include any C declarations in this file - it is included by
- assembler source.
*/ +#include <config.h>
/*
- Endian independent macros for shifting bytes within registers.
@@ -58,3 +59,12 @@
- Cache alligned
*/ #define CALGN(code...) code
+/*
- return instruction
- */
+#ifdef CONFIG_SYS_THUMB_BUILD +#define RET bx lr +#else +#define RET mov pc, lr
Why not "bx lr" in all cases? In that case you can just replace all the
instances of
"mov pc, lr" directly by "bx lr" instead of this macro. That looks
cleaner to me.
I didn't want to break any older ARM architectures that don't support the bx instruction but use this code.
Which is earlier than armv4t, right? On quick look it didn't seem there is anything older than that in u-boot. But yes, it's perhaps better to be safe.
BTW, as far as I remember when I did this originally my compiler was
compiling
all assembly functions in ARM and it was automatically converting "mov
pc, lr" to
"bx lr" ,where necessary. Maybe that was just my compiler and I don't
remember
the details now. Did you really face any issue with "mov pc, lr" making
wrong jumps?
Yes, without this change if I disassemble the code I'm seeing it's still a mov instruction and it hangs.
Ok.
br, Aneesh

On Wed, Aug 01, 2012 at 03:15:45PM -0700, V, Aneesh wrote:
I didn't want to break any older ARM architectures that don't support the bx instruction but use this code.
Which is earlier than armv4t, right? On quick look it didn't seem there is anything older than that in u-boot. But yes, it's perhaps better to be safe.
Yes, in particular bx is available in armv4t but not armv4, and there are architectures being compiled -march=armv4 in u-boot:
$ grep march arch/arm/cpu/*/config.mk arch/arm/cpu/arm1136/config.mk:PLATFORM_CPPFLAGS += -march=armv5 arch/arm/cpu/arm1176/config.mk:PLATFORM_CPPFLAGS += -march=armv5t arch/arm/cpu/arm720t/config.mk:PLATFORM_CPPFLAGS += -march=armv4 -mtune=arm7tdmi arch/arm/cpu/arm920t/config.mk:PLATFORM_CPPFLAGS += -march=armv4 arch/arm/cpu/arm925t/config.mk:PLATFORM_CPPFLAGS += -march=armv4 arch/arm/cpu/arm926ejs/config.mk:PLATFORM_CPPFLAGS += -march=armv5te arch/arm/cpu/arm946es/config.mk:PLATFORM_CPPFLAGS += -march=armv4 arch/arm/cpu/arm_intcm/config.mk:PLATFORM_CPPFLAGS += -march=armv4 arch/arm/cpu/armv7/config.mk:PF_CPPFLAGS_ARMV7 := $(call cc-option, -march=armv7-a, -march=armv5) arch/arm/cpu/ixp/config.mk:PLATFORM_CPPFLAGS += -mbig-endian -march=armv5te -mtune=strongarm1100 arch/arm/cpu/lh7a40x/config.mk:PLATFORM_CPPFLAGS += -march=armv4 arch/arm/cpu/pxa/config.mk:PLATFORM_CPPFLAGS += -march=armv5te -mtune=xscale arch/arm/cpu/s3c44b0/config.mk:PLATFORM_CPPFLAGS += -march=armv4 -mtune=arm7tdmi -msoft-float arch/arm/cpu/sa1100/config.mk:PLATFORM_CPPFLAGS += -march=armv4 -mtune=strongarm1100
Probably some of these are actually armv4t, but I don't want to touch them :^)
-Allen

Signed-off-by: Allen Martin amartin@nvidia.com --- include/configs/tegra20-common.h | 1 + 1 file changed, 1 insertion(+)
diff --git a/include/configs/tegra20-common.h b/include/configs/tegra20-common.h index c9e8b6b..3ce0d6f 100644 --- a/include/configs/tegra20-common.h +++ b/include/configs/tegra20-common.h @@ -191,6 +191,7 @@ #define CONFIG_CMD_GPIO #define CONFIG_CMD_ENTERRCM #define CONFIG_CMD_BOOTZ +#define CONFIG_SYS_THUMB_BUILD
/* Defines for SPL */ #define CONFIG_SPL

On 08/01/2012 02:32 PM, Allen Martin wrote:
Add changes necessary to enable thumb compilation for tegra20. This series includes two patches to work around a bug in gas that incorrectly optimizes short jumps in thumb mode that were posted separately. They can be dropped from this series when they are applied.
Sorry for the slow response. I tested this on a few boards and found that while some work fine with this patch, others don't.
OK: Springbank, Ventana, Harmony
Hang with no output at all: Whistler, Paz00, TrimSlice.
I haven't yet attempted to investigate why I'm seeing this problem. I did validate that simply removing your patch series yielded a working U-Boot in all cases (working and non-working above) without any other changes (flashing process, toolchain, ...)
participants (3)
-
Allen Martin
-
Stephen Warren
-
V, Aneesh