[PATCH] arm: Use builtins for ffs/fls

Since ARMv5, the clz instruction allows for efficient implementation of ffs/fls with builtins. Until ARMv7 (with Thumb-2), this instruction is only available in ARM mode. LTO makes it difficult to force specific functions to be in ARM mode, as it is effectively a form of very aggressive inlining. To work around this, fls/ffs are implemented in assembly for ARMv5 and ARMv6 when compiling U-Boot in Thumb mode. Overall, this saves around 75 bytes per call.
Signed-off-by: Sean Anderson sean.anderson@seco.com --- I only tested this on ARMv8. If someone has an ARMv5 or ARMv6 board, please test this.
arch/arm/include/asm/bitops.h | 27 ++++++++++++- arch/arm/lib/Makefile | 5 +++ arch/arm/lib/bitops.S | 45 ++++++++++++++++++++++ include/asm-generic/bitops/builtin-__ffs.h | 16 ++++++++ include/asm-generic/bitops/builtin-__fls.h | 16 ++++++++ include/asm-generic/bitops/builtin-ffs.h | 15 ++++++++ include/asm-generic/bitops/builtin-fls.h | 17 ++++++++ 7 files changed, 140 insertions(+), 1 deletion(-) create mode 100644 arch/arm/lib/bitops.S create mode 100644 include/asm-generic/bitops/builtin-__ffs.h create mode 100644 include/asm-generic/bitops/builtin-__fls.h create mode 100644 include/asm-generic/bitops/builtin-ffs.h create mode 100644 include/asm-generic/bitops/builtin-fls.h
diff --git a/arch/arm/include/asm/bitops.h b/arch/arm/include/asm/bitops.h index fa8548624a..8e897833bb 100644 --- a/arch/arm/include/asm/bitops.h +++ b/arch/arm/include/asm/bitops.h @@ -15,9 +15,34 @@ #ifndef __ASM_ARM_BITOPS_H #define __ASM_ARM_BITOPS_H
+#if __LINUX_ARM_ARCH__ < 5 + #include <asm-generic/bitops/__ffs.h> #include <asm-generic/bitops/__fls.h> #include <asm-generic/bitops/fls.h> + +#else + +#define PLATFORM_FFS +#define PLATFORM_FLS + +#if !IS_ENABLED(CONFIG_HAS_THUMB2) && CONFIG_IS_ENABLED(SYS_THUMB_BUILD) + +unsigned long __fls(unsigned long word); +unsigned long __ffs(unsigned long word); +int fls(unsigned int x); +int ffs(int x); + +#else + +#include <asm-generic/bitops/builtin-__fls.h> +#include <asm-generic/bitops/builtin-__ffs.h> +#include <asm-generic/bitops/builtin-fls.h> +#include <asm-generic/bitops/builtin-ffs.h> + +#endif +#endif + #include <asm-generic/bitops/fls64.h>
#ifdef __KERNEL__ @@ -113,7 +138,7 @@ static inline int test_bit(int nr, const void * addr)
static inline int __ilog2(unsigned int x) { - return generic_fls(x) - 1; + return fls(x) - 1; }
#define ffz(x) __ffs(~(x)) diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile index 62cf80f373..b1bcd37466 100644 --- a/arch/arm/lib/Makefile +++ b/arch/arm/lib/Makefile @@ -113,6 +113,11 @@ AFLAGS_REMOVE_memset.o := -mthumb -mthumb-interwork AFLAGS_REMOVE_memcpy.o := -mthumb -mthumb-interwork AFLAGS_memset.o := -DMEMSET_NO_THUMB_BUILD AFLAGS_memcpy.o := -DMEMCPY_NO_THUMB_BUILD + +# This is only necessary to force ARM mode on THUMB1 targets. +ifneq ($(CONFIG_SYS_ARM_ARCH),4) +obj-y += bitops.o +endif endif endif
diff --git a/arch/arm/lib/bitops.S b/arch/arm/lib/bitops.S new file mode 100644 index 0000000000..29d1524634 --- /dev/null +++ b/arch/arm/lib/bitops.S @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Sean Anderson sean.anderson@seco.com + * + * ARM bitops to call when using THUMB1, which doesn't have these instructions. + */ +#include <linux/linkage.h> +#include <asm/assembler.h> + +.pushsection .text.__fls +ENTRY(__fls) + clz r0, r0 + rsb r0, r0, #31 + ret lr +ENDPROC(__fls) +.popsection + +.pushsection .text.__ffs +ENTRY(__ffs) + rsb r3, r0, #0 + and r0, r0, r3 + clz r0, r0 + rsb r0, r0, #31 + ret lr +ENDPROC(__ffs) +.popsection + +.pushsection .text.fls +ENTRY(fls) + cmp r0, #0 + clzne r0, r0 + rsbne r0, r0, #32 + ret lr +ENDPROC(fls) +.popsection + +.pushsection .text.ffs +ENTRY(ffs) + rsb r3, r0, #0 + and r0, r0, r3 + clz r0, r0 + rsb r0, r0, #32 + ret lr +ENDPROC(ffs) +.popsection diff --git a/include/asm-generic/bitops/builtin-__ffs.h b/include/asm-generic/bitops/builtin-__ffs.h new file mode 100644 index 0000000000..87024da44d --- /dev/null +++ b/include/asm-generic/bitops/builtin-__ffs.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_GENERIC_BITOPS_BUILTIN___FFS_H_ +#define _ASM_GENERIC_BITOPS_BUILTIN___FFS_H_ + +/** + * __ffs - find first bit in word. + * @word: The word to search + * + * Undefined if no bit exists, so code should check against 0 first. + */ +static __always_inline unsigned long __ffs(unsigned long word) +{ + return __builtin_ctzl(word); +} + +#endif diff --git a/include/asm-generic/bitops/builtin-__fls.h b/include/asm-generic/bitops/builtin-__fls.h new file mode 100644 index 0000000000..43a5aa9afb --- /dev/null +++ b/include/asm-generic/bitops/builtin-__fls.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_GENERIC_BITOPS_BUILTIN___FLS_H_ +#define _ASM_GENERIC_BITOPS_BUILTIN___FLS_H_ + +/** + * __fls - find last (most-significant) set bit in a long word + * @word: the word to search + * + * Undefined if no set bit exists, so code should check against 0 first. + */ +static __always_inline unsigned long __fls(unsigned long word) +{ + return (sizeof(word) * 8) - 1 - __builtin_clzl(word); +} + +#endif diff --git a/include/asm-generic/bitops/builtin-ffs.h b/include/asm-generic/bitops/builtin-ffs.h new file mode 100644 index 0000000000..7b12932904 --- /dev/null +++ b/include/asm-generic/bitops/builtin-ffs.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_GENERIC_BITOPS_BUILTIN_FFS_H_ +#define _ASM_GENERIC_BITOPS_BUILTIN_FFS_H_ + +/** + * ffs - find first bit set + * @x: the word to search + * + * This is defined the same way as + * the libc and compiler builtin ffs routines, therefore + * differs in spirit from ffz (man ffs). + */ +#define ffs(x) __builtin_ffs(x) + +#endif diff --git a/include/asm-generic/bitops/builtin-fls.h b/include/asm-generic/bitops/builtin-fls.h new file mode 100644 index 0000000000..c8455cc288 --- /dev/null +++ b/include/asm-generic/bitops/builtin-fls.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_GENERIC_BITOPS_BUILTIN_FLS_H_ +#define _ASM_GENERIC_BITOPS_BUILTIN_FLS_H_ + +/** + * fls - find last (most-significant) bit set + * @x: the word to search + * + * This is defined the same way as ffs. + * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. + */ +static __always_inline int fls(unsigned int x) +{ + return x ? sizeof(x) * 8 - __builtin_clz(x) : 0; +} + +#endif

On Mon, Jul 31, 2023 at 05:27:33PM -0400, Sean Anderson wrote:
Since ARMv5, the clz instruction allows for efficient implementation of ffs/fls with builtins. Until ARMv7 (with Thumb-2), this instruction is only available in ARM mode. LTO makes it difficult to force specific functions to be in ARM mode, as it is effectively a form of very aggressive inlining. To work around this, fls/ffs are implemented in assembly for ARMv5 and ARMv6 when compiling U-Boot in Thumb mode. Overall, this saves around 75 bytes per call.
Signed-off-by: Sean Anderson sean.anderson@seco.com
This looks like it's synced from the kernel, what tag?

On 7/31/23 17:36, Tom Rini wrote:
On Mon, Jul 31, 2023 at 05:27:33PM -0400, Sean Anderson wrote:
Since ARMv5, the clz instruction allows for efficient implementation of ffs/fls with builtins. Until ARMv7 (with Thumb-2), this instruction is only available in ARM mode. LTO makes it difficult to force specific functions to be in ARM mode, as it is effectively a form of very aggressive inlining. To work around this, fls/ffs are implemented in assembly for ARMv5 and ARMv6 when compiling U-Boot in Thumb mode. Overall, this saves around 75 bytes per call.
Signed-off-by: Sean Anderson sean.anderson@seco.com
This looks like it's synced from the kernel, what tag?
The builtins are synced from v5.15 and they haven't changed since.
--Sean

On 7/31/23 17:27, Sean Anderson wrote:
Since ARMv5, the clz instruction allows for efficient implementation of ffs/fls with builtins. Until ARMv7 (with Thumb-2), this instruction is only available in ARM mode. LTO makes it difficult to force specific functions to be in ARM mode, as it is effectively a form of very aggressive inlining. To work around this, fls/ffs are implemented in assembly for ARMv5 and ARMv6 when compiling U-Boot in Thumb mode. Overall, this saves around 75 bytes per call.
Signed-off-by: Sean Anderson sean.anderson@seco.com
I only tested this on ARMv8. If someone has an ARMv5 or ARMv6 board, please test this.
+CC Linus since he may care about integrator.
--Sean

On Mon, Jul 31, 2023 at 05:27:33PM -0400, Sean Anderson wrote:
Since ARMv5, the clz instruction allows for efficient implementation of ffs/fls with builtins. Until ARMv7 (with Thumb-2), this instruction is only available in ARM mode. LTO makes it difficult to force specific functions to be in ARM mode, as it is effectively a form of very aggressive inlining. To work around this, fls/ffs are implemented in assembly for ARMv5 and ARMv6 when compiling U-Boot in Thumb mode. Overall, this saves around 75 bytes per call.
Signed-off-by: Sean Anderson sean.anderson@seco.com
I'll add some wording about when this was synced from the kernel when applying.
Reviewed-by: Tom Rini trini@konsulko.com

On Mon, Jul 31, 2023 at 05:27:33PM -0400, Sean Anderson wrote:
Since ARMv5, the clz instruction allows for efficient implementation of ffs/fls with builtins. Until ARMv7 (with Thumb-2), this instruction is only available in ARM mode. LTO makes it difficult to force specific functions to be in ARM mode, as it is effectively a form of very aggressive inlining. To work around this, fls/ffs are implemented in assembly for ARMv5 and ARMv6 when compiling U-Boot in Thumb mode. Overall, this saves around 75 bytes per call.
This code is synced with v5.15 of the Linux kernel.
Signed-off-by: Sean Anderson sean.anderson@seco.com Reviewed-by: Tom Rini trini@konsulko.com
Applied to u-boot/next, thanks!

Hi Sean,
On Mon, Jul 31, 2023 at 2:28 PM Sean Anderson sean.anderson@seco.com wrote:
Since ARMv5, the clz instruction allows for efficient implementation of ffs/fls with builtins. Until ARMv7 (with Thumb-2), this instruction is only available in ARM mode. LTO makes it difficult to force specific functions to be in ARM mode, as it is effectively a form of very aggressive inlining. To work around this, fls/ffs are implemented in assembly for ARMv5 and ARMv6 when compiling U-Boot in Thumb mode. Overall, this saves around 75 bytes per call.
Signed-off-by: Sean Anderson sean.anderson@seco.com
I only tested this on ARMv8. If someone has an ARMv5 or ARMv6 board, please test this.
Thanks for the patch! I've tested it with the Pogo V4 board (Kirkwood 88F6192, ARMv5) with LTO and SYS_THUMB_BUILD.
Also nice size reduction: Before: 502K After: 499K
So for ARMv5, Tested-by: Tony Dinh mibodhi@gmail.com
All the best, Tony
arch/arm/include/asm/bitops.h | 27 ++++++++++++- arch/arm/lib/Makefile | 5 +++ arch/arm/lib/bitops.S | 45 ++++++++++++++++++++++ include/asm-generic/bitops/builtin-__ffs.h | 16 ++++++++ include/asm-generic/bitops/builtin-__fls.h | 16 ++++++++ include/asm-generic/bitops/builtin-ffs.h | 15 ++++++++ include/asm-generic/bitops/builtin-fls.h | 17 ++++++++ 7 files changed, 140 insertions(+), 1 deletion(-) create mode 100644 arch/arm/lib/bitops.S create mode 100644 include/asm-generic/bitops/builtin-__ffs.h create mode 100644 include/asm-generic/bitops/builtin-__fls.h create mode 100644 include/asm-generic/bitops/builtin-ffs.h create mode 100644 include/asm-generic/bitops/builtin-fls.h
diff --git a/arch/arm/include/asm/bitops.h b/arch/arm/include/asm/bitops.h index fa8548624a..8e897833bb 100644 --- a/arch/arm/include/asm/bitops.h +++ b/arch/arm/include/asm/bitops.h @@ -15,9 +15,34 @@ #ifndef __ASM_ARM_BITOPS_H #define __ASM_ARM_BITOPS_H
+#if __LINUX_ARM_ARCH__ < 5
#include <asm-generic/bitops/__ffs.h> #include <asm-generic/bitops/__fls.h> #include <asm-generic/bitops/fls.h>
+#else
+#define PLATFORM_FFS +#define PLATFORM_FLS
+#if !IS_ENABLED(CONFIG_HAS_THUMB2) && CONFIG_IS_ENABLED(SYS_THUMB_BUILD)
+unsigned long __fls(unsigned long word); +unsigned long __ffs(unsigned long word); +int fls(unsigned int x); +int ffs(int x);
+#else
+#include <asm-generic/bitops/builtin-__fls.h> +#include <asm-generic/bitops/builtin-__ffs.h> +#include <asm-generic/bitops/builtin-fls.h> +#include <asm-generic/bitops/builtin-ffs.h>
+#endif +#endif
#include <asm-generic/bitops/fls64.h>
#ifdef __KERNEL__ @@ -113,7 +138,7 @@ static inline int test_bit(int nr, const void * addr)
static inline int __ilog2(unsigned int x) {
return generic_fls(x) - 1;
return fls(x) - 1;
}
#define ffz(x) __ffs(~(x)) diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile index 62cf80f373..b1bcd37466 100644 --- a/arch/arm/lib/Makefile +++ b/arch/arm/lib/Makefile @@ -113,6 +113,11 @@ AFLAGS_REMOVE_memset.o := -mthumb -mthumb-interwork AFLAGS_REMOVE_memcpy.o := -mthumb -mthumb-interwork AFLAGS_memset.o := -DMEMSET_NO_THUMB_BUILD AFLAGS_memcpy.o := -DMEMCPY_NO_THUMB_BUILD
+# This is only necessary to force ARM mode on THUMB1 targets. +ifneq ($(CONFIG_SYS_ARM_ARCH),4) +obj-y += bitops.o +endif endif endif
diff --git a/arch/arm/lib/bitops.S b/arch/arm/lib/bitops.S new file mode 100644 index 0000000000..29d1524634 --- /dev/null +++ b/arch/arm/lib/bitops.S @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/*
- Copyright (C) 2023 Sean Anderson sean.anderson@seco.com
- ARM bitops to call when using THUMB1, which doesn't have these instructions.
- */
+#include <linux/linkage.h> +#include <asm/assembler.h>
+.pushsection .text.__fls +ENTRY(__fls)
clz r0, r0
rsb r0, r0, #31
ret lr
+ENDPROC(__fls) +.popsection
+.pushsection .text.__ffs +ENTRY(__ffs)
rsb r3, r0, #0
and r0, r0, r3
clz r0, r0
rsb r0, r0, #31
ret lr
+ENDPROC(__ffs) +.popsection
+.pushsection .text.fls +ENTRY(fls)
cmp r0, #0
clzne r0, r0
rsbne r0, r0, #32
ret lr
+ENDPROC(fls) +.popsection
+.pushsection .text.ffs +ENTRY(ffs)
rsb r3, r0, #0
and r0, r0, r3
clz r0, r0
rsb r0, r0, #32
ret lr
+ENDPROC(ffs) +.popsection diff --git a/include/asm-generic/bitops/builtin-__ffs.h b/include/asm-generic/bitops/builtin-__ffs.h new file mode 100644 index 0000000000..87024da44d --- /dev/null +++ b/include/asm-generic/bitops/builtin-__ffs.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_GENERIC_BITOPS_BUILTIN___FFS_H_ +#define _ASM_GENERIC_BITOPS_BUILTIN___FFS_H_
+/**
- __ffs - find first bit in word.
- @word: The word to search
- Undefined if no bit exists, so code should check against 0 first.
- */
+static __always_inline unsigned long __ffs(unsigned long word) +{
return __builtin_ctzl(word);
+}
+#endif diff --git a/include/asm-generic/bitops/builtin-__fls.h b/include/asm-generic/bitops/builtin-__fls.h new file mode 100644 index 0000000000..43a5aa9afb --- /dev/null +++ b/include/asm-generic/bitops/builtin-__fls.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_GENERIC_BITOPS_BUILTIN___FLS_H_ +#define _ASM_GENERIC_BITOPS_BUILTIN___FLS_H_
+/**
- __fls - find last (most-significant) set bit in a long word
- @word: the word to search
- Undefined if no set bit exists, so code should check against 0 first.
- */
+static __always_inline unsigned long __fls(unsigned long word) +{
return (sizeof(word) * 8) - 1 - __builtin_clzl(word);
+}
+#endif diff --git a/include/asm-generic/bitops/builtin-ffs.h b/include/asm-generic/bitops/builtin-ffs.h new file mode 100644 index 0000000000..7b12932904 --- /dev/null +++ b/include/asm-generic/bitops/builtin-ffs.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_GENERIC_BITOPS_BUILTIN_FFS_H_ +#define _ASM_GENERIC_BITOPS_BUILTIN_FFS_H_
+/**
- ffs - find first bit set
- @x: the word to search
- This is defined the same way as
- the libc and compiler builtin ffs routines, therefore
- differs in spirit from ffz (man ffs).
- */
+#define ffs(x) __builtin_ffs(x)
+#endif diff --git a/include/asm-generic/bitops/builtin-fls.h b/include/asm-generic/bitops/builtin-fls.h new file mode 100644 index 0000000000..c8455cc288 --- /dev/null +++ b/include/asm-generic/bitops/builtin-fls.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_GENERIC_BITOPS_BUILTIN_FLS_H_ +#define _ASM_GENERIC_BITOPS_BUILTIN_FLS_H_
+/**
- fls - find last (most-significant) bit set
- @x: the word to search
- This is defined the same way as ffs.
- Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32.
- */
+static __always_inline int fls(unsigned int x) +{
return x ? sizeof(x) * 8 - __builtin_clz(x) : 0;
+}
+#endif
2.40.1
participants (3)
-
Sean Anderson
-
Tom Rini
-
Tony Dinh