[U-Boot-Users] [PATCH] mips: Bring over optimized memset() routine from Linux.

This commit pulls over the memset() MIPS routine from Linux 2.6.26, which provides a 10x to 20x speedup over the generic byte-at-a-time routine. This is especially useful on platforms with manual ECC scrubbing, that require all of memory to be written at least once after a power cycle. --- include/asm-mips/string.h | 2 +- lib_mips/Makefile | 2 +- lib_mips/memset.S | 174 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 176 insertions(+), 2 deletions(-) create mode 100644 lib_mips/memset.S
diff --git a/include/asm-mips/string.h b/include/asm-mips/string.h index 579a591..0df1463 100644 --- a/include/asm-mips/string.h +++ b/include/asm-mips/string.h @@ -27,7 +27,7 @@ extern int strcmp(__const__ char *__cs, __const__ char *__ct); #undef __HAVE_ARCH_STRNCMP extern int strncmp(__const__ char *__cs, __const__ char *__ct, __kernel_size_t __count);
-#undef __HAVE_ARCH_MEMSET +#define __HAVE_ARCH_MEMSET extern void *memset(void *__s, int __c, __kernel_size_t __count);
#undef __HAVE_ARCH_MEMCPY diff --git a/lib_mips/Makefile b/lib_mips/Makefile index 8176437..9149039 100644 --- a/lib_mips/Makefile +++ b/lib_mips/Makefile @@ -25,7 +25,7 @@ include $(TOPDIR)/config.mk
LIB = $(obj)lib$(ARCH).a
-SOBJS-y += +SOBJS-y += memset.o
COBJS-y += board.o COBJS-y += bootm.o diff --git a/lib_mips/memset.S b/lib_mips/memset.S new file mode 100644 index 0000000..f1c07d7 --- /dev/null +++ b/lib_mips/memset.S @@ -0,0 +1,174 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 1998, 1999, 2000 by Ralf Baechle + * Copyright (C) 1999, 2000 Silicon Graphics, Inc. + * Copyright (C) 2007 Maciej W. Rozycki + */ +#include <asm/asm.h> +//#include <asm/asm-offsets.h> +#include <asm/regdef.h> + +#if LONGSIZE == 4 +#define LONG_S_L swl +#define LONG_S_R swr +#else +#define LONG_S_L sdl +#define LONG_S_R sdr +#endif + +#define EX(insn,reg,addr,handler) \ +9: insn reg, addr; \ + .section __ex_table,"a"; \ + PTR 9b, handler; \ + .previous + + .macro f_fill64 dst, offset, val, fixup + EX(LONG_S, \val, (\offset + 0 * LONGSIZE)(\dst), \fixup) + EX(LONG_S, \val, (\offset + 1 * LONGSIZE)(\dst), \fixup) + EX(LONG_S, \val, (\offset + 2 * LONGSIZE)(\dst), \fixup) + EX(LONG_S, \val, (\offset + 3 * LONGSIZE)(\dst), \fixup) + EX(LONG_S, \val, (\offset + 4 * LONGSIZE)(\dst), \fixup) + EX(LONG_S, \val, (\offset + 5 * LONGSIZE)(\dst), \fixup) + EX(LONG_S, \val, (\offset + 6 * LONGSIZE)(\dst), \fixup) + EX(LONG_S, \val, (\offset + 7 * LONGSIZE)(\dst), \fixup) +#if LONGSIZE == 4 + EX(LONG_S, \val, (\offset + 8 * LONGSIZE)(\dst), \fixup) + EX(LONG_S, \val, (\offset + 9 * LONGSIZE)(\dst), \fixup) + EX(LONG_S, \val, (\offset + 10 * LONGSIZE)(\dst), \fixup) + EX(LONG_S, \val, (\offset + 11 * LONGSIZE)(\dst), \fixup) + EX(LONG_S, \val, (\offset + 12 * LONGSIZE)(\dst), \fixup) + EX(LONG_S, \val, (\offset + 13 * LONGSIZE)(\dst), \fixup) + EX(LONG_S, \val, (\offset + 14 * LONGSIZE)(\dst), \fixup) + EX(LONG_S, \val, (\offset + 15 * LONGSIZE)(\dst), \fixup) +#endif + .endm + +/* + * memset(void *s, int c, size_t n) + * + * a0: start of area to clear + * a1: char to fill with + * a2: size of area to clear + */ + .set noreorder + .align 5 +LEAF(memset) + beqz a1, 1f + move v0, a0 /* result */ + + andi a1, 0xff /* spread fillword */ + LONG_SLL t1, a1, 8 + or a1, t1 + LONG_SLL t1, a1, 16 +#if LONGSIZE == 8 + or a1, t1 + LONG_SLL t1, a1, 32 +#endif + or a1, t1 +1: + +FEXPORT(__bzero) + sltiu t0, a2, LONGSIZE /* very small region? */ + bnez t0, .Lsmall_memset + andi t0, a0, LONGMASK /* aligned? */ + +#ifndef CONFIG_CPU_DADDI_WORKAROUNDS + beqz t0, 1f + PTR_SUBU t0, LONGSIZE /* alignment in bytes */ +#else + .set noat + li AT, LONGSIZE + beqz t0, 1f + PTR_SUBU t0, AT /* alignment in bytes */ + .set at +#endif + + R10KCBARRIER(0(ra)) +#ifdef __MIPSEB__ + EX(LONG_S_L, a1, (a0), .Lfirst_fixup) /* make word/dword aligned */ +#endif +#ifdef __MIPSEL__ + EX(LONG_S_R, a1, (a0), .Lfirst_fixup) /* make word/dword aligned */ +#endif + PTR_SUBU a0, t0 /* long align ptr */ + PTR_ADDU a2, t0 /* correct size */ + +1: ori t1, a2, 0x3f /* # of full blocks */ + xori t1, 0x3f + beqz t1, .Lmemset_partial /* no block to fill */ + andi t0, a2, 0x40-LONGSIZE + + PTR_ADDU t1, a0 /* end address */ + .set reorder +1: PTR_ADDIU a0, 64 + R10KCBARRIER(0(ra)) + f_fill64 a0, -64, a1, .Lfwd_fixup + bne t1, a0, 1b + .set noreorder + +.Lmemset_partial: + R10KCBARRIER(0(ra)) + PTR_LA t1, 2f /* where to start */ +#if LONGSIZE == 4 + PTR_SUBU t1, t0 +#else + .set noat + LONG_SRL AT, t0, 1 + PTR_SUBU t1, AT + .set at +#endif + jr t1 + PTR_ADDU a0, t0 /* dest ptr */ + + .set push + .set noreorder + .set nomacro + f_fill64 a0, -64, a1, .Lpartial_fixup /* ... but first do longs ... */ +2: .set pop + andi a2, LONGMASK /* At most one long to go */ + + beqz a2, 1f + PTR_ADDU a0, a2 /* What's left */ + R10KCBARRIER(0(ra)) +#ifdef __MIPSEB__ + EX(LONG_S_R, a1, -1(a0), .Llast_fixup) +#endif +#ifdef __MIPSEL__ + EX(LONG_S_L, a1, -1(a0), .Llast_fixup) +#endif +1: jr ra + move a2, zero + +.Lsmall_memset: + beqz a2, 2f + PTR_ADDU t1, a0, a2 + +1: PTR_ADDIU a0, 1 /* fill bytewise */ + R10KCBARRIER(0(ra)) + bne t1, a0, 1b + sb a1, -1(a0) + +2: jr ra /* done */ + move a2, zero + END(memset) + +.Lfirst_fixup: + jr ra + nop + +.Lfwd_fixup: + andi a2, 0x3f + jr ra + LONG_ADDU a2, t1 + +.Lpartial_fixup: + andi a2, LONGMASK + jr ra + LONG_ADDU a2, t1 + +.Llast_fixup: + jr ra + andi v1, a2, LONGMASK

Hi Jason,
Jason McMullan wrote:
This commit pulls over the memset() MIPS routine from Linux 2.6.26, which provides a 10x to 20x speedup over the generic byte-at-a-time routine. This is especially useful on platforms with manual ECC scrubbing, that require all of memory to be written at least once after a power cycle.
include/asm-mips/string.h | 2 +- lib_mips/Makefile | 2 +- lib_mips/memset.S | 174 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 176 insertions(+), 2 deletions(-) create mode 100644 lib_mips/memset.S
IIRC, Linux's memset relies on AdEL/AdES exceptions. We have Status.EXL enabled, but don't have proper exception handlers, yet. So my question is does this code always works expectedly, or works with some alignment restriction?
And some nitpickings. See below.
diff --git a/lib_mips/memset.S b/lib_mips/memset.S new file mode 100644 index 0000000..f1c07d7 --- /dev/null +++ b/lib_mips/memset.S @@ -0,0 +1,174 @@ +/*
- This file is subject to the terms and conditions of the GNU General Public
- License. See the file "COPYING" in the main directory of this archive
- for more details.
- Copyright (C) 1998, 1999, 2000 by Ralf Baechle
- Copyright (C) 1999, 2000 Silicon Graphics, Inc.
- Copyright (C) 2007 Maciej W. Rozycki
- */
+#include <asm/asm.h> +//#include <asm/asm-offsets.h>
Please remove unused #include. Even '#if 0'-ing is not allowed in U-Boot policy.
+#include <asm/regdef.h>
+#if LONGSIZE == 4 +#define LONG_S_L swl +#define LONG_S_R swr +#else +#define LONG_S_L sdl +#define LONG_S_R sdr +#endif
+#define EX(insn,reg,addr,handler) \ +9: insn reg, addr; \
- .section __ex_table,"a"; \
- PTR 9b, handler; \
- .previous
- .macro f_fill64 dst, offset, val, fixup
- EX(LONG_S, \val, (\offset + 0 * LONGSIZE)(\dst), \fixup)
- EX(LONG_S, \val, (\offset + 1 * LONGSIZE)(\dst), \fixup)
- EX(LONG_S, \val, (\offset + 2 * LONGSIZE)(\dst), \fixup)
- EX(LONG_S, \val, (\offset + 3 * LONGSIZE)(\dst), \fixup)
- EX(LONG_S, \val, (\offset + 4 * LONGSIZE)(\dst), \fixup)
- EX(LONG_S, \val, (\offset + 5 * LONGSIZE)(\dst), \fixup)
- EX(LONG_S, \val, (\offset + 6 * LONGSIZE)(\dst), \fixup)
- EX(LONG_S, \val, (\offset + 7 * LONGSIZE)(\dst), \fixup)
+#if LONGSIZE == 4
- EX(LONG_S, \val, (\offset + 8 * LONGSIZE)(\dst), \fixup)
- EX(LONG_S, \val, (\offset + 9 * LONGSIZE)(\dst), \fixup)
- EX(LONG_S, \val, (\offset + 10 * LONGSIZE)(\dst), \fixup)
- EX(LONG_S, \val, (\offset + 11 * LONGSIZE)(\dst), \fixup)
- EX(LONG_S, \val, (\offset + 12 * LONGSIZE)(\dst), \fixup)
- EX(LONG_S, \val, (\offset + 13 * LONGSIZE)(\dst), \fixup)
- EX(LONG_S, \val, (\offset + 14 * LONGSIZE)(\dst), \fixup)
- EX(LONG_S, \val, (\offset + 15 * LONGSIZE)(\dst), \fixup)
+#endif
- .endm
+/*
- memset(void *s, int c, size_t n)
- a0: start of area to clear
- a1: char to fill with
- a2: size of area to clear
- */
- .set noreorder
- .align 5
+LEAF(memset)
- beqz a1, 1f
move v0, a0 /* result */
^
- andi a1, 0xff /* spread fillword */
- LONG_SLL t1, a1, 8
- or a1, t1
- LONG_SLL t1, a1, 16
+#if LONGSIZE == 8
- or a1, t1
- LONG_SLL t1, a1, 32
+#endif
- or a1, t1
+1:
+FEXPORT(__bzero)
- sltiu t0, a2, LONGSIZE /* very small region? */
- bnez t0, .Lsmall_memset
andi t0, a0, LONGMASK /* aligned? */
^
[further part snipped]
Please fix wrong indentations with proper tabs. I know this is exactly the same as Linux's memset, but we prefer to fix it correctly in U-Boot.
[ I used to do like you did, but changed my mind. Now I think this is better practice. Incoherent indentations with Linux is not a big deal IMO. Just diff -w option blows them away. ]
Thanks in advance,

Shinya Kuribayashi wrote:
- andi a1, 0xff /* spread fillword */
- LONG_SLL t1, a1, 8
- or a1, t1
- LONG_SLL t1, a1, 16
+#if LONGSIZE == 8
- or a1, t1
- LONG_SLL t1, a1, 32
+#endif
- or a1, t1
+1:
+FEXPORT(__bzero)
- sltiu t0, a2, LONGSIZE /* very small region? */
- bnez t0, .Lsmall_memset
andi t0, a0, LONGMASK /* aligned? */
^
[further part snipped]
Please fix wrong indentations with proper tabs. I know this is exactly the same as Linux's memset, but we prefer to fix it correctly in U-Boot.
I found that above is an intended space to indicate that the instruction is in the delay slot. I think it's probably a good old convention in MIPS assembly programming, and would like to leave it as it is, IMHO.
Anyway, sorry for my ignorance and please ignore my comments on this.

In message 48521C14.4010504@necel.com you wrote:
I found that above is an intended space to indicate that the instruction is in the delay slot. I think it's probably a good old convention in MIPS assembly programming, and would like to leave it as it is, IMHO.
Indeed. If it has a deeper meaning, this should be left as is.
Anyway, sorry for my ignorance and please ignore my comments on this.
Thanks for the explanation - I think most of us were not aware of any such conventions. Speaking for me - I definitely was not.
Best regards,
Wolfgang Denk

In message 20080604194815.A02FD6E7BD@mcmullan-linux.hq.netapp.com you wrote:
This commit pulls over the memset() MIPS routine from Linux 2.6.26, which provides a 10x to 20x speedup over the generic byte-at-a-time routine. This is especially useful on platforms with manual ECC scrubbing, that require all of memory to be written at least once after a power cycle.
include/asm-mips/string.h | 2 +- lib_mips/Makefile | 2 +- lib_mips/memset.S | 174 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 176 insertions(+), 2 deletions(-) create mode 100644 lib_mips/memset.S
Shinya Kuribayashi asked some questions about your patch, which you did not answer (as far as I can tell).
Do you intend to comment on the questions and/or submit a cleaned up version of the patch?
Best regards,
Wolfgang Denk

On Sun, 2008-07-06 at 00:32 +0200, Wolfgang Denk wrote:
In message 20080604194815.A02FD6E7BD@mcmullan-linux.hq.netapp.com you wrote:
This commit pulls over the memset() MIPS routine from Linux 2.6.26, which provides a 10x to 20x speedup over the generic byte-at-a-time routine. This is especially useful on platforms with manual ECC scrubbing, that require all of memory to be written at least once after a power cycle.
Do you intend to comment on the questions and/or submit a cleaned up version of the patch?
Unfortunately, no follow-up patch is forthcoming.
I was able to use a spare DMA engine on our SOC to perform the memory zeroing, which eliminated the need for the enhanced memcopy() routine.
Also, I am not familiar with the intricacies of MIPS exception handling for alignment issues, so I was not able to come up with a good solution for Shinya Kuribayashi's alignment trap issue questions.
Please retract the patch.
Jason McMullan MTS SW System Firmware
NetApp 724.741.5011 Fax 724.741.5166 Direct 412.656.3519 Mobile jason.mcmullan@netapp.com www.netapp.com
participants (4)
-
Jason McMullan
-
McMullan, Jason
-
Shinya Kuribayashi
-
Wolfgang Denk