[U-Boot] [PATCH] nios2: convert cache flush to use dm cpu data

Convert cache flush to use dm cpu data. The cacheflush.c of Linux nios2 arch is copied to arch/nios2/lib/cache.c to replace the cache.S. The cache related functions in cpu.c is moved to cache.c. Both flush_dcache() and flush_icache() are replaced and removed. The flush_dcache_all() now flush icache too, which is the same as what is done in Linux nios2 arch.
Signed-off-by: Thomas Chou thomas@wytron.com.tw --- arch/nios2/cpu/cpu.c | 15 -------- arch/nios2/include/asm/cache.h | 13 ++----- arch/nios2/lib/bootm.c | 6 +-- arch/nios2/lib/cache.S | 68 ---------------------------------- arch/nios2/lib/cache.c | 84 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 88 insertions(+), 98 deletions(-) delete mode 100644 arch/nios2/lib/cache.S create mode 100644 arch/nios2/lib/cache.c
diff --git a/arch/nios2/cpu/cpu.c b/arch/nios2/cpu/cpu.c index 5403c0d..8607c95 100644 --- a/arch/nios2/cpu/cpu.c +++ b/arch/nios2/cpu/cpu.c @@ -29,21 +29,6 @@ int do_reset(cmd_tbl_t *cmdtp, int flag, int argc, char * const argv[]) return 0; }
-int dcache_status(void) -{ - return 1; -} - -void dcache_enable(void) -{ - flush_dcache(CONFIG_SYS_DCACHE_SIZE, CONFIG_SYS_DCACHELINE_SIZE); -} - -void dcache_disable(void) -{ - flush_dcache(CONFIG_SYS_DCACHE_SIZE, CONFIG_SYS_DCACHELINE_SIZE); -} - /* * COPY EXCEPTION TRAMPOLINE -- copy the tramp to the * exception address. Define CONFIG_ROM_STUBS to prevent diff --git a/arch/nios2/include/asm/cache.h b/arch/nios2/include/asm/cache.h index 9b87c9f..dde43cd 100644 --- a/arch/nios2/include/asm/cache.h +++ b/arch/nios2/include/asm/cache.h @@ -8,18 +8,11 @@ #ifndef __ASM_NIOS2_CACHE_H_ #define __ASM_NIOS2_CACHE_H_
-extern void flush_dcache (unsigned long start, unsigned long size); -extern void flush_icache (unsigned long start, unsigned long size); - /* - * Valid L1 data cache line sizes for the NIOS2 architecture are 4, 16, and 32 - * bytes. If the board configuration has not specified one we default to the - * largest of these values for alignment of DMA buffers. + * Valid L1 data cache line sizes for the NIOS2 architecture are 4, + * 16, and 32 bytes. We default to the largest of these values for + * alignment of DMA buffers. */ -#ifdef CONFIG_SYS_CACHELINE_SIZE -#define ARCH_DMA_MINALIGN CONFIG_SYS_CACHELINE_SIZE -#else #define ARCH_DMA_MINALIGN 32 -#endif
#endif /* __ASM_NIOS2_CACHE_H_ */ diff --git a/arch/nios2/lib/bootm.c b/arch/nios2/lib/bootm.c index c730a3f..4e5c269 100644 --- a/arch/nios2/lib/bootm.c +++ b/arch/nios2/lib/bootm.c @@ -6,9 +6,6 @@ */
#include <common.h> -#include <command.h> -#include <asm/byteorder.h> -#include <asm/cache.h>
#define NIOS_MAGIC 0x534f494e /* enable command line and initrd passing */
@@ -40,8 +37,7 @@ int do_bootm_linux(int flag, int argc, char * const argv[], bootm_headers_t *ima
/* flushes data and instruction caches before calling the kernel */ disable_interrupts(); - flush_dcache((ulong)kernel, CONFIG_SYS_DCACHE_SIZE); - flush_icache((ulong)kernel, CONFIG_SYS_ICACHE_SIZE); + flush_dcache_all();
debug("bootargs=%s @ 0x%lx\n", commandline, (ulong)&commandline); debug("initrd=0x%lx-0x%lx\n", (ulong)initrd_start, (ulong)initrd_end); diff --git a/arch/nios2/lib/cache.S b/arch/nios2/lib/cache.S deleted file mode 100644 index 683f005..0000000 --- a/arch/nios2/lib/cache.S +++ /dev/null @@ -1,68 +0,0 @@ -/* - * (C) Copyright 2004, Psyent Corporation <www.psyent.com> - * Scott McNutt smcnutt@psyent.com - * - * SPDX-License-Identifier: GPL-2.0+ - */ - -#include <config.h> - - .text - - .global flush_dcache - -flush_dcache: - add r5, r5, r4 - movhi r8, %hi(CONFIG_SYS_DCACHELINE_SIZE) - ori r8, r8, %lo(CONFIG_SYS_DCACHELINE_SIZE) -0: flushd 0(r4) - add r4, r4, r8 - bltu r4, r5, 0b - ret - - - .global flush_icache - -flush_icache: - add r5, r5, r4 - movhi r8, %hi(CONFIG_SYS_ICACHELINE_SIZE) - ori r8, r8, %lo(CONFIG_SYS_ICACHELINE_SIZE) -1: flushi r4 - add r4, r4, r8 - bltu r4, r5, 1b - ret - - .global flush_dcache_range - -flush_dcache_range: - movhi r8, %hi(CONFIG_SYS_DCACHELINE_SIZE) - ori r8, r8, %lo(CONFIG_SYS_DCACHELINE_SIZE) -0: flushd 0(r4) - add r4, r4, r8 - bltu r4, r5, 0b - ret - - .global flush_cache - -flush_cache: - add r5, r5, r4 - mov r9, r4 - mov r10, r5 - - movhi r8, %hi(CONFIG_SYS_DCACHELINE_SIZE) - ori r8, r8, %lo(CONFIG_SYS_DCACHELINE_SIZE) -0: flushd 0(r4) - add r4, r4, r8 - bltu r4, r5, 0b - - mov r4, r9 - mov r5, r10 - movhi r8, %hi(CONFIG_SYS_ICACHELINE_SIZE) - ori r8, r8, %lo(CONFIG_SYS_ICACHELINE_SIZE) -1: flushi r4 - add r4, r4, r8 - bltu r4, r5, 1b - - sync - flushp - ret diff --git a/arch/nios2/lib/cache.c b/arch/nios2/lib/cache.c new file mode 100644 index 0000000..6f26d8d --- /dev/null +++ b/arch/nios2/lib/cache.c @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2015 Thomas Chou thomas@wytron.com.tw + * Copyright (C) 2009, Wind River Systems Inc + * Implemented by fredrik.markstrom@gmail.com and ivarholmqvist@gmail.com + * + * SPDX-License-Identifier: GPL-2.0+ + */ + +#include <common.h> +#include <asm/cache.h> + +DECLARE_GLOBAL_DATA_PTR; + +static void __flush_dcache_all(unsigned long start, unsigned long end) +{ + unsigned long addr; + + start &= ~(gd->arch.dcache_line_size - 1); + end += (gd->arch.dcache_line_size - 1); + end &= ~(gd->arch.dcache_line_size - 1); + + if (end > start + gd->arch.dcache_size) + end = start + gd->arch.dcache_size; + + for (addr = start; addr < end; addr += gd->arch.dcache_line_size) { + __asm__ __volatile__ (" flushd 0(%0)\n" + : /* Outputs */ + : /* Inputs */ "r"(addr) + /* : No clobber */); + } +} + +static void __flush_icache(unsigned long start, unsigned long end) +{ + unsigned long addr; + + start &= ~(gd->arch.icache_line_size - 1); + end += (gd->arch.icache_line_size - 1); + end &= ~(gd->arch.icache_line_size - 1); + + if (end > start + gd->arch.icache_size) + end = start + gd->arch.icache_size; + + for (addr = start; addr < end; addr += gd->arch.icache_line_size) { + __asm__ __volatile__ (" flushi %0\n" + : /* Outputs */ + : /* Inputs */ "r"(addr) + /* : No clobber */); + } + __asm__ __volatile(" flushp\n"); +} + +void flush_dcache_all(void) +{ + __flush_dcache_all(0, gd->arch.dcache_size); + __flush_icache(0, gd->arch.icache_size); +} + +void flush_dcache_range(unsigned long start, unsigned long end) +{ + __flush_dcache_all(start, end); +} + +void flush_cache(unsigned long start, unsigned long size) +{ + __flush_dcache_all(start, start + size); + __flush_icache(start, start + size); +} + +/* nios2 data cache is always enabled */ +int dcache_status(void) +{ + return 1; +} + +void dcache_enable(void) +{ + flush_dcache_all(); +} + +void dcache_disable(void) +{ + flush_dcache_all(); +}

On Tuesday, October 06, 2015 at 10:20:00 AM, Thomas Chou wrote:
Convert cache flush to use dm cpu data. The cacheflush.c of Linux nios2 arch is copied to arch/nios2/lib/cache.c to replace the cache.S. The cache related functions in cpu.c is moved to cache.c. Both flush_dcache() and flush_icache() are replaced and removed. The flush_dcache_all() now flush icache too, which is
... confusing as hell :-(
the same as what is done in Linux nios2 arch.
Signed-off-by: Thomas Chou thomas@wytron.com.tw
arch/nios2/cpu/cpu.c | 15 -------- arch/nios2/include/asm/cache.h | 13 ++----- arch/nios2/lib/bootm.c | 6 +-- arch/nios2/lib/cache.S | 68 ---------------------------------- arch/nios2/lib/cache.c | 84 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 88 insertions(+), 98 deletions(-) delete mode 100644 arch/nios2/lib/cache.S create mode 100644 arch/nios2/lib/cache.c
diff --git a/arch/nios2/cpu/cpu.c b/arch/nios2/cpu/cpu.c index 5403c0d..8607c95 100644 --- a/arch/nios2/cpu/cpu.c +++ b/arch/nios2/cpu/cpu.c @@ -29,21 +29,6 @@ int do_reset(cmd_tbl_t *cmdtp, int flag, int argc, char
- const argv[]) return 0;
}
-int dcache_status(void) -{
- return 1;
-}
-void dcache_enable(void) -{
- flush_dcache(CONFIG_SYS_DCACHE_SIZE, CONFIG_SYS_DCACHELINE_SIZE);
-}
-void dcache_disable(void) -{
- flush_dcache(CONFIG_SYS_DCACHE_SIZE, CONFIG_SYS_DCACHELINE_SIZE);
-}
/*
- COPY EXCEPTION TRAMPOLINE -- copy the tramp to the
- exception address. Define CONFIG_ROM_STUBS to prevent
diff --git a/arch/nios2/include/asm/cache.h b/arch/nios2/include/asm/cache.h index 9b87c9f..dde43cd 100644 --- a/arch/nios2/include/asm/cache.h +++ b/arch/nios2/include/asm/cache.h @@ -8,18 +8,11 @@ #ifndef __ASM_NIOS2_CACHE_H_ #define __ASM_NIOS2_CACHE_H_
-extern void flush_dcache (unsigned long start, unsigned long size); -extern void flush_icache (unsigned long start, unsigned long size);
/*
- Valid L1 data cache line sizes for the NIOS2 architecture are 4, 16,
and 32 - * bytes. If the board configuration has not specified one we default to the - * largest of these values for alignment of DMA buffers.
- Valid L1 data cache line sizes for the NIOS2 architecture are 4,
- 16, and 32 bytes. We default to the largest of these values for
*/
- alignment of DMA buffers.
-#ifdef CONFIG_SYS_CACHELINE_SIZE -#define ARCH_DMA_MINALIGN CONFIG_SYS_CACHELINE_SIZE -#else #define ARCH_DMA_MINALIGN 32 -#endif
#endif /* __ASM_NIOS2_CACHE_H_ */ diff --git a/arch/nios2/lib/bootm.c b/arch/nios2/lib/bootm.c index c730a3f..4e5c269 100644 --- a/arch/nios2/lib/bootm.c +++ b/arch/nios2/lib/bootm.c @@ -6,9 +6,6 @@ */
#include <common.h> -#include <command.h> -#include <asm/byteorder.h> -#include <asm/cache.h>
#define NIOS_MAGIC 0x534f494e /* enable command line and initrd passing */
@@ -40,8 +37,7 @@ int do_bootm_linux(int flag, int argc, char * const argv[], bootm_headers_t *ima
/* flushes data and instruction caches before calling the kernel */ disable_interrupts();
- flush_dcache((ulong)kernel, CONFIG_SYS_DCACHE_SIZE);
- flush_icache((ulong)kernel, CONFIG_SYS_ICACHE_SIZE);
flush_dcache_all();
debug("bootargs=%s @ 0x%lx\n", commandline, (ulong)&commandline); debug("initrd=0x%lx-0x%lx\n", (ulong)initrd_start, (ulong)initrd_end);
diff --git a/arch/nios2/lib/cache.S b/arch/nios2/lib/cache.S deleted file mode 100644 index 683f005..0000000 --- a/arch/nios2/lib/cache.S +++ /dev/null @@ -1,68 +0,0 @@ -/*
- (C) Copyright 2004, Psyent Corporation <www.psyent.com>
- Scott McNutt smcnutt@psyent.com
- SPDX-License-Identifier: GPL-2.0+
- */
-#include <config.h>
- .text
- .global flush_dcache
-flush_dcache:
- add r5, r5, r4
- movhi r8, %hi(CONFIG_SYS_DCACHELINE_SIZE)
- ori r8, r8, %lo(CONFIG_SYS_DCACHELINE_SIZE)
-0: flushd 0(r4)
- add r4, r4, r8
- bltu r4, r5, 0b
- ret
- .global flush_icache
-flush_icache:
- add r5, r5, r4
- movhi r8, %hi(CONFIG_SYS_ICACHELINE_SIZE)
- ori r8, r8, %lo(CONFIG_SYS_ICACHELINE_SIZE)
-1: flushi r4
- add r4, r4, r8
- bltu r4, r5, 1b
- ret
- .global flush_dcache_range
-flush_dcache_range:
- movhi r8, %hi(CONFIG_SYS_DCACHELINE_SIZE)
- ori r8, r8, %lo(CONFIG_SYS_DCACHELINE_SIZE)
-0: flushd 0(r4)
- add r4, r4, r8
- bltu r4, r5, 0b
- ret
- .global flush_cache
-flush_cache:
- add r5, r5, r4
- mov r9, r4
- mov r10, r5
- movhi r8, %hi(CONFIG_SYS_DCACHELINE_SIZE)
- ori r8, r8, %lo(CONFIG_SYS_DCACHELINE_SIZE)
-0: flushd 0(r4)
- add r4, r4, r8
- bltu r4, r5, 0b
- mov r4, r9
- mov r5, r10
- movhi r8, %hi(CONFIG_SYS_ICACHELINE_SIZE)
- ori r8, r8, %lo(CONFIG_SYS_ICACHELINE_SIZE)
-1: flushi r4
- add r4, r4, r8
- bltu r4, r5, 1b
- sync
- flushp
- ret
diff --git a/arch/nios2/lib/cache.c b/arch/nios2/lib/cache.c new file mode 100644 index 0000000..6f26d8d --- /dev/null +++ b/arch/nios2/lib/cache.c @@ -0,0 +1,84 @@ +/*
- Copyright (C) 2015 Thomas Chou thomas@wytron.com.tw
- Copyright (C) 2009, Wind River Systems Inc
- Implemented by fredrik.markstrom@gmail.com and ivarholmqvist@gmail.com
- SPDX-License-Identifier: GPL-2.0+
- */
+#include <common.h> +#include <asm/cache.h>
+DECLARE_GLOBAL_DATA_PTR;
+static void __flush_dcache_all(unsigned long start, unsigned long end) +{
- unsigned long addr;
- start &= ~(gd->arch.dcache_line_size - 1);
I'd suggest to use GENMASK() here, but I don't think we picked this from Linux just yet.
- end += (gd->arch.dcache_line_size - 1);
- end &= ~(gd->arch.dcache_line_size - 1);
Is this an attempt at poor-mans' rounding ? I think you want to implment something like arch/arm/cpu/arm926ejs/cache.c check_cache_range() and NOT do any rounding here. The reason for that is that if you do rounding, you might accidentally corrupt a piece of memory which was just delivered via DMA before you did the flush.
- if (end > start + gd->arch.dcache_size)
end = start + gd->arch.dcache_size;
- for (addr = start; addr < end; addr += gd->arch.dcache_line_size) {
__asm__ __volatile__ (" flushd 0(%0)\n"
: /* Outputs */
: /* Inputs */ "r"(addr)
/* : No clobber */);
- }
+}
+static void __flush_icache(unsigned long start, unsigned long end) +{
- unsigned long addr;
- start &= ~(gd->arch.icache_line_size - 1);
- end += (gd->arch.icache_line_size - 1);
- end &= ~(gd->arch.icache_line_size - 1);
- if (end > start + gd->arch.icache_size)
end = start + gd->arch.icache_size;
- for (addr = start; addr < end; addr += gd->arch.icache_line_size) {
__asm__ __volatile__ (" flushi %0\n"
: /* Outputs */
: /* Inputs */ "r"(addr)
/* : No clobber */);
- }
- __asm__ __volatile(" flushp\n");
+}
+void flush_dcache_all(void) +{
- __flush_dcache_all(0, gd->arch.dcache_size);
- __flush_icache(0, gd->arch.icache_size);
+}
+void flush_dcache_range(unsigned long start, unsigned long end) +{
- __flush_dcache_all(start, end);
+}
+void flush_cache(unsigned long start, unsigned long size) +{
- __flush_dcache_all(start, start + size);
- __flush_icache(start, start + size);
+}
+/* nios2 data cache is always enabled */ +int dcache_status(void) +{
- return 1;
+}
+void dcache_enable(void) +{
- flush_dcache_all();
+}
+void dcache_disable(void) +{
- flush_dcache_all();
+}

On Fri, Oct 9, 2015 at 5:39 AM, Marek Vasut marex@denx.de wrote:
+DECLARE_GLOBAL_DATA_PTR;
+static void __flush_dcache_all(unsigned long start, unsigned long end) +{
unsigned long addr;
start &= ~(gd->arch.dcache_line_size - 1);
I'd suggest to use GENMASK() here, but I don't think we picked this from Linux just yet.
end += (gd->arch.dcache_line_size - 1);
end &= ~(gd->arch.dcache_line_size - 1);
Is this an attempt at poor-mans' rounding ? I think you want to implment something like arch/arm/cpu/arm926ejs/cache.c check_cache_range() and NOT do any rounding here. The reason for that is that if you do rounding, you might accidentally corrupt a piece of memory which was just delivered via DMA before you did the flush.
The code above is to convert the address to dcache line size. arch/arm/cpu/arm926ejs/cache.c check_cache_range() will skip the cache flushing if it is unaligned to cache line size. I'm not sure how frequent U-boot access to non-aligned cache line size.
Regards Ley Foon

Hi Marek,
On 10/09/2015 10:49 AM, Ley Foon Tan wrote:
Is this an attempt at poor-mans' rounding ? I think you want to implment something like arch/arm/cpu/arm926ejs/cache.c check_cache_range() and NOT do any rounding here. The reason for that is that if you do rounding, you might accidentally corrupt a piece of memory which was just delivered via DMA before you did the flush.
The code above is to convert the address to dcache line size. arch/arm/cpu/arm926ejs/cache.c check_cache_range() will skip the cache flushing if it is unaligned to cache line size. I'm not sure how frequent U-boot access to non-aligned cache line size.
Thanks a lot for your looking into this, Ley Foon.
I think we take cache flushing in a different way to arm926ejs.
In nios2 driver programming, we would request all the DMA buffers be aligned to cache line. This is necessary to avoid the cache racing issue as you mention above.
In nios2, we don't skip the flushing when the inputs are not aligned like that of arm926ejs. We always flush all cache lines in the range, even if a single byte to flush is in request. So the inputs are rounded to get the lower and upper cache lines range inside the cache flush functions. The caller need not be aware of the detail.
In the copy_exception_trampoline() patch, both dcache and icache must be flushed at the exception target address. Though the flush range is only 12 bytes, which won't be aligned.
Thank you for your review.
Best regards, Thomas

On Friday, October 09, 2015 at 10:00:26 AM, Thomas Chou wrote:
Hi Marek,
On 10/09/2015 10:49 AM, Ley Foon Tan wrote:
Is this an attempt at poor-mans' rounding ? I think you want to implment something like arch/arm/cpu/arm926ejs/cache.c check_cache_range() and NOT do any rounding here. The reason for that is that if you do rounding, you might accidentally corrupt a piece of memory which was just delivered via DMA before you did the flush.
The code above is to convert the address to dcache line size. arch/arm/cpu/arm926ejs/cache.c check_cache_range() will skip the cache flushing if it is unaligned to cache line size. I'm not sure how frequent U-boot access to non-aligned cache line size.
Thanks a lot for your looking into this, Ley Foon.
I think we take cache flushing in a different way to arm926ejs.
In nios2 driver programming, we would request all the DMA buffers be aligned to cache line. This is necessary to avoid the cache racing issue as you mention above.
This is what ARM does as well.
In nios2, we don't skip the flushing when the inputs are not aligned like that of arm926ejs. We always flush all cache lines in the range, even if a single byte to flush is in request. So the inputs are rounded to get the lower and upper cache lines range inside the cache flush functions. The caller need not be aware of the detail.
This is incorrect and all the places which produce these unaligned cache operations must be fixed.
In the copy_exception_trampoline() patch, both dcache and icache must be flushed at the exception target address. Though the flush range is only 12 bytes, which won't be aligned.
Thank you for your review.
Best regards, Thomas
Best regards, Marek Vasut

Hi Marek,
On 10/09/2015 10:42 PM, Marek Vasut wrote:
In nios2, we don't skip the flushing when the inputs are not aligned like that of arm926ejs. We always flush all cache lines in the range, even if a single byte to flush is in request. So the inputs are rounded to get the lower and upper cache lines range inside the cache flush functions. The caller need not be aware of the detail.
This is incorrect and all the places which produce these unaligned cache operations must be fixed.
I take a look into the cache flush operations in every arch of u-boot. It turns out that the arm926ejs is the only platform that does such cache line range check and skip. All other ARM and all other arch don't. And the cache flush in Linux don't.
Best regards, Thomas

Hi Marek,
On 10/10/2015 01:55 PM, Thomas Chou wrote:
Hi Marek,
On 10/09/2015 10:42 PM, Marek Vasut wrote:
In nios2, we don't skip the flushing when the inputs are not aligned like that of arm926ejs. We always flush all cache lines in the range, even if a single byte to flush is in request. So the inputs are rounded to get the lower and upper cache lines range inside the cache flush functions. The caller need not be aware of the detail.
This is incorrect and all the places which produce these unaligned cache operations must be fixed.
I take a look into the cache flush operations in every arch of u-boot. It turns out that the arm926ejs is the only platform that does such cache line range check and skip. All other ARM and all other arch don't. And the cache flush in Linux don't.
+arm11, which is based on the same code.
I see your patch on this range check. I would prefer that the details of cache line configuration be kept inside the cache flush operators, and need not be exposed to drivers. As drivers might be used by different arch with different cache implementation, L1,L2..etc. It is not good for drivers to adjust the flush range before passing to cache flush operators.
Best regards, Thomas

On Saturday, October 10, 2015 at 08:32:09 AM, Thomas Chou wrote:
Hi Marek,
Hi,
On 10/10/2015 01:55 PM, Thomas Chou wrote:
Hi Marek,
On 10/09/2015 10:42 PM, Marek Vasut wrote:
In nios2, we don't skip the flushing when the inputs are not aligned like that of arm926ejs. We always flush all cache lines in the range, even if a single byte to flush is in request. So the inputs are rounded to get the lower and upper cache lines range inside the cache flush functions. The caller need not be aware of the detail.
This is incorrect and all the places which produce these unaligned cache operations must be fixed.
I take a look into the cache flush operations in every arch of u-boot. It turns out that the arm926ejs is the only platform that does such cache line range check and skip. All other ARM and all other arch don't. And the cache flush in Linux don't.
+arm11, which is based on the same code.
I see your patch on this range check. I would prefer that the details of cache line configuration be kept inside the cache flush operators, and need not be exposed to drivers.
Then you'd also need means to allocate variables to aligned memory location to prevent invalid cache flush. (Linux does this with it's DMA API). We are much simpler and thus this abstraction is still not available. I wonder if the overhead of DMA API would be high or not for U-Boot.
As drivers might be used by different arch with different cache implementation, L1,L2..etc. It is not good for drivers to adjust the flush range before passing to cache flush operators.
It is even worse if the cache flush operators permit incorrect cache flushes or invalidations. Like I mentioned before, this can lead to hard to debug problems when using DMA (at least on ARM).
Best regards, Marek Vasut

Hi Marek,
On 10/11/2015 02:18 AM, Marek Vasut wrote:
Then you'd also need means to allocate variables to aligned memory location to prevent invalid cache flush. (Linux does this with it's DMA API). We are much simpler and thus this abstraction is still not available. I wonder if the overhead of DMA API would be high or not for U-Boot.
I see most people use memalign(ARCH_DMA_MINALIGN, len) in u-boot to allocate DMA buffers so that they are cache aligned.
It is even worse if the cache flush operators permit incorrect cache flushes or invalidations. Like I mentioned before, this can lead to hard to debug problems when using DMA (at least on ARM).
I would suggest debug check should be left as for debug only. The definition of common functions should be kept as it is more important than coding style.
I debugged DMA issues a lot in the past until I realized the importance of aligned buffers. So there should be a developer's guideline.
But it is even much more difficult when something you believed does not work as expected, what is taken as common sense. It will trap a lot of developers when they called your flush cache functions but was skipped just because, eg, the end of packets are not aligned which is usually the case.
I would suggest that, with the best of my knowledge, please change the range check to a debug probe, and restore the cache flush functions to the common definition.
Best regards, Thomas

On Sunday, October 11, 2015 at 02:38:35 AM, Thomas Chou wrote:
Hi Marek,
Hi,
On 10/11/2015 02:18 AM, Marek Vasut wrote:
Then you'd also need means to allocate variables to aligned memory location to prevent invalid cache flush. (Linux does this with it's DMA API). We are much simpler and thus this abstraction is still not available. I wonder if the overhead of DMA API would be high or not for U-Boot.
I see most people use memalign(ARCH_DMA_MINALIGN, len) in u-boot to allocate DMA buffers so that they are cache aligned.
That and include/memalign.h , which contains macros that are used to align variables.
It is even worse if the cache flush operators permit incorrect cache flushes or invalidations. Like I mentioned before, this can lead to hard to debug problems when using DMA (at least on ARM).
I would suggest debug check should be left as for debug only. The definition of common functions should be kept as it is more important than coding style.
Uh yes, that's what arm926 cache functions do, they're debug only.
I debugged DMA issues a lot in the past until I realized the importance of aligned buffers. So there should be a developer's guideline.
For what exactly?
But it is even much more difficult when something you believed does not work as expected, what is taken as common sense. It will trap a lot of developers when they called your flush cache functions but was skipped just because, eg, the end of packets are not aligned which is usually the case.
This is good, it should bite them, because this is a bug. If, on the other hand, you will paper over such bugs by adding crap to the cache ops, there will be even worse bugs coming for you, like variables which are sitting in the same cacheline as your unaligned buffer that you want to invalidate or flush will possibly get trashed by such cache operation.
Consider this:
cacheline 0: [ variable A ; buffer B ......... ] cacheline 1: [ buffer B ......... ; Empty .... ]
Now you do the following:
1) Variable A = 0; 2) Flush buffer B (which is unaligned, so flush cacheline 0 and 1) 3) Start DMA to buffer B 4) Variable A = 1; 5) Check if DMA finished, it did 6) Invalidate buffer B ... oh, but it's unaligned, let's invalidate everything around it, which is cacheline 0 and 1. 7) What is the value of variable A ? Oh, it's fetched from memory and it's 0 there, even though we did set it to 1 ...
I would suggest that, with the best of my knowledge, please change the range check to a debug probe, and restore the cache flush functions to the common definition.
See above, does my example make it clear why we should never ever hide bugs in the cache ops code ?
Best regards, Marek Vasut

Hi Marek,
On 10/11/2015 08:15 PM, Marek Vasut wrote:
On Sunday, October 11, 2015 at 02:38:35 AM, Thomas Chou wrote:
Hi Marek,
Hi,
On 10/11/2015 02:18 AM, Marek Vasut wrote:
Then you'd also need means to allocate variables to aligned memory location to prevent invalid cache flush. (Linux does this with it's DMA API). We are much simpler and thus this abstraction is still not available. I wonder if the overhead of DMA API would be high or not for U-Boot.
I see most people use memalign(ARCH_DMA_MINALIGN, len) in u-boot to allocate DMA buffers so that they are cache aligned.
That and include/memalign.h , which contains macros that are used to align variables.
Yes. It is malloc_cache_aligned(), which should be used to allocate DMA buffer. Thank you for the pointer.
It is even worse if the cache flush operators permit incorrect cache flushes or invalidations. Like I mentioned before, this can lead to hard to debug problems when using DMA (at least on ARM).
I would suggest debug check should be left as for debug only. The definition of common functions should be kept as it is more important than coding style.
Uh yes, that's what arm926 cache functions do, they're debug only.
I debugged DMA issues a lot in the past until I realized the importance of aligned buffers. So there should be a developer's guideline.
For what exactly?
For u-boot, every DMA buffer must be allocated with malloc_cache_aligned(). Then there will be not variables and DMA buffers cache racing issues as you describe below.
But it is even much more difficult when something you believed does not work as expected, what is taken as common sense. It will trap a lot of developers when they called your flush cache functions but was skipped just because, eg, the end of packets are not aligned which is usually the case.
This is good, it should bite them, because this is a bug. If, on the other hand, you will paper over such bugs by adding crap to the cache ops, there will be even worse bugs coming for you, like variables which are sitting in the same cacheline as your unaligned buffer that you want to invalidate or flush will possibly get trashed by such cache operation.
Consider this:
cacheline 0: [ variable A ; buffer B ......... ] cacheline 1: [ buffer B ......... ; Empty .... ]
Now you do the following:
- Variable A = 0;
- Flush buffer B (which is unaligned, so flush cacheline 0 and 1)
- Start DMA to buffer B
- Variable A = 1;
- Check if DMA finished, it did
- Invalidate buffer B ... oh, but it's unaligned, let's invalidate everything around it, which is cacheline 0 and 1.
- What is the value of variable A ? Oh, it's fetched from memory and it's 0 there, even though we did set it to 1 ...
I would suggest that, with the best of my knowledge, please change the range check to a debug probe, and restore the cache flush functions to the common definition.
See above, does my example make it clear why we should never ever hide bugs in the cache ops code ?
It is the drivers' responsibility to follow the guide line above. If there is such a bug, it is not the cache flush ops bug. It is a driver's bug. You may add a probe to show the bug from caller, but you may not call it a bug of cache ops and skip the flush. Given that it is quite common that the return of such cache ops is not checked, few (if not none) will ever know that the flush was skipped.
Best regards, Thomas

On Monday, October 12, 2015 at 02:34:16 AM, Thomas Chou wrote:
Hi Marek,
Hi!
On 10/11/2015 08:15 PM, Marek Vasut wrote:
On Sunday, October 11, 2015 at 02:38:35 AM, Thomas Chou wrote:
Hi Marek,
Hi,
On 10/11/2015 02:18 AM, Marek Vasut wrote:
Then you'd also need means to allocate variables to aligned memory location to prevent invalid cache flush. (Linux does this with it's DMA API). We are much simpler and thus this abstraction is still not available. I wonder if the overhead of DMA API would be high or not for U-Boot.
I see most people use memalign(ARCH_DMA_MINALIGN, len) in u-boot to allocate DMA buffers so that they are cache aligned.
That and include/memalign.h , which contains macros that are used to align variables.
Yes. It is malloc_cache_aligned(), which should be used to allocate DMA buffer. Thank you for the pointer.
There are also DEFINE_CACHE_ALIGN_BUFFER() and ALLOC_CACHE_ALIGN_BUFFER() macros which can be used to allocate such stuff on stack. And you sometimes do want to allocate things on stack instead of using malloc().
It is even worse if the cache flush operators permit incorrect cache flushes or invalidations. Like I mentioned before, this can lead to hard to debug problems when using DMA (at least on ARM).
I would suggest debug check should be left as for debug only. The definition of common functions should be kept as it is more important than coding style.
Uh yes, that's what arm926 cache functions do, they're debug only.
I debugged DMA issues a lot in the past until I realized the importance of aligned buffers. So there should be a developer's guideline.
For what exactly?
For u-boot, every DMA buffer must be allocated with malloc_cache_aligned(). Then there will be not variables and DMA buffers cache racing issues as you describe below.
Sometimes you might want to allocate DMA buffers on stack, for example if you don't have mallocator running yet or if it's more convenient for some reason. So forcing everyone to allocate DMA buffers using malloc is not gonna slide I'm afraid.
But it is even much more difficult when something you believed does not work as expected, what is taken as common sense. It will trap a lot of developers when they called your flush cache functions but was skipped just because, eg, the end of packets are not aligned which is usually the case.
This is good, it should bite them, because this is a bug. If, on the other hand, you will paper over such bugs by adding crap to the cache ops, there will be even worse bugs coming for you, like variables which are sitting in the same cacheline as your unaligned buffer that you want to invalidate or flush will possibly get trashed by such cache operation.
Consider this:
cacheline 0: [ variable A ; buffer B ......... ] cacheline 1: [ buffer B ......... ; Empty .... ]
Now you do the following:
Variable A = 0;
Flush buffer B (which is unaligned, so flush cacheline 0 and 1)
Start DMA to buffer B
Variable A = 1;
Check if DMA finished, it did
Invalidate buffer B ... oh, but it's unaligned, let's invalidate
everything around it, which is cacheline 0 and 1.
What is the value of variable A ? Oh, it's fetched from memory and
it's 0 there, even though we did set it to 1 ...
I would suggest that, with the best of my knowledge, please change the range check to a debug probe, and restore the cache flush functions to the common definition.
See above, does my example make it clear why we should never ever hide bugs in the cache ops code ?
It is the drivers' responsibility to follow the guide line above. If there is such a bug, it is not the cache flush ops bug. It is a driver's bug. You may add a probe to show the bug from caller, but you may not call it a bug of cache ops and skip the flush. Given that it is quite common that the return of such cache ops is not checked, few (if not none) will ever know that the flush was skipped.
The cache flush ops is the best place to scream death and murder if someone tries such unaligned cache operation, so maybe you should even do a printf() there to weed such crappy drivers out for the 2016.01 release.
I agree it's the responsibility of the driver, so if the driver doesn't do things right, it's a bug and the behavior of cache ops is undefined, which might as well be that we do the safer thing here and flush nothing.
Best regards, Marek Vasut

Hi Marek,
On 10/12/2015 06:30 PM, Marek Vasut wrote:
There are also DEFINE_CACHE_ALIGN_BUFFER() and ALLOC_CACHE_ALIGN_BUFFER() macros which can be used to allocate such stuff on stack. And you sometimes do want to allocate things on stack instead of using malloc().
Thanks for sharing this.
Sometimes you might want to allocate DMA buffers on stack, for example if you don't have mallocator running yet or if it's more convenient for some reason. So forcing everyone to allocate DMA buffers using malloc is not gonna slide I'm afraid.
The same rule can be applied to buffer allocated on stack, with the macro you mentioned above. In all, cache line aware allocation on heap or on stack must be used for DMA buffer.
The cache flush ops is the best place to scream death and murder if someone tries such unaligned cache operation, so maybe you should even do a printf() there to weed such crappy drivers out for the 2016.01 release.
I agree it's the responsibility of the driver, so if the driver doesn't do things right, it's a bug and the behavior of cache ops is undefined, which might as well be that we do the safer thing here and flush nothing.
It won't be safer to flush nothing. Sooner or later the cache will be flushed due to data access, even if the cache flush ops is skip.
To solve problem like this, the only solution is to enforce the rule to allocate DMA buffer. It is wrong to skip the flush.
Best regards, Thomas

On Monday, October 12, 2015 at 03:12:18 PM, Thomas Chou wrote:
Hi Marek,
On 10/12/2015 06:30 PM, Marek Vasut wrote:
There are also DEFINE_CACHE_ALIGN_BUFFER() and ALLOC_CACHE_ALIGN_BUFFER() macros which can be used to allocate such stuff on stack. And you sometimes do want to allocate things on stack instead of using malloc().
Thanks for sharing this.
Sometimes you might want to allocate DMA buffers on stack, for example if you don't have mallocator running yet or if it's more convenient for some reason. So forcing everyone to allocate DMA buffers using malloc is not gonna slide I'm afraid.
The same rule can be applied to buffer allocated on stack, with the macro you mentioned above. In all, cache line aware allocation on heap or on stack must be used for DMA buffer.
That's correct, they must be used. But sadly, this is not yet the case in all the drivers, which we need to rectify. And how best to rectify this than to scream when someone does such a thing, right ?
The cache flush ops is the best place to scream death and murder if someone tries such unaligned cache operation, so maybe you should even do a printf() there to weed such crappy drivers out for the 2016.01 release.
I agree it's the responsibility of the driver, so if the driver doesn't do things right, it's a bug and the behavior of cache ops is undefined, which might as well be that we do the safer thing here and flush nothing.
It won't be safer to flush nothing. Sooner or later the cache will be flushed due to data access, even if the cache flush ops is skip.
That is bad bad bad, that's even nastier. We really need to fix the drivers, not paper over it in the cache ops.
To solve problem like this, the only solution is to enforce the rule to allocate DMA buffer. It is wrong to skip the flush.
I absolutelly agree we need aligned allocations for DMA memory areas. But, we also shouldn't hide bugs. And I believe aligning the incorrect arguments to cache functions is not the way to go. We should check the arguments and if someone tries an unaligned cache op, we should scream. What do you think?
btw. I think you won't get way too many cache warnings nowadays and we can fix those few remaining way before the 2016.01 is out.
Best regards, Marek Vasut

Dear Marek,
In message 201510121529.45730.marex@denx.de you wrote:
That is bad bad bad, that's even nastier. We really need to fix the drivers, not paper over it in the cache ops.
Full ACK here.
To solve problem like this, the only solution is to enforce the rule to allocate DMA buffer. It is wrong to skip the flush.
I absolutelly agree we need aligned allocations for DMA memory areas. But, we also shouldn't hide bugs. And I believe aligning the incorrect arguments to cache functions is not the way to go. We should check the arguments and if someone tries an unaligned cache op, we should scream. What do you think?
Again, full ACK.
We should make sure we get clear, unmistakable error messages for such bugs, and not silent non-deterministic behaviour.
Best regards,
Wolfgang Denk

Hi Marek,
On 10/12/2015 09:29 PM, Marek Vasut wrote:
On Monday, October 12, 2015 at 03:12:18 PM, Thomas Chou wrote:
Hi Marek,
On 10/12/2015 06:30 PM, Marek Vasut wrote:
There are also DEFINE_CACHE_ALIGN_BUFFER() and ALLOC_CACHE_ALIGN_BUFFER() macros which can be used to allocate such stuff on stack. And you sometimes do want to allocate things on stack instead of using malloc().
Thanks for sharing this.
Sometimes you might want to allocate DMA buffers on stack, for example if you don't have mallocator running yet or if it's more convenient for some reason. So forcing everyone to allocate DMA buffers using malloc is not gonna slide I'm afraid.
The same rule can be applied to buffer allocated on stack, with the macro you mentioned above. In all, cache line aware allocation on heap or on stack must be used for DMA buffer.
That's correct, they must be used. But sadly, this is not yet the case in all the drivers, which we need to rectify. And how best to rectify this than to scream when someone does such a thing, right ?
Given that there are not so many drivers using DMA in u-boot, as compared to Linux. I would suggest we can walk through and rectify the allocation of DMA buffers.
The cache flush ops is the best place to scream death and murder if someone tries such unaligned cache operation, so maybe you should even do a printf() there to weed such crappy drivers out for the 2016.01 release.
I agree it's the responsibility of the driver, so if the driver doesn't do things right, it's a bug and the behavior of cache ops is undefined, which might as well be that we do the safer thing here and flush nothing.
It won't be safer to flush nothing. Sooner or later the cache will be flushed due to data access, even if the cache flush ops is skip.
That is bad bad bad, that's even nastier. We really need to fix the drivers, not paper over it in the cache ops.
I know how bad it is, with over 35 years work with DMA. grin..
To solve problem like this, the only solution is to enforce the rule to allocate DMA buffer. It is wrong to skip the flush.
I absolutelly agree we need aligned allocations for DMA memory areas. But, we also shouldn't hide bugs. And I believe aligning the incorrect arguments to cache functions is not the way to go. We should check the arguments and if someone tries an unaligned cache op, we should scream. What do you think?
btw. I think you won't get way too many cache warnings nowadays and we can fix those few remaining way before the 2016.01 is out.
I would suggest the "cache alignment check and skip" be removed from cache flush ops, and say out the DMA buffer allocation rule loudly in README, and enforce it by guardianship. Please allow me to restate the reasons,
1. The cache flush ops are commonly used. Please refer to the "Cache and TLB Flushing Under Linux" doc, linux/Documentation/cachetlb.txt. Violating the defined interface is much worse than violating coding style. It will certainly impact the portability of u-boot. And might introduce more bug than resolve.
2. We all agree that enforcing DMA buffer allocation to cache aligned is the only real solution. Adding such "check and skip" to cache flush ops cannot prevent the flush or solve the problem.
3. Though the flush size of block device are usually aligned, the size of packet are not. Asking the packet drivers to adjust the flush size does not make sense. It is the job of cache flush ops. The debug probe should not override the original purpose. It should be spelled for common understanding.
It is free to your consideration. As it is free and open software. :)
Best regards, Thomas

On Tuesday, October 13, 2015 at 03:04:44 AM, Thomas Chou wrote:
Hi Marek,
Hi!
On 10/12/2015 09:29 PM, Marek Vasut wrote:
On Monday, October 12, 2015 at 03:12:18 PM, Thomas Chou wrote:
Hi Marek,
On 10/12/2015 06:30 PM, Marek Vasut wrote:
There are also DEFINE_CACHE_ALIGN_BUFFER() and ALLOC_CACHE_ALIGN_BUFFER() macros which can be used to allocate such stuff on stack. And you sometimes do want to allocate things on stack instead of using malloc().
Thanks for sharing this.
Sometimes you might want to allocate DMA buffers on stack, for example if you don't have mallocator running yet or if it's more convenient for some reason. So forcing everyone to allocate DMA buffers using malloc is not gonna slide I'm afraid.
The same rule can be applied to buffer allocated on stack, with the macro you mentioned above. In all, cache line aware allocation on heap or on stack must be used for DMA buffer.
That's correct, they must be used. But sadly, this is not yet the case in all the drivers, which we need to rectify. And how best to rectify this than to scream when someone does such a thing, right ?
Given that there are not so many drivers using DMA in u-boot, as compared to Linux. I would suggest we can walk through and rectify the allocation of DMA buffers.
That's what we're pretty much trying to do -- fix all the DMA-using drivers to behave correctly.
The cache flush ops is the best place to scream death and murder if someone tries such unaligned cache operation, so maybe you should even do a printf() there to weed such crappy drivers out for the 2016.01 release.
I agree it's the responsibility of the driver, so if the driver doesn't do things right, it's a bug and the behavior of cache ops is undefined, which might as well be that we do the safer thing here and flush nothing.
It won't be safer to flush nothing. Sooner or later the cache will be flushed due to data access, even if the cache flush ops is skip.
That is bad bad bad, that's even nastier. We really need to fix the drivers, not paper over it in the cache ops.
I know how bad it is, with over 35 years work with DMA. grin..
I can feel your pain here ;-/
To solve problem like this, the only solution is to enforce the rule to allocate DMA buffer. It is wrong to skip the flush.
I absolutelly agree we need aligned allocations for DMA memory areas. But, we also shouldn't hide bugs. And I believe aligning the incorrect arguments to cache functions is not the way to go. We should check the arguments and if someone tries an unaligned cache op, we should scream. What do you think?
btw. I think you won't get way too many cache warnings nowadays and we can fix those few remaining way before the 2016.01 is out.
I would suggest the "cache alignment check and skip" be removed from cache flush ops, and say out the DMA buffer allocation rule loudly in README, and enforce it by guardianship.
What exactly do you envision by this "guardianship" ?
Please allow me to restate the reasons,
- The cache flush ops are commonly used. Please refer to the "Cache and
TLB Flushing Under Linux" doc, linux/Documentation/cachetlb.txt. Violating the defined interface is much worse than violating coding style. It will certainly impact the portability of u-boot. And might introduce more bug than resolve.
I agree with this one.
- We all agree that enforcing DMA buffer allocation to cache aligned is
the only real solution. Adding such "check and skip" to cache flush ops cannot prevent the flush or solve the problem.
We should probably check-scream-skip here.
- Though the flush size of block device are usually aligned, the size
of packet are not. Asking the packet drivers to adjust the flush size does not make sense. It is the job of cache flush ops. The debug probe should not override the original purpose. It should be spelled for common understanding.
The socket buffer(s) should be aligned, so network packets should be fine.
It is free to your consideration. As it is free and open software. :)
Best regards, Thomas

Hi Marek,
On 10/17/2015 07:03 AM, Marek Vasut wrote:
I would suggest the "cache alignment check and skip" be removed from cache flush ops, and say out the DMA buffer allocation rule loudly in README, and enforce it by guardianship.
What exactly do you envision by this "guardianship" ?
I mean the reviews of custodians.
Please allow me to restate the reasons,
- The cache flush ops are commonly used. Please refer to the "Cache and
TLB Flushing Under Linux" doc, linux/Documentation/cachetlb.txt. Violating the defined interface is much worse than violating coding style. It will certainly impact the portability of u-boot. And might introduce more bug than resolve.
I agree with this one.
- We all agree that enforcing DMA buffer allocation to cache aligned is
the only real solution. Adding such "check and skip" to cache flush ops cannot prevent the flush or solve the problem.
We should probably check-scream-skip here.
- Though the flush size of block device are usually aligned, the size
of packet are not. Asking the packet drivers to adjust the flush size does not make sense. It is the job of cache flush ops. The debug probe should not override the original purpose. It should be spelled for common understanding.
The socket buffer(s) should be aligned, so network packets should be fine.
While the start of socket buffer might be aligned, the size of the transfer might not for the send ops. It is depended on the net/tcp/ip packets size.
For example, with tftp, there is a lot of unaligned end of packets.
tftp d1000000 u-boot-dtb.bin
flush unaligned d7ff7020-d7ff704e [repeat ..]
So, such an alarm may be false. And such a skip can be bug.
In fact, for my own projects, I have changed the memory allocation to always cache aligned. And I rarely worry about it ever after.
I look at the net.c of u-boot. There are packets buffer allocated on BSS and stack. I would suggest avoid such programming, and use aligned memory allocation stead.
Best regards, Thomas

On Saturday, October 17, 2015 at 05:22:41 AM, Thomas Chou wrote:
Hi Marek,
Hi!
On 10/17/2015 07:03 AM, Marek Vasut wrote:
I would suggest the "cache alignment check and skip" be removed from cache flush ops, and say out the DMA buffer allocation rule loudly in README, and enforce it by guardianship.
What exactly do you envision by this "guardianship" ?
I mean the reviews of custodians.
Wouldn't an automated check be better ? It's easier and costs almost nothing. Besides, custodians are not perfect and cannot detect all the issues.
Please allow me to restate the reasons,
- The cache flush ops are commonly used. Please refer to the "Cache and
TLB Flushing Under Linux" doc, linux/Documentation/cachetlb.txt. Violating the defined interface is much worse than violating coding style. It will certainly impact the portability of u-boot. And might introduce more bug than resolve.
I agree with this one.
- We all agree that enforcing DMA buffer allocation to cache aligned is
the only real solution. Adding such "check and skip" to cache flush ops cannot prevent the flush or solve the problem.
We should probably check-scream-skip here.
- Though the flush size of block device are usually aligned, the size
of packet are not. Asking the packet drivers to adjust the flush size does not make sense. It is the job of cache flush ops. The debug probe should not override the original purpose. It should be spelled for common understanding.
The socket buffer(s) should be aligned, so network packets should be fine.
While the start of socket buffer might be aligned, the size of the transfer might not for the send ops. It is depended on the net/tcp/ip packets size.
Aurgh :-( Now I see what you mean. This is purely bad, very bad. Here is a real possibility for corruption of variables close to the allocated DMA buffer, right ?
For example, with tftp, there is a lot of unaligned end of packets.
tftp d1000000 u-boot-dtb.bin
flush unaligned d7ff7020-d7ff704e [repeat ..]
So, such an alarm may be false. And such a skip can be bug.
In fact, for my own projects, I have changed the memory allocation to always cache aligned. And I rarely worry about it ever after.
I look at the net.c of u-boot. There are packets buffer allocated on BSS and stack. I would suggest avoid such programming, and use aligned memory allocation stead.
The stack allocation there is used because it's slightly faster and you don't need mallocator for that. I guess this is a topic for a broader discussion and we should include Tom and others into it. Would you mind starting another thread on the ML and CCing me, Tom Rini, Simon Glass etc please ?
Best regards, Marek Vasut

On Saturday, October 10, 2015 at 07:55:45 AM, Thomas Chou wrote:
Hi Marek,
Hi,
On 10/09/2015 10:42 PM, Marek Vasut wrote:
In nios2, we don't skip the flushing when the inputs are not aligned like that of arm926ejs. We always flush all cache lines in the range, even if a single byte to flush is in request. So the inputs are rounded to get the lower and upper cache lines range inside the cache flush functions. The caller need not be aware of the detail.
This is incorrect and all the places which produce these unaligned cache operations must be fixed.
I take a look into the cache flush operations in every arch of u-boot. It turns out that the arm926ejs is the only platform that does such cache line range check and skip. All other ARM and all other arch don't.
Yes, everyone else doesn't do the checks and if there is unaligned cache flush/invalidation, that platform also suffers from various obscure and hard to debug errors. I submitted patch to add the same check for ARMv7, but it didn't receive attention :-( I should repost it I guess.
And the cache flush in Linux don't.
Because the buffers there are always correctly aligned during allocation.
Best regards, Marek Vasut

On Friday, October 09, 2015 at 04:49:03 AM, Ley Foon Tan wrote:
On Fri, Oct 9, 2015 at 5:39 AM, Marek Vasut marex@denx.de wrote:
+DECLARE_GLOBAL_DATA_PTR;
+static void __flush_dcache_all(unsigned long start, unsigned long end) +{
unsigned long addr;
start &= ~(gd->arch.dcache_line_size - 1);
I'd suggest to use GENMASK() here, but I don't think we picked this from Linux just yet.
end += (gd->arch.dcache_line_size - 1);
end &= ~(gd->arch.dcache_line_size - 1);
Is this an attempt at poor-mans' rounding ? I think you want to implment something like arch/arm/cpu/arm926ejs/cache.c check_cache_range() and NOT do any rounding here. The reason for that is that if you do rounding, you might accidentally corrupt a piece of memory which was just delivered via DMA before you did the flush.
The code above is to convert the address to dcache line size.
So it's aligning the unaligned accesses, correct?
arch/arm/cpu/arm926ejs/cache.c check_cache_range() will skip the cache flushing if it is unaligned to cache line size.
That's right, unaligned cache flushes/invalidations should not happen, ever.
I'm not sure how frequent U-boot access to non-aligned cache line size.
They must not ever happen, if some driver does them, the driver needs to be fixed.
Best regards, Marek Vasut

Convert cache flush to use dm cpu data.
The original cache flush functions are written in assembly and use CONFIG_SYS_{I,D}CACHE_SIZE... macros. It is difficult to convert to use cache configuration in dm cpu data which is extracted from device tree.
The cacheflush.c of Linux nios2 arch uses cpuinfo structure, which is very close to our dm cpu data. So we copy and modify it to arch/nios2/lib/cache.c to replace the old cache.S.
Signed-off-by: Thomas Chou thomas@wytron.com.tw --- v2 change commit message.
arch/nios2/include/asm/cache.h | 13 ++------ arch/nios2/lib/bootm.c | 6 +--- arch/nios2/lib/cache.S | 68 ------------------------------------------ arch/nios2/lib/cache.c | 68 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 72 insertions(+), 83 deletions(-) delete mode 100644 arch/nios2/lib/cache.S create mode 100644 arch/nios2/lib/cache.c
diff --git a/arch/nios2/include/asm/cache.h b/arch/nios2/include/asm/cache.h index 9b87c9f..dde43cd 100644 --- a/arch/nios2/include/asm/cache.h +++ b/arch/nios2/include/asm/cache.h @@ -8,18 +8,11 @@ #ifndef __ASM_NIOS2_CACHE_H_ #define __ASM_NIOS2_CACHE_H_
-extern void flush_dcache (unsigned long start, unsigned long size); -extern void flush_icache (unsigned long start, unsigned long size); - /* - * Valid L1 data cache line sizes for the NIOS2 architecture are 4, 16, and 32 - * bytes. If the board configuration has not specified one we default to the - * largest of these values for alignment of DMA buffers. + * Valid L1 data cache line sizes for the NIOS2 architecture are 4, + * 16, and 32 bytes. We default to the largest of these values for + * alignment of DMA buffers. */ -#ifdef CONFIG_SYS_CACHELINE_SIZE -#define ARCH_DMA_MINALIGN CONFIG_SYS_CACHELINE_SIZE -#else #define ARCH_DMA_MINALIGN 32 -#endif
#endif /* __ASM_NIOS2_CACHE_H_ */ diff --git a/arch/nios2/lib/bootm.c b/arch/nios2/lib/bootm.c index c730a3f..4e5c269 100644 --- a/arch/nios2/lib/bootm.c +++ b/arch/nios2/lib/bootm.c @@ -6,9 +6,6 @@ */
#include <common.h> -#include <command.h> -#include <asm/byteorder.h> -#include <asm/cache.h>
#define NIOS_MAGIC 0x534f494e /* enable command line and initrd passing */
@@ -40,8 +37,7 @@ int do_bootm_linux(int flag, int argc, char * const argv[], bootm_headers_t *ima
/* flushes data and instruction caches before calling the kernel */ disable_interrupts(); - flush_dcache((ulong)kernel, CONFIG_SYS_DCACHE_SIZE); - flush_icache((ulong)kernel, CONFIG_SYS_ICACHE_SIZE); + flush_dcache_all();
debug("bootargs=%s @ 0x%lx\n", commandline, (ulong)&commandline); debug("initrd=0x%lx-0x%lx\n", (ulong)initrd_start, (ulong)initrd_end); diff --git a/arch/nios2/lib/cache.S b/arch/nios2/lib/cache.S deleted file mode 100644 index 683f005..0000000 --- a/arch/nios2/lib/cache.S +++ /dev/null @@ -1,68 +0,0 @@ -/* - * (C) Copyright 2004, Psyent Corporation <www.psyent.com> - * Scott McNutt smcnutt@psyent.com - * - * SPDX-License-Identifier: GPL-2.0+ - */ - -#include <config.h> - - .text - - .global flush_dcache - -flush_dcache: - add r5, r5, r4 - movhi r8, %hi(CONFIG_SYS_DCACHELINE_SIZE) - ori r8, r8, %lo(CONFIG_SYS_DCACHELINE_SIZE) -0: flushd 0(r4) - add r4, r4, r8 - bltu r4, r5, 0b - ret - - - .global flush_icache - -flush_icache: - add r5, r5, r4 - movhi r8, %hi(CONFIG_SYS_ICACHELINE_SIZE) - ori r8, r8, %lo(CONFIG_SYS_ICACHELINE_SIZE) -1: flushi r4 - add r4, r4, r8 - bltu r4, r5, 1b - ret - - .global flush_dcache_range - -flush_dcache_range: - movhi r8, %hi(CONFIG_SYS_DCACHELINE_SIZE) - ori r8, r8, %lo(CONFIG_SYS_DCACHELINE_SIZE) -0: flushd 0(r4) - add r4, r4, r8 - bltu r4, r5, 0b - ret - - .global flush_cache - -flush_cache: - add r5, r5, r4 - mov r9, r4 - mov r10, r5 - - movhi r8, %hi(CONFIG_SYS_DCACHELINE_SIZE) - ori r8, r8, %lo(CONFIG_SYS_DCACHELINE_SIZE) -0: flushd 0(r4) - add r4, r4, r8 - bltu r4, r5, 0b - - mov r4, r9 - mov r5, r10 - movhi r8, %hi(CONFIG_SYS_ICACHELINE_SIZE) - ori r8, r8, %lo(CONFIG_SYS_ICACHELINE_SIZE) -1: flushi r4 - add r4, r4, r8 - bltu r4, r5, 1b - - sync - flushp - ret diff --git a/arch/nios2/lib/cache.c b/arch/nios2/lib/cache.c new file mode 100644 index 0000000..3e154e8 --- /dev/null +++ b/arch/nios2/lib/cache.c @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2015 Thomas Chou thomas@wytron.com.tw + * Copyright (C) 2009, Wind River Systems Inc + * Implemented by fredrik.markstrom@gmail.com and ivarholmqvist@gmail.com + * + * SPDX-License-Identifier: GPL-2.0+ + */ + +#include <common.h> +#include <asm/cache.h> + +DECLARE_GLOBAL_DATA_PTR; + +static void __flush_dcache_all(unsigned long start, unsigned long end) +{ + unsigned long addr; + + start &= ~(gd->arch.dcache_line_size - 1); + end += (gd->arch.dcache_line_size - 1); + end &= ~(gd->arch.dcache_line_size - 1); + + if (end > start + gd->arch.dcache_size) + end = start + gd->arch.dcache_size; + + for (addr = start; addr < end; addr += gd->arch.dcache_line_size) { + __asm__ __volatile__ (" flushd 0(%0)\n" + : /* Outputs */ + : /* Inputs */ "r"(addr) + /* : No clobber */); + } +} + +static void __flush_icache(unsigned long start, unsigned long end) +{ + unsigned long addr; + + start &= ~(gd->arch.icache_line_size - 1); + end += (gd->arch.icache_line_size - 1); + end &= ~(gd->arch.icache_line_size - 1); + + if (end > start + gd->arch.icache_size) + end = start + gd->arch.icache_size; + + for (addr = start; addr < end; addr += gd->arch.icache_line_size) { + __asm__ __volatile__ (" flushi %0\n" + : /* Outputs */ + : /* Inputs */ "r"(addr) + /* : No clobber */); + } + __asm__ __volatile(" flushp\n"); +} + +void flush_dcache_all(void) +{ + __flush_dcache_all(0, gd->arch.dcache_size); + __flush_icache(0, gd->arch.icache_size); +} + +void flush_dcache_range(unsigned long start, unsigned long end) +{ + __flush_dcache_all(start, end); +} + +void flush_cache(unsigned long start, unsigned long size) +{ + __flush_dcache_all(start, start + size); + __flush_icache(start, start + size); +}
participants (4)
-
Ley Foon Tan
-
Marek Vasut
-
Thomas Chou
-
Wolfgang Denk