[U-Boot] [PATCH v2 0/6] dcache support for Raspberry Pi 1

This patchset enables dcache support for Raspberry Pi 1. First the cache support code for arm1136 and 1176 was merged. CONFIG_SYS_CACHELINE_SIZE is defined as 32 bytes which is used as alignment for mailbox buffer allocations. Then rpi mailbox code has now dcache flush for writing the mailbox request and a dcache invalidation for receiving the mailbox answer. Finally the CONFIG_SYS_DCACHE_OFF switch got removed from rpi1 config. It is still set for rpi2 config.
dcache supprt increases the MMC read performance on RPI 1 from 5,4 MiB/s to 12.3 MiB/s.
This was tested by the following command:
fatload mmc 0:1 ${kernel_addr_r} zImage
Changes in v2: * Merge arm1136/1176 cache code * Use cacheline size as mailbox buffer alignment * Flush/invalidate mailbox buffer up to cacheline size
Alexander Stein (6): arm1136: Remove dead code arm1136/arm1176: Merge cache handling code ARM: bcm283x: Define CONFIG_SYS_CACHELINE_SIZE ARM: bcm283x: Allocate all mailbox buffers cacheline aligned arm/mach-bcm283x/mbox: Flush and invalidate dcache when using fw mailbox arm/rpi: Enable dcache
arch/arm/cpu/arm11/Makefile | 8 ++ arch/arm/cpu/arm11/cpu.c | 150 ++++++++++++++++++++++++++++ arch/arm/cpu/arm1136/Makefile | 1 - arch/arm/cpu/arm1136/cpu.c | 160 ------------------------------ arch/arm/cpu/arm1176/Makefile | 4 +- arch/arm/cpu/arm1176/cpu.c | 51 ---------- arch/arm/mach-bcm283x/include/mach/mbox.h | 4 + arch/arm/mach-bcm283x/mbox.c | 9 ++ board/raspberrypi/rpi/rpi.c | 10 +- drivers/video/bcm2835.c | 4 +- include/configs/rpi-common.h | 2 +- include/configs/rpi_2.h | 1 + 12 files changed, 183 insertions(+), 221 deletions(-) create mode 100644 arch/arm/cpu/arm11/Makefile create mode 100644 arch/arm/cpu/arm11/cpu.c delete mode 100644 arch/arm/cpu/arm1136/cpu.c delete mode 100644 arch/arm/cpu/arm1176/cpu.c

Apparently lcd_panel_disable is not defined anywhere, so no config for an arm1136 board would have set CONFIG_LCD. Remove the unused code.
Signed-off-by: Alexander Stein alexanders83@web.de --- arch/arm/cpu/arm1136/cpu.c | 10 ---------- 1 file changed, 10 deletions(-)
diff --git a/arch/arm/cpu/arm1136/cpu.c b/arch/arm/cpu/arm1136/cpu.c index a7aed4b..5d4b3c2 100644 --- a/arch/arm/cpu/arm1136/cpu.c +++ b/arch/arm/cpu/arm1136/cpu.c @@ -32,16 +32,6 @@ int cleanup_before_linux (void)
disable_interrupts ();
-#ifdef CONFIG_LCD - { - extern void lcd_disable(void); - extern void lcd_panel_disable(void); - - lcd_disable(); /* proper disable of lcd & panel */ - lcd_panel_disable(); - } -#endif - /* turn off I/D-cache */ icache_disable(); dcache_disable();

As both cores are similar merge the cache handling code for both CPUs to arm11 directory.
Signed-off-by: Alexander Stein alexanders83@web.de --- arch/arm/cpu/arm11/Makefile | 8 +++ arch/arm/cpu/arm11/cpu.c | 150 ++++++++++++++++++++++++++++++++++++++++++ arch/arm/cpu/arm1136/Makefile | 1 - arch/arm/cpu/arm1136/cpu.c | 150 ------------------------------------------ arch/arm/cpu/arm1176/Makefile | 4 +- arch/arm/cpu/arm1176/cpu.c | 51 -------------- 6 files changed, 161 insertions(+), 203 deletions(-) create mode 100644 arch/arm/cpu/arm11/Makefile create mode 100644 arch/arm/cpu/arm11/cpu.c delete mode 100644 arch/arm/cpu/arm1136/cpu.c delete mode 100644 arch/arm/cpu/arm1176/cpu.c
diff --git a/arch/arm/cpu/arm11/Makefile b/arch/arm/cpu/arm11/Makefile new file mode 100644 index 0000000..2379b0f --- /dev/null +++ b/arch/arm/cpu/arm11/Makefile @@ -0,0 +1,8 @@ +# +# (C) Copyright 2000-2006 +# Wolfgang Denk, DENX Software Engineering, wd@denx.de. +# +# SPDX-License-Identifier: GPL-2.0+ +# + +obj-y = cpu.o diff --git a/arch/arm/cpu/arm11/cpu.c b/arch/arm/cpu/arm11/cpu.c new file mode 100644 index 0000000..5d4b3c2 --- /dev/null +++ b/arch/arm/cpu/arm11/cpu.c @@ -0,0 +1,150 @@ +/* + * (C) Copyright 2004 Texas Insturments + * + * (C) Copyright 2002 + * Sysgo Real-Time Solutions, GmbH <www.elinos.com> + * Marius Groeger mgroeger@sysgo.de + * + * (C) Copyright 2002 + * Gary Jennejohn, DENX Software Engineering, garyj@denx.de + * + * SPDX-License-Identifier: GPL-2.0+ + */ + +/* + * CPU specific code + */ + +#include <common.h> +#include <command.h> +#include <asm/system.h> + +static void cache_flush(void); + +int cleanup_before_linux (void) +{ + /* + * this function is called just before we call linux + * it prepares the processor for linux + * + * we turn off caches etc ... + */ + + disable_interrupts (); + + /* turn off I/D-cache */ + icache_disable(); + dcache_disable(); + /* flush I/D-cache */ + cache_flush(); + + return 0; +} + +static void cache_flush(void) +{ + unsigned long i = 0; + /* clean entire data cache */ + asm volatile("mcr p15, 0, %0, c7, c10, 0" : : "r" (i)); + /* invalidate both caches and flush btb */ + asm volatile("mcr p15, 0, %0, c7, c7, 0" : : "r" (i)); + /* mem barrier to sync things */ + asm volatile("mcr p15, 0, %0, c7, c10, 4" : : "r" (i)); +} + +#ifndef CONFIG_SYS_DCACHE_OFF + +#ifndef CONFIG_SYS_CACHELINE_SIZE +#define CONFIG_SYS_CACHELINE_SIZE 32 +#endif + +void invalidate_dcache_all(void) +{ + asm volatile("mcr p15, 0, %0, c7, c6, 0" : : "r" (0)); +} + +void flush_dcache_all(void) +{ + asm volatile("mcr p15, 0, %0, c7, c10, 0" : : "r" (0)); + asm volatile("mcr p15, 0, %0, c7, c10, 4" : : "r" (0)); +} + +static int check_cache_range(unsigned long start, unsigned long stop) +{ + int ok = 1; + + if (start & (CONFIG_SYS_CACHELINE_SIZE - 1)) + ok = 0; + + if (stop & (CONFIG_SYS_CACHELINE_SIZE - 1)) + ok = 0; + + if (!ok) + debug("CACHE: Misaligned operation at range [%08lx, %08lx]\n", + start, stop); + + return ok; +} + +void invalidate_dcache_range(unsigned long start, unsigned long stop) +{ + if (!check_cache_range(start, stop)) + return; + + while (start < stop) { + asm volatile("mcr p15, 0, %0, c7, c6, 1" : : "r" (start)); + start += CONFIG_SYS_CACHELINE_SIZE; + } +} + +void flush_dcache_range(unsigned long start, unsigned long stop) +{ + if (!check_cache_range(start, stop)) + return; + + while (start < stop) { + asm volatile("mcr p15, 0, %0, c7, c14, 1" : : "r" (start)); + start += CONFIG_SYS_CACHELINE_SIZE; + } + + asm volatile("mcr p15, 0, %0, c7, c10, 4" : : "r" (0)); +} + +void flush_cache(unsigned long start, unsigned long size) +{ + flush_dcache_range(start, start + size); +} + +#else /* #ifndef CONFIG_SYS_DCACHE_OFF */ +void invalidate_dcache_all(void) +{ +} + +void flush_dcache_all(void) +{ +} + +void invalidate_dcache_range(unsigned long start, unsigned long stop) +{ +} + +void flush_dcache_range(unsigned long start, unsigned long stop) +{ +} + +void flush_cache(unsigned long start, unsigned long size) +{ +} +#endif /* #ifndef CONFIG_SYS_DCACHE_OFF */ + +#if !defined(CONFIG_SYS_ICACHE_OFF) || !defined(CONFIG_SYS_DCACHE_OFF) +void enable_caches(void) +{ +#ifndef CONFIG_SYS_ICACHE_OFF + icache_enable(); +#endif +#ifndef CONFIG_SYS_DCACHE_OFF + dcache_enable(); +#endif +} +#endif diff --git a/arch/arm/cpu/arm1136/Makefile b/arch/arm/cpu/arm1136/Makefile index 56a9390..5d6f0aa 100644 --- a/arch/arm/cpu/arm1136/Makefile +++ b/arch/arm/cpu/arm1136/Makefile @@ -6,7 +6,6 @@ #
extra-y = start.o -obj-y = cpu.o
obj-$(CONFIG_MX31) += mx31/ obj-$(CONFIG_MX35) += mx35/ diff --git a/arch/arm/cpu/arm1136/cpu.c b/arch/arm/cpu/arm1136/cpu.c deleted file mode 100644 index 5d4b3c2..0000000 --- a/arch/arm/cpu/arm1136/cpu.c +++ /dev/null @@ -1,150 +0,0 @@ -/* - * (C) Copyright 2004 Texas Insturments - * - * (C) Copyright 2002 - * Sysgo Real-Time Solutions, GmbH <www.elinos.com> - * Marius Groeger mgroeger@sysgo.de - * - * (C) Copyright 2002 - * Gary Jennejohn, DENX Software Engineering, garyj@denx.de - * - * SPDX-License-Identifier: GPL-2.0+ - */ - -/* - * CPU specific code - */ - -#include <common.h> -#include <command.h> -#include <asm/system.h> - -static void cache_flush(void); - -int cleanup_before_linux (void) -{ - /* - * this function is called just before we call linux - * it prepares the processor for linux - * - * we turn off caches etc ... - */ - - disable_interrupts (); - - /* turn off I/D-cache */ - icache_disable(); - dcache_disable(); - /* flush I/D-cache */ - cache_flush(); - - return 0; -} - -static void cache_flush(void) -{ - unsigned long i = 0; - /* clean entire data cache */ - asm volatile("mcr p15, 0, %0, c7, c10, 0" : : "r" (i)); - /* invalidate both caches and flush btb */ - asm volatile("mcr p15, 0, %0, c7, c7, 0" : : "r" (i)); - /* mem barrier to sync things */ - asm volatile("mcr p15, 0, %0, c7, c10, 4" : : "r" (i)); -} - -#ifndef CONFIG_SYS_DCACHE_OFF - -#ifndef CONFIG_SYS_CACHELINE_SIZE -#define CONFIG_SYS_CACHELINE_SIZE 32 -#endif - -void invalidate_dcache_all(void) -{ - asm volatile("mcr p15, 0, %0, c7, c6, 0" : : "r" (0)); -} - -void flush_dcache_all(void) -{ - asm volatile("mcr p15, 0, %0, c7, c10, 0" : : "r" (0)); - asm volatile("mcr p15, 0, %0, c7, c10, 4" : : "r" (0)); -} - -static int check_cache_range(unsigned long start, unsigned long stop) -{ - int ok = 1; - - if (start & (CONFIG_SYS_CACHELINE_SIZE - 1)) - ok = 0; - - if (stop & (CONFIG_SYS_CACHELINE_SIZE - 1)) - ok = 0; - - if (!ok) - debug("CACHE: Misaligned operation at range [%08lx, %08lx]\n", - start, stop); - - return ok; -} - -void invalidate_dcache_range(unsigned long start, unsigned long stop) -{ - if (!check_cache_range(start, stop)) - return; - - while (start < stop) { - asm volatile("mcr p15, 0, %0, c7, c6, 1" : : "r" (start)); - start += CONFIG_SYS_CACHELINE_SIZE; - } -} - -void flush_dcache_range(unsigned long start, unsigned long stop) -{ - if (!check_cache_range(start, stop)) - return; - - while (start < stop) { - asm volatile("mcr p15, 0, %0, c7, c14, 1" : : "r" (start)); - start += CONFIG_SYS_CACHELINE_SIZE; - } - - asm volatile("mcr p15, 0, %0, c7, c10, 4" : : "r" (0)); -} - -void flush_cache(unsigned long start, unsigned long size) -{ - flush_dcache_range(start, start + size); -} - -#else /* #ifndef CONFIG_SYS_DCACHE_OFF */ -void invalidate_dcache_all(void) -{ -} - -void flush_dcache_all(void) -{ -} - -void invalidate_dcache_range(unsigned long start, unsigned long stop) -{ -} - -void flush_dcache_range(unsigned long start, unsigned long stop) -{ -} - -void flush_cache(unsigned long start, unsigned long size) -{ -} -#endif /* #ifndef CONFIG_SYS_DCACHE_OFF */ - -#if !defined(CONFIG_SYS_ICACHE_OFF) || !defined(CONFIG_SYS_DCACHE_OFF) -void enable_caches(void) -{ -#ifndef CONFIG_SYS_ICACHE_OFF - icache_enable(); -#endif -#ifndef CONFIG_SYS_DCACHE_OFF - dcache_enable(); -#endif -} -#endif diff --git a/arch/arm/cpu/arm1176/Makefile b/arch/arm/cpu/arm1176/Makefile index deec427..cd6dc9c 100644 --- a/arch/arm/cpu/arm1176/Makefile +++ b/arch/arm/cpu/arm1176/Makefile @@ -8,5 +8,7 @@ # SPDX-License-Identifier: GPL-2.0+ #
+obj- += dummy.o extra-y = start.o -obj-y = cpu.o + +obj-y += ../arm11/ diff --git a/arch/arm/cpu/arm1176/cpu.c b/arch/arm/cpu/arm1176/cpu.c deleted file mode 100644 index 2d81651..0000000 --- a/arch/arm/cpu/arm1176/cpu.c +++ /dev/null @@ -1,51 +0,0 @@ -/* - * (C) Copyright 2004 Texas Insturments - * - * (C) Copyright 2002 - * Sysgo Real-Time Solutions, GmbH <www.elinos.com> - * Marius Groeger mgroeger@sysgo.de - * - * (C) Copyright 2002 - * Gary Jennejohn, DENX Software Engineering, garyj@denx.de - * - * SPDX-License-Identifier: GPL-2.0+ - */ - -/* - * CPU specific code - */ - -#include <common.h> -#include <command.h> -#include <asm/system.h> - -static void cache_flush (void); - -int cleanup_before_linux (void) -{ - /* - * this function is called just before we call linux - * it prepares the processor for linux - * - * we turn off caches etc ... - */ - - disable_interrupts (); - - /* turn off I/D-cache */ - icache_disable(); - dcache_disable(); - /* flush I/D-cache */ - cache_flush(); - - return 0; -} - -/* flush I/D-cache */ -static void cache_flush (void) -{ - /* invalidate both caches and flush btb */ - asm ("mcr p15, 0, %0, c7, c7, 0": :"r" (0)); - /* mem barrier to sync things */ - asm ("mcr p15, 0, %0, c7, c10, 4": :"r" (0)); -}

The cacheline is always 32 bytes for arm1176 CPUs, so define it at board config level for cache handling code.
Signed-off-by: Alexander Stein alexanders83@web.de --- include/configs/rpi-common.h | 1 + 1 file changed, 1 insertion(+)
diff --git a/include/configs/rpi-common.h b/include/configs/rpi-common.h index 1012cdd..e75fb1e 100644 --- a/include/configs/rpi-common.h +++ b/include/configs/rpi-common.h @@ -15,6 +15,7 @@ #define CONFIG_BCM2835 #define CONFIG_ARCH_CPU_INIT #define CONFIG_SYS_DCACHE_OFF +#define CONFIG_SYS_CACHELINE_SIZE 32
#define CONFIG_SYS_TIMER_RATE 1000000 #define CONFIG_SYS_TIMER_COUNTER \

The mailbox buffer is required to be at least 16 bytes aligned, but for cache invalidation and/or flush it needs to be cacheline aligned. Use CONFIG_SYS_CACHELINE_SIZE alignment for all mailbox buffer allocations.
Signed-off-by: Alexander Stein alexanders83@web.de --- board/raspberrypi/rpi/rpi.c | 10 +++++----- drivers/video/bcm2835.c | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/board/raspberrypi/rpi/rpi.c b/board/raspberrypi/rpi/rpi.c index 96fe870..d5d3fec 100644 --- a/board/raspberrypi/rpi/rpi.c +++ b/board/raspberrypi/rpi/rpi.c @@ -182,7 +182,7 @@ u32 rpi_board_rev = 0;
int dram_init(void) { - ALLOC_ALIGN_BUFFER(struct msg_get_arm_mem, msg, 1, 16); + ALLOC_ALIGN_BUFFER(struct msg_get_arm_mem, msg, 1, CONFIG_SYS_CACHELINE_SIZE); int ret;
BCM2835_MBOX_INIT_HDR(msg); @@ -212,7 +212,7 @@ static void set_fdtfile(void)
static void set_usbethaddr(void) { - ALLOC_ALIGN_BUFFER(struct msg_get_mac_address, msg, 1, 16); + ALLOC_ALIGN_BUFFER(struct msg_get_mac_address, msg, 1, CONFIG_SYS_CACHELINE_SIZE); int ret;
if (!models[rpi_board_rev].has_onboard_eth) @@ -245,7 +245,7 @@ int misc_init_r(void)
static int power_on_module(u32 module) { - ALLOC_ALIGN_BUFFER(struct msg_set_power_state, msg_pwr, 1, 16); + ALLOC_ALIGN_BUFFER(struct msg_set_power_state, msg_pwr, 1, CONFIG_SYS_CACHELINE_SIZE); int ret;
BCM2835_MBOX_INIT_HDR(msg_pwr); @@ -269,7 +269,7 @@ static int power_on_module(u32 module)
static void get_board_rev(void) { - ALLOC_ALIGN_BUFFER(struct msg_get_board_rev, msg, 1, 16); + ALLOC_ALIGN_BUFFER(struct msg_get_board_rev, msg, 1, CONFIG_SYS_CACHELINE_SIZE); int ret; const char *name;
@@ -324,7 +324,7 @@ int board_init(void)
int board_mmc_init(bd_t *bis) { - ALLOC_ALIGN_BUFFER(struct msg_get_clock_rate, msg_clk, 1, 16); + ALLOC_ALIGN_BUFFER(struct msg_get_clock_rate, msg_clk, 1, CONFIG_SYS_CACHELINE_SIZE); int ret;
power_on_module(BCM2835_MBOX_POWER_DEVID_SDHCI); diff --git a/drivers/video/bcm2835.c b/drivers/video/bcm2835.c index 1f18231..30e22cc 100644 --- a/drivers/video/bcm2835.c +++ b/drivers/video/bcm2835.c @@ -38,8 +38,8 @@ struct msg_setup {
void lcd_ctrl_init(void *lcdbase) { - ALLOC_ALIGN_BUFFER(struct msg_query, msg_query, 1, 16); - ALLOC_ALIGN_BUFFER(struct msg_setup, msg_setup, 1, 16); + ALLOC_ALIGN_BUFFER(struct msg_query, msg_query, 1, CONFIG_SYS_CACHELINE_SIZE); + ALLOC_ALIGN_BUFFER(struct msg_setup, msg_setup, 1, CONFIG_SYS_CACHELINE_SIZE); int ret; u32 w, h;

When using dcache the setup data for the mailbox must be actually written into memory before calling into firmware. Thus flush and invalidate the memory.
Signed-off-by: Alexander Stein alexanders83@web.de --- Changes in v2: * Add hint in header about alignment requirements * Invalidate cache after calling into mailbox * round size up to next cacheline size
arch/arm/mach-bcm283x/include/mach/mbox.h | 4 ++++ arch/arm/mach-bcm283x/mbox.c | 9 +++++++++ 2 files changed, 13 insertions(+)
diff --git a/arch/arm/mach-bcm283x/include/mach/mbox.h b/arch/arm/mach-bcm283x/include/mach/mbox.h index 54d369c..ae1b904 100644 --- a/arch/arm/mach-bcm283x/include/mach/mbox.h +++ b/arch/arm/mach-bcm283x/include/mach/mbox.h @@ -522,6 +522,10 @@ int bcm2835_mbox_call_raw(u32 chan, u32 send, u32 *recv); * a termination value are expected to immediately follow the header in * memory, as required by the property protocol. * + * Each struct bcm2835_mbox_hdr passed must be allocated with + * ALLOC_ALIGN_BUFFER(x, y, z, CONFIG_SYS_CACHELINE_SIZE) to ensure proper + * cache flush/invalidate. + * * Returns 0 for success, any other value for error. */ int bcm2835_mbox_call_prop(u32 chan, struct bcm2835_mbox_hdr *buffer); diff --git a/arch/arm/mach-bcm283x/mbox.c b/arch/arm/mach-bcm283x/mbox.c index 1af9be7..740db0c 100644 --- a/arch/arm/mach-bcm283x/mbox.c +++ b/arch/arm/mach-bcm283x/mbox.c @@ -111,9 +111,18 @@ int bcm2835_mbox_call_prop(u32 chan, struct bcm2835_mbox_hdr *buffer) dump_buf(buffer); #endif
+ flush_dcache_range((unsigned long)buffer, + (unsigned long)((void *)buffer + + roundup(buffer->buf_size, 32))); + ret = bcm2835_mbox_call_raw(chan, phys_to_bus((u32)buffer), &rbuffer); if (ret) return ret; + + invalidate_dcache_range((unsigned long)buffer, + (unsigned long)((void *)buffer + + roundup(buffer->buf_size, 32))); + if (rbuffer != phys_to_bus((u32)buffer)) { printf("mbox: Response buffer mismatch\n"); return -1;

Now that mailbox driver supports cache flush and invalidation, we can enable dcache.
Signed-off-by: Alexander Stein alexanders83@web.de --- Changes in v2: * Only enable dcache on rpi1, but not on rpi2
include/configs/rpi-common.h | 1 - include/configs/rpi_2.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/configs/rpi-common.h b/include/configs/rpi-common.h index e75fb1e..43b460b 100644 --- a/include/configs/rpi-common.h +++ b/include/configs/rpi-common.h @@ -14,7 +14,6 @@ #define CONFIG_SYS_GENERIC_BOARD #define CONFIG_BCM2835 #define CONFIG_ARCH_CPU_INIT -#define CONFIG_SYS_DCACHE_OFF #define CONFIG_SYS_CACHELINE_SIZE 32
#define CONFIG_SYS_TIMER_RATE 1000000 diff --git a/include/configs/rpi_2.h b/include/configs/rpi_2.h index 2e7e74f..0ecd399 100644 --- a/include/configs/rpi_2.h +++ b/include/configs/rpi_2.h @@ -9,6 +9,7 @@
#define CONFIG_SKIP_LOWLEVEL_INIT #define CONFIG_BCM2836 +#define CONFIG_SYS_DCACHE_OFF
#include "rpi-common.h"

On Monday 20 July 2015, 23:00:04 wrote Alexander Stein:
Now that mailbox driver supports cache flush and invalidation, we can enable dcache.
Signed-off-by: Alexander Stein alexanders83@web.de
Well, I just noticed that the dwc2 driver does not support dache yet. I'm on the way to add that there too and will send v3.
Best regards, Alexander
participants (1)
-
Alexander Stein