[PATCH 0/2] Optimize udelay for aarch64 boards

Arm boards currently use the default implementation of __udelay which consists of a spin-loop. AArch64 however provides an architectural timer with an event stream which can be enabled and waited upon with a few lines of assembly. This is more efficient and also avoids an issue on Arm FVPs where polling the counter frequently takes much longer than real-time.
Therefore, overrride __udelay for all armv8 boards to use the event stream.
Macros for the relevant instructions already existed for mach-exynos, so pull these up to a common header file.
I have tested this patch chain on qemu-system-aarch64 (virt,secure=on), FVP_Base_RevC and a Raspberry Pi 4.
Peter Hoyes (2): arm: Move sev() and wfe() definitions to common Arm header file armv8: generic_timer: Use event stream for udelay
arch/arm/cpu/armv8/Kconfig | 8 +++++++ arch/arm/cpu/armv8/generic_timer.c | 27 ++++++++++++++++++++++ arch/arm/include/asm/system.h | 15 ++++++++++-- arch/arm/mach-exynos/include/mach/system.h | 19 --------------- 4 files changed, 48 insertions(+), 21 deletions(-)

From: Peter Hoyes Peter.Hoyes@arm.com
The sev() and wfe() asm macros are currently defined only for mach-exynos. As these are common Arm instructions, move them to the common asm/system.h header file, for both Armv7 and Armv8, so they can be used by other machines.
wfe may theoretically trigger a context switch if an interrupt occurs so add a memory barrier to this call.
Signed-off-by: Peter Hoyes Peter.Hoyes@arm.com --- arch/arm/include/asm/system.h | 9 +++++++++ arch/arm/mach-exynos/include/mach/system.h | 19 ------------------- 2 files changed, 9 insertions(+), 19 deletions(-)
diff --git a/arch/arm/include/asm/system.h b/arch/arm/include/asm/system.h index 43f7503571..51123c2968 100644 --- a/arch/arm/include/asm/system.h +++ b/arch/arm/include/asm/system.h @@ -154,6 +154,13 @@ enum dcache_option { "wfi" : : : "memory"); \ })
+#define wfe() \ + ({asm volatile( \ + "wfe" : : : "memory"); \ + }) + +#define sev() asm volatile("sev") + static inline unsigned int current_el(void) { unsigned long el; @@ -369,6 +376,8 @@ void switch_to_hypervisor_ret(void);
#ifdef __ARM_ARCH_7A__ #define wfi() __asm__ __volatile__ ("wfi" : : : "memory") +#define wfe() __asm__ __volatile__ ("wfe" : : : "memory") +#define sev() __asm__ __volatile__ ("sev") #else #define wfi() #endif diff --git a/arch/arm/mach-exynos/include/mach/system.h b/arch/arm/mach-exynos/include/mach/system.h index 5d0bebac57..0aed4c3e2b 100644 --- a/arch/arm/mach-exynos/include/mach/system.h +++ b/arch/arm/mach-exynos/include/mach/system.h @@ -36,25 +36,6 @@ struct exynos5_sysreg {
#define USB20_PHY_CFG_HOST_LINK_EN (1 << 0)
-/* - * This instruction causes an event to be signaled to all cores - * within a multiprocessor system. If SEV is implemented, - * WFE must also be implemented. - */ -#define sev() __asm__ __volatile__ ("sev\n\t" : : ); -/* - * If the Event Register is not set, WFE suspends execution until - * one of the following events occurs: - * - an IRQ interrupt, unless masked by the CPSR I-bit - * - an FIQ interrupt, unless masked by the CPSR F-bit - * - an Imprecise Data abort, unless masked by the CPSR A-bit - * - a Debug Entry request, if Debug is enabled - * - an Event signaled by another processor using the SEV instruction. - * If the Event Register is set, WFE clears it and returns immediately. - * If WFE is implemented, SEV must also be implemented. - */ -#define wfe() __asm__ __volatile__ ("wfe\n\t" : : ); - /* Move 0xd3 value to CPSR register to enable SVC mode */ #define svc32_mode_en() __asm__ __volatile__ \ ("@ I&F disable, Mode: 0x13 - SVC\n\t" \

On Tue, 23 Apr 2024 09:10:04 +0100 Peter Hoyes peter.hoyes@arm.com wrote:
Hi,
From: Peter Hoyes Peter.Hoyes@arm.com
The sev() and wfe() asm macros are currently defined only for mach-exynos. As these are common Arm instructions, move them to the common asm/system.h header file, for both Armv7 and Armv8, so they can be used by other machines.
wfe may theoretically trigger a context switch if an interrupt occurs so add a memory barrier to this call.
Signed-off-by: Peter Hoyes Peter.Hoyes@arm.com
Looks alright to me, and the memory barrier for wfe looks justified.
For the records: we couldn't come up with a reason why "sev" would need one as well, so even while the kernel has this, this seems to be more a copy&paste incident rather than something intentional.
Reviewed-by: Andre Przywara andre.przywara@arm.com
Cheers, Andre
arch/arm/include/asm/system.h | 9 +++++++++ arch/arm/mach-exynos/include/mach/system.h | 19 ------------------- 2 files changed, 9 insertions(+), 19 deletions(-)
diff --git a/arch/arm/include/asm/system.h b/arch/arm/include/asm/system.h index 43f7503571..51123c2968 100644 --- a/arch/arm/include/asm/system.h +++ b/arch/arm/include/asm/system.h @@ -154,6 +154,13 @@ enum dcache_option { "wfi" : : : "memory"); \ })
+#define wfe() \
- ({asm volatile( \
- "wfe" : : : "memory"); \
- })
+#define sev() asm volatile("sev")
static inline unsigned int current_el(void) { unsigned long el; @@ -369,6 +376,8 @@ void switch_to_hypervisor_ret(void);
#ifdef __ARM_ARCH_7A__ #define wfi() __asm__ __volatile__ ("wfi" : : : "memory") +#define wfe() __asm__ __volatile__ ("wfe" : : : "memory") +#define sev() __asm__ __volatile__ ("sev") #else #define wfi() #endif diff --git a/arch/arm/mach-exynos/include/mach/system.h b/arch/arm/mach-exynos/include/mach/system.h index 5d0bebac57..0aed4c3e2b 100644 --- a/arch/arm/mach-exynos/include/mach/system.h +++ b/arch/arm/mach-exynos/include/mach/system.h @@ -36,25 +36,6 @@ struct exynos5_sysreg {
#define USB20_PHY_CFG_HOST_LINK_EN (1 << 0)
-/*
- This instruction causes an event to be signaled to all cores
- within a multiprocessor system. If SEV is implemented,
- WFE must also be implemented.
- */
-#define sev() __asm__ __volatile__ ("sev\n\t" : : ); -/*
- If the Event Register is not set, WFE suspends execution until
- one of the following events occurs:
- an IRQ interrupt, unless masked by the CPSR I-bit
- an FIQ interrupt, unless masked by the CPSR F-bit
- an Imprecise Data abort, unless masked by the CPSR A-bit
- a Debug Entry request, if Debug is enabled
- an Event signaled by another processor using the SEV instruction.
- If the Event Register is set, WFE clears it and returns immediately.
- If WFE is implemented, SEV must also be implemented.
- */
-#define wfe() __asm__ __volatile__ ("wfe\n\t" : : );
/* Move 0xd3 value to CPSR register to enable SVC mode */ #define svc32_mode_en() __asm__ __volatile__ \ ("@ I&F disable, Mode: 0x13 - SVC\n\t" \

From: Peter Hoyes Peter.Hoyes@arm.com
Polling cntpct_el0 in a tight loop for delays is inefficient. This is particularly apparent on Arm FVPs, which do not simulate real time, meaning that a 1s sleep can take a couple of orders of magnitude longer to execute in wall time.
If running at EL2 or above (where CNTHCTL_EL2 is available), enable the cntpct_el0 event stream temporarily and use wfe to implement the delay more efficiently. The event period is chosen as a trade-off between efficiency and the fact that Arm FVPs do not typically simulate real time.
This is only implemented for Armv8 boards, where an architectural timer exists.
Some mach-socfpga AArch64 boards already override __udelay to make it always inline, so guard the functionality with a new ARMV8_UDELAY_EVENT_STREAM Kconfig, enabled by default.
Signed-off-by: Peter Hoyes Peter.Hoyes@arm.com --- arch/arm/cpu/armv8/Kconfig | 8 ++++++++ arch/arm/cpu/armv8/generic_timer.c | 27 +++++++++++++++++++++++++++ arch/arm/include/asm/system.h | 6 ++++-- 3 files changed, 39 insertions(+), 2 deletions(-)
diff --git a/arch/arm/cpu/armv8/Kconfig b/arch/arm/cpu/armv8/Kconfig index 9f0fb369f7..544c5e2d74 100644 --- a/arch/arm/cpu/armv8/Kconfig +++ b/arch/arm/cpu/armv8/Kconfig @@ -191,6 +191,14 @@ config ARMV8_EA_EL3_FIRST Exception handling at all exception levels for External Abort and SError interrupt exception are taken in EL3.
+config ARMV8_UDELAY_EVENT_STREAM + bool "Use the event stream for udelay" + default y if !ARCH_SOCFPGA + help + Use the event stream provided by the AArch64 architectural timer for + delays. This is more efficient than the default polling + implementation. + menuconfig ARMV8_CRYPTO bool "ARM64 Accelerated Cryptographic Algorithms"
diff --git a/arch/arm/cpu/armv8/generic_timer.c b/arch/arm/cpu/armv8/generic_timer.c index 8f83372cbc..e18b5c8187 100644 --- a/arch/arm/cpu/armv8/generic_timer.c +++ b/arch/arm/cpu/armv8/generic_timer.c @@ -115,3 +115,30 @@ ulong timer_get_boot_us(void)
return val / get_tbclk(); } + +#if CONFIG_IS_ENABLED(ARMV8_UDELAY_EVENT_STREAM) +void __udelay(unsigned long usec) +{ + u64 target = get_ticks() + usec_to_tick(usec); + + /* At EL2 or above, use the event stream to avoid polling CNTPCT_EL0 so often */ + if (current_el() >= 2) { + u32 cnthctl_val; + const u8 event_period = 0x7; + + asm volatile("mrs %0, cnthctl_el2" : "=r" (cnthctl_val)); + asm volatile("msr cnthctl_el2, %0" : : "r" + (cnthctl_val | CNTHCTL_EL2_EVNT_EN | CNTHCTL_EL2_EVNT_I(event_period))); + + while (get_ticks() + (1ULL << event_period) <= target) + wfe(); + + /* Reset the event stream */ + asm volatile("msr cnthctl_el2, %0" : : "r" (cnthctl_val)); + } + + /* Fall back to polling CNTPCT_EL0 */ + while (get_ticks() <= target) + ; +} +#endif diff --git a/arch/arm/include/asm/system.h b/arch/arm/include/asm/system.h index 51123c2968..7e30cac32a 100644 --- a/arch/arm/include/asm/system.h +++ b/arch/arm/include/asm/system.h @@ -69,8 +69,10 @@ /* * CNTHCTL_EL2 bits definitions */ -#define CNTHCTL_EL2_EL1PCEN_EN (1 << 1) /* Physical timer regs accessible */ -#define CNTHCTL_EL2_EL1PCTEN_EN (1 << 0) /* Physical counter accessible */ +#define CNTHCTL_EL2_EVNT_EN BIT(2) /* Enable the event stream */ +#define CNTHCTL_EL2_EVNT_I(val) ((val) << 4) /* Event stream trigger bits */ +#define CNTHCTL_EL2_EL1PCEN_EN (1 << 1) /* Physical timer regs accessible */ +#define CNTHCTL_EL2_EL1PCTEN_EN (1 << 0) /* Physical counter accessible */
/* * HCR_EL2 bits definitions

Hi Peter,
On 4/23/24 10:10, Peter Hoyes wrote:
From: Peter Hoyes Peter.Hoyes@arm.com
Polling cntpct_el0 in a tight loop for delays is inefficient. This is particularly apparent on Arm FVPs, which do not simulate real time, meaning that a 1s sleep can take a couple of orders of magnitude longer to execute in wall time.
If running at EL2 or above (where CNTHCTL_EL2 is available), enable the cntpct_el0 event stream temporarily and use wfe to implement the delay more efficiently. The event period is chosen as a trade-off between efficiency and the fact that Arm FVPs do not typically simulate real time.
This is only implemented for Armv8 boards, where an architectural timer exists.
Some mach-socfpga AArch64 boards already override __udelay to make it always inline, so guard the functionality with a new ARMV8_UDELAY_EVENT_STREAM Kconfig, enabled by default.
Signed-off-by: Peter Hoyes Peter.Hoyes@arm.com
arch/arm/cpu/armv8/Kconfig | 8 ++++++++ arch/arm/cpu/armv8/generic_timer.c | 27 +++++++++++++++++++++++++++ arch/arm/include/asm/system.h | 6 ++++-- 3 files changed, 39 insertions(+), 2 deletions(-)
diff --git a/arch/arm/cpu/armv8/Kconfig b/arch/arm/cpu/armv8/Kconfig index 9f0fb369f7..544c5e2d74 100644 --- a/arch/arm/cpu/armv8/Kconfig +++ b/arch/arm/cpu/armv8/Kconfig @@ -191,6 +191,14 @@ config ARMV8_EA_EL3_FIRST Exception handling at all exception levels for External Abort and SError interrupt exception are taken in EL3.
+config ARMV8_UDELAY_EVENT_STREAM
- bool "Use the event stream for udelay"
- default y if !ARCH_SOCFPGA
- help
Use the event stream provided by the AArch64 architectural timer for
delays. This is more efficient than the default polling
implementation.
- menuconfig ARMV8_CRYPTO bool "ARM64 Accelerated Cryptographic Algorithms"
diff --git a/arch/arm/cpu/armv8/generic_timer.c b/arch/arm/cpu/armv8/generic_timer.c index 8f83372cbc..e18b5c8187 100644 --- a/arch/arm/cpu/armv8/generic_timer.c +++ b/arch/arm/cpu/armv8/generic_timer.c @@ -115,3 +115,30 @@ ulong timer_get_boot_us(void)
return val / get_tbclk(); }
+#if CONFIG_IS_ENABLED(ARMV8_UDELAY_EVENT_STREAM) +void __udelay(unsigned long usec) +{
- u64 target = get_ticks() + usec_to_tick(usec);
This can theoretically overflow, do we have any guarantee this cannot happen in real life, like... we would need U-Boot to be running for 100 years without being powered-down/reset or something like that? Can we document this assumption? Does this make sense?
- /* At EL2 or above, use the event stream to avoid polling CNTPCT_EL0 so often */
- if (current_el() >= 2) {
u32 cnthctl_val;
const u8 event_period = 0x7;
asm volatile("mrs %0, cnthctl_el2" : "=r" (cnthctl_val));
asm volatile("msr cnthctl_el2, %0" : : "r"
(cnthctl_val | CNTHCTL_EL2_EVNT_EN | CNTHCTL_EL2_EVNT_I(event_period)));
while (get_ticks() + (1ULL << event_period) <= target)
This could be an overflow as well.
wfe();
/* Reset the event stream */
asm volatile("msr cnthctl_el2, %0" : : "r" (cnthctl_val));
- }
- /* Fall back to polling CNTPCT_EL0 */
- while (get_ticks() <= target)
get_ticks() could wrap around here maybe?
Cheers, Quentin

Hi Quentin,
On 4/23/24 11:55, Quentin Schulz wrote:
Hi Peter,
On 4/23/24 10:10, Peter Hoyes wrote:
From: Peter Hoyes Peter.Hoyes@arm.com
Polling cntpct_el0 in a tight loop for delays is inefficient. This is particularly apparent on Arm FVPs, which do not simulate real time, meaning that a 1s sleep can take a couple of orders of magnitude longer to execute in wall time.
If running at EL2 or above (where CNTHCTL_EL2 is available), enable the cntpct_el0 event stream temporarily and use wfe to implement the delay more efficiently. The event period is chosen as a trade-off between efficiency and the fact that Arm FVPs do not typically simulate real time.
This is only implemented for Armv8 boards, where an architectural timer exists.
Some mach-socfpga AArch64 boards already override __udelay to make it always inline, so guard the functionality with a new ARMV8_UDELAY_EVENT_STREAM Kconfig, enabled by default.
Signed-off-by: Peter Hoyes Peter.Hoyes@arm.com
arch/arm/cpu/armv8/Kconfig | 8 ++++++++ arch/arm/cpu/armv8/generic_timer.c | 27 +++++++++++++++++++++++++++ arch/arm/include/asm/system.h | 6 ++++-- 3 files changed, 39 insertions(+), 2 deletions(-)
diff --git a/arch/arm/cpu/armv8/Kconfig b/arch/arm/cpu/armv8/Kconfig index 9f0fb369f7..544c5e2d74 100644 --- a/arch/arm/cpu/armv8/Kconfig +++ b/arch/arm/cpu/armv8/Kconfig @@ -191,6 +191,14 @@ config ARMV8_EA_EL3_FIRST Exception handling at all exception levels for External Abort and SError interrupt exception are taken in EL3. +config ARMV8_UDELAY_EVENT_STREAM + bool "Use the event stream for udelay" + default y if !ARCH_SOCFPGA + help + Use the event stream provided by the AArch64 architectural timer for + delays. This is more efficient than the default polling + implementation.
menuconfig ARMV8_CRYPTO bool "ARM64 Accelerated Cryptographic Algorithms" diff --git a/arch/arm/cpu/armv8/generic_timer.c b/arch/arm/cpu/armv8/generic_timer.c index 8f83372cbc..e18b5c8187 100644 --- a/arch/arm/cpu/armv8/generic_timer.c +++ b/arch/arm/cpu/armv8/generic_timer.c @@ -115,3 +115,30 @@ ulong timer_get_boot_us(void) return val / get_tbclk(); }
+#if CONFIG_IS_ENABLED(ARMV8_UDELAY_EVENT_STREAM) +void __udelay(unsigned long usec) +{ + u64 target = get_ticks() + usec_to_tick(usec);
This can theoretically overflow, do we have any guarantee this cannot happen in real life, like... we would need U-Boot to be running for 100 years without being powered-down/reset or something like that? Can we document this assumption? Does this make sense?
+ /* At EL2 or above, use the event stream to avoid polling CNTPCT_EL0 so often */ + if (current_el() >= 2) { + u32 cnthctl_val; + const u8 event_period = 0x7;
+ asm volatile("mrs %0, cnthctl_el2" : "=r" (cnthctl_val)); + asm volatile("msr cnthctl_el2, %0" : : "r" + (cnthctl_val | CNTHCTL_EL2_EVNT_EN | CNTHCTL_EL2_EVNT_I(event_period)));
+ while (get_ticks() + (1ULL << event_period) <= target)
This could be an overflow as well.
+ wfe();
+ /* Reset the event stream */ + asm volatile("msr cnthctl_el2, %0" : : "r" (cnthctl_val)); + }
+ /* Fall back to polling CNTPCT_EL0 */ + while (get_ticks() <= target)
get_ticks() could wrap around here maybe?
I don't see a problem here. It's using u64s and the maximum COUNTER_FREQUENCY in the U-Boot source is 200MHz, which gives enough ticks for about 3 millennia.
Additionally, the underlying "while" condition is the same as the existing weak __udelay implementation (https://source.denx.de/u-boot/u-boot/-/blob/master/lib/time.c?ref_type=heads...), so this change doesn't affect the overflow limit.
Thanks, Peter

On Tue, 23 Apr 2024 12:55:55 +0200 Quentin Schulz quentin.schulz@theobroma-systems.com wrote:
Hi Peter,
On 4/23/24 10:10, Peter Hoyes wrote:
From: Peter Hoyes Peter.Hoyes@arm.com
Polling cntpct_el0 in a tight loop for delays is inefficient. This is particularly apparent on Arm FVPs, which do not simulate real time, meaning that a 1s sleep can take a couple of orders of magnitude longer to execute in wall time.
If running at EL2 or above (where CNTHCTL_EL2 is available), enable the cntpct_el0 event stream temporarily and use wfe to implement the delay more efficiently. The event period is chosen as a trade-off between efficiency and the fact that Arm FVPs do not typically simulate real time.
This is only implemented for Armv8 boards, where an architectural timer exists.
Some mach-socfpga AArch64 boards already override __udelay to make it always inline, so guard the functionality with a new ARMV8_UDELAY_EVENT_STREAM Kconfig, enabled by default.
Signed-off-by: Peter Hoyes Peter.Hoyes@arm.com
arch/arm/cpu/armv8/Kconfig | 8 ++++++++ arch/arm/cpu/armv8/generic_timer.c | 27 +++++++++++++++++++++++++++ arch/arm/include/asm/system.h | 6 ++++-- 3 files changed, 39 insertions(+), 2 deletions(-)
diff --git a/arch/arm/cpu/armv8/Kconfig b/arch/arm/cpu/armv8/Kconfig index 9f0fb369f7..544c5e2d74 100644 --- a/arch/arm/cpu/armv8/Kconfig +++ b/arch/arm/cpu/armv8/Kconfig @@ -191,6 +191,14 @@ config ARMV8_EA_EL3_FIRST Exception handling at all exception levels for External Abort and SError interrupt exception are taken in EL3.
+config ARMV8_UDELAY_EVENT_STREAM
- bool "Use the event stream for udelay"
- default y if !ARCH_SOCFPGA
- help
Use the event stream provided by the AArch64 architectural timer for
delays. This is more efficient than the default polling
implementation.
- menuconfig ARMV8_CRYPTO bool "ARM64 Accelerated Cryptographic Algorithms"
diff --git a/arch/arm/cpu/armv8/generic_timer.c b/arch/arm/cpu/armv8/generic_timer.c index 8f83372cbc..e18b5c8187 100644 --- a/arch/arm/cpu/armv8/generic_timer.c +++ b/arch/arm/cpu/armv8/generic_timer.c @@ -115,3 +115,30 @@ ulong timer_get_boot_us(void)
return val / get_tbclk(); }
+#if CONFIG_IS_ENABLED(ARMV8_UDELAY_EVENT_STREAM) +void __udelay(unsigned long usec) +{
- u64 target = get_ticks() + usec_to_tick(usec);
This can theoretically overflow, do we have any guarantee this cannot happen in real life, like... we would need U-Boot to be running for 100 years without being powered-down/reset or something like that? Can we document this assumption? Does this make sense?
The Arm ARM guarantees a "Roll-over time of not less than 40 years." (Armv8 ARM 0487K.a D12.1.2 "The system counter"). So that's not the 100 years you are asking for, but I guess still good enough?
Cheers, Andre
- /* At EL2 or above, use the event stream to avoid polling CNTPCT_EL0 so often */
- if (current_el() >= 2) {
u32 cnthctl_val;
const u8 event_period = 0x7;
asm volatile("mrs %0, cnthctl_el2" : "=r" (cnthctl_val));
asm volatile("msr cnthctl_el2, %0" : : "r"
(cnthctl_val | CNTHCTL_EL2_EVNT_EN | CNTHCTL_EL2_EVNT_I(event_period)));
while (get_ticks() + (1ULL << event_period) <= target)
This could be an overflow as well.
wfe();
/* Reset the event stream */
asm volatile("msr cnthctl_el2, %0" : : "r" (cnthctl_val));
- }
- /* Fall back to polling CNTPCT_EL0 */
- while (get_ticks() <= target)
get_ticks() could wrap around here maybe?
Cheers, Quentin

Hi Peter, Andre,
On 4/24/24 12:29, Andre Przywara wrote:
On Tue, 23 Apr 2024 12:55:55 +0200 Quentin Schulz quentin.schulz@theobroma-systems.com wrote:
Hi Peter,
On 4/23/24 10:10, Peter Hoyes wrote:
From: Peter Hoyes Peter.Hoyes@arm.com
Polling cntpct_el0 in a tight loop for delays is inefficient. This is particularly apparent on Arm FVPs, which do not simulate real time, meaning that a 1s sleep can take a couple of orders of magnitude longer to execute in wall time.
If running at EL2 or above (where CNTHCTL_EL2 is available), enable the cntpct_el0 event stream temporarily and use wfe to implement the delay more efficiently. The event period is chosen as a trade-off between efficiency and the fact that Arm FVPs do not typically simulate real time.
This is only implemented for Armv8 boards, where an architectural timer exists.
Some mach-socfpga AArch64 boards already override __udelay to make it always inline, so guard the functionality with a new ARMV8_UDELAY_EVENT_STREAM Kconfig, enabled by default.
Signed-off-by: Peter Hoyes Peter.Hoyes@arm.com
arch/arm/cpu/armv8/Kconfig | 8 ++++++++ arch/arm/cpu/armv8/generic_timer.c | 27 +++++++++++++++++++++++++++ arch/arm/include/asm/system.h | 6 ++++-- 3 files changed, 39 insertions(+), 2 deletions(-)
diff --git a/arch/arm/cpu/armv8/Kconfig b/arch/arm/cpu/armv8/Kconfig index 9f0fb369f7..544c5e2d74 100644 --- a/arch/arm/cpu/armv8/Kconfig +++ b/arch/arm/cpu/armv8/Kconfig @@ -191,6 +191,14 @@ config ARMV8_EA_EL3_FIRST Exception handling at all exception levels for External Abort and SError interrupt exception are taken in EL3.
+config ARMV8_UDELAY_EVENT_STREAM
- bool "Use the event stream for udelay"
- default y if !ARCH_SOCFPGA
- help
Use the event stream provided by the AArch64 architectural timer for
delays. This is more efficient than the default polling
implementation.
- menuconfig ARMV8_CRYPTO bool "ARM64 Accelerated Cryptographic Algorithms"
diff --git a/arch/arm/cpu/armv8/generic_timer.c b/arch/arm/cpu/armv8/generic_timer.c index 8f83372cbc..e18b5c8187 100644 --- a/arch/arm/cpu/armv8/generic_timer.c +++ b/arch/arm/cpu/armv8/generic_timer.c @@ -115,3 +115,30 @@ ulong timer_get_boot_us(void)
return val / get_tbclk();
}
+#if CONFIG_IS_ENABLED(ARMV8_UDELAY_EVENT_STREAM) +void __udelay(unsigned long usec) +{
- u64 target = get_ticks() + usec_to_tick(usec);
This can theoretically overflow, do we have any guarantee this cannot happen in real life, like... we would need U-Boot to be running for 100 years without being powered-down/reset or something like that? Can we document this assumption? Does this make sense?
The Arm ARM guarantees a "Roll-over time of not less than 40 years." (Armv8 ARM 0487K.a D12.1.2 "The system counter"). So that's not the 100 years you are asking for, but I guess still good enough?
I guess it is, since it is also stored in a u64 and is reset to 0 upon start-up according to the ARM. I also assume nobody is going to add a udelay of years in their code :) (and if they do, they would probably figure out something's wrong before it reaches the final products :) ).
Thanks all for the pointers to reference manuals and current implementations.
Cheers, Quentin

On Tue, 23 Apr 2024 09:10:05 +0100 Peter Hoyes peter.hoyes@arm.com wrote:
Hi,
From: Peter Hoyes Peter.Hoyes@arm.com
Polling cntpct_el0 in a tight loop for delays is inefficient. This is particularly apparent on Arm FVPs, which do not simulate real time, meaning that a 1s sleep can take a couple of orders of magnitude longer to execute in wall time.
If running at EL2 or above (where CNTHCTL_EL2 is available), enable the cntpct_el0 event stream temporarily and use wfe to implement the delay more efficiently. The event period is chosen as a trade-off between efficiency and the fact that Arm FVPs do not typically simulate real time.
This is only implemented for Armv8 boards, where an architectural timer exists.
Some mach-socfpga AArch64 boards already override __udelay to make it always inline, so guard the functionality with a new ARMV8_UDELAY_EVENT_STREAM Kconfig, enabled by default.
So the code looks alright, and it works for me, tested on some hardware I just had at hand. However I am a bit worried about the scope of this patch. While it's indeed purely architectural and should work on all systems, I think we need to be careful with messing with the arch timer *while the OS is running*. U-Boot code will run for UEFI runtime services and for serving PSCI on some platforms, and I guess messing with CNTHCTL_EL2 is not a good idea then. And udelay sounds like a basic function that this code might use. So I wonder if we should limit this to the Arm FVPs for now? To not create subtle and hard-to-diagnose problems for a lot of boards?
Or maybe there is some mechanism to limit this to U-Boot boot time/UEFI boot services?
Signed-off-by: Peter Hoyes Peter.Hoyes@arm.com
arch/arm/cpu/armv8/Kconfig | 8 ++++++++ arch/arm/cpu/armv8/generic_timer.c | 27 +++++++++++++++++++++++++++ arch/arm/include/asm/system.h | 6 ++++-- 3 files changed, 39 insertions(+), 2 deletions(-)
diff --git a/arch/arm/cpu/armv8/Kconfig b/arch/arm/cpu/armv8/Kconfig index 9f0fb369f7..544c5e2d74 100644 --- a/arch/arm/cpu/armv8/Kconfig +++ b/arch/arm/cpu/armv8/Kconfig @@ -191,6 +191,14 @@ config ARMV8_EA_EL3_FIRST Exception handling at all exception levels for External Abort and SError interrupt exception are taken in EL3.
+config ARMV8_UDELAY_EVENT_STREAM
- bool "Use the event stream for udelay"
- default y if !ARCH_SOCFPGA
As described above, change this to something like "... if ARCH_VEXPRESS64".
Cheers, Andre
- help
Use the event stream provided by the AArch64 architectural timer for
delays. This is more efficient than the default polling
implementation.
menuconfig ARMV8_CRYPTO bool "ARM64 Accelerated Cryptographic Algorithms"
diff --git a/arch/arm/cpu/armv8/generic_timer.c b/arch/arm/cpu/armv8/generic_timer.c index 8f83372cbc..e18b5c8187 100644 --- a/arch/arm/cpu/armv8/generic_timer.c +++ b/arch/arm/cpu/armv8/generic_timer.c @@ -115,3 +115,30 @@ ulong timer_get_boot_us(void)
return val / get_tbclk(); }
+#if CONFIG_IS_ENABLED(ARMV8_UDELAY_EVENT_STREAM) +void __udelay(unsigned long usec) +{
- u64 target = get_ticks() + usec_to_tick(usec);
- /* At EL2 or above, use the event stream to avoid polling CNTPCT_EL0 so often */
- if (current_el() >= 2) {
u32 cnthctl_val;
const u8 event_period = 0x7;
asm volatile("mrs %0, cnthctl_el2" : "=r" (cnthctl_val));
asm volatile("msr cnthctl_el2, %0" : : "r"
(cnthctl_val | CNTHCTL_EL2_EVNT_EN | CNTHCTL_EL2_EVNT_I(event_period)));
while (get_ticks() + (1ULL << event_period) <= target)
wfe();
/* Reset the event stream */
asm volatile("msr cnthctl_el2, %0" : : "r" (cnthctl_val));
- }
- /* Fall back to polling CNTPCT_EL0 */
- while (get_ticks() <= target)
;
+} +#endif diff --git a/arch/arm/include/asm/system.h b/arch/arm/include/asm/system.h index 51123c2968..7e30cac32a 100644 --- a/arch/arm/include/asm/system.h +++ b/arch/arm/include/asm/system.h @@ -69,8 +69,10 @@ /*
- CNTHCTL_EL2 bits definitions
*/ -#define CNTHCTL_EL2_EL1PCEN_EN (1 << 1) /* Physical timer regs accessible */ -#define CNTHCTL_EL2_EL1PCTEN_EN (1 << 0) /* Physical counter accessible */ +#define CNTHCTL_EL2_EVNT_EN BIT(2) /* Enable the event stream */ +#define CNTHCTL_EL2_EVNT_I(val) ((val) << 4) /* Event stream trigger bits */ +#define CNTHCTL_EL2_EL1PCEN_EN (1 << 1) /* Physical timer regs accessible */ +#define CNTHCTL_EL2_EL1PCTEN_EN (1 << 0) /* Physical counter accessible */
/*
- HCR_EL2 bits definitions

On 4/29/24 16:16, Andre Przywara wrote:
On Tue, 23 Apr 2024 09:10:05 +0100 Peter Hoyes peter.hoyes@arm.com wrote:
Hi,
From: Peter Hoyes Peter.Hoyes@arm.com
Polling cntpct_el0 in a tight loop for delays is inefficient. This is particularly apparent on Arm FVPs, which do not simulate real time, meaning that a 1s sleep can take a couple of orders of magnitude longer to execute in wall time.
If running at EL2 or above (where CNTHCTL_EL2 is available), enable the cntpct_el0 event stream temporarily and use wfe to implement the delay more efficiently. The event period is chosen as a trade-off between efficiency and the fact that Arm FVPs do not typically simulate real time.
This is only implemented for Armv8 boards, where an architectural timer exists.
Some mach-socfpga AArch64 boards already override __udelay to make it always inline, so guard the functionality with a new ARMV8_UDELAY_EVENT_STREAM Kconfig, enabled by default.
So the code looks alright, and it works for me, tested on some hardware I just had at hand. However I am a bit worried about the scope of this patch. While it's indeed purely architectural and should work on all systems, I think we need to be careful with messing with the arch timer *while the OS is running*. U-Boot code will run for UEFI runtime services and for serving PSCI on some platforms, and I guess messing with CNTHCTL_EL2 is not a good idea then. And udelay sounds like a basic function that this code might use. So I wonder if we should limit this to the Arm FVPs for now? To not create subtle and hard-to-diagnose problems for a lot of boards?
Or maybe there is some mechanism to limit this to U-Boot boot time/UEFI boot services?
Signed-off-by: Peter Hoyes Peter.Hoyes@arm.com
arch/arm/cpu/armv8/Kconfig | 8 ++++++++ arch/arm/cpu/armv8/generic_timer.c | 27 +++++++++++++++++++++++++++ arch/arm/include/asm/system.h | 6 ++++-- 3 files changed, 39 insertions(+), 2 deletions(-)
diff --git a/arch/arm/cpu/armv8/Kconfig b/arch/arm/cpu/armv8/Kconfig index 9f0fb369f7..544c5e2d74 100644 --- a/arch/arm/cpu/armv8/Kconfig +++ b/arch/arm/cpu/armv8/Kconfig @@ -191,6 +191,14 @@ config ARMV8_EA_EL3_FIRST Exception handling at all exception levels for External Abort and SError interrupt exception are taken in EL3.
+config ARMV8_UDELAY_EVENT_STREAM
- bool "Use the event stream for udelay"
- default y if !ARCH_SOCFPGA
As described above, change this to something like "... if ARCH_VEXPRESS64".
Cheers, Andre
Agreed, this is limited to ARCH_VEXPRESS64 in v2.
Peter
- help
Use the event stream provided by the AArch64 architectural timer for
delays. This is more efficient than the default polling
implementation.
- menuconfig ARMV8_CRYPTO bool "ARM64 Accelerated Cryptographic Algorithms"
diff --git a/arch/arm/cpu/armv8/generic_timer.c b/arch/arm/cpu/armv8/generic_timer.c index 8f83372cbc..e18b5c8187 100644 --- a/arch/arm/cpu/armv8/generic_timer.c +++ b/arch/arm/cpu/armv8/generic_timer.c @@ -115,3 +115,30 @@ ulong timer_get_boot_us(void)
return val / get_tbclk(); }
+#if CONFIG_IS_ENABLED(ARMV8_UDELAY_EVENT_STREAM) +void __udelay(unsigned long usec) +{
- u64 target = get_ticks() + usec_to_tick(usec);
- /* At EL2 or above, use the event stream to avoid polling CNTPCT_EL0 so often */
- if (current_el() >= 2) {
u32 cnthctl_val;
const u8 event_period = 0x7;
asm volatile("mrs %0, cnthctl_el2" : "=r" (cnthctl_val));
asm volatile("msr cnthctl_el2, %0" : : "r"
(cnthctl_val | CNTHCTL_EL2_EVNT_EN | CNTHCTL_EL2_EVNT_I(event_period)));
while (get_ticks() + (1ULL << event_period) <= target)
wfe();
/* Reset the event stream */
asm volatile("msr cnthctl_el2, %0" : : "r" (cnthctl_val));
- }
- /* Fall back to polling CNTPCT_EL0 */
- while (get_ticks() <= target)
;
+} +#endif diff --git a/arch/arm/include/asm/system.h b/arch/arm/include/asm/system.h index 51123c2968..7e30cac32a 100644 --- a/arch/arm/include/asm/system.h +++ b/arch/arm/include/asm/system.h @@ -69,8 +69,10 @@ /*
- CNTHCTL_EL2 bits definitions
*/ -#define CNTHCTL_EL2_EL1PCEN_EN (1 << 1) /* Physical timer regs accessible */ -#define CNTHCTL_EL2_EL1PCTEN_EN (1 << 0) /* Physical counter accessible */ +#define CNTHCTL_EL2_EVNT_EN BIT(2) /* Enable the event stream */ +#define CNTHCTL_EL2_EVNT_I(val) ((val) << 4) /* Event stream trigger bits */ +#define CNTHCTL_EL2_EL1PCEN_EN (1 << 1) /* Physical timer regs accessible */ +#define CNTHCTL_EL2_EL1PCTEN_EN (1 << 0) /* Physical counter accessible */
/*
- HCR_EL2 bits definitions
participants (3)
-
Andre Przywara
-
Peter Hoyes
-
Quentin Schulz