[U-Boot] [PATCH v4 00/14] Add PSCI support for Jetson TK1/Tegra124 + CNTFRQ fix

Changes in v4: - rebased over master - fixed a nasty bug in CTNFRQ setting that broke unrelated code - fixed a bug in Tegra's psci_cpu_off that prevented powergating - factored out armv7_get_cpu_id macro - removed duplicate call to psci_arch_init - addressed review comments regarding comments - wait on initial CPU power-down using Flow Controller instead of PMC
Jan
CC: Ian Campbell ijc@hellion.org.uk CC: Marc Zyngier marc.zyngier@arm.com
Ian Campbell (3): tegra124: Add more registers to struct mc_ctlr jetson-tk1: Add PSCI configuration options and reserve secure code tegra124: Reserve secure RAM using MC_SECURITY_CFG{0, 1}_0
Jan Kiszka (11): sun7i: Remove duplicate call to psci_arch_init ARM: Factor out armv7_get_cpu_id macro ARM: Factor out reusable psci_cpu_off_common ARM: Factor out reusable psci_cpu_entry ARM: Factor out reusable psci_get_cpu_stack_top ARM: Put target PC for PSCI CPU_ON on per-CPU stack virt-dt: Allow reservation of secure region when in a RAM carveout tegra: Make tegra_powergate_power_on public tegra: Add ap_pm_init hook tegra124: Add PSCI support for Tegra124 tegra: Set CNTFRQ for secondary CPUs
arch/arm/cpu/armv7/psci.S | 102 +++++++++++++++++++++++++ arch/arm/cpu/armv7/sunxi/psci.S | 112 +++++---------------------- arch/arm/cpu/armv7/virt-dt.c | 29 +++++++ arch/arm/cpu/armv7/virt-v7.c | 5 ++ arch/arm/include/asm/arch-tegra/ap.h | 5 ++ arch/arm/include/asm/arch-tegra/powergate.h | 1 + arch/arm/include/asm/arch-tegra124/flow.h | 6 ++ arch/arm/include/asm/arch-tegra124/mc.h | 35 ++++++++- arch/arm/include/asm/armv7.h | 1 + arch/arm/include/asm/macro.h | 7 ++ arch/arm/include/asm/system.h | 1 + arch/arm/lib/bootm-fdt.c | 5 ++ arch/arm/mach-tegra/Makefile | 4 + arch/arm/mach-tegra/ap.c | 15 ++++ arch/arm/mach-tegra/powergate.c | 2 +- arch/arm/mach-tegra/psci.S | 114 ++++++++++++++++++++++++++++ arch/arm/mach-tegra/tegra124/Kconfig | 2 + arch/arm/mach-tegra/tegra124/Makefile | 4 + arch/arm/mach-tegra/tegra124/ap.c | 55 ++++++++++++++ board/nvidia/common/board.c | 4 + include/configs/jetson-tk1.h | 5 ++ 21 files changed, 416 insertions(+), 98 deletions(-) create mode 100644 arch/arm/mach-tegra/psci.S create mode 100644 arch/arm/mach-tegra/tegra124/ap.c

This is already invoked a few cycles later in monitor mode by _secure_monitor. Drop it here, it serves no purpose.
CC: Marc Zyngier marc.zyngier@arm.com Signed-off-by: Jan Kiszka jan.kiszka@siemens.com --- arch/arm/cpu/armv7/sunxi/psci.S | 1 - 1 file changed, 1 deletion(-)
diff --git a/arch/arm/cpu/armv7/sunxi/psci.S b/arch/arm/cpu/armv7/sunxi/psci.S index 5be497b..9e898f2 100644 --- a/arch/arm/cpu/armv7/sunxi/psci.S +++ b/arch/arm/cpu/armv7/sunxi/psci.S @@ -254,7 +254,6 @@ _sunxi_cpu_entry: isb
bl _nonsec_init - bl psci_arch_init
adr r0, _target_pc ldr r0, [r0]

Handy for obtaining the ID of the current CPU. We will have more use cases.
CC: Marc Zyngier marc.zyngier@arm.com Signed-off-by: Jan Kiszka jan.kiszka@siemens.com --- arch/arm/cpu/armv7/sunxi/psci.S | 4 ++-- arch/arm/include/asm/macro.h | 7 +++++++ 2 files changed, 9 insertions(+), 2 deletions(-)
diff --git a/arch/arm/cpu/armv7/sunxi/psci.S b/arch/arm/cpu/armv7/sunxi/psci.S index 9e898f2..0523217 100644 --- a/arch/arm/cpu/armv7/sunxi/psci.S +++ b/arch/arm/cpu/armv7/sunxi/psci.S @@ -19,6 +19,7 @@
#include <config.h> #include <asm/gic.h> +#include <asm/macro.h> #include <asm/psci.h> #include <asm/arch/cpu.h>
@@ -315,8 +316,7 @@ psci_arch_init: mcr p15, 0, r5, c1, c1, 0 @ Write SCR isb
- mrc p15, 0, r4, c0, c0, 5 @ MPIDR - and r4, r4, #3 @ cpu number in cluster + armv7_get_cpu_id r4 mov r5, #0x400 @ 1kB of stack per CPU mul r4, r4, r5
diff --git a/arch/arm/include/asm/macro.h b/arch/arm/include/asm/macro.h index 1c8c425..0bc925a 100644 --- a/arch/arm/include/asm/macro.h +++ b/arch/arm/include/asm/macro.h @@ -198,6 +198,13 @@ lr .req x30 .endm #endif
+#else /* !CONFIG_ARM64 */ + +.macro armv7_get_cpu_id rn + mrc p15, 0, \rn, c0, c0, 5 /* read MPIDR */ + and \rn, \rn, #0xff /* return CPU ID in cluster */ +.endm + #endif /* CONFIG_ARM64 */
#endif /* __ASSEMBLY__ */

On Fri, 27 Feb 2015 13:28:01 +0000 Jan Kiszka jan.kiszka@siemens.com wrote:
Handy for obtaining the ID of the current CPU. We will have more use cases.
CC: Marc Zyngier marc.zyngier@arm.com Signed-off-by: Jan Kiszka jan.kiszka@siemens.com
arch/arm/cpu/armv7/sunxi/psci.S | 4 ++-- arch/arm/include/asm/macro.h | 7 +++++++ 2 files changed, 9 insertions(+), 2 deletions(-)
diff --git a/arch/arm/cpu/armv7/sunxi/psci.S b/arch/arm/cpu/armv7/sunxi/psci.S index 9e898f2..0523217 100644 --- a/arch/arm/cpu/armv7/sunxi/psci.S +++ b/arch/arm/cpu/armv7/sunxi/psci.S @@ -19,6 +19,7 @@
#include <config.h> #include <asm/gic.h> +#include <asm/macro.h> #include <asm/psci.h> #include <asm/arch/cpu.h>
@@ -315,8 +316,7 @@ psci_arch_init: mcr p15, 0, r5, c1, c1, 0 @ Write SCR isb
- mrc p15, 0, r4, c0, c0, 5 @ MPIDR
- and r4, r4, #3 @ cpu number in cluster
- armv7_get_cpu_id r4 mov r5, #0x400 @ 1kB of stack per CPU mul r4, r4, r5
diff --git a/arch/arm/include/asm/macro.h b/arch/arm/include/asm/macro.h index 1c8c425..0bc925a 100644 --- a/arch/arm/include/asm/macro.h +++ b/arch/arm/include/asm/macro.h @@ -198,6 +198,13 @@ lr .req x30 .endm #endif
+#else /* !CONFIG_ARM64 */
+.macro armv7_get_cpu_id rn
- mrc p15, 0, \rn, c0, c0, 5 /* read MPIDR */
- and \rn, \rn, #0xff /* return CPU ID
in cluster */ +.endm
How does this work in a multi-cluster situation? Or when you have sparse MPIDRs?
Thanks,
M.
#endif /* CONFIG_ARM64 */
#endif /* __ASSEMBLY__ */

On 2015-02-28 14:56, Marc Zyngier wrote:
On Fri, 27 Feb 2015 13:28:01 +0000 Jan Kiszka jan.kiszka@siemens.com wrote:
Handy for obtaining the ID of the current CPU. We will have more use cases.
CC: Marc Zyngier marc.zyngier@arm.com Signed-off-by: Jan Kiszka jan.kiszka@siemens.com
arch/arm/cpu/armv7/sunxi/psci.S | 4 ++-- arch/arm/include/asm/macro.h | 7 +++++++ 2 files changed, 9 insertions(+), 2 deletions(-)
diff --git a/arch/arm/cpu/armv7/sunxi/psci.S b/arch/arm/cpu/armv7/sunxi/psci.S index 9e898f2..0523217 100644 --- a/arch/arm/cpu/armv7/sunxi/psci.S +++ b/arch/arm/cpu/armv7/sunxi/psci.S @@ -19,6 +19,7 @@
#include <config.h> #include <asm/gic.h> +#include <asm/macro.h> #include <asm/psci.h> #include <asm/arch/cpu.h>
@@ -315,8 +316,7 @@ psci_arch_init: mcr p15, 0, r5, c1, c1, 0 @ Write SCR isb
- mrc p15, 0, r4, c0, c0, 5 @ MPIDR
- and r4, r4, #3 @ cpu number in cluster
- armv7_get_cpu_id r4 mov r5, #0x400 @ 1kB of stack per CPU mul r4, r4, r5
diff --git a/arch/arm/include/asm/macro.h b/arch/arm/include/asm/macro.h index 1c8c425..0bc925a 100644 --- a/arch/arm/include/asm/macro.h +++ b/arch/arm/include/asm/macro.h @@ -198,6 +198,13 @@ lr .req x30 .endm #endif
+#else /* !CONFIG_ARM64 */
+.macro armv7_get_cpu_id rn
- mrc p15, 0, \rn, c0, c0, 5 /* read MPIDR */
- and \rn, \rn, #0xff /* return CPU ID
in cluster */ +.endm
How does this work in a multi-cluster situation? Or when you have sparse MPIDRs?
I have no idea. That masking was stolen from your code.
The model we assume for PSCI is that there is no cluster and that we have enough per-cpu space for up to the maximum cpu ID obtained that way.
If you are concerned about signaling a false general applicability of that macro, I can fold it back into the callers or add some comments about restrictions. My plan was just to make the caller site a bit more readable.
Jan

On 02/03/15 09:40, Jan Kiszka wrote:
On 2015-02-28 14:56, Marc Zyngier wrote:
On Fri, 27 Feb 2015 13:28:01 +0000 Jan Kiszka jan.kiszka@siemens.com wrote:
Handy for obtaining the ID of the current CPU. We will have more use cases.
CC: Marc Zyngier marc.zyngier@arm.com Signed-off-by: Jan Kiszka jan.kiszka@siemens.com
arch/arm/cpu/armv7/sunxi/psci.S | 4 ++-- arch/arm/include/asm/macro.h | 7 +++++++ 2 files changed, 9 insertions(+), 2 deletions(-)
diff --git a/arch/arm/cpu/armv7/sunxi/psci.S b/arch/arm/cpu/armv7/sunxi/psci.S index 9e898f2..0523217 100644 --- a/arch/arm/cpu/armv7/sunxi/psci.S +++ b/arch/arm/cpu/armv7/sunxi/psci.S @@ -19,6 +19,7 @@
#include <config.h> #include <asm/gic.h> +#include <asm/macro.h> #include <asm/psci.h> #include <asm/arch/cpu.h>
@@ -315,8 +316,7 @@ psci_arch_init: mcr p15, 0, r5, c1, c1, 0 @ Write SCR isb
- mrc p15, 0, r4, c0, c0, 5 @ MPIDR
- and r4, r4, #3 @ cpu number in cluster
- armv7_get_cpu_id r4 mov r5, #0x400 @ 1kB of stack per CPU mul r4, r4, r5
diff --git a/arch/arm/include/asm/macro.h b/arch/arm/include/asm/macro.h index 1c8c425..0bc925a 100644 --- a/arch/arm/include/asm/macro.h +++ b/arch/arm/include/asm/macro.h @@ -198,6 +198,13 @@ lr .req x30 .endm #endif
+#else /* !CONFIG_ARM64 */
+.macro armv7_get_cpu_id rn
- mrc p15, 0, \rn, c0, c0, 5 /* read MPIDR */
- and \rn, \rn, #0xff /* return CPU ID
in cluster */ +.endm
How does this work in a multi-cluster situation? Or when you have sparse MPIDRs?
I have no idea. That masking was stolen from your code.
Well, it is perfectly correct in the context of the original code (single cluster, dense ID space), but is utterly wrong as a general implementation.
The model we assume for PSCI is that there is no cluster and that we have enough per-cpu space for up to the maximum cpu ID obtained that way.
I don't think you can rely on this assumption.
If you are concerned about signaling a false general applicability of that macro, I can fold it back into the callers or add some comments about restrictions. My plan was just to make the caller site a bit more readable.
I understand the concern, but making this a general macro is very misleading. Most systems with more than 4 cores will have clusters and a very sparse ID space, and these systems are quite common these days.
Computing a CPU number is not a simple task, specially in the absence of a discovery protocol (the Linux kernel relies on DT for that), so I'm afraid you have to either keep this on a per platform basis, or provide ways to override this macro.
Thanks,
M.

On 2015-03-02 11:19, Marc Zyngier wrote:
On 02/03/15 09:40, Jan Kiszka wrote:
On 2015-02-28 14:56, Marc Zyngier wrote:
On Fri, 27 Feb 2015 13:28:01 +0000 Jan Kiszka jan.kiszka@siemens.com wrote:
Handy for obtaining the ID of the current CPU. We will have more use cases.
CC: Marc Zyngier marc.zyngier@arm.com Signed-off-by: Jan Kiszka jan.kiszka@siemens.com
arch/arm/cpu/armv7/sunxi/psci.S | 4 ++-- arch/arm/include/asm/macro.h | 7 +++++++ 2 files changed, 9 insertions(+), 2 deletions(-)
diff --git a/arch/arm/cpu/armv7/sunxi/psci.S b/arch/arm/cpu/armv7/sunxi/psci.S index 9e898f2..0523217 100644 --- a/arch/arm/cpu/armv7/sunxi/psci.S +++ b/arch/arm/cpu/armv7/sunxi/psci.S @@ -19,6 +19,7 @@
#include <config.h> #include <asm/gic.h> +#include <asm/macro.h> #include <asm/psci.h> #include <asm/arch/cpu.h>
@@ -315,8 +316,7 @@ psci_arch_init: mcr p15, 0, r5, c1, c1, 0 @ Write SCR isb
- mrc p15, 0, r4, c0, c0, 5 @ MPIDR
- and r4, r4, #3 @ cpu number in cluster
- armv7_get_cpu_id r4 mov r5, #0x400 @ 1kB of stack per CPU mul r4, r4, r5
diff --git a/arch/arm/include/asm/macro.h b/arch/arm/include/asm/macro.h index 1c8c425..0bc925a 100644 --- a/arch/arm/include/asm/macro.h +++ b/arch/arm/include/asm/macro.h @@ -198,6 +198,13 @@ lr .req x30 .endm #endif
+#else /* !CONFIG_ARM64 */
+.macro armv7_get_cpu_id rn
- mrc p15, 0, \rn, c0, c0, 5 /* read MPIDR */
- and \rn, \rn, #0xff /* return CPU ID
in cluster */ +.endm
How does this work in a multi-cluster situation? Or when you have sparse MPIDRs?
I have no idea. That masking was stolen from your code.
Well, it is perfectly correct in the context of the original code (single cluster, dense ID space), but is utterly wrong as a general implementation.
The model we assume for PSCI is that there is no cluster and that we have enough per-cpu space for up to the maximum cpu ID obtained that way.
I don't think you can rely on this assumption.
We'll have to in order to share code, so this service has to help out.
If you are concerned about signaling a false general applicability of that macro, I can fold it back into the callers or add some comments about restrictions. My plan was just to make the caller site a bit more readable.
I understand the concern, but making this a general macro is very misleading. Most systems with more than 4 cores will have clusters and a very sparse ID space, and these systems are quite common these days.
Computing a CPU number is not a simple task, specially in the absence of a discovery protocol (the Linux kernel relies on DT for that), so I'm afraid you have to either keep this on a per platform basis, or provide ways to override this macro.
I'll make this another weak function. Unfolding won't work well because it should be used in common psci.S.
Jan

Move parts of sunxi's psci_cpu_off into psci_cpu_off_common, namely cache disabling and flushing, clrex and the disabling of SMP for the dying CPU. These steps are apparently generic for ARMv7 and will be reused for Tegra124 support.
CC: Marc Zyngier marc.zyngier@arm.com Signed-off-by: Jan Kiszka jan.kiszka@siemens.com --- arch/arm/cpu/armv7/psci.S | 71 +++++++++++++++++++++++++++++++++++++++++ arch/arm/cpu/armv7/sunxi/psci.S | 63 +----------------------------------- 2 files changed, 72 insertions(+), 62 deletions(-)
diff --git a/arch/arm/cpu/armv7/psci.S b/arch/arm/cpu/armv7/psci.S index bf11a34..d688607 100644 --- a/arch/arm/cpu/armv7/psci.S +++ b/arch/arm/cpu/armv7/psci.S @@ -99,4 +99,75 @@ _smc_psci: pop {r4-r7, lr} movs pc, lr @ Return to the kernel
+/* Imported from Linux kernel */ +LENTRY(v7_flush_dcache_all) + dmb @ ensure ordering with previous memory accesses + mrc p15, 1, r0, c0, c0, 1 @ read clidr + ands r3, r0, #0x7000000 @ extract loc from clidr + mov r3, r3, lsr #23 @ left align loc bit field + beq finished @ if loc is 0, then no need to clean + mov r10, #0 @ start clean at cache level 0 +flush_levels: + add r2, r10, r10, lsr #1 @ work out 3x current cache level + mov r1, r0, lsr r2 @ extract cache type bits from clidr + and r1, r1, #7 @ mask of the bits for current cache only + cmp r1, #2 @ see what cache we have at this level + blt skip @ skip if no cache, or just i-cache + mrs r9, cpsr @ make cssr&csidr read atomic + mcr p15, 2, r10, c0, c0, 0 @ select current cache level in cssr + isb @ isb to sych the new cssr&csidr + mrc p15, 1, r1, c0, c0, 0 @ read the new csidr + msr cpsr_c, r9 + and r2, r1, #7 @ extract the length of the cache lines + add r2, r2, #4 @ add 4 (line length offset) + ldr r4, =0x3ff + ands r4, r4, r1, lsr #3 @ find maximum number on the way size + clz r5, r4 @ find bit position of way size increment + ldr r7, =0x7fff + ands r7, r7, r1, lsr #13 @ extract max number of the index size +loop1: + mov r9, r7 @ create working copy of max index +loop2: + orr r11, r10, r4, lsl r5 @ factor way and cache number into r11 + orr r11, r11, r9, lsl r2 @ factor index number into r11 + mcr p15, 0, r11, c7, c14, 2 @ clean & invalidate by set/way + subs r9, r9, #1 @ decrement the index + bge loop2 + subs r4, r4, #1 @ decrement the way + bge loop1 +skip: + add r10, r10, #2 @ increment cache number + cmp r3, r10 + bgt flush_levels +finished: + mov r10, #0 @ swith back to cache level 0 + mcr p15, 2, r10, c0, c0, 0 @ select current cache level in cssr + dsb st + isb + bx lr +ENDPROC(v7_flush_dcache_all) + +ENTRY(psci_cpu_off_common) + push {lr} + + mrc p15, 0, r0, c1, c0, 0 @ SCTLR + bic r0, r0, #(1 << 2) @ Clear C bit + mcr p15, 0, r0, c1, c0, 0 @ SCTLR + isb + dsb + + bl v7_flush_dcache_all + + clrex @ Why??? + + mrc p15, 0, r0, c1, c0, 1 @ ACTLR + bic r0, r0, #(1 << 6) @ Clear SMP bit + mcr p15, 0, r0, c1, c0, 1 @ ACTLR + isb + dsb + + pop {lr} + bx lr +ENDPROC(psci_cpu_off_common) + .popsection diff --git a/arch/arm/cpu/armv7/sunxi/psci.S b/arch/arm/cpu/armv7/sunxi/psci.S index 0523217..bb3d4ef 100644 --- a/arch/arm/cpu/armv7/sunxi/psci.S +++ b/arch/arm/cpu/armv7/sunxi/psci.S @@ -200,53 +200,6 @@ psci_cpu_on: _target_pc: .word 0
-/* Imported from Linux kernel */ -v7_flush_dcache_all: - dmb @ ensure ordering with previous memory accesses - mrc p15, 1, r0, c0, c0, 1 @ read clidr - ands r3, r0, #0x7000000 @ extract loc from clidr - mov r3, r3, lsr #23 @ left align loc bit field - beq finished @ if loc is 0, then no need to clean - mov r10, #0 @ start clean at cache level 0 -flush_levels: - add r2, r10, r10, lsr #1 @ work out 3x current cache level - mov r1, r0, lsr r2 @ extract cache type bits from clidr - and r1, r1, #7 @ mask of the bits for current cache only - cmp r1, #2 @ see what cache we have at this level - blt skip @ skip if no cache, or just i-cache - mrs r9, cpsr @ make cssr&csidr read atomic - mcr p15, 2, r10, c0, c0, 0 @ select current cache level in cssr - isb @ isb to sych the new cssr&csidr - mrc p15, 1, r1, c0, c0, 0 @ read the new csidr - msr cpsr_c, r9 - and r2, r1, #7 @ extract the length of the cache lines - add r2, r2, #4 @ add 4 (line length offset) - ldr r4, =0x3ff - ands r4, r4, r1, lsr #3 @ find maximum number on the way size - clz r5, r4 @ find bit position of way size increment - ldr r7, =0x7fff - ands r7, r7, r1, lsr #13 @ extract max number of the index size -loop1: - mov r9, r7 @ create working copy of max index -loop2: - orr r11, r10, r4, lsl r5 @ factor way and cache number into r11 - orr r11, r11, r9, lsl r2 @ factor index number into r11 - mcr p15, 0, r11, c7, c14, 2 @ clean & invalidate by set/way - subs r9, r9, #1 @ decrement the index - bge loop2 - subs r4, r4, #1 @ decrement the way - bge loop1 -skip: - add r10, r10, #2 @ increment cache number - cmp r3, r10 - bgt flush_levels -finished: - mov r10, #0 @ swith back to cache level 0 - mcr p15, 2, r10, c0, c0, 0 @ select current cache level in cssr - dsb st - isb - bx lr - _sunxi_cpu_entry: @ Set SMP bit mrc p15, 0, r0, c1, c0, 1 @@ -262,21 +215,7 @@ _sunxi_cpu_entry:
.globl psci_cpu_off psci_cpu_off: - mrc p15, 0, r0, c1, c0, 0 @ SCTLR - bic r0, r0, #(1 << 2) @ Clear C bit - mcr p15, 0, r0, c1, c0, 0 @ SCTLR - isb - dsb - - bl v7_flush_dcache_all - - clrex @ Why??? - - mrc p15, 0, r0, c1, c0, 1 @ ACTLR - bic r0, r0, #(1 << 6) @ Clear SMP bit - mcr p15, 0, r0, c1, c0, 1 @ ACTLR - isb - dsb + bl psci_cpu_off_common
@ Ask CPU0 to pull the rug... movw r0, #(GICD_BASE & 0xffff)

On Fri, 27 Feb 2015 13:28:02 +0000 Jan Kiszka jan.kiszka@siemens.com wrote:
Move parts of sunxi's psci_cpu_off into psci_cpu_off_common, namely cache disabling and flushing, clrex and the disabling of SMP for the dying CPU. These steps are apparently generic for ARMv7 and will be reused for Tegra124 support.
CC: Marc Zyngier marc.zyngier@arm.com Signed-off-by: Jan Kiszka jan.kiszka@siemens.com
arch/arm/cpu/armv7/psci.S | 71 +++++++++++++++++++++++++++++++++++++++++ arch/arm/cpu/armv7/sunxi/psci.S | 63 +----------------------------------- 2 files changed, 72 insertions(+), 62 deletions(-)
diff --git a/arch/arm/cpu/armv7/psci.S b/arch/arm/cpu/armv7/psci.S index bf11a34..d688607 100644 --- a/arch/arm/cpu/armv7/psci.S +++ b/arch/arm/cpu/armv7/psci.S @@ -99,4 +99,75 @@ _smc_psci: pop {r4-r7, lr} movs pc, lr @ Return to the kernel +/* Imported from Linux kernel */ +LENTRY(v7_flush_dcache_all)
- dmb @ ensure ordering
with previous memory accesses
- mrc p15, 1, r0, c0, c0, 1 @ read clidr
- ands r3, r0, #0x7000000 @ extract loc
from clidr
- mov r3, r3, lsr #23 @ left
align loc bit field
- beq finished @ if loc is 0,
then no need to clean
- mov r10, #0 @ start
clean at cache level 0 +flush_levels:
- add r2, r10, r10, lsr #1 @ work out 3x
current cache level
- mov r1, r0, lsr r2 @ extract
cache type bits from clidr
- and r1, r1, #7 @ mask of the
bits for current cache only
- cmp r1, #2 @ see what
cache we have at this level
- blt skip @ skip if no
cache, or just i-cache
- mrs r9, cpsr @ make cssr&csidr
read atomic
- mcr p15, 2, r10, c0, c0, 0 @ select
current cache level in cssr
- isb @ isb to sych the
new cssr&csidr
- mrc p15, 1, r1, c0, c0, 0 @ read the
new csidr
- msr cpsr_c, r9
- and r2, r1, #7 @ extract the
length of the cache lines
- add r2, r2, #4 @ add 4 (line
length offset)
- ldr r4, =0x3ff
- ands r4, r4, r1, lsr #3 @ find maximum
number on the way size
- clz r5, r4 @ find bit
position of way size increment
- ldr r7, =0x7fff
- ands r7, r7, r1, lsr #13 @ extract max
number of the index size +loop1:
- mov r9, r7 @ create
working copy of max index +loop2:
- orr r11, r10, r4, lsl r5 @ factor way
and cache number into r11
- orr r11, r11, r9, lsl r2 @ factor
index number into r11
- mcr p15, 0, r11, c7, c14, 2 @ clean &
invalidate by set/way
- subs r9, r9, #1 @ decrement
the index
- bge loop2
- subs r4, r4, #1 @ decrement
the way
- bge loop1
+skip:
- add r10, r10, #2 @ increment
cache number
- cmp r3, r10
- bgt flush_levels
+finished:
- mov r10, #0 @ swith
back to cache level 0
- mcr p15, 2, r10, c0, c0, 0 @ select
current cache level in cssr
- dsb st
- isb
- bx lr
+ENDPROC(v7_flush_dcache_all)
+ENTRY(psci_cpu_off_common)
- push {lr}
- mrc p15, 0, r0, c1, c0, 0 @ SCTLR
- bic r0, r0, #(1 << 2) @ Clear C bit
- mcr p15, 0, r0, c1, c0, 0 @ SCTLR
- isb
- dsb
- bl v7_flush_dcache_all
- clrex @ Why???
- mrc p15, 0, r0, c1, c0, 1 @ ACTLR
- bic r0, r0, #(1 << 6) @ Clear SMP bit
- mcr p15, 0, r0, c1, c0, 1 @ ACTLR
- isb
- dsb
Same thing. Please turn this into something an implementation can override.
- pop {lr}
- bx lr
+ENDPROC(psci_cpu_off_common)
- .popsection
diff --git a/arch/arm/cpu/armv7/sunxi/psci.S b/arch/arm/cpu/armv7/sunxi/psci.S index 0523217..bb3d4ef 100644 --- a/arch/arm/cpu/armv7/sunxi/psci.S +++ b/arch/arm/cpu/armv7/sunxi/psci.S @@ -200,53 +200,6 @@ psci_cpu_on: _target_pc: .word 0
-/* Imported from Linux kernel */ -v7_flush_dcache_all:
- dmb @ ensure ordering
with previous memory accesses
- mrc p15, 1, r0, c0, c0, 1 @ read clidr
- ands r3, r0, #0x7000000 @ extract loc
from clidr
- mov r3, r3, lsr #23 @ left
align loc bit field
- beq finished @ if loc is 0,
then no need to clean
- mov r10, #0 @ start
clean at cache level 0 -flush_levels:
- add r2, r10, r10, lsr #1 @ work out 3x
current cache level
- mov r1, r0, lsr r2 @ extract
cache type bits from clidr
- and r1, r1, #7 @ mask of the
bits for current cache only
- cmp r1, #2 @ see what
cache we have at this level
- blt skip @ skip if no
cache, or just i-cache
- mrs r9, cpsr @ make cssr&csidr
read atomic
- mcr p15, 2, r10, c0, c0, 0 @ select
current cache level in cssr
- isb @ isb to sych the
new cssr&csidr
- mrc p15, 1, r1, c0, c0, 0 @ read the
new csidr
- msr cpsr_c, r9
- and r2, r1, #7 @ extract the
length of the cache lines
- add r2, r2, #4 @ add 4 (line
length offset)
- ldr r4, =0x3ff
- ands r4, r4, r1, lsr #3 @ find maximum
number on the way size
- clz r5, r4 @ find bit
position of way size increment
- ldr r7, =0x7fff
- ands r7, r7, r1, lsr #13 @ extract max
number of the index size -loop1:
- mov r9, r7 @ create
working copy of max index -loop2:
- orr r11, r10, r4, lsl r5 @ factor way
and cache number into r11
- orr r11, r11, r9, lsl r2 @ factor
index number into r11
- mcr p15, 0, r11, c7, c14, 2 @ clean &
invalidate by set/way
- subs r9, r9, #1 @ decrement
the index
- bge loop2
- subs r4, r4, #1 @ decrement
the way
- bge loop1
-skip:
- add r10, r10, #2 @ increment
cache number
- cmp r3, r10
- bgt flush_levels
-finished:
- mov r10, #0 @ swith
back to cache level 0
- mcr p15, 2, r10, c0, c0, 0 @ select
current cache level in cssr
- dsb st
- isb
- bx lr
_sunxi_cpu_entry: @ Set SMP bit mrc p15, 0, r0, c1, c0, 1 @@ -262,21 +215,7 @@ _sunxi_cpu_entry:
.globl psci_cpu_off psci_cpu_off:
- mrc p15, 0, r0, c1, c0, 0 @ SCTLR
- bic r0, r0, #(1 << 2) @ Clear C bit
- mcr p15, 0, r0, c1, c0, 0 @ SCTLR
- isb
- dsb
- bl v7_flush_dcache_all
- clrex @ Why???
- mrc p15, 0, r0, c1, c0, 1 @ ACTLR
- bic r0, r0, #(1 << 6) @ Clear SMP bit
- mcr p15, 0, r0, c1, c0, 1 @ ACTLR
- isb
- dsb
bl psci_cpu_off_common
@ Ask CPU0 to pull the rug... movw r0, #(GICD_BASE & 0xffff)

_sunxi_cpu_entry can be converted completely into a reusable psci_cpu_entry. Tegra124 will use it as well.
CC: Marc Zyngier marc.zyngier@arm.com Signed-off-by: Jan Kiszka jan.kiszka@siemens.com --- arch/arm/cpu/armv7/psci.S | 18 ++++++++++++++++++ arch/arm/cpu/armv7/sunxi/psci.S | 20 ++------------------ 2 files changed, 20 insertions(+), 18 deletions(-)
diff --git a/arch/arm/cpu/armv7/psci.S b/arch/arm/cpu/armv7/psci.S index d688607..315c20b 100644 --- a/arch/arm/cpu/armv7/psci.S +++ b/arch/arm/cpu/armv7/psci.S @@ -170,4 +170,22 @@ ENTRY(psci_cpu_off_common) bx lr ENDPROC(psci_cpu_off_common)
+ENTRY(psci_cpu_entry) + @ Set SMP bit + mrc p15, 0, r0, c1, c0, 1 @ ACTLR + orr r0, r0, #(1 << 6) @ Set SMP bit + mcr p15, 0, r0, c1, c0, 1 @ ACTLR + isb + + bl _nonsec_init + + adr r0, _psci_target_pc + ldr r0, [r0] + b _do_nonsec_entry +ENDPROC(psci_cpu_entry) + +.globl _psci_target_pc +_psci_target_pc: + .word 0 + .popsection diff --git a/arch/arm/cpu/armv7/sunxi/psci.S b/arch/arm/cpu/armv7/sunxi/psci.S index bb3d4ef..9ea3ce8 100644 --- a/arch/arm/cpu/armv7/sunxi/psci.S +++ b/arch/arm/cpu/armv7/sunxi/psci.S @@ -139,7 +139,7 @@ out: mcr p15, 0, r7, c1, c1, 0 @ r2 = target PC .globl psci_cpu_on psci_cpu_on: - adr r0, _target_pc + ldr r0, =_psci_target_pc str r2, [r0] dsb
@@ -151,7 +151,7 @@ psci_cpu_on: mov r4, #1 lsl r4, r4, r1
- adr r6, _sunxi_cpu_entry + ldr r6, =psci_cpu_entry str r6, [r0, #0x1a4] @ PRIVATE_REG (boot vector)
@ Assert reset on target CPU @@ -197,22 +197,6 @@ psci_cpu_on: mov r0, #ARM_PSCI_RET_SUCCESS @ Return PSCI_RET_SUCCESS mov pc, lr
-_target_pc: - .word 0 - -_sunxi_cpu_entry: - @ Set SMP bit - mrc p15, 0, r0, c1, c0, 1 - orr r0, r0, #0x40 - mcr p15, 0, r0, c1, c0, 1 - isb - - bl _nonsec_init - - adr r0, _target_pc - ldr r0, [r0] - b _do_nonsec_entry - .globl psci_cpu_off psci_cpu_off: bl psci_cpu_off_common

On Fri, 27 Feb 2015 13:28:03 +0000 Jan Kiszka jan.kiszka@siemens.com wrote:
_sunxi_cpu_entry can be converted completely into a reusable psci_cpu_entry. Tegra124 will use it as well.
CC: Marc Zyngier marc.zyngier@arm.com Signed-off-by: Jan Kiszka jan.kiszka@siemens.com
arch/arm/cpu/armv7/psci.S | 18 ++++++++++++++++++ arch/arm/cpu/armv7/sunxi/psci.S | 20 ++------------------ 2 files changed, 20 insertions(+), 18 deletions(-)
diff --git a/arch/arm/cpu/armv7/psci.S b/arch/arm/cpu/armv7/psci.S index d688607..315c20b 100644 --- a/arch/arm/cpu/armv7/psci.S +++ b/arch/arm/cpu/armv7/psci.S @@ -170,4 +170,22 @@ ENTRY(psci_cpu_off_common) bx lr ENDPROC(psci_cpu_off_common)
+ENTRY(psci_cpu_entry)
- @ Set SMP bit
- mrc p15, 0, r0, c1, c0, 1 @ ACTLR
- orr r0, r0, #(1 << 6) @ Set SMP bit
- mcr p15, 0, r0, c1, c0, 1 @ ACTLR
- isb
- bl _nonsec_init
- adr r0, _psci_target_pc
- ldr r0, [r0]
- b _do_nonsec_entry
+ENDPROC(psci_cpu_entry)
I'd add a *big* comment at the top of this. ACTLR is implementation dependent, and while sticking the SMP bit at this location is fairly common among ARM cores, it is by no mean a strong guarantee (this is not an architectural feature).
I'd recommend making it override-able.
+.globl _psci_target_pc +_psci_target_pc:
- .word 0
- .popsection
diff --git a/arch/arm/cpu/armv7/sunxi/psci.S b/arch/arm/cpu/armv7/sunxi/psci.S index bb3d4ef..9ea3ce8 100644 --- a/arch/arm/cpu/armv7/sunxi/psci.S +++ b/arch/arm/cpu/armv7/sunxi/psci.S @@ -139,7 +139,7 @@ out: mcr p15, 0, r7, c1, c1, 0 @ r2 = target PC .globl psci_cpu_on psci_cpu_on:
- adr r0, _target_pc
- ldr r0, =_psci_target_pc str r2, [r0] dsb
@@ -151,7 +151,7 @@ psci_cpu_on: mov r4, #1 lsl r4, r4, r1
- adr r6, _sunxi_cpu_entry
ldr r6, =psci_cpu_entry str r6, [r0, #0x1a4] @ PRIVATE_REG (boot vector)
@ Assert reset on target CPU
@@ -197,22 +197,6 @@ psci_cpu_on: mov r0, #ARM_PSCI_RET_SUCCESS @ Return PSCI_RET_SUCCESS mov pc, lr
-_target_pc:
- .word 0
-_sunxi_cpu_entry:
- @ Set SMP bit
- mrc p15, 0, r0, c1, c0, 1
- orr r0, r0, #0x40
- mcr p15, 0, r0, c1, c0, 1
- isb
- bl _nonsec_init
- adr r0, _target_pc
- ldr r0, [r0]
- b _do_nonsec_entry
.globl psci_cpu_off psci_cpu_off: bl psci_cpu_off_common

On Sat, 2015-02-28 at 13:53 +0000, Marc Zyngier wrote:
+ENTRY(psci_cpu_entry)
- @ Set SMP bit
- mrc p15, 0, r0, c1, c0, 1 @ ACTLR
- orr r0, r0, #(1 << 6) @ Set SMP bit
- mcr p15, 0, r0, c1, c0, 1 @ ACTLR
- isb
- bl _nonsec_init
- adr r0, _psci_target_pc
- ldr r0, [r0]
- b _do_nonsec_entry
+ENDPROC(psci_cpu_entry)
I'd add a *big* comment at the top of this. ACTLR is implementation dependent, and while sticking the SMP bit at this location is fairly common among ARM cores, it is by no mean a strong guarantee (this is not an architectural feature).
I'd recommend making it override-able.
Tom R has a series which does something along those lines: https://patchwork.ozlabs.org/patch/431587/
I suppose it ought to be used here too one way or another.
Ian.

This algorithm will be useful on Tegra as well, plus we will need it for making _psci_target_pc per-CPU.
CC: Marc Zyngier marc.zyngier@arm.com Signed-off-by: Jan Kiszka jan.kiszka@siemens.com --- arch/arm/cpu/armv7/psci.S | 14 ++++++++++++++ arch/arm/cpu/armv7/sunxi/psci.S | 17 +++++++---------- 2 files changed, 21 insertions(+), 10 deletions(-)
diff --git a/arch/arm/cpu/armv7/psci.S b/arch/arm/cpu/armv7/psci.S index 315c20b..99ce490 100644 --- a/arch/arm/cpu/armv7/psci.S +++ b/arch/arm/cpu/armv7/psci.S @@ -170,6 +170,20 @@ ENTRY(psci_cpu_off_common) bx lr ENDPROC(psci_cpu_off_common)
+@ expects CPU ID in r4 (will be overwritten) and returns stack top in r5 +ENTRY(psci_get_cpu_stack_top) + mov r5, #0x400 @ 1kB of stack per CPU + mul r4, r4, r5 + + ldr r5, =psci_text_end @ end of monitor text + add r5, r5, #0x2000 @ Skip two pages + lsr r5, r5, #12 @ Align to start of page + lsl r5, r5, #12 + sub r5, r5, r4 @ here's our stack! + + bx lr +ENDPROC(psci_get_cpu_stack_top) + ENTRY(psci_cpu_entry) @ Set SMP bit mrc p15, 0, r0, c1, c0, 1 @ ACTLR diff --git a/arch/arm/cpu/armv7/sunxi/psci.S b/arch/arm/cpu/armv7/sunxi/psci.S index 9ea3ce8..53d5e3f 100644 --- a/arch/arm/cpu/armv7/sunxi/psci.S +++ b/arch/arm/cpu/armv7/sunxi/psci.S @@ -214,6 +214,8 @@ psci_cpu_off:
.globl psci_arch_init psci_arch_init: + mov r6, lr + movw r4, #(GICD_BASE & 0xffff) movt r4, #(GICD_BASE >> 16)
@@ -240,16 +242,11 @@ psci_arch_init: isb
armv7_get_cpu_id r4 - mov r5, #0x400 @ 1kB of stack per CPU - mul r4, r4, r5 - - adr r5, text_end @ end of text - add r5, r5, #0x2000 @ Skip two pages - lsr r5, r5, #12 @ Align to start of page - lsl r5, r5, #12 - sub sp, r5, r4 @ here's our stack! + bl psci_get_cpu_stack_top + mov sp, r5
- bx lr + bx r6
-text_end: + .globl psci_text_end +psci_text_end: .popsection

Use a per-CPU variable for saving the target PC during CPU_ON operations. This allows us to run this service independently on targets that have more than 2 cores and also core-local power control.
CC: Marc Zyngier marc.zyngier@arm.com Signed-off-by: Jan Kiszka jan.kiszka@siemens.com --- arch/arm/cpu/armv7/psci.S | 11 +++++------ arch/arm/cpu/armv7/sunxi/psci.S | 9 ++++++--- 2 files changed, 11 insertions(+), 9 deletions(-)
diff --git a/arch/arm/cpu/armv7/psci.S b/arch/arm/cpu/armv7/psci.S index 99ce490..d3c623b 100644 --- a/arch/arm/cpu/armv7/psci.S +++ b/arch/arm/cpu/armv7/psci.S @@ -17,6 +17,7 @@
#include <config.h> #include <linux/linkage.h> +#include <asm/macro.h> #include <asm/psci.h>
.pushsection ._secure.text, "ax" @@ -179,6 +180,7 @@ ENTRY(psci_get_cpu_stack_top) add r5, r5, #0x2000 @ Skip two pages lsr r5, r5, #12 @ Align to start of page lsl r5, r5, #12 + sub r5, r5, #4 @ reserve 1 word for target PC sub r5, r5, r4 @ here's our stack!
bx lr @@ -193,13 +195,10 @@ ENTRY(psci_cpu_entry)
bl _nonsec_init
- adr r0, _psci_target_pc - ldr r0, [r0] + armv7_get_cpu_id r4 + bl psci_get_cpu_stack_top + ldr r0, [r5] @ target PC at stack top b _do_nonsec_entry ENDPROC(psci_cpu_entry)
-.globl _psci_target_pc -_psci_target_pc: - .word 0 - .popsection diff --git a/arch/arm/cpu/armv7/sunxi/psci.S b/arch/arm/cpu/armv7/sunxi/psci.S index 53d5e3f..70fc5f7 100644 --- a/arch/arm/cpu/armv7/sunxi/psci.S +++ b/arch/arm/cpu/armv7/sunxi/psci.S @@ -139,8 +139,11 @@ out: mcr p15, 0, r7, c1, c1, 0 @ r2 = target PC .globl psci_cpu_on psci_cpu_on: - ldr r0, =_psci_target_pc - str r2, [r0] + push {lr} + + mov r4, r1 + bl psci_get_cpu_stack_top @ get stack top of target CPU + str r2, [r5] @ store target PC at stack top dsb
movw r0, #(SUN7I_CPUCFG_BASE & 0xffff) @@ -195,7 +198,7 @@ psci_cpu_on: str r6, [r0, #0x1e4]
mov r0, #ARM_PSCI_RET_SUCCESS @ Return PSCI_RET_SUCCESS - mov pc, lr + pop {pc}
.globl psci_cpu_off psci_cpu_off:

From: Ian Campbell ijc@hellion.org.uk
I will need mc_security_cfg0/1 in a future patch and I added the rest while debugging, so thought I might as well commit them.
Signed-off-by: Ian Campbell ijc@hellion.org.uk Signed-off-by: Jan Kiszka jan.kiszka@siemens.com --- arch/arm/include/asm/arch-tegra124/mc.h | 35 +++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-)
diff --git a/arch/arm/include/asm/arch-tegra124/mc.h b/arch/arm/include/asm/arch-tegra124/mc.h index d526dfe..5557732 100644 --- a/arch/arm/include/asm/arch-tegra124/mc.h +++ b/arch/arm/include/asm/arch-tegra124/mc.h @@ -35,9 +35,40 @@ struct mc_ctlr { u32 mc_emem_adr_cfg; /* offset 0x54 */ u32 mc_emem_adr_cfg_dev0; /* offset 0x58 */ u32 mc_emem_adr_cfg_dev1; /* offset 0x5C */ - u32 reserved3[12]; /* offset 0x60 - 0x8C */ + u32 reserved3[4]; /* offset 0x60 - 0x6C */ + u32 mc_security_cfg0; /* offset 0x70 */ + u32 mc_security_cfg1; /* offset 0x74 */ + u32 reserved4[6]; /* offset 0x7C - 0x8C */ u32 mc_emem_arb_reserved[28]; /* offset 0x90 - 0xFC */ - u32 reserved4[338]; /* offset 0x100 - 0x644 */ + u32 reserved5[74]; /* offset 0x100 - 0x224 */ + u32 mc_smmu_translation_enable_0; /* offset 0x228 */ + u32 mc_smmu_translation_enable_1; /* offset 0x22C */ + u32 mc_smmu_translation_enable_2; /* offset 0x230 */ + u32 mc_smmu_translation_enable_3; /* offset 0x234 */ + u32 mc_smmu_afi_asid; /* offset 0x238 */ + u32 mc_smmu_avpc_asid; /* offset 0x23C */ + u32 mc_smmu_dc_asid; /* offset 0x240 */ + u32 mc_smmu_dcb_asid; /* offset 0x244 */ + u32 reserved6[2]; /* offset 0x248 - 0x24C */ + u32 mc_smmu_hc_asid; /* offset 0x250 */ + u32 mc_smmu_hda_asid; /* offset 0x254 */ + u32 mc_smmu_isp2_asid; /* offset 0x258 */ + u32 reserved7[2]; /* offset 0x25C - 0x260 */ + u32 mc_smmu_msenc_asid; /* offset 0x264 */ + u32 mc_smmu_nv_asid; /* offset 0x268 */ + u32 mc_smmu_nv2_asid; /* offset 0x26C */ + u32 mc_smmu_ppcs_asid; /* offset 0x270 */ + u32 mc_smmu_sata_asid; /* offset 0x274 */ + u32 reserved8[1]; /* offset 0x278 */ + u32 mc_smmu_vde_asid; /* offset 0x27C */ + u32 mc_smmu_vi_asid; /* offset 0x280 */ + u32 mc_smmu_vic_asid; /* offset 0x284 */ + u32 mc_smmu_xusb_host_asid; /* offset 0x288 */ + u32 mc_smmu_xusb_dev_asid; /* offset 0x28C */ + u32 reserved9[1]; /* offset 0x290 */ + u32 mc_smmu_tsec_asid; /* offset 0x294 */ + u32 mc_smmu_ppcs1_asid; /* offset 0x298 */ + u32 reserved10[235]; /* offset 0x29C - 0x644 */ u32 mc_video_protect_bom; /* offset 0x648 */ u32 mc_video_protect_size_mb; /* offset 0x64c */ u32 mc_video_protect_reg_ctrl; /* offset 0x650 */

In this case the secure code lives in RAM, and hence the memory node in the device tree needs to be adjusted. This avoids that the OS will map and possibly access the reservation.
Add support for setting CONFIG_ARMV7_SECURE_RESERVE_SIZE to carve out such a region. We only support cutting off memory from the beginning or the end of a RAM bank as we do not want to increase their number (which would happen if punching a hole) for simplicity reasons
This will be used in a subsequent patch for Jetson-TK1.
Signed-off-by: Jan Kiszka jan.kiszka@siemens.com --- arch/arm/cpu/armv7/virt-dt.c | 29 +++++++++++++++++++++++++++++ arch/arm/include/asm/armv7.h | 1 + arch/arm/lib/bootm-fdt.c | 5 +++++ 3 files changed, 35 insertions(+)
diff --git a/arch/arm/cpu/armv7/virt-dt.c b/arch/arm/cpu/armv7/virt-dt.c index ad19e4c..06edeec 100644 --- a/arch/arm/cpu/armv7/virt-dt.c +++ b/arch/arm/cpu/armv7/virt-dt.c @@ -16,6 +16,7 @@ */
#include <common.h> +#include <errno.h> #include <stdio_dev.h> #include <linux/ctype.h> #include <linux/types.h> @@ -88,6 +89,34 @@ static int fdt_psci(void *fdt) return 0; }
+int armv7_apply_memory_carveout(u64 *start, u64 *size) +{ +#ifdef CONFIG_ARMV7_SECURE_RESERVE_SIZE + if (*start + *size < CONFIG_ARMV7_SECURE_BASE || + *start >= (u64)CONFIG_ARMV7_SECURE_BASE + + CONFIG_ARMV7_SECURE_RESERVE_SIZE) + return 0; + + /* carveout must be at the beginning or the end of the bank */ + if (*start == CONFIG_ARMV7_SECURE_BASE || + *start + *size == (u64)CONFIG_ARMV7_SECURE_BASE + + CONFIG_ARMV7_SECURE_RESERVE_SIZE) { + if (*size < CONFIG_ARMV7_SECURE_RESERVE_SIZE) { + debug("Secure monitor larger than RAM bank!?\n"); + return -EINVAL; + } + *size -= CONFIG_ARMV7_SECURE_RESERVE_SIZE; + if (*start == CONFIG_ARMV7_SECURE_BASE) + *start += CONFIG_ARMV7_SECURE_RESERVE_SIZE; + return 0; + } + debug("Secure monitor not located at beginning or end of RAM bank\n"); + return -EINVAL; +#else /* !CONFIG_ARMV7_SECURE_RESERVE_SIZE */ + return 0; +#endif +} + int armv7_update_dt(void *fdt) { if (!armv7_boot_nonsec()) diff --git a/arch/arm/include/asm/armv7.h b/arch/arm/include/asm/armv7.h index a13da23..e06dfc9 100644 --- a/arch/arm/include/asm/armv7.h +++ b/arch/arm/include/asm/armv7.h @@ -79,6 +79,7 @@ void v7_outer_cache_inval_range(u32 start, u32 end); #if defined(CONFIG_ARMV7_NONSEC) || defined(CONFIG_ARMV7_VIRT)
int armv7_init_nonsec(void); +int armv7_apply_memory_carveout(u64 *start, u64 *size); int armv7_update_dt(void *fdt); bool armv7_boot_nonsec(void);
diff --git a/arch/arm/lib/bootm-fdt.c b/arch/arm/lib/bootm-fdt.c index d4f1578..7b88739 100644 --- a/arch/arm/lib/bootm-fdt.c +++ b/arch/arm/lib/bootm-fdt.c @@ -31,6 +31,11 @@ int arch_fixup_fdt(void *blob) for (bank = 0; bank < CONFIG_NR_DRAM_BANKS; bank++) { start[bank] = bd->bi_dram[bank].start; size[bank] = bd->bi_dram[bank].size; +#if defined(CONFIG_ARMV7_NONSEC) || defined(CONFIG_ARMV7_VIRT) + ret = armv7_apply_memory_carveout(&start[bank], &size[bank]); + if (ret) + return ret; +#endif }
ret = fdt_fixup_memory_banks(blob, start, size, CONFIG_NR_DRAM_BANKS);

Will be used for unpowergating CPUs.
Signed-off-by: Jan Kiszka jan.kiszka@siemens.com --- arch/arm/include/asm/arch-tegra/powergate.h | 1 + arch/arm/mach-tegra/powergate.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-)
diff --git a/arch/arm/include/asm/arch-tegra/powergate.h b/arch/arm/include/asm/arch-tegra/powergate.h index 130b58b..2e491f1 100644 --- a/arch/arm/include/asm/arch-tegra/powergate.h +++ b/arch/arm/include/asm/arch-tegra/powergate.h @@ -33,6 +33,7 @@ enum tegra_powergate {
int tegra_powergate_sequence_power_up(enum tegra_powergate id, enum periph_id periph); +int tegra_powergate_power_on(enum tegra_powergate id); int tegra_powergate_power_off(enum tegra_powergate id);
#endif diff --git a/arch/arm/mach-tegra/powergate.c b/arch/arm/mach-tegra/powergate.c index 439cff3..6331cd4 100644 --- a/arch/arm/mach-tegra/powergate.c +++ b/arch/arm/mach-tegra/powergate.c @@ -44,7 +44,7 @@ static int tegra_powergate_set(enum tegra_powergate id, bool state) return -ETIMEDOUT; }
-static int tegra_powergate_power_on(enum tegra_powergate id) +int tegra_powergate_power_on(enum tegra_powergate id) { return tegra_powergate_set(id, true); }

This function will be used to initialize CPU power management for Tegra SOCs. For now it does nothing.
Signed-off-by: Jan Kiszka jan.kiszka@siemens.com --- arch/arm/include/asm/arch-tegra/ap.h | 5 +++++ board/nvidia/common/board.c | 4 ++++ 2 files changed, 9 insertions(+)
diff --git a/arch/arm/include/asm/arch-tegra/ap.h b/arch/arm/include/asm/arch-tegra/ap.h index 5c8be94..208db90 100644 --- a/arch/arm/include/asm/arch-tegra/ap.h +++ b/arch/arm/include/asm/arch-tegra/ap.h @@ -63,6 +63,11 @@ int tegra_get_chip(void); */ int tegra_get_sku_info(void);
+/** + * Initialize power management for application processors + */ +void ap_pm_init(void); + /* Do any chip-specific cache config */ void config_cache(void);
diff --git a/board/nvidia/common/board.c b/board/nvidia/common/board.c index 80ef8fd..c62b3da 100644 --- a/board/nvidia/common/board.c +++ b/board/nvidia/common/board.c @@ -21,6 +21,7 @@ #include <asm/arch/pwm.h> #endif #include <asm/arch/tegra.h> +#include <asm/arch-tegra/ap.h> #include <asm/arch-tegra/board.h> #include <asm/arch-tegra/clk_rst.h> #include <asm/arch-tegra/pmc.h> @@ -56,6 +57,7 @@ const struct tegra_sysinfo sysinfo = { CONFIG_TEGRA_BOARD_STRING };
+__weak void ap_pm_init(void) {} __weak void pinmux_init(void) {} __weak void pin_mux_usb(void) {} __weak void pin_mux_spi(void) {} @@ -96,6 +98,8 @@ int board_init(void) clock_init(); clock_verify();
+ ap_pm_init(); + #ifdef CONFIG_TEGRA_SPI pin_mux_spi(); #endif

This is based on Thierry Reding's work and uses Ian Campell's preparatory patches. It comes with full support for CPU_ON/OFF PSCI services. The algorithm used in this version for turning CPUs on and off was proposed by Peter De Schrijver and Thierry Reding in http://thread.gmane.org/gmane.comp.boot-loaders.u-boot/210881. It consists of first enabling CPU1..3 via the PMC, just to powergate them again with the help of the Flow Controller. Once the Flow Controller is in place, we can leave the PMC alone while processing CPU_ON and CPU_OFF PSCI requests.
Signed-off-by: Jan Kiszka jan.kiszka@siemens.com --- arch/arm/include/asm/arch-tegra124/flow.h | 6 ++ arch/arm/mach-tegra/Makefile | 4 ++ arch/arm/mach-tegra/psci.S | 101 ++++++++++++++++++++++++++++++ arch/arm/mach-tegra/tegra124/Makefile | 4 ++ arch/arm/mach-tegra/tegra124/ap.c | 55 ++++++++++++++++ 5 files changed, 170 insertions(+) create mode 100644 arch/arm/mach-tegra/psci.S create mode 100644 arch/arm/mach-tegra/tegra124/ap.c
diff --git a/arch/arm/include/asm/arch-tegra124/flow.h b/arch/arm/include/asm/arch-tegra124/flow.h index 0db1881..d6f515f 100644 --- a/arch/arm/include/asm/arch-tegra124/flow.h +++ b/arch/arm/include/asm/arch-tegra124/flow.h @@ -37,4 +37,10 @@ struct flow_ctlr { /* FLOW_CTLR_CLUSTER_CONTROL_0 0x2c */ #define ACTIVE_LP (1 << 0)
+/* CPUn_CSR_0 */ +#define CSR_ENABLE (1 << 0) +#define CSR_IMMEDIATE_WAKE (1 << 3) +#define CSR_WAIT_WFI_SHIFT 8 +#define CSR_PWR_OFF_STS (1 << 16) + #endif /* _TEGRA124_FLOW_H_ */ diff --git a/arch/arm/mach-tegra/Makefile b/arch/arm/mach-tegra/Makefile index 04cef0a..0779086 100644 --- a/arch/arm/mach-tegra/Makefile +++ b/arch/arm/mach-tegra/Makefile @@ -25,6 +25,10 @@ obj-y += xusb-padctl.o obj-$(CONFIG_DISPLAY_CPUINFO) += sys_info.o obj-$(CONFIG_TEGRA124) += vpr.o
+ifndef CONFIG_SPL_BUILD +obj-$(CONFIG_ARMV7_PSCI) += psci.o +endif + obj-$(CONFIG_TEGRA20) += tegra20/ obj-$(CONFIG_TEGRA30) += tegra30/ obj-$(CONFIG_TEGRA114) += tegra114/ diff --git a/arch/arm/mach-tegra/psci.S b/arch/arm/mach-tegra/psci.S new file mode 100644 index 0000000..8b69abc --- /dev/null +++ b/arch/arm/mach-tegra/psci.S @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2014, NVIDIA + * Copyright (C) 2015, Siemens AG + * + * Authors: + * Thierry Reding treding@nvidia.com + * Jan Kiszka jan.kiszka@siemens.com + * + * SPDX-License-Identifier: GPL-2.0+ + */ + +#include <linux/linkage.h> +#include <asm/macro.h> +#include <asm/psci.h> + + .pushsection ._secure.text, "ax" + .arch_extension sec + +#define TEGRA_SB_CSR_0 0x6000c200 +#define NS_RST_VEC_WR_DIS (1 << 1) + +#define TEGRA_RESET_EXCEPTION_VECTOR 0x6000f100 + +#define TEGRA_FLOW_CTRL_BASE 0x60007000 +#define FLOW_CTRL_CPU_CSR 0x08 +#define CSR_ENABLE (1 << 0) +#define CSR_IMMEDIATE_WAKE (1 << 3) +#define CSR_WAIT_WFI_SHIFT 8 +#define FLOW_CTRL_CPU1_CSR 0x18 + +@ converts CPU ID into FLOW_CTRL_CPUn_CSR offset +.macro get_csr_reg cpu, ofs, tmp + cmp \cpu, #0 @ CPU0? + lsl \tmp, \cpu, #3 @ multiple by 8 (register offset CPU1-3) + moveq \ofs, #FLOW_CTRL_CPU_CSR + addne \ofs, \tmp, #FLOW_CTRL_CPU1_CSR - 8 +.endm + +ENTRY(psci_arch_init) + mov r6, lr + + mrc p15, 0, r5, c1, c1, 0 @ Read SCR + bic r5, r5, #1 @ Secure mode + mcr p15, 0, r5, c1, c1, 0 @ Write SCR + isb + + @ lock reset vector for non-secure + ldr r4, =TEGRA_SB_CSR_0 + ldr r5, [r4] + orr r5, r5, #NS_RST_VEC_WR_DIS + str r5, [r4] + + armv7_get_cpu_id r4 + bl psci_get_cpu_stack_top + mov sp, r5 + + bx r6 +ENDPROC(psci_arch_init) + +ENTRY(psci_cpu_off) + bl psci_cpu_off_common + + armv7_get_cpu_id r1 + + get_csr_reg r1, r2, r3 + + ldr r6, =TEGRA_FLOW_CTRL_BASE + mov r5, #(CSR_ENABLE) + mov r4, #(1 << CSR_WAIT_WFI_SHIFT) + add r5, r4, lsl r1 + str r5, [r6, r2] + +_loop: wfi + b _loop +ENDPROC(psci_cpu_off) + +ENTRY(psci_cpu_on) + push {lr} + + mov r4, r1 + bl psci_get_cpu_stack_top @ get stack top of target CPU + str r2, [r5] @ store target PC at stack top + dsb + + ldr r6, =TEGRA_RESET_EXCEPTION_VECTOR + ldr r5, =psci_cpu_entry + str r5, [r6] + + get_csr_reg r1, r2, r3 + + ldr r6, =TEGRA_FLOW_CTRL_BASE + mov r5, #(CSR_IMMEDIATE_WAKE | CSR_ENABLE) + str r5, [r6, r2] + + mov r0, #ARM_PSCI_RET_SUCCESS @ Return PSCI_RET_SUCCESS + pop {pc} +ENDPROC(psci_cpu_on) + + .globl psci_text_end +psci_text_end: + .popsection diff --git a/arch/arm/mach-tegra/tegra124/Makefile b/arch/arm/mach-tegra/tegra124/Makefile index ef2da29..d19ddb3 100644 --- a/arch/arm/mach-tegra/tegra124/Makefile +++ b/arch/arm/mach-tegra/tegra124/Makefile @@ -11,3 +11,7 @@ obj-y += clock.o obj-y += funcmux.o obj-y += pinmux.o obj-y += xusb-padctl.o + +ifndef CONFIG_SPL_BUILD +obj-$(CONFIG_ARMV7_PSCI) += ap.o +endif diff --git a/arch/arm/mach-tegra/tegra124/ap.c b/arch/arm/mach-tegra/tegra124/ap.c new file mode 100644 index 0000000..d729c16 --- /dev/null +++ b/arch/arm/mach-tegra/tegra124/ap.c @@ -0,0 +1,55 @@ +/* + * (C) Copyright 2015, Siemens AG + * Author: Jan Kiszka jan.kiszka@siemens.com + * + * SPDX-License-Identifier: GPL-2.0+ + */ + +#include <common.h> +#include <asm/io.h> +#include <asm/arch/flow.h> +#include <asm/arch/powergate.h> +#include <asm/arch-tegra/ap.h> +#include <asm/arch-tegra/pmc.h> + +static void park_cpu(void) +{ + while (1) + asm volatile("wfi"); +} + +void ap_pm_init(void) +{ + struct flow_ctlr *flow = (struct flow_ctlr *)NV_PA_FLOW_BASE; + + writel((u32)park_cpu, EXCEP_VECTOR_CPU_RESET_VECTOR); + + /* + * The naturally expected order of putting these CPUs under Flow + * Controller regime would be + * - configure the Flow Controller + * - power up the CPUs + * - wait for the CPUs to hit wfi and be powered down again + * + * However, this doesn't work in practice. We rather need to power them + * up first and park them in wfi. While they are waiting there, we can + * indeed program the Flow Controller to powergate them on wfi, which + * will then happen immediately as they are already in that state. + */ + tegra_powergate_power_on(TEGRA_POWERGATE_CPU1); + tegra_powergate_power_on(TEGRA_POWERGATE_CPU2); + tegra_powergate_power_on(TEGRA_POWERGATE_CPU3); + + writel((2 << CSR_WAIT_WFI_SHIFT) | CSR_ENABLE, &flow->cpu1_csr); + writel((4 << CSR_WAIT_WFI_SHIFT) | CSR_ENABLE, &flow->cpu2_csr); + writel((8 << CSR_WAIT_WFI_SHIFT) | CSR_ENABLE, &flow->cpu3_csr); + + writel(EVENT_MODE_STOP, &flow->halt_cpu1_events); + writel(EVENT_MODE_STOP, &flow->halt_cpu2_events); + writel(EVENT_MODE_STOP, &flow->halt_cpu3_events); + + while (!(readl(&flow->cpu1_csr) & CSR_PWR_OFF_STS) || + !(readl(&flow->cpu2_csr) & CSR_PWR_OFF_STS) || + !(readl(&flow->cpu3_csr) & CSR_PWR_OFF_STS)) + /* wait */; +}

From: Ian Campbell ijc@hellion.org.uk
The secure world code is relocated to the MB just below the top of 4G, we reserve it in the FDT (by setting CONFIG_ARMV7_SECURE_RESERVE_SIZE) but it is not protected in h/w. See next patch.
Signed-off-by: Ian Campbell ijc@hellion.org.uk Signed-off-by: Jan Kiszka jan.kiszka@siemens.com --- arch/arm/mach-tegra/tegra124/Kconfig | 2 ++ include/configs/jetson-tk1.h | 5 +++++ 2 files changed, 7 insertions(+)
diff --git a/arch/arm/mach-tegra/tegra124/Kconfig b/arch/arm/mach-tegra/tegra124/Kconfig index 88f627c..5114299 100644 --- a/arch/arm/mach-tegra/tegra124/Kconfig +++ b/arch/arm/mach-tegra/tegra124/Kconfig @@ -5,6 +5,8 @@ choice
config TARGET_JETSON_TK1 bool "NVIDIA Tegra124 Jetson TK1 board" + select CPU_V7_HAS_NONSEC if !SPL_BUILD + select CPU_V7_HAS_VIRT if !SPL_BUILD
config TARGET_NYAN_BIG bool "Google/NVIDIA Nyan-big Chrombook" diff --git a/include/configs/jetson-tk1.h b/include/configs/jetson-tk1.h index 0a79c7c..80c2952 100644 --- a/include/configs/jetson-tk1.h +++ b/include/configs/jetson-tk1.h @@ -81,4 +81,9 @@ #include "tegra-common-usb-gadget.h" #include "tegra-common-post.h"
+#define CONFIG_ARMV7_PSCI 1 +/* Reserve top 1M for secure RAM */ +#define CONFIG_ARMV7_SECURE_BASE 0xfff00000 +#define CONFIG_ARMV7_SECURE_RESERVE_SIZE 0x00100000 + #endif /* __CONFIG_H */

From: Ian Campbell ijc@hellion.org.uk
These registers can be used to prevent non-secure world from accessing a megabyte aligned region of RAM, use them to protect the u-boot secure monitor code.
At first I tried to do this from s_init(), however this inexplicably causes u-boot's networking (e.g. DHCP) to fail, while networking under Linux was fine.
So instead I have added a new weak arch function protect_secure_section() called from relocate_secure_section() and reserved the region there. This is better overall since it defers the reservation until after the sec vs. non-sec decision (which can be influenced by an envvar) has been made when booting the os.
Signed-off-by: Ian Campbell ijc@hellion.org.uk [Jan: tiny style adjustment] Signed-off-by: Jan Kiszka jan.kiszka@siemens.com --- arch/arm/cpu/armv7/virt-v7.c | 5 +++++ arch/arm/include/asm/system.h | 1 + arch/arm/mach-tegra/ap.c | 15 +++++++++++++++ 3 files changed, 21 insertions(+)
diff --git a/arch/arm/cpu/armv7/virt-v7.c b/arch/arm/cpu/armv7/virt-v7.c index b69fd37..eb6195c 100644 --- a/arch/arm/cpu/armv7/virt-v7.c +++ b/arch/arm/cpu/armv7/virt-v7.c @@ -46,6 +46,10 @@ static unsigned long get_gicd_base_address(void) #endif }
+/* Define a specific version of this function to enable any available + * hardware protections for the reserved region */ +void __weak protect_secure_section(void) {} + static void relocate_secure_section(void) { #ifdef CONFIG_ARMV7_SECURE_BASE @@ -54,6 +58,7 @@ static void relocate_secure_section(void) memcpy((void *)CONFIG_ARMV7_SECURE_BASE, __secure_start, sz); flush_dcache_range(CONFIG_ARMV7_SECURE_BASE, CONFIG_ARMV7_SECURE_BASE + sz + 1); + protect_secure_section(); invalidate_icache_all(); #endif } diff --git a/arch/arm/include/asm/system.h b/arch/arm/include/asm/system.h index 2a5bed2..d6dfddd 100644 --- a/arch/arm/include/asm/system.h +++ b/arch/arm/include/asm/system.h @@ -77,6 +77,7 @@ void armv8_switch_to_el1(void); void gic_init(void); void gic_send_sgi(unsigned long sgino); void wait_for_wakeup(void); +void protect_secure_region(void); void smp_kick_all_cpus(void);
void flush_l3_cache(void); diff --git a/arch/arm/mach-tegra/ap.c b/arch/arm/mach-tegra/ap.c index a17dfd1..869a2ed 100644 --- a/arch/arm/mach-tegra/ap.c +++ b/arch/arm/mach-tegra/ap.c @@ -10,6 +10,7 @@ #include <common.h> #include <asm/io.h> #include <asm/arch/gp_padctrl.h> +#include <asm/arch/mc.h> #include <asm/arch-tegra/ap.h> #include <asm/arch-tegra/clock.h> #include <asm/arch-tegra/fuse.h> @@ -154,6 +155,20 @@ static void init_pmc_scratch(void) writel(odmdata, &pmc->pmc_scratch20); }
+#ifdef CONFIG_ARMV7_SECURE_RESERVE_SIZE +void protect_secure_section(void) +{ + struct mc_ctlr *mc = (struct mc_ctlr *)NV_PA_MC_BASE; + + /* Must be MB aligned */ + BUILD_BUG_ON(CONFIG_ARMV7_SECURE_BASE & 0xFFFFF); + BUILD_BUG_ON(CONFIG_ARMV7_SECURE_RESERVE_SIZE & 0xFFFFF); + + writel(CONFIG_ARMV7_SECURE_BASE, &mc->mc_security_cfg0); + writel(CONFIG_ARMV7_SECURE_RESERVE_SIZE >> 20, &mc->mc_security_cfg1); +} +#endif + void s_init(void) { /* Init PMC scratch memory */

We only set CNTFRQ in arch_timer_init for the boot CPU. But this has to happen for all cores.
Fixing this resolves problems of KVM with emulating the generic timer/counter.
Signed-off-by: Jan Kiszka jan.kiszka@siemens.com --- arch/arm/mach-tegra/psci.S | 13 +++++++++++++ 1 file changed, 13 insertions(+)
diff --git a/arch/arm/mach-tegra/psci.S b/arch/arm/mach-tegra/psci.S index 8b69abc..ba1664b 100644 --- a/arch/arm/mach-tegra/psci.S +++ b/arch/arm/mach-tegra/psci.S @@ -51,12 +51,25 @@ ENTRY(psci_arch_init) str r5, [r4]
armv7_get_cpu_id r4 + + adr r5, _sys_clock_freq + cmp r4, #0 + + mrceq p15, 0, r7, c14, c0, 0 @ read CNTFRQ from CPU0 + streq r7, [r5] + + ldrne r7, [r5] + mcrne p15, 0, r7, c14, c0, 0 @ write CNTFRQ to CPU1..3 + bl psci_get_cpu_stack_top mov sp, r5
bx r6 ENDPROC(psci_arch_init)
+_sys_clock_freq: + .word 0 + ENTRY(psci_cpu_off) bl psci_cpu_off_common

On Fri, 2015-02-27 at 14:27 +0100, Jan Kiszka wrote:
CC: Ian Campbell ijc@hellion.org.uk
I've been running with these on my Jetson (and booting Xen on top) just fine. So, FWIW:
Tested-by: Ian Campbell ijc@hellion.org.uk
Cheers, Ian.

On 2015-03-08 21:08, Ian Campbell wrote:
On Fri, 2015-02-27 at 14:27 +0100, Jan Kiszka wrote:
CC: Ian Campbell ijc@hellion.org.uk
I've been running with these on my Jetson (and booting Xen on top) just fine. So, FWIW:
Tested-by: Ian Campbell ijc@hellion.org.uk
Great, thanks! I'll roll out v5 soon to address the pending review comments. Would be perfect if you could test that as well.
Jan
participants (3)
-
Ian Campbell
-
Jan Kiszka
-
Marc Zyngier