[U-Boot] [Patch v1 1/4] ARMv8/ls2085a: Fix CPU_RELEASE_ADDR

From: Arnab Basu arnab.basu@freescale.com
Remove trailing UL from CONFIG_SYS_SDRAM_BASE to be used from assembly. Fix CPU_RELEASE_ADDR to use the beginning of SDRAM.
Signed-off-by: Arnab Basu arnab.basu@freescale.com Signed-off-by: York Sun yorksun@freescale.com --- This set depends on this bundle http://patchwork.ozlabs.org/bundle/yorksun/armv8_fsl-lsch3/
include/configs/ls2085a_common.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/include/configs/ls2085a_common.h b/include/configs/ls2085a_common.h index 2bd5a47..49e2971 100644 --- a/include/configs/ls2085a_common.h +++ b/include/configs/ls2085a_common.h @@ -45,14 +45,18 @@
#define CONFIG_SYS_FSL_DDR_INTLV_256B /* force 256 byte interleaving */
-/* SMP Definitions */ -#define CPU_RELEASE_ADDR CONFIG_SYS_INIT_SP_ADDR - -#define CONFIG_SYS_DDR_SDRAM_BASE 0x80000000UL +#define CONFIG_SYS_DDR_SDRAM_BASE 0x80000000 #define CONFIG_SYS_FSL_DDR_SDRAM_BASE_PHY 0 #define CONFIG_SYS_SDRAM_BASE CONFIG_SYS_DDR_SDRAM_BASE #define CONFIG_SYS_DDR_BLOCK2_BASE 0x8080000000ULL
+/* + * SMP Definitions + * Spin table is at the beginning of boot page. + */ +#define SECONDARY_CPU_BOOT_PAGE (CONFIG_SYS_SDRAM_BASE) +#define CPU_RELEASE_ADDR SECONDARY_CPU_BOOT_PAGE + /* Generic Timer Definitions */ #define COUNTER_FREQUENCY 12000000 /* 12MHz */

Secondary cores need to be released from holdoff by boot release registers. With GPP bootrom, they can boot from main memory directly. Individual spin table is used for each core. If a single release address is needed, defining macro CONFIG_FSL_SMP_RELEASE_ALL will use the CPU_RELEASE_ADDR. Spin table and the boot page is reserved in device tree so OS won't overwrite.
Signed-off-by: York Sun yorksun@freescale.com Signed-off-by: Arnab Basu arnab.basu@freescale.com --- This set depends on this bundle http://patchwork.ozlabs.org/bundle/yorksun/armv8_fsl-lsch3/
arch/arm/cpu/armv8/fsl-lsch3/Makefile | 2 + arch/arm/cpu/armv8/fsl-lsch3/cpu.c | 13 ++ arch/arm/cpu/armv8/fsl-lsch3/cpu.h | 1 + arch/arm/cpu/armv8/fsl-lsch3/fdt.c | 56 +++++++ arch/arm/cpu/armv8/fsl-lsch3/lowlevel.S | 119 +++++++++++--- arch/arm/cpu/armv8/fsl-lsch3/mp.c | 171 +++++++++++++++++++++ arch/arm/cpu/armv8/fsl-lsch3/mp.h | 36 +++++ arch/arm/cpu/armv8/transition.S | 63 +------- arch/arm/include/asm/arch-fsl-lsch3/config.h | 3 +- arch/arm/include/asm/arch-fsl-lsch3/immap_lsch3.h | 35 +++++ arch/arm/include/asm/macro.h | 81 ++++++++++ arch/arm/lib/gic_64.S | 10 +- common/board_f.c | 2 +- 13 files changed, 502 insertions(+), 90 deletions(-) create mode 100644 arch/arm/cpu/armv8/fsl-lsch3/fdt.c create mode 100644 arch/arm/cpu/armv8/fsl-lsch3/mp.c create mode 100644 arch/arm/cpu/armv8/fsl-lsch3/mp.h
diff --git a/arch/arm/cpu/armv8/fsl-lsch3/Makefile b/arch/arm/cpu/armv8/fsl-lsch3/Makefile index 9249537..f920eeb 100644 --- a/arch/arm/cpu/armv8/fsl-lsch3/Makefile +++ b/arch/arm/cpu/armv8/fsl-lsch3/Makefile @@ -7,3 +7,5 @@ obj-y += cpu.o obj-y += lowlevel.o obj-y += speed.o +obj-$(CONFIG_MP) += mp.o +obj-$(CONFIG_OF_LIBFDT) += fdt.o diff --git a/arch/arm/cpu/armv8/fsl-lsch3/cpu.c b/arch/arm/cpu/armv8/fsl-lsch3/cpu.c index c129d03..47b947f 100644 --- a/arch/arm/cpu/armv8/fsl-lsch3/cpu.c +++ b/arch/arm/cpu/armv8/fsl-lsch3/cpu.c @@ -11,6 +11,7 @@ #include <asm/io.h> #include <asm/arch-fsl-lsch3/immap_lsch3.h> #include "cpu.h" +#include "mp.h" #include "speed.h" #include <fsl_mc.h>
@@ -434,3 +435,15 @@ int cpu_eth_init(bd_t *bis) #endif return error; } + + +int arch_early_init_r(void) +{ + int rv; + rv = fsl_lsch3_wake_seconday_cores(); + + if (rv) + printf("Did not wake secondary cores\n"); + + return 0; +} diff --git a/arch/arm/cpu/armv8/fsl-lsch3/cpu.h b/arch/arm/cpu/armv8/fsl-lsch3/cpu.h index 28544d7..2e3312b 100644 --- a/arch/arm/cpu/armv8/fsl-lsch3/cpu.h +++ b/arch/arm/cpu/armv8/fsl-lsch3/cpu.h @@ -5,3 +5,4 @@ */
int fsl_qoriq_core_to_cluster(unsigned int core); +u32 cpu_mask(void); diff --git a/arch/arm/cpu/armv8/fsl-lsch3/fdt.c b/arch/arm/cpu/armv8/fsl-lsch3/fdt.c new file mode 100644 index 0000000..cd34e16 --- /dev/null +++ b/arch/arm/cpu/armv8/fsl-lsch3/fdt.c @@ -0,0 +1,56 @@ +/* + * Copyright 2014 Freescale Semiconductor, Inc. + * + * SPDX-License-Identifier: GPL-2.0+ + */ + +#include <common.h> +#include <libfdt.h> +#include <fdt_support.h> +#include "mp.h" + +#ifdef CONFIG_MP +void ft_fixup_cpu(void *blob) +{ + int off; + __maybe_unused u64 spin_tbl_addr = (u64)get_spin_tbl_addr(); + u64 *reg; + u64 val; + + off = fdt_node_offset_by_prop_value(blob, -1, "device_type", "cpu", 4); + while (off != -FDT_ERR_NOTFOUND) { + reg = (u64 *)fdt_getprop(blob, off, "reg", 0); + if (reg) { + val = spin_tbl_addr; +#ifndef CONFIG_FSL_SMP_RELEASE_ALL + val += id_to_core(fdt64_to_cpu(*reg)) * SIZE_BOOT_ENTRY; +#endif + val = cpu_to_fdt64(val); + fdt_setprop_string(blob, off, "enable-method", + "spin-table"); + fdt_setprop(blob, off, "cpu-release-addr", + &val, sizeof(val)); + } else { + puts("cpu NULL\n"); + } + off = fdt_node_offset_by_prop_value(blob, off, "device_type", + "cpu", 4); + } + /* + * Boot page and spin table can be reserved here if not done staticlly + * in device tree. + * + * fdt_add_mem_rsv(blob, bootpg, + * *((u64 *)&(__secondary_boot_page_size))); + * If defined CONFIG_FSL_SMP_RELEASE_ALL, the release address should + * also be reserved. + */ +} +#endif + +void ft_cpu_setup(void *blob, bd_t *bd) +{ +#ifdef CONFIG_MP + ft_fixup_cpu(blob); +#endif +} diff --git a/arch/arm/cpu/armv8/fsl-lsch3/lowlevel.S b/arch/arm/cpu/armv8/fsl-lsch3/lowlevel.S index b4720ae..162d3d6 100644 --- a/arch/arm/cpu/armv8/fsl-lsch3/lowlevel.S +++ b/arch/arm/cpu/armv8/fsl-lsch3/lowlevel.S @@ -8,7 +8,9 @@
#include <config.h> #include <linux/linkage.h> +#include <asm/gic.h> #include <asm/macro.h> +#include "mp.h"
ENTRY(lowlevel_init) mov x29, lr /* Save LR */ @@ -37,26 +39,12 @@ ENTRY(lowlevel_init)
branch_if_master x0, x1, 1f
- /* - * Slave should wait for master clearing spin table. - * This sync prevent salves observing incorrect - * value of spin table and jumping to wrong place. - */ -#if defined(CONFIG_GICV2) || defined(CONFIG_GICV3) -#ifdef CONFIG_GICV2 - ldr x0, =GICC_BASE -#endif - bl gic_wait_for_interrupt -#endif - - /* - * All processors will enter EL2 and optionally EL1. - */ - bl armv8_switch_to_el2 -#ifdef CONFIG_ARMV8_SWITCH_TO_EL1 - bl armv8_switch_to_el1 -#endif - b 2f + ldr x0, =(SECONDARY_CPU_BOOT_PAGE) + ldr x1, =secondary_boot_func + ldr x2, =secondary_boot_page + sub x1, x1, x2 + add x0, x0, x1 + blr x0
1: /* Set Non Secure access for all devices protected via TZPC */ @@ -119,3 +107,94 @@ ENTRY(lowlevel_init) mov lr, x29 /* Restore LR */ ret ENDPROC(lowlevel_init) + + /* Keep literals not used by the secondary boot page outside it */ + .ltorg + + .align 4 + .global secondary_boot_page +secondary_boot_page: + .global __spin_table +__spin_table: + .space CONFIG_MAX_CPUS*ENTRY_SIZE + + .align 4 + /* Secondary Boot Page starts here */ +ENTRY(secondary_boot_func) + /* + * PIR calculation from MPIDR_EL1 + * MPIDR[1:0] = AFF0_CPUID <- Core ID (0,1) + * MPIDR[7:2] = AFF0_RES + * MPIDR[15:8] = AFF1_CLUSTERID <- Cluster ID (0,1,2,3) + * MPIDR[23:16] = AFF2_CLUSTERID + * MPIDR[24] = MT + * MPIDR[29:25] =RES + * MPIDR[30] = U + * MPIDR[31] = ME + * MPIDR[39:32] = AFF3 + * We only use AFF0_CPUID and AFF1_CLUSTERID for now + * until AFF2_CLUSTERID and AFF3 have non-zero values. + */ + mrs x0, mpidr_el1 + ubfm x1, x0, #8, #15 + ubfm x2, x0, #0, #1 + orr x10, x2, x1, lsl #2 /* x10 has PIR */ + ubfm x9, x0, #0, #15 /* w9 has 16-bit original PIR */ + lsl x1, x10, #6 /* spin table is padded to 64 byte each core */ + ldr x0, =(SECONDARY_CPU_BOOT_PAGE) + ldr x3, =__spin_table + ldr x4, =secondary_boot_page + sub x3, x3, x4 + add x0, x0, x3 + add x11, x1, x0 + + str x9, [x11, #16] /* ENTRY_PIR */ + mov x4, #1 + str x4, [x11] /* ENTRY_ADDR */ + dsb sy + isb +#if defined(CONFIG_GICV3) + gic_wait_for_interrupt_m x0 +#endif + + bl secondary_switch_to_el2 +#ifdef CONFIG_ARMV8_SWITCH_TO_EL1 + secondary_switch_to_el1 +#endif + +slave_cpu: + wfe +#ifdef CONFIG_FSL_SMP_RELEASE_ALL + ldr x1, =CPU_RELEASE_ADDR + ldr x0, [x1] +#else + ldr x0, [x11] + tbnz x0, #0, slave_cpu +#endif + cbz x0, slave_cpu + br x0 /* branch to the given address */ +ENDPROC(secondary_boot_func) + +ENTRY(secondary_switch_to_el2) + switch_el x0, 1f, 0f, 0f +0: ret +1: armv8_switch_to_el2_m x0 +ENDPROC(secondary_switch_to_el2) + +ENTRY(secondary_switch_to_el1) + switch_el x0, 0f, 1f, 0f +0: ret +1: armv8_switch_to_el1_m x0, x1 +ENDPROC(secondary_switch_to_el1) + + /* Ensure that the literals used by the secondary boot page are + * assembled within it + */ + .ltorg + + .align 4 + .globl __secondary_boot_page_size + .type __secondary_boot_page_size, %object + /* Secondary Boot Page ends here */ +__secondary_boot_page_size: + .quad .-secondary_boot_page diff --git a/arch/arm/cpu/armv8/fsl-lsch3/mp.c b/arch/arm/cpu/armv8/fsl-lsch3/mp.c new file mode 100644 index 0000000..1cd36ab --- /dev/null +++ b/arch/arm/cpu/armv8/fsl-lsch3/mp.c @@ -0,0 +1,171 @@ +/* + * Copyright 2014 Freescale Semiconductor, Inc. + * + * SPDX-License-Identifier: GPL-2.0+ + */ + +#include <common.h> +#include <asm/io.h> +#include <asm/system.h> +#include <asm/io.h> +#include <asm/arch-fsl-lsch3/immap_lsch3.h> +#include "mp.h" + +DECLARE_GLOBAL_DATA_PTR; + +void *get_spin_tbl_addr(void) +{ + void *ptr = (void *)SECONDARY_CPU_BOOT_PAGE; + + /* + * Spin table is at the beginning of secondary boot page. It is + * copied to SECONDARY_CPU_BOOT_PAGE. + */ + ptr += (u64)&__spin_table - (u64)&secondary_boot_page; + + return ptr; +} + +phys_addr_t determine_mp_bootpg(void) +{ + return (phys_addr_t)SECONDARY_CPU_BOOT_PAGE; +} + +int fsl_lsch3_wake_seconday_cores(void) +{ + struct ccsr_gur __iomem *gur = (void *)(CONFIG_SYS_FSL_GUTS_ADDR); + struct ccsr_reset __iomem *rst = (void *)(CONFIG_SYS_FSL_RST_ADDR); + void *boot_loc = (void *)SECONDARY_CPU_BOOT_PAGE; + size_t *boot_page_size = &(__secondary_boot_page_size); + u32 cores, cpu_up_mask = 1; + int i, timeout = 10; + u64 *table = get_spin_tbl_addr(); + + cores = cpu_mask(); + memcpy(boot_loc, &secondary_boot_page, *boot_page_size); + /* Clear spin table so that secondary processors + * observe the correct value after waking up from wfe. + */ + memset(table, 0, CONFIG_MAX_CPUS*ENTRY_SIZE); + flush_dcache_range((unsigned long)boot_loc, + (unsigned long)boot_loc + *boot_page_size); + + printf("Waking secondary cores to start from %lx\n", gd->relocaddr); + out_le32(&gur->bootlocptrh, (u32)(gd->relocaddr >> 32)); + out_le32(&gur->bootlocptrl, (u32)gd->relocaddr); + out_le32(&gur->scratchrw[6], 1); + asm volatile("dsb st" : : : "memory"); + rst->brrl = cores; + asm volatile("dsb st" : : : "memory"); + + /* fixme: this is only needed for the simulator because secnodary cores + * start to run without waiting for boot release register, then enter + * "wfe" before the scratch register is set above. + */ + asm volatile("sev"); + + while (timeout--) { + flush_dcache_range((unsigned long)table, (unsigned long)table + + CONFIG_MAX_CPUS * 64); + for (i = 1; i < CONFIG_MAX_CPUS; i++) { + if (table[i * NUM_BOOT_ENTRY + BOOT_ENTRY_ADDR]) + cpu_up_mask |= 1 << i; + } + if (hweight32(cpu_up_mask) == hweight32(cores)) + break; + udelay(10); + } + if (timeout <= 0) { + printf("Not all cores (0x%x) are up (0x%x)\n", + cores, cpu_up_mask); + return 1; + } + printf("All (%d) cores are up.\n", hweight32(cores)); + + return 0; +} + +int is_core_valid(unsigned int core) +{ + return !!((1 << core) & cpu_mask()); +} + +int cpu_reset(int nr) +{ + puts("Feature is not implemented.\n"); + + return 0; +} + +int cpu_disable(int nr) +{ + puts("Feature is not implemented.\n"); + + return 0; +} + +int core_to_pos(int nr) +{ + u32 cores = cpu_mask(); + int i, count = 0; + + if (nr == 0) { + return 0; + } else if (nr >= hweight32(cores)) { + puts("Not a valid core number.\n"); + return -1; + } + + for (i = 1; i < 32; i++) { + if (is_core_valid(i)) { + count++; + if (count == nr) + break; + } + } + + return count; +} + +int cpu_status(int nr) +{ + u64 *table; + int pos; + + if (nr == 0) { + table = (u64 *)get_spin_tbl_addr(); + printf("table base @ 0x%p\n", table); + } else { + pos = core_to_pos(nr); + if (pos < 0) + return -1; + table = (u64 *)get_spin_tbl_addr() + pos * NUM_BOOT_ENTRY; + printf("table @ 0x%p\n", table); + printf(" addr - 0x%016llx\n", table[BOOT_ENTRY_ADDR]); + printf(" r3 - 0x%016llx\n", table[BOOT_ENTRY_R3]); + printf(" pir - 0x%016llx\n", table[BOOT_ENTRY_PIR]); + } + + return 0; +} + +int cpu_release(int nr, int argc, char * const argv[]) +{ + u64 boot_addr; + u64 *table = (u64 *)get_spin_tbl_addr(); +#ifndef CONFIG_FSL_SMP_RELEASE_ALL + int pos; + + pos = core_to_pos(nr); + if (pos <= 0) + return -1; + + table += pos * NUM_BOOT_ENTRY; +#endif + boot_addr = simple_strtoull(argv[0], NULL, 16); + table[BOOT_ENTRY_ADDR] = boot_addr; + asm volatile("dsb st"); + smp_kick_all_cpus(); /* only those with entry addr set will run */ + + return 0; +} diff --git a/arch/arm/cpu/armv8/fsl-lsch3/mp.h b/arch/arm/cpu/armv8/fsl-lsch3/mp.h new file mode 100644 index 0000000..2153b41 --- /dev/null +++ b/arch/arm/cpu/armv8/fsl-lsch3/mp.h @@ -0,0 +1,36 @@ +/* + * Copyright 2014, Freescale Semiconductor + * + * SPDX-License-Identifier: GPL-2.0+ + */ + +#ifndef _FSL_CH3_MP_H +#define _FSL_CH3_MP_H + +/* +* spin table is defined as +* struct { +* uint64_t entry_addr; +* uint64_t r3; +* uint64_t pir; +* }; +* we pad this struct to 64 bytes so each entry is in its own cacheline +*/ +#define ENTRY_SIZE 64 +#define BOOT_ENTRY_ADDR 0 +#define BOOT_ENTRY_R3 1 +#define BOOT_ENTRY_PIR 2 +#define NUM_BOOT_ENTRY 8 /* pad to 64 bytes */ +#define SIZE_BOOT_ENTRY (NUM_BOOT_ENTRY * sizeof(u64)) + +#define id_to_core(x) ((x & 3) | (x >> 8)) +#ifndef __ASSEMBLY__ +extern u64 __spin_table[]; +extern u64 *secondary_boot_page; +extern size_t __secondary_boot_page_size; +int fsl_lsch3_wake_seconday_cores(void); +void *get_spin_tbl_addr(void); +phys_addr_t determine_mp_bootpg(void); +void secondary_boot_func(void); +#endif +#endif /* _FSL_CH3_MP_H */ diff --git a/arch/arm/cpu/armv8/transition.S b/arch/arm/cpu/armv8/transition.S index e0a5946..ade1cde 100644 --- a/arch/arm/cpu/armv8/transition.S +++ b/arch/arm/cpu/armv8/transition.S @@ -14,70 +14,11 @@ ENTRY(armv8_switch_to_el2) switch_el x0, 1f, 0f, 0f 0: ret -1: - mov x0, #0x5b1 /* Non-secure EL0/EL1 | HVC | 64bit EL2 */ - msr scr_el3, x0 - msr cptr_el3, xzr /* Disable coprocessor traps to EL3 */ - mov x0, #0x33ff - msr cptr_el2, x0 /* Disable coprocessor traps to EL2 */ - - /* Initialize SCTLR_EL2 */ - msr sctlr_el2, xzr - - /* Return to the EL2_SP2 mode from EL3 */ - mov x0, sp - msr sp_el2, x0 /* Migrate SP */ - mrs x0, vbar_el3 - msr vbar_el2, x0 /* Migrate VBAR */ - mov x0, #0x3c9 - msr spsr_el3, x0 /* EL2_SP2 | D | A | I | F */ - msr elr_el3, lr - eret +1: armv8_switch_to_el2_m x0 ENDPROC(armv8_switch_to_el2)
ENTRY(armv8_switch_to_el1) switch_el x0, 0f, 1f, 0f 0: ret -1: - /* Initialize Generic Timers */ - mrs x0, cnthctl_el2 - orr x0, x0, #0x3 /* Enable EL1 access to timers */ - msr cnthctl_el2, x0 - msr cntvoff_el2, x0 - mrs x0, cntkctl_el1 - orr x0, x0, #0x3 /* Enable EL0 access to timers */ - msr cntkctl_el1, x0 - - /* Initilize MPID/MPIDR registers */ - mrs x0, midr_el1 - mrs x1, mpidr_el1 - msr vpidr_el2, x0 - msr vmpidr_el2, x1 - - /* Disable coprocessor traps */ - mov x0, #0x33ff - msr cptr_el2, x0 /* Disable coprocessor traps to EL2 */ - msr hstr_el2, xzr /* Disable coprocessor traps to EL2 */ - mov x0, #3 << 20 - msr cpacr_el1, x0 /* Enable FP/SIMD at EL1 */ - - /* Initialize HCR_EL2 */ - mov x0, #(1 << 31) /* 64bit EL1 */ - orr x0, x0, #(1 << 29) /* Disable HVC */ - msr hcr_el2, x0 - - /* SCTLR_EL1 initialization */ - mov x0, #0x0800 - movk x0, #0x30d0, lsl #16 - msr sctlr_el1, x0 - - /* Return to the EL1_SP1 mode from EL2 */ - mov x0, sp - msr sp_el1, x0 /* Migrate SP */ - mrs x0, vbar_el2 - msr vbar_el1, x0 /* Migrate VBAR */ - mov x0, #0x3c5 - msr spsr_el2, x0 /* EL1_SP1 | D | A | I | F */ - msr elr_el2, lr - eret +1: armv8_switch_to_el1_m x0, x1 ENDPROC(armv8_switch_to_el1) diff --git a/arch/arm/include/asm/arch-fsl-lsch3/config.h b/arch/arm/include/asm/arch-fsl-lsch3/config.h index c81ab41..932a21d 100644 --- a/arch/arm/include/asm/arch-fsl-lsch3/config.h +++ b/arch/arm/include/asm/arch-fsl-lsch3/config.h @@ -8,7 +8,7 @@ #define _ASM_ARMV8_FSL_LSCH3_CONFIG_
#include <fsl_ddrc_version.h> - +#define CONFIG_MP #define CONFIG_SYS_FSL_OCRAM_BASE 0x18000000 /* initial RAM */ /* Link Definitions */ #define CONFIG_SYS_INIT_SP_ADDR (CONFIG_SYS_FSL_OCRAM_BASE + 0xfff0) @@ -18,6 +18,7 @@ #define CONFIG_SYS_FSL_DDR2_ADDR (CONFIG_SYS_IMMR + 0x00090000) #define CONFIG_SYS_FSL_GUTS_ADDR (CONFIG_SYS_IMMR + 0x00E00000) #define CONFIG_SYS_FSL_PMU_ADDR (CONFIG_SYS_IMMR + 0x00E30000) +#define CONFIG_SYS_FSL_RST_ADDR (CONFIG_SYS_IMMR + 0x00E60000) #define CONFIG_SYS_FSL_CH3_CLK_GRPA_ADDR (CONFIG_SYS_IMMR + 0x00300000) #define CONFIG_SYS_FSL_CH3_CLK_GRPB_ADDR (CONFIG_SYS_IMMR + 0x00310000) #define CONFIG_SYS_FSL_CH3_CLK_CTRL_ADDR (CONFIG_SYS_IMMR + 0x00370000) diff --git a/arch/arm/include/asm/arch-fsl-lsch3/immap_lsch3.h b/arch/arm/include/asm/arch-fsl-lsch3/immap_lsch3.h index 18e66bd..ee1d651 100644 --- a/arch/arm/include/asm/arch-fsl-lsch3/immap_lsch3.h +++ b/arch/arm/include/asm/arch-fsl-lsch3/immap_lsch3.h @@ -113,4 +113,39 @@ struct ccsr_clk_ctrl { u8 res_04[0x20-0x04]; } clkcncsr[8]; }; + +struct ccsr_reset { + u32 rstcr; /* 0x000 */ + u32 rstcrsp; /* 0x004 */ + u8 res_008[0x10-0x08]; /* 0x008 */ + u32 rstrqmr1; /* 0x010 */ + u32 rstrqmr2; /* 0x014 */ + u32 rstrqsr1; /* 0x018 */ + u32 rstrqsr2; /* 0x01c */ + u32 rstrqwdtmrl; /* 0x020 */ + u32 rstrqwdtmru; /* 0x024 */ + u8 res_028[0x30-0x28]; /* 0x028 */ + u32 rstrqwdtsrl; /* 0x030 */ + u32 rstrqwdtsru; /* 0x034 */ + u8 res_038[0x60-0x38]; /* 0x038 */ + u32 brrl; /* 0x060 */ + u32 brru; /* 0x064 */ + u8 res_068[0x80-0x68]; /* 0x068 */ + u32 pirset; /* 0x080 */ + u32 pirclr; /* 0x084 */ + u8 res_088[0x90-0x88]; /* 0x088 */ + u32 brcorenbr; /* 0x090 */ + u8 res_094[0x100-0x94]; /* 0x094 */ + u32 rcw_reqr; /* 0x100 */ + u32 rcw_completion; /* 0x104 */ + u8 res_108[0x110-0x108]; /* 0x108 */ + u32 pbi_reqr; /* 0x110 */ + u32 pbi_completion; /* 0x114 */ + u8 res_118[0xa00-0x118]; /* 0x118 */ + u32 qmbm_warmrst; /* 0xa00 */ + u32 soc_warmrst; /* 0xa04 */ + u8 res_a08[0xbf8-0xa08]; /* 0xa08 */ + u32 ip_rev1; /* 0xbf8 */ + u32 ip_rev2; /* 0xbfc */ +}; #endif /* __ARCH_FSL_LSCH3_IMMAP_H */ diff --git a/arch/arm/include/asm/macro.h b/arch/arm/include/asm/macro.h index f77e4b8..16ba76e 100644 --- a/arch/arm/include/asm/macro.h +++ b/arch/arm/include/asm/macro.h @@ -105,6 +105,87 @@ lr .req x30 cbz \xreg1, \master_label .endm
+.macro armv8_switch_to_el2_m, xreg1 + mov \xreg1, #0x5b1 /* Non-secure EL0/EL1 | HVC | 64bit EL2 */ + msr scr_el3, \xreg1 + msr cptr_el3, xzr /* Disable coprocessor traps to EL3 */ + mov \xreg1, #0x33ff + msr cptr_el2, \xreg1 /* Disable coprocessor traps to EL2 */ + + /* Initialize SCTLR_EL2 */ + msr sctlr_el2, xzr + + /* Return to the EL2_SP2 mode from EL3 */ + mov \xreg1, sp + msr sp_el2, \xreg1 /* Migrate SP */ + mrs \xreg1, vbar_el3 + msr vbar_el2, \xreg1 /* Migrate VBAR */ + mov x0, #0x3c9 + msr spsr_el3, \xreg1 /* EL2_SP2 | D | A | I | F */ + msr elr_el3, lr + eret +.endm + +.macro armv8_switch_to_el1_m, xreg1, xreg2 + /* Initialize Generic Timers */ + mrs \xreg1, cnthctl_el2 + orr \xreg1, \xreg1, #0x3 /* Enable EL1 access to timers */ + msr cnthctl_el2, \xreg1 + msr cntvoff_el2, \xreg1 + mrs \xreg1, cntkctl_el1 + orr \xreg1, \xreg1, #0x3 /* Enable EL0 access to timers */ + msr cntkctl_el1, \xreg1 + + /* Initilize MPID/MPIDR registers */ + mrs \xreg1, midr_el1 + mrs \xreg2, mpidr_el1 + msr vpidr_el2, \xreg1 + msr vmpidr_el2, \xreg2 + + /* Disable coprocessor traps */ + mov \xreg1, #0x33ff + msr cptr_el2, \xreg1 /* Disable coprocessor traps to EL2 */ + msr hstr_el2, xzr /* Disable coprocessor traps to EL2 */ + mov \xreg1, #3 << 20 + msr cpacr_el1, \xreg1 /* Enable FP/SIMD at EL1 */ + + /* Initialize HCR_EL2 */ + mov \xreg1, #(1 << 31) /* 64bit EL1 */ + orr \xreg1, \xreg1, #(1 << 29) /* Disable HVC */ + msr hcr_el2, \xreg1 + + /* SCTLR_EL1 initialization */ + mov \xreg1, #0x0800 + movk \xreg1, #0x30d0, lsl #16 + msr sctlr_el1, \xreg1 + + /* Return to the EL1_SP1 mode from EL2 */ + mov \xreg1, sp + msr sp_el1, \xreg1 /* Migrate SP */ + mrs \xreg1, vbar_el2 + msr vbar_el1, \xreg1 /* Migrate VBAR */ + mov \xreg1, #0x3c5 + msr spsr_el2, \xreg1 /* EL1_SP1 | D | A | I | F */ + msr elr_el2, lr + eret +.endm + +#if defined(CONFIG_GICV3) +.macro gic_wait_for_interrupt_m xreg1 +0 : wfi + mrs \xreg1, ICC_IAR1_EL1 + msr ICC_EOIR1_EL1, \xreg1 + cbnz \xreg1, 0b +.endm +#elif defined(CONFIG_GICV2) +.macro gic_wait_for_interrupt_m xreg1, wreg2 +0 : wfi + ldr \wreg2, [\xreg1, GICC_AIAR] + str \wreg2, [\xreg1, GICC_AEOIR] + cbnz \wreg2, 0b +.endm +#endif + #endif /* CONFIG_ARM64 */
#endif /* __ASSEMBLY__ */ diff --git a/arch/arm/lib/gic_64.S b/arch/arm/lib/gic_64.S index d56396e..a3e18f7 100644 --- a/arch/arm/lib/gic_64.S +++ b/arch/arm/lib/gic_64.S @@ -10,8 +10,8 @@ #include <asm-offsets.h> #include <config.h> #include <linux/linkage.h> -#include <asm/macro.h> #include <asm/gic.h> +#include <asm/macro.h>
/************************************************************************* @@ -181,14 +181,10 @@ ENDPROC(gic_kick_secondary_cpus) * *************************************************************************/ ENTRY(gic_wait_for_interrupt) -0: wfi #if defined(CONFIG_GICV3) - mrs x9, ICC_IAR1_EL1 - msr ICC_EOIR1_EL1, x9 + gic_wait_for_interrupt_m x9 #elif defined(CONFIG_GICV2) - ldr w9, [x0, GICC_AIAR] - str w9, [x0, GICC_AEOIR] + gic_wait_for_interrupt_m x0, w9 #endif - cbnz w9, 0b ret ENDPROC(gic_wait_for_interrupt) diff --git a/common/board_f.c b/common/board_f.c index 4ea4cb2..3b6df18 100644 --- a/common/board_f.c +++ b/common/board_f.c @@ -41,7 +41,7 @@ #include <watchdog.h> #include <asm/errno.h> #include <asm/io.h> -#ifdef CONFIG_MP +#if defined(CONFIG_MP) && (defined(CONFIG_MPC86xx) || defined(CONFIG_E500)) #include <asm/mp.h> #endif #include <asm/sections.h>

Hi York,
I spotted a couple of generic issues below. Most of these are issues with the existing code that you happen to be moving around, rather than with the new code this patch introduces.
There are a couple of gotchas around secondary startup that are painful with the bootwrapper for arm64 at present, and I think that we can avoid them by construction for U-Boot. More on that below.
On Fri, Jun 27, 2014 at 05:54:08PM +0100, York Sun wrote:
Secondary cores need to be released from holdoff by boot release registers. With GPP bootrom, they can boot from main memory directly. Individual spin table is used for each core. If a single release address is needed, defining macro CONFIG_FSL_SMP_RELEASE_ALL will use the CPU_RELEASE_ADDR. Spin table and the boot page is reserved in device tree so OS won't overwrite.
Signed-off-by: York Sun yorksun@freescale.com Signed-off-by: Arnab Basu arnab.basu@freescale.com
This set depends on this bundle http://patchwork.ozlabs.org/bundle/yorksun/armv8_fsl-lsch3/
arch/arm/cpu/armv8/fsl-lsch3/Makefile | 2 + arch/arm/cpu/armv8/fsl-lsch3/cpu.c | 13 ++ arch/arm/cpu/armv8/fsl-lsch3/cpu.h | 1 + arch/arm/cpu/armv8/fsl-lsch3/fdt.c | 56 +++++++ arch/arm/cpu/armv8/fsl-lsch3/lowlevel.S | 119 +++++++++++--- arch/arm/cpu/armv8/fsl-lsch3/mp.c | 171 +++++++++++++++++++++ arch/arm/cpu/armv8/fsl-lsch3/mp.h | 36 +++++ arch/arm/cpu/armv8/transition.S | 63 +------- arch/arm/include/asm/arch-fsl-lsch3/config.h | 3 +- arch/arm/include/asm/arch-fsl-lsch3/immap_lsch3.h | 35 +++++ arch/arm/include/asm/macro.h | 81 ++++++++++ arch/arm/lib/gic_64.S | 10 +- common/board_f.c | 2 +- 13 files changed, 502 insertions(+), 90 deletions(-) create mode 100644 arch/arm/cpu/armv8/fsl-lsch3/fdt.c create mode 100644 arch/arm/cpu/armv8/fsl-lsch3/mp.c create mode 100644 arch/arm/cpu/armv8/fsl-lsch3/mp.h
[...]
diff --git a/arch/arm/cpu/armv8/fsl-lsch3/fdt.c b/arch/arm/cpu/armv8/fsl-lsch3/fdt.c new file mode 100644 index 0000000..cd34e16 --- /dev/null +++ b/arch/arm/cpu/armv8/fsl-lsch3/fdt.c @@ -0,0 +1,56 @@ +/*
- Copyright 2014 Freescale Semiconductor, Inc.
- SPDX-License-Identifier: GPL-2.0+
- */
+#include <common.h> +#include <libfdt.h> +#include <fdt_support.h> +#include "mp.h"
+#ifdef CONFIG_MP +void ft_fixup_cpu(void *blob) +{
int off;
__maybe_unused u64 spin_tbl_addr = (u64)get_spin_tbl_addr();
u64 *reg;
u64 val;
off = fdt_node_offset_by_prop_value(blob, -1, "device_type", "cpu", 4);
while (off != -FDT_ERR_NOTFOUND) {
reg = (u64 *)fdt_getprop(blob, off, "reg", 0);
if (reg) {
val = spin_tbl_addr;
+#ifndef CONFIG_FSL_SMP_RELEASE_ALL
val += id_to_core(fdt64_to_cpu(*reg)) * SIZE_BOOT_ENTRY;
In Linux we read /cpus/#address-cells to determine the size of a CPU's reg property (and have dts where this is 1 cell). Will the above work for that?
+#endif
val = cpu_to_fdt64(val);
fdt_setprop_string(blob, off, "enable-method",
"spin-table");
fdt_setprop(blob, off, "cpu-release-addr",
&val, sizeof(val));
} else {
puts("cpu NULL\n");
}
off = fdt_node_offset_by_prop_value(blob, off, "device_type",
"cpu", 4);
}
/*
* Boot page and spin table can be reserved here if not done staticlly
* in device tree.
*
* fdt_add_mem_rsv(blob, bootpg,
* *((u64 *)&(__secondary_boot_page_size)));
* If defined CONFIG_FSL_SMP_RELEASE_ALL, the release address should
* also be reserved.
*/
I think that this reservation should _always_ be added by U-Boot unless specifically overridden.
A problem I had with the arm64 bootwrapper when adding PSCI support and now (as I am moving stuff about) was that the DTS in the kernel tree had a memreserve out-of-sync with what the wrapper actually needed. While I can add a new reservation, I can't remove any in case they are for something else, so I end up protecting too much, wasting memory.
Given that the reservation is to protect data which U-Boot is in control of choosing the address for, I think the only sane thing to do is for U-Boot to always add the reservation.
That way U-Boot can change and existing DTBs will just work. We won't end up protecting too much or too little.
[...]
@@ -119,3 +107,94 @@ ENTRY(lowlevel_init) mov lr, x29 /* Restore LR */ ret ENDPROC(lowlevel_init)
/* Keep literals not used by the secondary boot page outside it */
.ltorg
.align 4
That looks like a small alignment for a page.
Should this be larger? Or is the "page" a misnomer here?
.global secondary_boot_page
+secondary_boot_page:
.global __spin_table
+__spin_table:
.space CONFIG_MAX_CPUS*ENTRY_SIZE
.align 4
/* Secondary Boot Page starts here */
+ENTRY(secondary_boot_func)
/*
* PIR calculation from MPIDR_EL1
Sorry if I'm asking a stupid question, but what is "PIR"?
* MPIDR[1:0] = AFF0_CPUID <- Core ID (0,1)
* MPIDR[7:2] = AFF0_RES
* MPIDR[15:8] = AFF1_CLUSTERID <- Cluster ID (0,1,2,3)
* MPIDR[23:16] = AFF2_CLUSTERID
* MPIDR[24] = MT
* MPIDR[29:25] =RES
Could we say RES0 here? That would match the documentation in the ARM ARM and make things a bit clearer.
Also, missing space after '='?
* MPIDR[30] = U
* MPIDR[31] = ME
My ARMv8 ARM ARM shows this as RES1, but appears to be self-contradictory. I'll query this internally. I don't think that matters here anyway.
* MPIDR[39:32] = AFF3
* We only use AFF0_CPUID and AFF1_CLUSTERID for now
* until AFF2_CLUSTERID and AFF3 have non-zero values.
*/
mrs x0, mpidr_el1
ubfm x1, x0, #8, #15
ubfm x2, x0, #0, #1
orr x10, x2, x1, lsl #2 /* x10 has PIR */
ubfm x9, x0, #0, #15 /* w9 has 16-bit original PIR */
lsl x1, x10, #6 /* spin table is padded to 64 byte each core */
ldr x0, =(SECONDARY_CPU_BOOT_PAGE)
ldr x3, =__spin_table
ldr x4, =secondary_boot_page
sub x3, x3, x4
add x0, x0, x3
add x11, x1, x0
str x9, [x11, #16] /* ENTRY_PIR */
mov x4, #1
str x4, [x11] /* ENTRY_ADDR */
dsb sy
isb
What is the isb intended to synchronize?
Could we get comments on barriers? Even when coming back to code oneself wrote it's easy to miss a subtlety as to why one is needed.
+#if defined(CONFIG_GICV3)
gic_wait_for_interrupt_m x0
+#endif
bl secondary_switch_to_el2
+#ifdef CONFIG_ARMV8_SWITCH_TO_EL1
secondary_switch_to_el1
+#endif
+slave_cpu:
wfe
+#ifdef CONFIG_FSL_SMP_RELEASE_ALL
ldr x1, =CPU_RELEASE_ADDR
ldr x0, [x1]
+#else
ldr x0, [x11]
tbnz x0, #0, slave_cpu
+#endif
cbz x0, slave_cpu
br x0 /* branch to the given address */
Just to check, I take it CPUs won't ever be in a big-endian mode at this point?
+ENDPROC(secondary_boot_func)
+ENTRY(secondary_switch_to_el2)
switch_el x0, 1f, 0f, 0f
+0: ret +1: armv8_switch_to_el2_m x0 +ENDPROC(secondary_switch_to_el2)
+ENTRY(secondary_switch_to_el1)
switch_el x0, 0f, 1f, 0f
+0: ret +1: armv8_switch_to_el1_m x0, x1 +ENDPROC(secondary_switch_to_el1)
/* Ensure that the literals used by the secondary boot page are
* assembled within it
*/
.ltorg
.align 4
Similarly to above, this looks like a small alignment for a page.
.globl __secondary_boot_page_size
.type __secondary_boot_page_size, %object
/* Secondary Boot Page ends here */
+__secondary_boot_page_size:
.quad .-secondary_boot_page
[...]
+int fsl_lsch3_wake_seconday_cores(void) +{
struct ccsr_gur __iomem *gur = (void *)(CONFIG_SYS_FSL_GUTS_ADDR);
struct ccsr_reset __iomem *rst = (void *)(CONFIG_SYS_FSL_RST_ADDR);
void *boot_loc = (void *)SECONDARY_CPU_BOOT_PAGE;
size_t *boot_page_size = &(__secondary_boot_page_size);
u32 cores, cpu_up_mask = 1;
int i, timeout = 10;
u64 *table = get_spin_tbl_addr();
cores = cpu_mask();
memcpy(boot_loc, &secondary_boot_page, *boot_page_size);
/* Clear spin table so that secondary processors
* observe the correct value after waking up from wfe.
*/
memset(table, 0, CONFIG_MAX_CPUS*ENTRY_SIZE);
flush_dcache_range((unsigned long)boot_loc,
(unsigned long)boot_loc + *boot_page_size);
printf("Waking secondary cores to start from %lx\n", gd->relocaddr);
out_le32(&gur->bootlocptrh, (u32)(gd->relocaddr >> 32));
out_le32(&gur->bootlocptrl, (u32)gd->relocaddr);
out_le32(&gur->scratchrw[6], 1);
asm volatile("dsb st" : : : "memory");
rst->brrl = cores;
asm volatile("dsb st" : : : "memory");
/* fixme: this is only needed for the simulator because secnodary cores
* start to run without waiting for boot release register, then enter
* "wfe" before the scratch register is set above.
*/
asm volatile("sev");
That feels a little dodgy; a number of things could generate an event before we got here. Is there no way to block them until we've set that up?
while (timeout--) {
flush_dcache_range((unsigned long)table, (unsigned long)table +
CONFIG_MAX_CPUS * 64);
for (i = 1; i < CONFIG_MAX_CPUS; i++) {
if (table[i * NUM_BOOT_ENTRY + BOOT_ENTRY_ADDR])
cpu_up_mask |= 1 << i;
}
if (hweight32(cpu_up_mask) == hweight32(cores))
break;
udelay(10);
}
Surely we need this before we expect the CPUs to read the values in the table?
Or have I misunderstood?
if (timeout <= 0) {
printf("Not all cores (0x%x) are up (0x%x)\n",
cores, cpu_up_mask);
return 1;
}
printf("All (%d) cores are up.\n", hweight32(cores));
return 0;
+}
[...]
diff --git a/arch/arm/include/asm/macro.h b/arch/arm/include/asm/macro.h index f77e4b8..16ba76e 100644 --- a/arch/arm/include/asm/macro.h +++ b/arch/arm/include/asm/macro.h @@ -105,6 +105,87 @@ lr .req x30 cbz \xreg1, \master_label .endm
+.macro armv8_switch_to_el2_m, xreg1
mov \xreg1, #0x5b1 /* Non-secure EL0/EL1 | HVC | 64bit EL2 */
msr scr_el3, \xreg1
msr cptr_el3, xzr /* Disable coprocessor traps to EL3 */
mov \xreg1, #0x33ff
msr cptr_el2, \xreg1 /* Disable coprocessor traps to EL2 */
/* Initialize SCTLR_EL2 */
msr sctlr_el2, xzr
What about the RES1 bits (e.g. bits 29 & 28)?
We don't seem to initialise them before the eret.
/* Return to the EL2_SP2 mode from EL3 */
mov \xreg1, sp
msr sp_el2, \xreg1 /* Migrate SP */
mrs \xreg1, vbar_el3
msr vbar_el2, \xreg1 /* Migrate VBAR */
mov x0, #0x3c9
msr spsr_el3, \xreg1 /* EL2_SP2 | D | A | I | F */
msr elr_el3, lr
eret
+.endm
+.macro armv8_switch_to_el1_m, xreg1, xreg2
/* Initialize Generic Timers */
mrs \xreg1, cnthctl_el2
orr \xreg1, \xreg1, #0x3 /* Enable EL1 access to timers */
msr cnthctl_el2, \xreg1
Is there any reason this can't be set to a precise known value? This currently leaves EVNTDIR and EVNTEN in UNKNOWN states (which could differ across CPUs).
An EL1 OS can enable the event stream if it wants in CNTKCTL_EL1, so is there any reason to enable it at EL2?
msr cntvoff_el2, \xreg1
Please initialize cntvoff_el2 using xzr. Due to the aforementioned UNKNOWN bits, this could leave CPUs with differing view of time, and there's no point differing in the first place.
An EL1 OS will not be able to fix this.
I fixed this elsewhere in b924d586d70b (arm64: zero cntvoff_el2).
mrs \xreg1, cntkctl_el1
orr \xreg1, \xreg1, #0x3 /* Enable EL0 access to timers */
msr cntkctl_el1, \xreg1
Likewise this leaves many bits UNKNOWN and potentially differing across CPUs, though the OS at EL1 should be able to fix this up (and Linux will).
/* Initilize MPID/MPIDR registers */
mrs \xreg1, midr_el1
mrs \xreg2, mpidr_el1
msr vpidr_el2, \xreg1
msr vmpidr_el2, \xreg2
/* Disable coprocessor traps */
mov \xreg1, #0x33ff
msr cptr_el2, \xreg1 /* Disable coprocessor traps to EL2 */
msr hstr_el2, xzr /* Disable coprocessor traps to EL2 */
mov \xreg1, #3 << 20
msr cpacr_el1, \xreg1 /* Enable FP/SIMD at EL1 */
/* Initialize HCR_EL2 */
mov \xreg1, #(1 << 31) /* 64bit EL1 */
orr \xreg1, \xreg1, #(1 << 29) /* Disable HVC */
msr hcr_el2, \xreg1
/* SCTLR_EL1 initialization */
mov \xreg1, #0x0800
movk \xreg1, #0x30d0, lsl #16
msr sctlr_el1, \xreg1
That doesn't seem to set up all the RES1 bits (e.g. bit 29).
/* Return to the EL1_SP1 mode from EL2 */
mov \xreg1, sp
msr sp_el1, \xreg1 /* Migrate SP */
mrs \xreg1, vbar_el2
msr vbar_el1, \xreg1 /* Migrate VBAR */
mov \xreg1, #0x3c5
msr spsr_el2, \xreg1 /* EL1_SP1 | D | A | I | F */
msr elr_el2, lr
eret
+.endm
+#if defined(CONFIG_GICV3) +.macro gic_wait_for_interrupt_m xreg1 +0 : wfi
mrs \xreg1, ICC_IAR1_EL1
msr ICC_EOIR1_EL1, \xreg1
cbnz \xreg1, 0b
+.endm +#elif defined(CONFIG_GICV2) +.macro gic_wait_for_interrupt_m xreg1, wreg2 +0 : wfi
ldr \wreg2, [\xreg1, GICC_AIAR]
str \wreg2, [\xreg1, GICC_AEOIR]
cbnz \wreg2, 0b
+.endm
Assuming I've understood correctly, here we block until we receive SGI 0 from the CPU with GIC ID 0? Do we have a guarantee that the boot CPU will have GIC ID 0?
Thanks, Mark.

On 07/04/2014 05:31 AM, Mark Rutland wrote:
Hi York,
I spotted a couple of generic issues below. Most of these are issues with the existing code that you happen to be moving around, rather than with the new code this patch introduces.
There are a couple of gotchas around secondary startup that are painful with the bootwrapper for arm64 at present, and I think that we can avoid them by construction for U-Boot. More on that below.
On Fri, Jun 27, 2014 at 05:54:08PM +0100, York Sun wrote:
Secondary cores need to be released from holdoff by boot release registers. With GPP bootrom, they can boot from main memory directly. Individual spin table is used for each core. If a single release address is needed, defining macro CONFIG_FSL_SMP_RELEASE_ALL will use the CPU_RELEASE_ADDR. Spin table and the boot page is reserved in device tree so OS won't overwrite.
Signed-off-by: York Sun yorksun@freescale.com Signed-off-by: Arnab Basu arnab.basu@freescale.com
This set depends on this bundle http://patchwork.ozlabs.org/bundle/yorksun/armv8_fsl-lsch3/
arch/arm/cpu/armv8/fsl-lsch3/Makefile | 2 + arch/arm/cpu/armv8/fsl-lsch3/cpu.c | 13 ++ arch/arm/cpu/armv8/fsl-lsch3/cpu.h | 1 + arch/arm/cpu/armv8/fsl-lsch3/fdt.c | 56 +++++++ arch/arm/cpu/armv8/fsl-lsch3/lowlevel.S | 119 +++++++++++--- arch/arm/cpu/armv8/fsl-lsch3/mp.c | 171 +++++++++++++++++++++ arch/arm/cpu/armv8/fsl-lsch3/mp.h | 36 +++++ arch/arm/cpu/armv8/transition.S | 63 +------- arch/arm/include/asm/arch-fsl-lsch3/config.h | 3 +- arch/arm/include/asm/arch-fsl-lsch3/immap_lsch3.h | 35 +++++ arch/arm/include/asm/macro.h | 81 ++++++++++ arch/arm/lib/gic_64.S | 10 +- common/board_f.c | 2 +- 13 files changed, 502 insertions(+), 90 deletions(-) create mode 100644 arch/arm/cpu/armv8/fsl-lsch3/fdt.c create mode 100644 arch/arm/cpu/armv8/fsl-lsch3/mp.c create mode 100644 arch/arm/cpu/armv8/fsl-lsch3/mp.h
[...]
diff --git a/arch/arm/cpu/armv8/fsl-lsch3/fdt.c b/arch/arm/cpu/armv8/fsl-lsch3/fdt.c new file mode 100644 index 0000000..cd34e16 --- /dev/null +++ b/arch/arm/cpu/armv8/fsl-lsch3/fdt.c @@ -0,0 +1,56 @@ +/*
- Copyright 2014 Freescale Semiconductor, Inc.
- SPDX-License-Identifier: GPL-2.0+
- */
+#include <common.h> +#include <libfdt.h> +#include <fdt_support.h> +#include "mp.h"
+#ifdef CONFIG_MP +void ft_fixup_cpu(void *blob) +{
int off;
__maybe_unused u64 spin_tbl_addr = (u64)get_spin_tbl_addr();
u64 *reg;
u64 val;
off = fdt_node_offset_by_prop_value(blob, -1, "device_type", "cpu", 4);
while (off != -FDT_ERR_NOTFOUND) {
reg = (u64 *)fdt_getprop(blob, off, "reg", 0);
if (reg) {
val = spin_tbl_addr;
+#ifndef CONFIG_FSL_SMP_RELEASE_ALL
val += id_to_core(fdt64_to_cpu(*reg)) * SIZE_BOOT_ENTRY;
In Linux we read /cpus/#address-cells to determine the size of a CPU's reg property (and have dts where this is 1 cell). Will the above work for that?
I don't think so. Will have to add the same size check.
+#endif
val = cpu_to_fdt64(val);
fdt_setprop_string(blob, off, "enable-method",
"spin-table");
fdt_setprop(blob, off, "cpu-release-addr",
&val, sizeof(val));
} else {
puts("cpu NULL\n");
}
off = fdt_node_offset_by_prop_value(blob, off, "device_type",
"cpu", 4);
}
/*
* Boot page and spin table can be reserved here if not done staticlly
* in device tree.
*
* fdt_add_mem_rsv(blob, bootpg,
* *((u64 *)&(__secondary_boot_page_size)));
* If defined CONFIG_FSL_SMP_RELEASE_ALL, the release address should
* also be reserved.
*/
I think that this reservation should _always_ be added by U-Boot unless specifically overridden.
A problem I had with the arm64 bootwrapper when adding PSCI support and now (as I am moving stuff about) was that the DTS in the kernel tree had a memreserve out-of-sync with what the wrapper actually needed. While I can add a new reservation, I can't remove any in case they are for something else, so I end up protecting too much, wasting memory.
Given that the reservation is to protect data which U-Boot is in control of choosing the address for, I think the only sane thing to do is for U-Boot to always add the reservation.
That way U-Boot can change and existing DTBs will just work. We won't end up protecting too much or too little.
That's the same problem I am facing. I can add the reserving memory in u-boot. But it may overlap with the device tree. I guess it should be OK if the range u-boot adds is the same or smaller. Will debug to see.
[...]
@@ -119,3 +107,94 @@ ENTRY(lowlevel_init) mov lr, x29 /* Restore LR */ ret ENDPROC(lowlevel_init)
/* Keep literals not used by the secondary boot page outside it */
.ltorg
.align 4
That looks like a small alignment for a page.
Should this be larger? Or is the "page" a misnomer here?
I think as far as it is aligned to instruction size and keep "ldr" happy, it is OK. The code will be copied to the beginning of DDR to run. Any concern here?
.global secondary_boot_page
+secondary_boot_page:
.global __spin_table
+__spin_table:
.space CONFIG_MAX_CPUS*ENTRY_SIZE
.align 4
/* Secondary Boot Page starts here */
+ENTRY(secondary_boot_func)
/*
* PIR calculation from MPIDR_EL1
Sorry if I'm asking a stupid question, but what is "PIR"?
It is a term we use for all Power SoCs, processor ID register, to uniquely identify each core. Since the spin table code is the same idea, I just borrowed the term. It can be rewritten to fit ARM's context.
* MPIDR[1:0] = AFF0_CPUID <- Core ID (0,1)
* MPIDR[7:2] = AFF0_RES
* MPIDR[15:8] = AFF1_CLUSTERID <- Cluster ID (0,1,2,3)
* MPIDR[23:16] = AFF2_CLUSTERID
* MPIDR[24] = MT
* MPIDR[29:25] =RES
Could we say RES0 here? That would match the documentation in the ARM ARM and make things a bit clearer.
Sure.
Also, missing space after '='?
Yes.
* MPIDR[30] = U
* MPIDR[31] = ME
My ARMv8 ARM ARM shows this as RES1, but appears to be self-contradictory. I'll query this internally. I don't think that matters here anyway.
* MPIDR[39:32] = AFF3
* We only use AFF0_CPUID and AFF1_CLUSTERID for now
* until AFF2_CLUSTERID and AFF3 have non-zero values.
*/
mrs x0, mpidr_el1
ubfm x1, x0, #8, #15
ubfm x2, x0, #0, #1
orr x10, x2, x1, lsl #2 /* x10 has PIR */
ubfm x9, x0, #0, #15 /* w9 has 16-bit original PIR */
lsl x1, x10, #6 /* spin table is padded to 64 byte each core */
ldr x0, =(SECONDARY_CPU_BOOT_PAGE)
ldr x3, =__spin_table
ldr x4, =secondary_boot_page
sub x3, x3, x4
add x0, x0, x3
add x11, x1, x0
str x9, [x11, #16] /* ENTRY_PIR */
mov x4, #1
str x4, [x11] /* ENTRY_ADDR */
dsb sy
isb
What is the isb intended to synchronize?
Could we get comments on barriers? Even when coming back to code oneself wrote it's easy to miss a subtlety as to why one is needed.
Probably we don't need the "isb" here. It is a translation from Power SoC spin table code. I think originally it was used to sync out-of-order execution.
+#if defined(CONFIG_GICV3)
gic_wait_for_interrupt_m x0
+#endif
bl secondary_switch_to_el2
+#ifdef CONFIG_ARMV8_SWITCH_TO_EL1
secondary_switch_to_el1
+#endif
+slave_cpu:
wfe
+#ifdef CONFIG_FSL_SMP_RELEASE_ALL
ldr x1, =CPU_RELEASE_ADDR
ldr x0, [x1]
+#else
ldr x0, [x11]
tbnz x0, #0, slave_cpu
+#endif
cbz x0, slave_cpu
br x0 /* branch to the given address */
Just to check, I take it CPUs won't ever be in a big-endian mode at this point?
Don't know yet. Any concern if big-endian here?
+ENDPROC(secondary_boot_func)
+ENTRY(secondary_switch_to_el2)
switch_el x0, 1f, 0f, 0f
+0: ret +1: armv8_switch_to_el2_m x0 +ENDPROC(secondary_switch_to_el2)
+ENTRY(secondary_switch_to_el1)
switch_el x0, 0f, 1f, 0f
+0: ret +1: armv8_switch_to_el1_m x0, x1 +ENDPROC(secondary_switch_to_el1)
/* Ensure that the literals used by the secondary boot page are
* assembled within it
*/
.ltorg
.align 4
Similarly to above, this looks like a small alignment for a page.
Please suggest a proper alignment.
.globl __secondary_boot_page_size
.type __secondary_boot_page_size, %object
/* Secondary Boot Page ends here */
+__secondary_boot_page_size:
.quad .-secondary_boot_page
[...]
+int fsl_lsch3_wake_seconday_cores(void) +{
struct ccsr_gur __iomem *gur = (void *)(CONFIG_SYS_FSL_GUTS_ADDR);
struct ccsr_reset __iomem *rst = (void *)(CONFIG_SYS_FSL_RST_ADDR);
void *boot_loc = (void *)SECONDARY_CPU_BOOT_PAGE;
size_t *boot_page_size = &(__secondary_boot_page_size);
u32 cores, cpu_up_mask = 1;
int i, timeout = 10;
u64 *table = get_spin_tbl_addr();
cores = cpu_mask();
memcpy(boot_loc, &secondary_boot_page, *boot_page_size);
/* Clear spin table so that secondary processors
* observe the correct value after waking up from wfe.
*/
memset(table, 0, CONFIG_MAX_CPUS*ENTRY_SIZE);
flush_dcache_range((unsigned long)boot_loc,
(unsigned long)boot_loc + *boot_page_size);
printf("Waking secondary cores to start from %lx\n", gd->relocaddr);
out_le32(&gur->bootlocptrh, (u32)(gd->relocaddr >> 32));
out_le32(&gur->bootlocptrl, (u32)gd->relocaddr);
out_le32(&gur->scratchrw[6], 1);
asm volatile("dsb st" : : : "memory");
rst->brrl = cores;
asm volatile("dsb st" : : : "memory");
/* fixme: this is only needed for the simulator because secnodary cores
* start to run without waiting for boot release register, then enter
* "wfe" before the scratch register is set above.
*/
asm volatile("sev");
That feels a little dodgy; a number of things could generate an event before we got here. Is there no way to block them until we've set that up?
This is backward. The secondary cores are expected to be released to run here into GPP bootrom. The bootlocptrh/l points to the location in u-boot, where they continue to run. But for current simulator, I found the secondary cores don't wait to be released. But they won't get far. The GPP bootrom code put them into sleep, unless the scratchrw abobe is set. For this situation, they need to be woken up here.
while (timeout--) {
flush_dcache_range((unsigned long)table, (unsigned long)table +
CONFIG_MAX_CPUS * 64);
for (i = 1; i < CONFIG_MAX_CPUS; i++) {
if (table[i * NUM_BOOT_ENTRY + BOOT_ENTRY_ADDR])
cpu_up_mask |= 1 << i;
}
if (hweight32(cpu_up_mask) == hweight32(cores))
break;
udelay(10);
}
Surely we need this before we expect the CPUs to read the values in the table?
Or have I misunderstood?
I don't know how other ARM SoCs setup spin table. I borrowed the code from Power. The spin tables (one table for each core) are set up by secondary cores, not by the primary core. When the secondary cores are up, they write the spin table and wait there. Because they don't enable d-cache, all tables are in main memory. For the primary core, d-cache is enabled. We have to invalid the d-cache in order to load from main memory. flush_dcache_range() serves the purpose.
if (timeout <= 0) {
printf("Not all cores (0x%x) are up (0x%x)\n",
cores, cpu_up_mask);
return 1;
}
printf("All (%d) cores are up.\n", hweight32(cores));
return 0;
+}
[...]
diff --git a/arch/arm/include/asm/macro.h b/arch/arm/include/asm/macro.h index f77e4b8..16ba76e 100644 --- a/arch/arm/include/asm/macro.h +++ b/arch/arm/include/asm/macro.h @@ -105,6 +105,87 @@ lr .req x30 cbz \xreg1, \master_label .endm
+.macro armv8_switch_to_el2_m, xreg1
mov \xreg1, #0x5b1 /* Non-secure EL0/EL1 | HVC | 64bit EL2 */
msr scr_el3, \xreg1
msr cptr_el3, xzr /* Disable coprocessor traps to EL3 */
mov \xreg1, #0x33ff
msr cptr_el2, \xreg1 /* Disable coprocessor traps to EL2 */
/* Initialize SCTLR_EL2 */
msr sctlr_el2, xzr
What about the RES1 bits (e.g. bits 29 & 28)?
We don't seem to initialise them before the eret.
I can't answer this question and below. Adding Arnab as the original author for these changes.
York
/* Return to the EL2_SP2 mode from EL3 */
mov \xreg1, sp
msr sp_el2, \xreg1 /* Migrate SP */
mrs \xreg1, vbar_el3
msr vbar_el2, \xreg1 /* Migrate VBAR */
mov x0, #0x3c9
msr spsr_el3, \xreg1 /* EL2_SP2 | D | A | I | F */
msr elr_el3, lr
eret
+.endm
+.macro armv8_switch_to_el1_m, xreg1, xreg2
/* Initialize Generic Timers */
mrs \xreg1, cnthctl_el2
orr \xreg1, \xreg1, #0x3 /* Enable EL1 access to timers */
msr cnthctl_el2, \xreg1
Is there any reason this can't be set to a precise known value? This currently leaves EVNTDIR and EVNTEN in UNKNOWN states (which could differ across CPUs).
An EL1 OS can enable the event stream if it wants in CNTKCTL_EL1, so is there any reason to enable it at EL2?
msr cntvoff_el2, \xreg1
Please initialize cntvoff_el2 using xzr. Due to the aforementioned UNKNOWN bits, this could leave CPUs with differing view of time, and there's no point differing in the first place.
An EL1 OS will not be able to fix this.
I fixed this elsewhere in b924d586d70b (arm64: zero cntvoff_el2).
mrs \xreg1, cntkctl_el1
orr \xreg1, \xreg1, #0x3 /* Enable EL0 access to timers */
msr cntkctl_el1, \xreg1
Likewise this leaves many bits UNKNOWN and potentially differing across CPUs, though the OS at EL1 should be able to fix this up (and Linux will).
/* Initilize MPID/MPIDR registers */
mrs \xreg1, midr_el1
mrs \xreg2, mpidr_el1
msr vpidr_el2, \xreg1
msr vmpidr_el2, \xreg2
/* Disable coprocessor traps */
mov \xreg1, #0x33ff
msr cptr_el2, \xreg1 /* Disable coprocessor traps to EL2 */
msr hstr_el2, xzr /* Disable coprocessor traps to EL2 */
mov \xreg1, #3 << 20
msr cpacr_el1, \xreg1 /* Enable FP/SIMD at EL1 */
/* Initialize HCR_EL2 */
mov \xreg1, #(1 << 31) /* 64bit EL1 */
orr \xreg1, \xreg1, #(1 << 29) /* Disable HVC */
msr hcr_el2, \xreg1
/* SCTLR_EL1 initialization */
mov \xreg1, #0x0800
movk \xreg1, #0x30d0, lsl #16
msr sctlr_el1, \xreg1
That doesn't seem to set up all the RES1 bits (e.g. bit 29).
/* Return to the EL1_SP1 mode from EL2 */
mov \xreg1, sp
msr sp_el1, \xreg1 /* Migrate SP */
mrs \xreg1, vbar_el2
msr vbar_el1, \xreg1 /* Migrate VBAR */
mov \xreg1, #0x3c5
msr spsr_el2, \xreg1 /* EL1_SP1 | D | A | I | F */
msr elr_el2, lr
eret
+.endm
+#if defined(CONFIG_GICV3) +.macro gic_wait_for_interrupt_m xreg1 +0 : wfi
mrs \xreg1, ICC_IAR1_EL1
msr ICC_EOIR1_EL1, \xreg1
cbnz \xreg1, 0b
+.endm +#elif defined(CONFIG_GICV2) +.macro gic_wait_for_interrupt_m xreg1, wreg2 +0 : wfi
ldr \wreg2, [\xreg1, GICC_AIAR]
str \wreg2, [\xreg1, GICC_AEOIR]
cbnz \wreg2, 0b
+.endm
Assuming I've understood correctly, here we block until we receive SGI 0 from the CPU with GIC ID 0? Do we have a guarantee that the boot CPU will have GIC ID 0?

-----Original Message----- From: Sun York-R58495 Sent: Tuesday, July 08, 2014 11:26 PM To: Mark Rutland Cc: u-boot@lists.denx.de; trini@ti.com; Basu Arnab-B45036 Subject: Re: [U-Boot] [Patch v1 2/4] armv8/fsl-lsch3: Release secondary cores from boot hold off with Boot Page
On 07/04/2014 05:31 AM, Mark Rutland wrote:
Hi York,
I spotted a couple of generic issues below. Most of these are issues with the existing code that you happen to be moving around, rather than with the new code this patch introduces.
There are a couple of gotchas around secondary startup that are painful with the bootwrapper for arm64 at present, and I think that we can avoid them by construction for U-Boot. More on that below.
On Fri, Jun 27, 2014 at 05:54:08PM +0100, York Sun wrote:
Secondary cores need to be released from holdoff by boot release registers. With GPP bootrom, they can boot from main memory directly. Individual spin table is used for each core. If a single release address is needed, defining macro CONFIG_FSL_SMP_RELEASE_ALL will use the CPU_RELEASE_ADDR. Spin table and the boot page is reserved in device tree so OS won't overwrite.
Signed-off-by: York Sun yorksun@freescale.com Signed-off-by: Arnab Basu arnab.basu@freescale.com
This set depends on this bundle http://patchwork.ozlabs.org/bundle/yorksun/armv8_fsl-lsch3/
arch/arm/cpu/armv8/fsl-lsch3/Makefile | 2 + arch/arm/cpu/armv8/fsl-lsch3/cpu.c | 13 ++ arch/arm/cpu/armv8/fsl-lsch3/cpu.h | 1 + arch/arm/cpu/armv8/fsl-lsch3/fdt.c | 56 +++++++ arch/arm/cpu/armv8/fsl-lsch3/lowlevel.S | 119 +++++++++++-
--
arch/arm/cpu/armv8/fsl-lsch3/mp.c | 171
+++++++++++++++++++++
arch/arm/cpu/armv8/fsl-lsch3/mp.h | 36 +++++ arch/arm/cpu/armv8/transition.S | 63 +------- arch/arm/include/asm/arch-fsl-lsch3/config.h | 3 +- arch/arm/include/asm/arch-fsl-lsch3/immap_lsch3.h | 35 +++++ arch/arm/include/asm/macro.h | 81 ++++++++++ arch/arm/lib/gic_64.S | 10 +- common/board_f.c | 2 +- 13 files changed, 502 insertions(+), 90 deletions(-) create mode 100644 arch/arm/cpu/armv8/fsl-lsch3/fdt.c create mode 100644 arch/arm/cpu/armv8/fsl-lsch3/mp.c create mode 100644 arch/arm/cpu/armv8/fsl-lsch3/mp.h
[...]
diff --git a/arch/arm/cpu/armv8/fsl-lsch3/fdt.c b/arch/arm/cpu/armv8/fsl-lsch3/fdt.c new file mode 100644 index 0000000..cd34e16 --- /dev/null +++ b/arch/arm/cpu/armv8/fsl-lsch3/fdt.c @@ -0,0 +1,56 @@ +/*
- Copyright 2014 Freescale Semiconductor, Inc.
- SPDX-License-Identifier: GPL-2.0+
- */
+#include <common.h> +#include <libfdt.h> +#include <fdt_support.h> +#include "mp.h"
+#ifdef CONFIG_MP +void ft_fixup_cpu(void *blob) +{
int off;
__maybe_unused u64 spin_tbl_addr = (u64)get_spin_tbl_addr();
u64 *reg;
u64 val;
off = fdt_node_offset_by_prop_value(blob, -1, "device_type",
"cpu", 4);
while (off != -FDT_ERR_NOTFOUND) {
reg = (u64 *)fdt_getprop(blob, off, "reg", 0);
if (reg) {
val = spin_tbl_addr; #ifndef
+CONFIG_FSL_SMP_RELEASE_ALL
val += id_to_core(fdt64_to_cpu(*reg)) *
+SIZE_BOOT_ENTRY;
In Linux we read /cpus/#address-cells to determine the size of a CPU's reg property (and have dts where this is 1 cell). Will the above work for that?
I don't think so. Will have to add the same size check.
+#endif
val = cpu_to_fdt64(val);
fdt_setprop_string(blob, off, "enable-method",
"spin-table");
fdt_setprop(blob, off, "cpu-release-addr",
&val, sizeof(val));
} else {
puts("cpu NULL\n");
}
off = fdt_node_offset_by_prop_value(blob, off,
"device_type",
"cpu", 4);
}
/*
* Boot page and spin table can be reserved here if not done
staticlly
* in device tree.
*
* fdt_add_mem_rsv(blob, bootpg,
* *((u64 *)&(__secondary_boot_page_size)));
* If defined CONFIG_FSL_SMP_RELEASE_ALL, the release address
should
* also be reserved.
*/
I think that this reservation should _always_ be added by U-Boot unless specifically overridden.
A problem I had with the arm64 bootwrapper when adding PSCI support and now (as I am moving stuff about) was that the DTS in the kernel tree had a memreserve out-of-sync with what the wrapper actually needed. While I can add a new reservation, I can't remove any in case they are for something else, so I end up protecting too much, wasting
memory.
Given that the reservation is to protect data which U-Boot is in control of choosing the address for, I think the only sane thing to do is for U-Boot to always add the reservation.
That way U-Boot can change and existing DTBs will just work. We won't end up protecting too much or too little.
That's the same problem I am facing. I can add the reserving memory in u- boot. But it may overlap with the device tree. I guess it should be OK if the range u-boot adds is the same or smaller. Will debug to see.
[...]
@@ -119,3 +107,94 @@ ENTRY(lowlevel_init) mov lr, x29 /* Restore LR */ ret ENDPROC(lowlevel_init)
/* Keep literals not used by the secondary boot page outside
it */
.ltorg
.align 4
That looks like a small alignment for a page.
Should this be larger? Or is the "page" a misnomer here?
I think as far as it is aligned to instruction size and keep "ldr" happy, it is OK. The code will be copied to the beginning of DDR to run. Any concern here?
"page" is definitely a misnomer here, the comment (and maybe the label below) should probably be altered. The align directive is fine I guess.
.global secondary_boot_page
+secondary_boot_page:
.global __spin_table
+__spin_table:
.space CONFIG_MAX_CPUS*ENTRY_SIZE
.align 4
/* Secondary Boot Page starts here */
+ENTRY(secondary_boot_func)
/*
* PIR calculation from MPIDR_EL1
Sorry if I'm asking a stupid question, but what is "PIR"?
It is a term we use for all Power SoCs, processor ID register, to uniquely identify each core. Since the spin table code is the same idea, I just borrowed the term. It can be rewritten to fit ARM's context.
* MPIDR[1:0] = AFF0_CPUID <- Core ID (0,1)
* MPIDR[7:2] = AFF0_RES
* MPIDR[15:8] = AFF1_CLUSTERID <- Cluster ID (0,1,2,3)
* MPIDR[23:16] = AFF2_CLUSTERID
* MPIDR[24] = MT
* MPIDR[29:25] =RES
Could we say RES0 here? That would match the documentation in the ARM ARM and make things a bit clearer.
Sure.
Also, missing space after '='?
Yes.
* MPIDR[30] = U
* MPIDR[31] = ME
My ARMv8 ARM ARM shows this as RES1, but appears to be self-contradictory. I'll query this internally. I don't think that matters here anyway.
* MPIDR[39:32] = AFF3
* We only use AFF0_CPUID and AFF1_CLUSTERID for now
* until AFF2_CLUSTERID and AFF3 have non-zero values.
*/
mrs x0, mpidr_el1
ubfm x1, x0, #8, #15
ubfm x2, x0, #0, #1
orr x10, x2, x1, lsl #2 /* x10 has PIR */
ubfm x9, x0, #0, #15 /* w9 has 16-bit original PIR
*/
lsl x1, x10, #6 /* spin table is padded to 64 byte
each core */
ldr x0, =(SECONDARY_CPU_BOOT_PAGE)
ldr x3, =__spin_table
ldr x4, =secondary_boot_page
sub x3, x3, x4
add x0, x0, x3
add x11, x1, x0
str x9, [x11, #16] /* ENTRY_PIR */
mov x4, #1
str x4, [x11] /* ENTRY_ADDR */
dsb sy
isb
What is the isb intended to synchronize?
Could we get comments on barriers? Even when coming back to code oneself wrote it's easy to miss a subtlety as to why one is needed.
Probably we don't need the "isb" here. It is a translation from Power SoC spin table code. I think originally it was used to sync out-of-order execution.
+#if defined(CONFIG_GICV3)
gic_wait_for_interrupt_m x0
+#endif
bl secondary_switch_to_el2
+#ifdef CONFIG_ARMV8_SWITCH_TO_EL1
secondary_switch_to_el1
+#endif
+slave_cpu:
wfe
+#ifdef CONFIG_FSL_SMP_RELEASE_ALL
ldr x1, =CPU_RELEASE_ADDR
ldr x0, [x1]
+#else
ldr x0, [x11]
tbnz x0, #0, slave_cpu
+#endif
cbz x0, slave_cpu
br x0 /* branch to the given address
*/
Just to check, I take it CPUs won't ever be in a big-endian mode at this point?
Don't know yet. Any concern if big-endian here?
I think we missed something here. Mark please correct me if I am wrong. If the CPU is big-endian then we will need to convert the address contained at "CPU_RELEASE_ADDR", since it will always be written as a "single 64-bit little-endian value" (quoting from Documentation/arm64/booting.txt")
+ENDPROC(secondary_boot_func)
+ENTRY(secondary_switch_to_el2)
switch_el x0, 1f, 0f, 0f
+0: ret +1: armv8_switch_to_el2_m x0 +ENDPROC(secondary_switch_to_el2)
+ENTRY(secondary_switch_to_el1)
switch_el x0, 0f, 1f, 0f
+0: ret +1: armv8_switch_to_el1_m x0, x1 +ENDPROC(secondary_switch_to_el1)
/* Ensure that the literals used by the secondary boot page
are
* assembled within it
*/
.ltorg
.align 4
Similarly to above, this looks like a small alignment for a page.
Please suggest a proper alignment.
Think this is confusion caused by our use of the term "secondary boot page". I don't think it needs to be sized or aligned as a page. We should probably change our terminology.
.globl __secondary_boot_page_size
.type __secondary_boot_page_size, %object
/* Secondary Boot Page ends here */
+__secondary_boot_page_size:
.quad .-secondary_boot_page
[...]
+int fsl_lsch3_wake_seconday_cores(void) +{
struct ccsr_gur __iomem *gur = (void
*)(CONFIG_SYS_FSL_GUTS_ADDR);
struct ccsr_reset __iomem *rst = (void
*)(CONFIG_SYS_FSL_RST_ADDR);
void *boot_loc = (void *)SECONDARY_CPU_BOOT_PAGE;
size_t *boot_page_size = &(__secondary_boot_page_size);
u32 cores, cpu_up_mask = 1;
int i, timeout = 10;
u64 *table = get_spin_tbl_addr();
cores = cpu_mask();
memcpy(boot_loc, &secondary_boot_page, *boot_page_size);
/* Clear spin table so that secondary processors
* observe the correct value after waking up from wfe.
*/
memset(table, 0, CONFIG_MAX_CPUS*ENTRY_SIZE);
flush_dcache_range((unsigned long)boot_loc,
(unsigned long)boot_loc +
- *boot_page_size);
printf("Waking secondary cores to start from %lx\n", gd-
relocaddr);
out_le32(&gur->bootlocptrh, (u32)(gd->relocaddr >> 32));
out_le32(&gur->bootlocptrl, (u32)gd->relocaddr);
out_le32(&gur->scratchrw[6], 1);
asm volatile("dsb st" : : : "memory");
rst->brrl = cores;
asm volatile("dsb st" : : : "memory");
/* fixme: this is only needed for the simulator because
secnodary cores
* start to run without waiting for boot release register,
then enter
* "wfe" before the scratch register is set above.
*/
asm volatile("sev");
That feels a little dodgy; a number of things could generate an event before we got here. Is there no way to block them until we've set that up?
This is backward. The secondary cores are expected to be released to run here into GPP bootrom. The bootlocptrh/l points to the location in u- boot, where they continue to run. But for current simulator, I found the secondary cores don't wait to be released. But they won't get far. The GPP bootrom code put them into sleep, unless the scratchrw abobe is set. For this situation, they need to be woken up here.
while (timeout--) {
flush_dcache_range((unsigned long)table, (unsigned
long)table +
CONFIG_MAX_CPUS * 64);
for (i = 1; i < CONFIG_MAX_CPUS; i++) {
if (table[i * NUM_BOOT_ENTRY +
BOOT_ENTRY_ADDR])
cpu_up_mask |= 1 << i;
}
if (hweight32(cpu_up_mask) == hweight32(cores))
break;
udelay(10);
}
Surely we need this before we expect the CPUs to read the values in the table?
Or have I misunderstood?
I don't know how other ARM SoCs setup spin table. I borrowed the code from Power. The spin tables (one table for each core) are set up by secondary cores, not by the primary core. When the secondary cores are up, they write the spin table and wait there. Because they don't enable d-cache, all tables are in main memory. For the primary core, d-cache is enabled. We have to invalid the d-cache in order to load from main memory. flush_dcache_range() serves the purpose.
if (timeout <= 0) {
printf("Not all cores (0x%x) are up (0x%x)\n",
cores, cpu_up_mask);
return 1;
}
printf("All (%d) cores are up.\n", hweight32(cores));
return 0;
+}
[...]
diff --git a/arch/arm/include/asm/macro.h b/arch/arm/include/asm/macro.h index f77e4b8..16ba76e 100644 --- a/arch/arm/include/asm/macro.h +++ b/arch/arm/include/asm/macro.h @@ -105,6 +105,87 @@ lr .req x30 cbz \xreg1, \master_label .endm
+.macro armv8_switch_to_el2_m, xreg1
mov \xreg1, #0x5b1 /* Non-secure EL0/EL1 | HVC | 64bit
EL2 */
msr scr_el3, \xreg1
msr cptr_el3, xzr /* Disable coprocessor traps to EL3 */
mov \xreg1, #0x33ff
msr cptr_el2, \xreg1 /* Disable coprocessor traps
to EL2 */
/* Initialize SCTLR_EL2 */
msr sctlr_el2, xzr
What about the RES1 bits (e.g. bits 29 & 28)?
We don't seem to initialise them before the eret.
I can't answer this question and below. Adding Arnab as the original author for these changes.
York
You are right, will fix this. According to the ARMv8 ARM, RES1 bits "should be one or preserved". What should the preferred approach be here? Write one or read modify and update the required bits?
/* Return to the EL2_SP2 mode from EL3 */
mov \xreg1, sp
msr sp_el2, \xreg1 /* Migrate SP */
mrs \xreg1, vbar_el3
msr vbar_el2, \xreg1 /* Migrate VBAR */
mov x0, #0x3c9
Just noticed a bug here, x0 should in fact be \xreg1. My bad!!
msr spsr_el3, \xreg1 /* EL2_SP2 | D | A | I | F */
msr elr_el3, lr
eret
+.endm
+.macro armv8_switch_to_el1_m, xreg1, xreg2
/* Initialize Generic Timers */
mrs \xreg1, cnthctl_el2
orr \xreg1, \xreg1, #0x3 /* Enable EL1 access to timers
*/
msr cnthctl_el2, \xreg1
Is there any reason this can't be set to a precise known value? This currently leaves EVNTDIR and EVNTEN in UNKNOWN states (which could differ across CPUs).
You mean EVENTDIR and EVNTI? EVNTEN resets to 0 according to my copy of the documentation. Since events are disabled will the value of these bits matter? I would expect any code that sets EVNTEN to first set these bits to sane values, right? That said, there is no harm in setting them here I guess, does 0 seem like a reasonable value to go with?
An EL1 OS can enable the event stream if it wants in CNTKCTL_EL1, so is there any reason to enable it at EL2?
Are we enabling it at EL2? We are only setting EL1PCEN and EL1PCTEN. EVNTEN resets to 0 so we are leaving events disabled, right?
msr cntvoff_el2, \xreg1
Please initialize cntvoff_el2 using xzr. Due to the aforementioned UNKNOWN bits, this could leave CPUs with differing view of time, and there's no point differing in the first place.
An EL1 OS will not be able to fix this.
I fixed this elsewhere in b924d586d70b (arm64: zero cntvoff_el2).
Will do
mrs \xreg1, cntkctl_el1
orr \xreg1, \xreg1, #0x3 /* Enable EL0 access to timers
*/
msr cntkctl_el1, \xreg1
Likewise this leaves many bits UNKNOWN and potentially differing across CPUs, though the OS at EL1 should be able to fix this up (and Linux will).
As you said, Linux sets this register and the value used (0x2) is different from what we use here (0x3). So, should we get rid of this section of code?
/* Initilize MPID/MPIDR registers */
mrs \xreg1, midr_el1
mrs \xreg2, mpidr_el1
msr vpidr_el2, \xreg1
msr vmpidr_el2, \xreg2
/* Disable coprocessor traps */
mov \xreg1, #0x33ff
msr cptr_el2, \xreg1 /* Disable coprocessor traps
to EL2 */
msr hstr_el2, xzr /* Disable coprocessor traps
to EL2 */
mov \xreg1, #3 << 20
msr cpacr_el1, \xreg1 /* Enable FP/SIMD at EL1 */
/* Initialize HCR_EL2 */
mov \xreg1, #(1 << 31) /* 64bit EL1 */
orr \xreg1, \xreg1, #(1 << 29) /* Disable HVC */
msr hcr_el2, \xreg1
/* SCTLR_EL1 initialization */
mov \xreg1, #0x0800
movk \xreg1, #0x30d0, lsl #16
msr sctlr_el1, \xreg1
That doesn't seem to set up all the RES1 bits (e.g. bit 29).
Will fix.
/* Return to the EL1_SP1 mode from EL2 */
mov \xreg1, sp
msr sp_el1, \xreg1 /* Migrate SP */
mrs \xreg1, vbar_el2
msr vbar_el1, \xreg1 /* Migrate VBAR */
mov \xreg1, #0x3c5
msr spsr_el2, \xreg1 /* EL1_SP1 | D | A | I | F */
msr elr_el2, lr
eret
+.endm
+#if defined(CONFIG_GICV3) +.macro gic_wait_for_interrupt_m xreg1 +0 : wfi
mrs \xreg1, ICC_IAR1_EL1
msr ICC_EOIR1_EL1, \xreg1
cbnz \xreg1, 0b
+.endm +#elif defined(CONFIG_GICV2) +.macro gic_wait_for_interrupt_m xreg1, wreg2 +0 : wfi
ldr \wreg2, [\xreg1, GICC_AIAR]
str \wreg2, [\xreg1, GICC_AEOIR]
cbnz \wreg2, 0b
+.endm
Assuming I've understood correctly, here we block until we receive SGI 0 from the CPU with GIC ID 0? Do we have a guarantee that the boot CPU will have GIC ID 0?
You are right, for GICv2 there is an assumption that the boot CPU will have CPU interface ID 0. As far as I know there is no such guarantee. It is probably ok to check bits 9:0 and ignore 12:10 here right? The assumption being that whoever is sending SGI 0 is the boot CPU.
Thanks Arnab

@@ -119,3 +107,94 @@ ENTRY(lowlevel_init) mov lr, x29 /* Restore LR */ ret ENDPROC(lowlevel_init)
/* Keep literals not used by the secondary boot page outside
it */
.ltorg
.align 4
That looks like a small alignment for a page.
Should this be larger? Or is the "page" a misnomer here?
I think as far as it is aligned to instruction size and keep "ldr" happy, it is OK. The code will be copied to the beginning of DDR to run. Any concern here?
"page" is definitely a misnomer here, the comment (and maybe the label below) should probably be altered. The align directive is fine I guess.
I think it can be dropped to .align 2 if it's just to keep the instruction stream aligned as per York's comment, but it's not harmful to have greater alignment (just slightly confusing when trying to figure out why the .align is there).
[...]
+#if defined(CONFIG_GICV3)
gic_wait_for_interrupt_m x0
+#endif
bl secondary_switch_to_el2
+#ifdef CONFIG_ARMV8_SWITCH_TO_EL1
secondary_switch_to_el1
+#endif
+slave_cpu:
wfe
+#ifdef CONFIG_FSL_SMP_RELEASE_ALL
ldr x1, =CPU_RELEASE_ADDR
ldr x0, [x1]
+#else
ldr x0, [x11]
tbnz x0, #0, slave_cpu
+#endif
cbz x0, slave_cpu
br x0 /* branch to the given address
*/
Just to check, I take it CPUs won't ever be in a big-endian mode at this point?
Don't know yet. Any concern if big-endian here?
I think we missed something here. Mark please correct me if I am wrong. If the CPU is big-endian then we will need to convert the address contained at "CPU_RELEASE_ADDR", since it will always be written as a "single 64-bit little-endian value" (quoting from Documentation/arm64/booting.txt")
It sounds like you figured it out.
As far as I am aware, all you need to do is byte-swap the value if CPUs are big-endian at this point. Linux will configure the CPUs to the endianness it desires before it makes any explicit memory accesses.
+ENDPROC(secondary_boot_func)
+ENTRY(secondary_switch_to_el2)
switch_el x0, 1f, 0f, 0f
+0: ret +1: armv8_switch_to_el2_m x0 +ENDPROC(secondary_switch_to_el2)
+ENTRY(secondary_switch_to_el1)
switch_el x0, 0f, 1f, 0f
+0: ret +1: armv8_switch_to_el1_m x0, x1 +ENDPROC(secondary_switch_to_el1)
/* Ensure that the literals used by the secondary boot page
are
* assembled within it
*/
.ltorg
.align 4
Similarly to above, this looks like a small alignment for a page.
Please suggest a proper alignment.
Think this is confusion caused by our use of the term "secondary boot page". I don't think it needs to be sized or aligned as a page. We should probably change our terminology.
If there's some better terminology we could use, it would certainly make things clearer. I must admit that the alternatives I came up with weren't much better. "Secondary boot region", perhaps?
[...]
/* Initialize SCTLR_EL2 */
msr sctlr_el2, xzr
What about the RES1 bits (e.g. bits 29 & 28)?
We don't seem to initialise them before the eret.
I can't answer this question and below. Adding Arnab as the original author for these changes.
York
You are right, will fix this. According to the ARMv8 ARM, RES1 bits "should be one or preserved". What should the preferred approach be here? Write one or read modify and update the required bits?
As this seems to be the first initialisation of sctlr_el2, I believe the correct thing to do is to write one for those bits, as their value may be UNKNOWN (if not hardwired).
Per my reading of the ARM ARM's description of SBOP, after initialization read-modify-write preserving the value of those bits is the preferred way of modifying the register.
/* Return to the EL2_SP2 mode from EL3 */
mov \xreg1, sp
msr sp_el2, \xreg1 /* Migrate SP */
mrs \xreg1, vbar_el3
msr vbar_el2, \xreg1 /* Migrate VBAR */
mov x0, #0x3c9
Just noticed a bug here, x0 should in fact be \xreg1. My bad!!
msr spsr_el3, \xreg1 /* EL2_SP2 | D | A | I | F */
msr elr_el3, lr
eret
+.endm
+.macro armv8_switch_to_el1_m, xreg1, xreg2
/* Initialize Generic Timers */
mrs \xreg1, cnthctl_el2
orr \xreg1, \xreg1, #0x3 /* Enable EL1 access to timers
*/
msr cnthctl_el2, \xreg1
Is there any reason this can't be set to a precise known value? This currently leaves EVNTDIR and EVNTEN in UNKNOWN states (which could differ across CPUs).
You mean EVENTDIR and EVNTI? EVNTEN resets to 0 according to my copy of the documentation.
Sorry, my bad, I mixed up EVENTEN and EVENTI when reading the ARM ARM.
The problem I thought I'd spotted does not exist, so feel free to ignore the above.
Since events are disabled will the value of these bits matter?
I don't think so. I think EVENTEN being zero is sufficient to prevent anything unexpected.
I would expect any code that sets EVNTEN to first set these bits to sane values, right? That said, there is no harm in setting them here I guess, does 0 seem like a reasonable value to go with?
I think leaving them as-is should be fine given EVENTEN resets to 0. As you say, we should expect anything wanting to make use of the event stream to configure EVENTI and EVENTDIR.
An EL1 OS can enable the event stream if it wants in CNTKCTL_EL1, so is there any reason to enable it at EL2?
Are we enabling it at EL2? We are only setting EL1PCEN and EL1PCTEN. EVNTEN resets to 0 so we are leaving events disabled, right?
Yes. Sorry for the noise.
msr cntvoff_el2, \xreg1
Please initialize cntvoff_el2 using xzr. Due to the aforementioned UNKNOWN bits, this could leave CPUs with differing view of time, and there's no point differing in the first place.
An EL1 OS will not be able to fix this.
I fixed this elsewhere in b924d586d70b (arm64: zero cntvoff_el2).
Will do
Cheers.
mrs \xreg1, cntkctl_el1
orr \xreg1, \xreg1, #0x3 /* Enable EL0 access to timers
*/
msr cntkctl_el1, \xreg1
Likewise this leaves many bits UNKNOWN and potentially differing across CPUs, though the OS at EL1 should be able to fix this up (and Linux will).
As you said, Linux sets this register and the value used (0x2) is different from what we use here (0x3). So, should we get rid of this section of code?
I U-Boot isn't doing anything at EL0 and doesn't need an event stream, then I'd recommend dropping this and leaving it up to the OS to configure.
/* Initilize MPID/MPIDR registers */
mrs \xreg1, midr_el1
mrs \xreg2, mpidr_el1
msr vpidr_el2, \xreg1
msr vmpidr_el2, \xreg2
/* Disable coprocessor traps */
mov \xreg1, #0x33ff
msr cptr_el2, \xreg1 /* Disable coprocessor traps
to EL2 */
msr hstr_el2, xzr /* Disable coprocessor traps
to EL2 */
mov \xreg1, #3 << 20
msr cpacr_el1, \xreg1 /* Enable FP/SIMD at EL1 */
/* Initialize HCR_EL2 */
mov \xreg1, #(1 << 31) /* 64bit EL1 */
orr \xreg1, \xreg1, #(1 << 29) /* Disable HVC */
msr hcr_el2, \xreg1
/* SCTLR_EL1 initialization */
mov \xreg1, #0x0800
movk \xreg1, #0x30d0, lsl #16
msr sctlr_el1, \xreg1
That doesn't seem to set up all the RES1 bits (e.g. bit 29).
Will fix.
Cheers.
/* Return to the EL1_SP1 mode from EL2 */
mov \xreg1, sp
msr sp_el1, \xreg1 /* Migrate SP */
mrs \xreg1, vbar_el2
msr vbar_el1, \xreg1 /* Migrate VBAR */
mov \xreg1, #0x3c5
msr spsr_el2, \xreg1 /* EL1_SP1 | D | A | I | F */
msr elr_el2, lr
eret
+.endm
+#if defined(CONFIG_GICV3) +.macro gic_wait_for_interrupt_m xreg1 +0 : wfi
mrs \xreg1, ICC_IAR1_EL1
msr ICC_EOIR1_EL1, \xreg1
cbnz \xreg1, 0b
+.endm +#elif defined(CONFIG_GICV2) +.macro gic_wait_for_interrupt_m xreg1, wreg2 +0 : wfi
ldr \wreg2, [\xreg1, GICC_AIAR]
str \wreg2, [\xreg1, GICC_AEOIR]
cbnz \wreg2, 0b
+.endm
Assuming I've understood correctly, here we block until we receive SGI 0 from the CPU with GIC ID 0? Do we have a guarantee that the boot CPU will have GIC ID 0?
You are right, for GICv2 there is an assumption that the boot CPU will have CPU interface ID 0. As far as I know there is no such guarantee. It is probably ok to check bits 9:0 and ignore 12:10 here right? The assumption being that whoever is sending SGI 0 is the boot CPU.
That sounds fine to me.
A assume that by the time we get to an OS no CPUs should be waiting on an interrupt?
For 32-bit Linux we broadcast SGIs until each CPU has read its GIC CPU ID from the GIC (necessary on platforms where SGIs form part of the secondary boot protocol). On 64-bit we don't do that currently but it would be nice to know that if we did happen to broadcast an SGI it wouldn't cause bad things to happen inside U-Boot for some secondary CPUs.
Cheers, Mark.

On Tue, Jul 08, 2014 at 06:56:26PM +0100, York Sun wrote:
On 07/04/2014 05:31 AM, Mark Rutland wrote:
Hi York,
I spotted a couple of generic issues below. Most of these are issues with the existing code that you happen to be moving around, rather than with the new code this patch introduces.
There are a couple of gotchas around secondary startup that are painful with the bootwrapper for arm64 at present, and I think that we can avoid them by construction for U-Boot. More on that below.
On Fri, Jun 27, 2014 at 05:54:08PM +0100, York Sun wrote:
Secondary cores need to be released from holdoff by boot release registers. With GPP bootrom, they can boot from main memory directly. Individual spin table is used for each core. If a single release address is needed, defining macro CONFIG_FSL_SMP_RELEASE_ALL will use the CPU_RELEASE_ADDR. Spin table and the boot page is reserved in device tree so OS won't overwrite.
Signed-off-by: York Sun yorksun@freescale.com Signed-off-by: Arnab Basu arnab.basu@freescale.com
This set depends on this bundle http://patchwork.ozlabs.org/bundle/yorksun/armv8_fsl-lsch3/
arch/arm/cpu/armv8/fsl-lsch3/Makefile | 2 + arch/arm/cpu/armv8/fsl-lsch3/cpu.c | 13 ++ arch/arm/cpu/armv8/fsl-lsch3/cpu.h | 1 + arch/arm/cpu/armv8/fsl-lsch3/fdt.c | 56 +++++++ arch/arm/cpu/armv8/fsl-lsch3/lowlevel.S | 119 +++++++++++--- arch/arm/cpu/armv8/fsl-lsch3/mp.c | 171 +++++++++++++++++++++ arch/arm/cpu/armv8/fsl-lsch3/mp.h | 36 +++++ arch/arm/cpu/armv8/transition.S | 63 +------- arch/arm/include/asm/arch-fsl-lsch3/config.h | 3 +- arch/arm/include/asm/arch-fsl-lsch3/immap_lsch3.h | 35 +++++ arch/arm/include/asm/macro.h | 81 ++++++++++ arch/arm/lib/gic_64.S | 10 +- common/board_f.c | 2 +- 13 files changed, 502 insertions(+), 90 deletions(-) create mode 100644 arch/arm/cpu/armv8/fsl-lsch3/fdt.c create mode 100644 arch/arm/cpu/armv8/fsl-lsch3/mp.c create mode 100644 arch/arm/cpu/armv8/fsl-lsch3/mp.h
[...]
diff --git a/arch/arm/cpu/armv8/fsl-lsch3/fdt.c b/arch/arm/cpu/armv8/fsl-lsch3/fdt.c new file mode 100644 index 0000000..cd34e16 --- /dev/null +++ b/arch/arm/cpu/armv8/fsl-lsch3/fdt.c @@ -0,0 +1,56 @@ +/*
- Copyright 2014 Freescale Semiconductor, Inc.
- SPDX-License-Identifier: GPL-2.0+
- */
+#include <common.h> +#include <libfdt.h> +#include <fdt_support.h> +#include "mp.h"
+#ifdef CONFIG_MP +void ft_fixup_cpu(void *blob) +{
int off;
__maybe_unused u64 spin_tbl_addr = (u64)get_spin_tbl_addr();
u64 *reg;
u64 val;
off = fdt_node_offset_by_prop_value(blob, -1, "device_type", "cpu", 4);
while (off != -FDT_ERR_NOTFOUND) {
reg = (u64 *)fdt_getprop(blob, off, "reg", 0);
if (reg) {
val = spin_tbl_addr;
+#ifndef CONFIG_FSL_SMP_RELEASE_ALL
val += id_to_core(fdt64_to_cpu(*reg)) * SIZE_BOOT_ENTRY;
In Linux we read /cpus/#address-cells to determine the size of a CPU's reg property (and have dts where this is 1 cell). Will the above work for that?
I don't think so. Will have to add the same size check.
Cheers.
+#endif
val = cpu_to_fdt64(val);
fdt_setprop_string(blob, off, "enable-method",
"spin-table");
fdt_setprop(blob, off, "cpu-release-addr",
&val, sizeof(val));
} else {
puts("cpu NULL\n");
}
off = fdt_node_offset_by_prop_value(blob, off, "device_type",
"cpu", 4);
}
/*
* Boot page and spin table can be reserved here if not done staticlly
* in device tree.
*
* fdt_add_mem_rsv(blob, bootpg,
* *((u64 *)&(__secondary_boot_page_size)));
* If defined CONFIG_FSL_SMP_RELEASE_ALL, the release address should
* also be reserved.
*/
I think that this reservation should _always_ be added by U-Boot unless specifically overridden.
A problem I had with the arm64 bootwrapper when adding PSCI support and now (as I am moving stuff about) was that the DTS in the kernel tree had a memreserve out-of-sync with what the wrapper actually needed. While I can add a new reservation, I can't remove any in case they are for something else, so I end up protecting too much, wasting memory.
Given that the reservation is to protect data which U-Boot is in control of choosing the address for, I think the only sane thing to do is for U-Boot to always add the reservation.
That way U-Boot can change and existing DTBs will just work. We won't end up protecting too much or too little.
That's the same problem I am facing. I can add the reserving memory in u-boot. But it may overlap with the device tree. I guess it should be OK if the range u-boot adds is the same or smaller. Will debug to see.
As far as I am aware, Linux should handle overlapping memreserves sanely (we just iterate over them and mark that range of memory unusable).
If that doesn't happen to work at the moment then I'm happy to have a go at making it so.
[...]
@@ -119,3 +107,94 @@ ENTRY(lowlevel_init) mov lr, x29 /* Restore LR */ ret ENDPROC(lowlevel_init)
/* Keep literals not used by the secondary boot page outside it */
.ltorg
.align 4
That looks like a small alignment for a page.
Should this be larger? Or is the "page" a misnomer here?
I think as far as it is aligned to instruction size and keep "ldr" happy, it is OK. The code will be copied to the beginning of DDR to run. Any concern here?
I think this is fine.
Looking over this again I was inferring the wrong thing from the comment above the .ltorg and the .align.
Sorry for the noise.
That said, for aarch64 .align takes a power of two and instructions are 32-bit, so wouldn't .align 2 be sufficient?
If so, could the comment be updated to say something like "keep relevant literals close, don't break alignment of the instruction stream"? That would avoid my confusion.
.global secondary_boot_page
+secondary_boot_page:
.global __spin_table
+__spin_table:
.space CONFIG_MAX_CPUS*ENTRY_SIZE
.align 4
/* Secondary Boot Page starts here */
+ENTRY(secondary_boot_func)
/*
* PIR calculation from MPIDR_EL1
Sorry if I'm asking a stupid question, but what is "PIR"?
It is a term we use for all Power SoCs, processor ID register, to uniquely identify each core. Since the spin table code is the same idea, I just borrowed the term. It can be rewritten to fit ARM's context.
I see. Are we just calculating a linear index for each CPU? It might be better to state that for the sake of those of us without a background in Power development. :)
* MPIDR[1:0] = AFF0_CPUID <- Core ID (0,1)
* MPIDR[7:2] = AFF0_RES
* MPIDR[15:8] = AFF1_CLUSTERID <- Cluster ID (0,1,2,3)
* MPIDR[23:16] = AFF2_CLUSTERID
* MPIDR[24] = MT
* MPIDR[29:25] =RES
Could we say RES0 here? That would match the documentation in the ARM ARM and make things a bit clearer.
Sure.
Also, missing space after '='?
Yes.
* MPIDR[30] = U
* MPIDR[31] = ME
My ARMv8 ARM ARM shows this as RES1, but appears to be self-contradictory. I'll query this internally. I don't think that matters here anyway.
* MPIDR[39:32] = AFF3
* We only use AFF0_CPUID and AFF1_CLUSTERID for now
* until AFF2_CLUSTERID and AFF3 have non-zero values.
*/
mrs x0, mpidr_el1
ubfm x1, x0, #8, #15
ubfm x2, x0, #0, #1
orr x10, x2, x1, lsl #2 /* x10 has PIR */
ubfm x9, x0, #0, #15 /* w9 has 16-bit original PIR */
lsl x1, x10, #6 /* spin table is padded to 64 byte each core */
ldr x0, =(SECONDARY_CPU_BOOT_PAGE)
ldr x3, =__spin_table
ldr x4, =secondary_boot_page
sub x3, x3, x4
add x0, x0, x3
add x11, x1, x0
str x9, [x11, #16] /* ENTRY_PIR */
mov x4, #1
str x4, [x11] /* ENTRY_ADDR */
dsb sy
isb
What is the isb intended to synchronize?
Could we get comments on barriers? Even when coming back to code oneself wrote it's easy to miss a subtlety as to why one is needed.
Probably we don't need the "isb" here. It is a translation from Power SoC spin table code. I think originally it was used to sync out-of-order execution.
I see. As far as I can see the dsb should be sufficient here unless we're trying to synchronise with a prior write to a system register. The dsb should halt execution until the prior memory accesses are complete (while there could be stale prefetched instructions we don't seem to be changing any execution context so that should be ok).
+#if defined(CONFIG_GICV3)
gic_wait_for_interrupt_m x0
+#endif
bl secondary_switch_to_el2
+#ifdef CONFIG_ARMV8_SWITCH_TO_EL1
secondary_switch_to_el1
+#endif
+slave_cpu:
wfe
+#ifdef CONFIG_FSL_SMP_RELEASE_ALL
ldr x1, =CPU_RELEASE_ADDR
ldr x0, [x1]
+#else
ldr x0, [x11]
tbnz x0, #0, slave_cpu
+#endif
cbz x0, slave_cpu
br x0 /* branch to the given address */
Just to check, I take it CPUs won't ever be in a big-endian mode at this point?
Don't know yet. Any concern if big-endian here?
The value written to the spin-table mailbox by Linux is always little-endian, so if the CPU is in a big-endian mode it will need to reverse the byes before branching. Linux will configure the endianness it wants before it makes any explicit memory accesses, so shouldn't require anything else special with regards to running big-endian (other than all CPUs it's told about supporting BE, of course).
+ENDPROC(secondary_boot_func)
+ENTRY(secondary_switch_to_el2)
switch_el x0, 1f, 0f, 0f
+0: ret +1: armv8_switch_to_el2_m x0 +ENDPROC(secondary_switch_to_el2)
+ENTRY(secondary_switch_to_el1)
switch_el x0, 0f, 1f, 0f
+0: ret +1: armv8_switch_to_el1_m x0, x1 +ENDPROC(secondary_switch_to_el1)
/* Ensure that the literals used by the secondary boot page are
* assembled within it
*/
.ltorg
.align 4
Similarly to above, this looks like a small alignment for a page.
Please suggest a proper alignment.
Again, this was a misreading on my part. Apologies for the noise.
I think my comments above apply here too.
.globl __secondary_boot_page_size
.type __secondary_boot_page_size, %object
/* Secondary Boot Page ends here */
+__secondary_boot_page_size:
.quad .-secondary_boot_page
[...]
+int fsl_lsch3_wake_seconday_cores(void) +{
struct ccsr_gur __iomem *gur = (void *)(CONFIG_SYS_FSL_GUTS_ADDR);
struct ccsr_reset __iomem *rst = (void *)(CONFIG_SYS_FSL_RST_ADDR);
void *boot_loc = (void *)SECONDARY_CPU_BOOT_PAGE;
size_t *boot_page_size = &(__secondary_boot_page_size);
u32 cores, cpu_up_mask = 1;
int i, timeout = 10;
u64 *table = get_spin_tbl_addr();
cores = cpu_mask();
memcpy(boot_loc, &secondary_boot_page, *boot_page_size);
/* Clear spin table so that secondary processors
* observe the correct value after waking up from wfe.
*/
memset(table, 0, CONFIG_MAX_CPUS*ENTRY_SIZE);
flush_dcache_range((unsigned long)boot_loc,
(unsigned long)boot_loc + *boot_page_size);
printf("Waking secondary cores to start from %lx\n", gd->relocaddr);
out_le32(&gur->bootlocptrh, (u32)(gd->relocaddr >> 32));
out_le32(&gur->bootlocptrl, (u32)gd->relocaddr);
out_le32(&gur->scratchrw[6], 1);
asm volatile("dsb st" : : : "memory");
rst->brrl = cores;
asm volatile("dsb st" : : : "memory");
/* fixme: this is only needed for the simulator because secnodary cores
* start to run without waiting for boot release register, then enter
* "wfe" before the scratch register is set above.
*/
asm volatile("sev");
That feels a little dodgy; a number of things could generate an event before we got here. Is there no way to block them until we've set that up?
This is backward. The secondary cores are expected to be released to run here into GPP bootrom. The bootlocptrh/l points to the location in u-boot, where they continue to run. But for current simulator, I found the secondary cores don't wait to be released. But they won't get far. The GPP bootrom code put them into sleep, unless the scratchrw abobe is set. For this situation, they need to be woken up here.
Ok. I take it the explicit sev will get removed at some point?
It's just worth bearing in mind that if those CPUs in the simulator are blocked on a wfe they may wake up for reasons other than an explicit sev.
while (timeout--) {
flush_dcache_range((unsigned long)table, (unsigned long)table +
CONFIG_MAX_CPUS * 64);
for (i = 1; i < CONFIG_MAX_CPUS; i++) {
if (table[i * NUM_BOOT_ENTRY + BOOT_ENTRY_ADDR])
cpu_up_mask |= 1 << i;
}
if (hweight32(cpu_up_mask) == hweight32(cores))
break;
udelay(10);
}
Surely we need this before we expect the CPUs to read the values in the table?
Or have I misunderstood?
I don't know how other ARM SoCs setup spin table. I borrowed the code from Power. The spin tables (one table for each core) are set up by secondary cores, not by the primary core. When the secondary cores are up, they write the spin table and wait there. Because they don't enable d-cache, all tables are in main memory. For the primary core, d-cache is enabled. We have to invalid the d-cache in order to load from main memory. flush_dcache_range() serves the purpose.
I see. So here we're just waiting for the secondary CPUs to setup their tables.
Sorry for the noise here, and thanks for the clarification.
Thanks, Mark.

Spin table is at the very beginning of boot page. All cores are released by the first entry address. If individual entry address is needed, undef CONFIG_FSL_SMP_RELEASE_ALL. FDT fixup is called to update spin table address.
Signed-off-by: York Sun yorksun@freescale.com --- This set depends on this bundle http://patchwork.ozlabs.org/bundle/yorksun/armv8_fsl-lsch3/
board/freescale/ls2085a/ls2085a.c | 2 ++ include/configs/ls2085a_common.h | 2 ++ 2 files changed, 4 insertions(+)
diff --git a/board/freescale/ls2085a/ls2085a.c b/board/freescale/ls2085a/ls2085a.c index a18db1d..3daa787 100644 --- a/board/freescale/ls2085a/ls2085a.c +++ b/board/freescale/ls2085a/ls2085a.c @@ -88,6 +88,8 @@ void ft_board_setup(void *blob, bd_t *bd) phys_addr_t base; phys_size_t size;
+ ft_cpu_setup(blob, bd); + /* limit the memory size to bank 1 until Linux can handle 40-bit PA */ base = getenv_bootm_low(); size = getenv_bootm_size(); diff --git a/include/configs/ls2085a_common.h b/include/configs/ls2085a_common.h index 49e2971..6da431f 100644 --- a/include/configs/ls2085a_common.h +++ b/include/configs/ls2085a_common.h @@ -56,6 +56,7 @@ */ #define SECONDARY_CPU_BOOT_PAGE (CONFIG_SYS_SDRAM_BASE) #define CPU_RELEASE_ADDR SECONDARY_CPU_BOOT_PAGE +#define CONFIG_FSL_SMP_RELEASE_ALL
/* Generic Timer Definitions */ #define COUNTER_FREQUENCY 12000000 /* 12MHz */ @@ -171,6 +172,7 @@
/* Miscellaneous configurable options */ #define CONFIG_SYS_LOAD_ADDR (CONFIG_SYS_DDR_SDRAM_BASE + 0x10000000) +#define CONFIG_ARCH_EARLY_INIT_R
/* Physical Memory Map */ /* fixme: these need to be checked against the board */

When booting with SP, RCW resides at the beginning of IFC NOR flash.
Signed-off-by: York Sun yorksun@freescale.com --- This set depends on this bundle http://patchwork.ozlabs.org/bundle/yorksun/armv8_fsl-lsch3/
include/configs/ls2085a_common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/configs/ls2085a_common.h b/include/configs/ls2085a_common.h index 6da431f..049329d 100644 --- a/include/configs/ls2085a_common.h +++ b/include/configs/ls2085a_common.h @@ -15,7 +15,7 @@ #define CONFIG_GICV3
/* Link Definitions */ -#define CONFIG_SYS_TEXT_BASE 0x30000000 +#define CONFIG_SYS_TEXT_BASE 0x30001000
#define CONFIG_SYS_NO_FLASH
participants (3)
-
arnab.basuļ¼ freescale.com
-
Mark Rutland
-
York Sun