[PATCH 0/2] A64/R40 DRAM controller dual-rank-related changes

This patchset contains two patches.
The first one enables asymmetric dual rank DRAM on A64. This is needed for 3GiB PinePhone, which has 2GiB rank 0 and 1GiB rank 1. This patch is already used by the firmware flashed to PinePhone by factory.
The second one enables dual rank (and asymmetric dual rank, although not tested because of lack of real board) on R40. In order to support single rank and dual rank at the same time, a new rank detection code is implemented (because PIR_QSGATE-based one does not work on R40). The code enables some error report facility of the DRAM controller, and then tries to access rank 1 and then check for error. It's placed at 2nd patch because it depends on the function that calculates rank 0 size (and rank 1 base address) introduced in PATCH 1.
Icenowy Zheng (2): sunxi: support asymmetric dual rank DRAM on A64/R40 sunxi: enable dual rank memory on R40
.../include/asm/arch-sunxi/dram_sunxi_dw.h | 11 +- arch/arm/mach-sunxi/dram_sunxi_dw.c | 149 +++++++++++++++--- 2 files changed, 131 insertions(+), 29 deletions(-)

Previously we have known that R40 has a configuration register for its rank 1, which allows different configuration than rank 0. Reverse engineering of newest libdram of A64 from Allwinner shows that A64 has this register too. It's bit 0 (which enables dual rank in rank 0 configuration register) means a dedicated rank size setup is used for rank 1.
Now, Pine64 scheduled to use a 3GiB LPDDR3 DRAM chip (which has 2GiB rank 0 and 1GiB rank 1) on PinePhone, that makes asymmetric dual rank DRAM support necessary.
Add this support. The code could support both A64 and R40, but because dual rank detection is broken on R40 now, we cannot really use it on R40 currently.
Signed-off-by: Icenowy Zheng icenowy@aosc.io --- .../include/asm/arch-sunxi/dram_sunxi_dw.h | 11 ++- arch/arm/mach-sunxi/dram_sunxi_dw.c | 94 +++++++++++++++---- 2 files changed, 82 insertions(+), 23 deletions(-)
diff --git a/arch/arm/include/asm/arch-sunxi/dram_sunxi_dw.h b/arch/arm/include/asm/arch-sunxi/dram_sunxi_dw.h index a5a7ebde44..e843c14202 100644 --- a/arch/arm/include/asm/arch-sunxi/dram_sunxi_dw.h +++ b/arch/arm/include/asm/arch-sunxi/dram_sunxi_dw.h @@ -215,12 +215,17 @@ struct sunxi_mctl_ctl_reg { #define NR_OF_BYTE_LANES (32 / BITS_PER_BYTE) /* The eight data lines (DQn) plus DM, DQS and DQSN */ #define LINES_PER_BYTE_LANE (BITS_PER_BYTE + 3) -struct dram_para { + +struct rank_para { u16 page_size; - u8 bus_full_width; - u8 dual_rank; u8 row_bits; u8 bank_bits; +}; + +struct dram_para { + u8 dual_rank; + u8 bus_full_width; + struct rank_para ranks[2]; const u8 dx_read_delays[NR_OF_BYTE_LANES][LINES_PER_BYTE_LANE]; const u8 dx_write_delays[NR_OF_BYTE_LANES][LINES_PER_BYTE_LANE]; const u8 ac_delays[31]; diff --git a/arch/arm/mach-sunxi/dram_sunxi_dw.c b/arch/arm/mach-sunxi/dram_sunxi_dw.c index d0600011ff..2b9d631d49 100644 --- a/arch/arm/mach-sunxi/dram_sunxi_dw.c +++ b/arch/arm/mach-sunxi/dram_sunxi_dw.c @@ -399,11 +399,19 @@ static void mctl_set_cr(uint16_t socid, struct dram_para *para) #else #error Unsupported DRAM type! #endif - (para->bank_bits == 3 ? MCTL_CR_EIGHT_BANKS : MCTL_CR_FOUR_BANKS) | + (para->ranks[0].bank_bits == 3 ? MCTL_CR_EIGHT_BANKS : MCTL_CR_FOUR_BANKS) | MCTL_CR_BUS_FULL_WIDTH(para->bus_full_width) | (para->dual_rank ? MCTL_CR_DUAL_RANK : MCTL_CR_SINGLE_RANK) | - MCTL_CR_PAGE_SIZE(para->page_size) | - MCTL_CR_ROW_BITS(para->row_bits), &mctl_com->cr); + MCTL_CR_PAGE_SIZE(para->ranks[0].page_size) | + MCTL_CR_ROW_BITS(para->ranks[0].row_bits), &mctl_com->cr); + + if (para->dual_rank && (socid == SOCID_A64 || socid == SOCID_R40)) { + writel((para->ranks[1].bank_bits == 3 ? MCTL_CR_EIGHT_BANKS : MCTL_CR_FOUR_BANKS) | + MCTL_CR_BUS_FULL_WIDTH(para->bus_full_width) | + MCTL_CR_DUAL_RANK | + MCTL_CR_PAGE_SIZE(para->ranks[1].page_size) | + MCTL_CR_ROW_BITS(para->ranks[1].row_bits), &mctl_com->cr_r1); + }
if (socid == SOCID_R40) { if (para->dual_rank) @@ -646,35 +654,63 @@ static int mctl_channel_init(uint16_t socid, struct dram_para *para) return 0; }
-static void mctl_auto_detect_dram_size(uint16_t socid, struct dram_para *para) +/* + * Test if memory at offset offset matches memory at a certain base + */ +static bool mctl_mem_matches_base(u32 offset, ulong base) +{ + /* Try to write different values to RAM at two addresses */ + writel(0, base); + writel(0xaa55aa55, base + offset); + dsb(); + /* Check if the same value is actually observed when reading back */ + return readl(base) == + readl(base + offset); +} + +static void mctl_auto_detect_dram_size_rank(uint16_t socid, struct dram_para *para, ulong base, struct rank_para *rank) { /* detect row address bits */ - para->page_size = 512; - para->row_bits = 16; - para->bank_bits = 2; + rank->page_size = 512; + rank->row_bits = 16; + rank->bank_bits = 2; mctl_set_cr(socid, para);
- for (para->row_bits = 11; para->row_bits < 16; para->row_bits++) - if (mctl_mem_matches((1 << (para->row_bits + para->bank_bits)) * para->page_size)) + for (rank->row_bits = 11; rank->row_bits < 16; rank->row_bits++) + if (mctl_mem_matches_base((1 << (rank->row_bits + rank->bank_bits)) * rank->page_size, base)) break;
/* detect bank address bits */ - para->bank_bits = 3; + rank->bank_bits = 3; mctl_set_cr(socid, para);
- for (para->bank_bits = 2; para->bank_bits < 3; para->bank_bits++) - if (mctl_mem_matches((1 << para->bank_bits) * para->page_size)) + for (rank->bank_bits = 2; rank->bank_bits < 3; rank->bank_bits++) + if (mctl_mem_matches_base((1 << rank->bank_bits) * rank->page_size, base)) break;
/* detect page size */ - para->page_size = 8192; + rank->page_size = 8192; mctl_set_cr(socid, para);
- for (para->page_size = 512; para->page_size < 8192; para->page_size *= 2) - if (mctl_mem_matches(para->page_size)) + for (rank->page_size = 512; rank->page_size < 8192; rank->page_size *= 2) + if (mctl_mem_matches_base(rank->page_size, base)) break; }
+static unsigned long mctl_calc_rank_size(struct rank_para *rank) +{ + return (1UL << (rank->row_bits + rank->bank_bits)) * rank->page_size; +} + +static void mctl_auto_detect_dram_size(uint16_t socid, struct dram_para *para) +{ + mctl_auto_detect_dram_size_rank(socid, para, (ulong)CONFIG_SYS_SDRAM_BASE, ¶->ranks[0]); + + if ((socid == SOCID_A64 || socid == SOCID_R40) && para->dual_rank) { + mctl_auto_detect_dram_size_rank(socid, para, (ulong)CONFIG_SYS_SDRAM_BASE + mctl_calc_rank_size(¶->ranks[0]), ¶->ranks[1]); + } +} + /* * The actual values used here are taken from Allwinner provided boot0 * binaries, though they are probably board specific, so would likely benefit @@ -769,12 +805,23 @@ unsigned long sunxi_dram_init(void) struct sunxi_mctl_ctl_reg * const mctl_ctl = (struct sunxi_mctl_ctl_reg *)SUNXI_DRAM_CTL0_BASE;
+ unsigned long size; + struct dram_para para = { .dual_rank = 1, .bus_full_width = 1, - .row_bits = 15, - .bank_bits = 3, - .page_size = 4096, + .ranks = { + { + .row_bits = 15, + .bank_bits = 3, + .page_size = 4096, + }, + { + .row_bits = 15, + .bank_bits = 3, + .page_size = 4096, + } + },
#if defined(CONFIG_MACH_SUN8I_H3) .dx_read_delays = SUN8I_H3_DX_READ_DELAYS, @@ -846,6 +893,13 @@ unsigned long sunxi_dram_init(void) mctl_auto_detect_dram_size(socid, ¶); mctl_set_cr(socid, ¶);
- return (1UL << (para.row_bits + para.bank_bits)) * para.page_size * - (para.dual_rank ? 2 : 1); + size = mctl_calc_rank_size(¶.ranks[0]); + if (socid == SOCID_A64 || socid == SOCID_R40) { + if (para.dual_rank) + size += mctl_calc_rank_size(¶.ranks[1]); + } else if (para.dual_rank) { + size *= 2; + } + + return size; }

On Fri, 26 Feb 2021 00:13:24 +0800 Icenowy Zheng icenowy@aosc.io wrote:
Previously we have known that R40 has a configuration register for its rank 1, which allows different configuration than rank 0. Reverse engineering of newest libdram of A64 from Allwinner shows that A64 has this register too. It's bit 0 (which enables dual rank in rank 0 configuration register) means a dedicated rank size setup is used for rank 1.
Now, Pine64 scheduled to use a 3GiB LPDDR3 DRAM chip (which has 2GiB rank 0 and 1GiB rank 1) on PinePhone, that makes asymmetric dual rank DRAM support necessary.
Add this support. The code could support both A64 and R40, but because dual rank detection is broken on R40 now, we cannot really use it on R40 currently.
Signed-off-by: Icenowy Zheng icenowy@aosc.io
So this looks alright to me. I tried to re-use the existing mctl_mem_matches() implementation for the _base() version you introduce, but interestingly this increases the code size - I guess we reach the limits of the toolchain garbage collection here. So it's some code duplication, but for the sake of keeping the SPL small, I am OK with that.
I couldn't test this, but reportedly this makes quite some Pinephone people happy, so:
Reviewed-by: Andre Przywara andre.przywara@arm.com
Cheers, Andre
.../include/asm/arch-sunxi/dram_sunxi_dw.h | 11 ++- arch/arm/mach-sunxi/dram_sunxi_dw.c | 94 +++++++++++++++---- 2 files changed, 82 insertions(+), 23 deletions(-)
diff --git a/arch/arm/include/asm/arch-sunxi/dram_sunxi_dw.h b/arch/arm/include/asm/arch-sunxi/dram_sunxi_dw.h index a5a7ebde44..e843c14202 100644 --- a/arch/arm/include/asm/arch-sunxi/dram_sunxi_dw.h +++ b/arch/arm/include/asm/arch-sunxi/dram_sunxi_dw.h @@ -215,12 +215,17 @@ struct sunxi_mctl_ctl_reg { #define NR_OF_BYTE_LANES (32 / BITS_PER_BYTE) /* The eight data lines (DQn) plus DM, DQS and DQSN */ #define LINES_PER_BYTE_LANE (BITS_PER_BYTE + 3) -struct dram_para {
+struct rank_para { u16 page_size;
- u8 bus_full_width;
- u8 dual_rank; u8 row_bits; u8 bank_bits;
+};
+struct dram_para {
- u8 dual_rank;
- u8 bus_full_width;
- struct rank_para ranks[2]; const u8 dx_read_delays[NR_OF_BYTE_LANES][LINES_PER_BYTE_LANE]; const u8 dx_write_delays[NR_OF_BYTE_LANES][LINES_PER_BYTE_LANE]; const u8 ac_delays[31];
diff --git a/arch/arm/mach-sunxi/dram_sunxi_dw.c b/arch/arm/mach-sunxi/dram_sunxi_dw.c index d0600011ff..2b9d631d49 100644 --- a/arch/arm/mach-sunxi/dram_sunxi_dw.c +++ b/arch/arm/mach-sunxi/dram_sunxi_dw.c @@ -399,11 +399,19 @@ static void mctl_set_cr(uint16_t socid, struct dram_para *para) #else #error Unsupported DRAM type! #endif
(para->bank_bits == 3 ? MCTL_CR_EIGHT_BANKS : MCTL_CR_FOUR_BANKS) |
(para->ranks[0].bank_bits == 3 ? MCTL_CR_EIGHT_BANKS : MCTL_CR_FOUR_BANKS) | MCTL_CR_BUS_FULL_WIDTH(para->bus_full_width) | (para->dual_rank ? MCTL_CR_DUAL_RANK : MCTL_CR_SINGLE_RANK) |
MCTL_CR_PAGE_SIZE(para->page_size) |
MCTL_CR_ROW_BITS(para->row_bits), &mctl_com->cr);
MCTL_CR_PAGE_SIZE(para->ranks[0].page_size) |
MCTL_CR_ROW_BITS(para->ranks[0].row_bits), &mctl_com->cr);
if (para->dual_rank && (socid == SOCID_A64 || socid == SOCID_R40)) {
writel((para->ranks[1].bank_bits == 3 ? MCTL_CR_EIGHT_BANKS : MCTL_CR_FOUR_BANKS) |
MCTL_CR_BUS_FULL_WIDTH(para->bus_full_width) |
MCTL_CR_DUAL_RANK |
MCTL_CR_PAGE_SIZE(para->ranks[1].page_size) |
MCTL_CR_ROW_BITS(para->ranks[1].row_bits), &mctl_com->cr_r1);
}
if (socid == SOCID_R40) { if (para->dual_rank)
@@ -646,35 +654,63 @@ static int mctl_channel_init(uint16_t socid, struct dram_para *para) return 0; }
-static void mctl_auto_detect_dram_size(uint16_t socid, struct dram_para *para) +/*
- Test if memory at offset offset matches memory at a certain base
- */
+static bool mctl_mem_matches_base(u32 offset, ulong base) +{
- /* Try to write different values to RAM at two addresses */
- writel(0, base);
- writel(0xaa55aa55, base + offset);
- dsb();
- /* Check if the same value is actually observed when reading back */
- return readl(base) ==
readl(base + offset);
+}
+static void mctl_auto_detect_dram_size_rank(uint16_t socid, struct dram_para *para, ulong base, struct rank_para *rank) { /* detect row address bits */
- para->page_size = 512;
- para->row_bits = 16;
- para->bank_bits = 2;
- rank->page_size = 512;
- rank->row_bits = 16;
- rank->bank_bits = 2; mctl_set_cr(socid, para);
- for (para->row_bits = 11; para->row_bits < 16; para->row_bits++)
if (mctl_mem_matches((1 << (para->row_bits + para->bank_bits)) * para->page_size))
for (rank->row_bits = 11; rank->row_bits < 16; rank->row_bits++)
if (mctl_mem_matches_base((1 << (rank->row_bits + rank->bank_bits)) * rank->page_size, base)) break;
/* detect bank address bits */
- para->bank_bits = 3;
- rank->bank_bits = 3; mctl_set_cr(socid, para);
- for (para->bank_bits = 2; para->bank_bits < 3; para->bank_bits++)
if (mctl_mem_matches((1 << para->bank_bits) * para->page_size))
for (rank->bank_bits = 2; rank->bank_bits < 3; rank->bank_bits++)
if (mctl_mem_matches_base((1 << rank->bank_bits) * rank->page_size, base)) break;
/* detect page size */
- para->page_size = 8192;
- rank->page_size = 8192; mctl_set_cr(socid, para);
- for (para->page_size = 512; para->page_size < 8192; para->page_size *= 2)
if (mctl_mem_matches(para->page_size))
- for (rank->page_size = 512; rank->page_size < 8192; rank->page_size *= 2)
if (mctl_mem_matches_base(rank->page_size, base)) break;
}
+static unsigned long mctl_calc_rank_size(struct rank_para *rank) +{
- return (1UL << (rank->row_bits + rank->bank_bits)) * rank->page_size;
+}
+static void mctl_auto_detect_dram_size(uint16_t socid, struct dram_para *para) +{
- mctl_auto_detect_dram_size_rank(socid, para, (ulong)CONFIG_SYS_SDRAM_BASE, ¶->ranks[0]);
- if ((socid == SOCID_A64 || socid == SOCID_R40) && para->dual_rank) {
mctl_auto_detect_dram_size_rank(socid, para, (ulong)CONFIG_SYS_SDRAM_BASE + mctl_calc_rank_size(¶->ranks[0]), ¶->ranks[1]);
- }
+}
/*
- The actual values used here are taken from Allwinner provided boot0
- binaries, though they are probably board specific, so would likely benefit
@@ -769,12 +805,23 @@ unsigned long sunxi_dram_init(void) struct sunxi_mctl_ctl_reg * const mctl_ctl = (struct sunxi_mctl_ctl_reg *)SUNXI_DRAM_CTL0_BASE;
- unsigned long size;
- struct dram_para para = { .dual_rank = 1, .bus_full_width = 1,
.row_bits = 15,
.bank_bits = 3,
.page_size = 4096,
.ranks = {
{
.row_bits = 15,
.bank_bits = 3,
.page_size = 4096,
},
{
.row_bits = 15,
.bank_bits = 3,
.page_size = 4096,
}
},
#if defined(CONFIG_MACH_SUN8I_H3) .dx_read_delays = SUN8I_H3_DX_READ_DELAYS, @@ -846,6 +893,13 @@ unsigned long sunxi_dram_init(void) mctl_auto_detect_dram_size(socid, ¶); mctl_set_cr(socid, ¶);
- return (1UL << (para.row_bits + para.bank_bits)) * para.page_size *
(para.dual_rank ? 2 : 1);
- size = mctl_calc_rank_size(¶.ranks[0]);
- if (socid == SOCID_A64 || socid == SOCID_R40) {
if (para.dual_rank)
size += mctl_calc_rank_size(¶.ranks[1]);
- } else if (para.dual_rank) {
size *= 2;
- }
- return size;
}

On Thu, Feb 25, 2021 at 4:14 PM Icenowy Zheng icenowy@aosc.io wrote:
Previously we have known that R40 has a configuration register for its rank 1, which allows different configuration than rank 0. Reverse engineering of newest libdram of A64 from Allwinner shows that A64 has this register too. It's bit 0 (which enables dual rank in rank 0 configuration register) means a dedicated rank size setup is used for rank 1.
Now, Pine64 scheduled to use a 3GiB LPDDR3 DRAM chip (which has 2GiB rank 0 and 1GiB rank 1) on PinePhone, that makes asymmetric dual rank DRAM support necessary.
Add this support. The code could support both A64 and R40, but because dual rank detection is broken on R40 now, we cannot really use it on R40 currently.
Signed-off-by: Icenowy Zheng icenowy@aosc.io
Tested-by: Peter Robinson pbrobinson@gmail.com
Tested on Pinephone Braveheart and 3Gb variants plus a Pine64 and all work as expected.
.../include/asm/arch-sunxi/dram_sunxi_dw.h | 11 ++- arch/arm/mach-sunxi/dram_sunxi_dw.c | 94 +++++++++++++++---- 2 files changed, 82 insertions(+), 23 deletions(-)
diff --git a/arch/arm/include/asm/arch-sunxi/dram_sunxi_dw.h b/arch/arm/include/asm/arch-sunxi/dram_sunxi_dw.h index a5a7ebde44..e843c14202 100644 --- a/arch/arm/include/asm/arch-sunxi/dram_sunxi_dw.h +++ b/arch/arm/include/asm/arch-sunxi/dram_sunxi_dw.h @@ -215,12 +215,17 @@ struct sunxi_mctl_ctl_reg { #define NR_OF_BYTE_LANES (32 / BITS_PER_BYTE) /* The eight data lines (DQn) plus DM, DQS and DQSN */ #define LINES_PER_BYTE_LANE (BITS_PER_BYTE + 3) -struct dram_para {
+struct rank_para { u16 page_size;
u8 bus_full_width;
u8 dual_rank; u8 row_bits; u8 bank_bits;
+};
+struct dram_para {
u8 dual_rank;
u8 bus_full_width;
struct rank_para ranks[2]; const u8 dx_read_delays[NR_OF_BYTE_LANES][LINES_PER_BYTE_LANE]; const u8 dx_write_delays[NR_OF_BYTE_LANES][LINES_PER_BYTE_LANE]; const u8 ac_delays[31];
diff --git a/arch/arm/mach-sunxi/dram_sunxi_dw.c b/arch/arm/mach-sunxi/dram_sunxi_dw.c index d0600011ff..2b9d631d49 100644 --- a/arch/arm/mach-sunxi/dram_sunxi_dw.c +++ b/arch/arm/mach-sunxi/dram_sunxi_dw.c @@ -399,11 +399,19 @@ static void mctl_set_cr(uint16_t socid, struct dram_para *para) #else #error Unsupported DRAM type! #endif
(para->bank_bits == 3 ? MCTL_CR_EIGHT_BANKS : MCTL_CR_FOUR_BANKS) |
(para->ranks[0].bank_bits == 3 ? MCTL_CR_EIGHT_BANKS : MCTL_CR_FOUR_BANKS) | MCTL_CR_BUS_FULL_WIDTH(para->bus_full_width) | (para->dual_rank ? MCTL_CR_DUAL_RANK : MCTL_CR_SINGLE_RANK) |
MCTL_CR_PAGE_SIZE(para->page_size) |
MCTL_CR_ROW_BITS(para->row_bits), &mctl_com->cr);
MCTL_CR_PAGE_SIZE(para->ranks[0].page_size) |
MCTL_CR_ROW_BITS(para->ranks[0].row_bits), &mctl_com->cr);
if (para->dual_rank && (socid == SOCID_A64 || socid == SOCID_R40)) {
writel((para->ranks[1].bank_bits == 3 ? MCTL_CR_EIGHT_BANKS : MCTL_CR_FOUR_BANKS) |
MCTL_CR_BUS_FULL_WIDTH(para->bus_full_width) |
MCTL_CR_DUAL_RANK |
MCTL_CR_PAGE_SIZE(para->ranks[1].page_size) |
MCTL_CR_ROW_BITS(para->ranks[1].row_bits), &mctl_com->cr_r1);
} if (socid == SOCID_R40) { if (para->dual_rank)
@@ -646,35 +654,63 @@ static int mctl_channel_init(uint16_t socid, struct dram_para *para) return 0; }
-static void mctl_auto_detect_dram_size(uint16_t socid, struct dram_para *para) +/*
- Test if memory at offset offset matches memory at a certain base
- */
+static bool mctl_mem_matches_base(u32 offset, ulong base) +{
/* Try to write different values to RAM at two addresses */
writel(0, base);
writel(0xaa55aa55, base + offset);
dsb();
/* Check if the same value is actually observed when reading back */
return readl(base) ==
readl(base + offset);
+}
+static void mctl_auto_detect_dram_size_rank(uint16_t socid, struct dram_para *para, ulong base, struct rank_para *rank) { /* detect row address bits */
para->page_size = 512;
para->row_bits = 16;
para->bank_bits = 2;
rank->page_size = 512;
rank->row_bits = 16;
rank->bank_bits = 2; mctl_set_cr(socid, para);
for (para->row_bits = 11; para->row_bits < 16; para->row_bits++)
if (mctl_mem_matches((1 << (para->row_bits + para->bank_bits)) * para->page_size))
for (rank->row_bits = 11; rank->row_bits < 16; rank->row_bits++)
if (mctl_mem_matches_base((1 << (rank->row_bits + rank->bank_bits)) * rank->page_size, base)) break; /* detect bank address bits */
para->bank_bits = 3;
rank->bank_bits = 3; mctl_set_cr(socid, para);
for (para->bank_bits = 2; para->bank_bits < 3; para->bank_bits++)
if (mctl_mem_matches((1 << para->bank_bits) * para->page_size))
for (rank->bank_bits = 2; rank->bank_bits < 3; rank->bank_bits++)
if (mctl_mem_matches_base((1 << rank->bank_bits) * rank->page_size, base)) break; /* detect page size */
para->page_size = 8192;
rank->page_size = 8192; mctl_set_cr(socid, para);
for (para->page_size = 512; para->page_size < 8192; para->page_size *= 2)
if (mctl_mem_matches(para->page_size))
for (rank->page_size = 512; rank->page_size < 8192; rank->page_size *= 2)
if (mctl_mem_matches_base(rank->page_size, base)) break;
}
+static unsigned long mctl_calc_rank_size(struct rank_para *rank) +{
return (1UL << (rank->row_bits + rank->bank_bits)) * rank->page_size;
+}
+static void mctl_auto_detect_dram_size(uint16_t socid, struct dram_para *para) +{
mctl_auto_detect_dram_size_rank(socid, para, (ulong)CONFIG_SYS_SDRAM_BASE, ¶->ranks[0]);
if ((socid == SOCID_A64 || socid == SOCID_R40) && para->dual_rank) {
mctl_auto_detect_dram_size_rank(socid, para, (ulong)CONFIG_SYS_SDRAM_BASE + mctl_calc_rank_size(¶->ranks[0]), ¶->ranks[1]);
}
+}
/*
- The actual values used here are taken from Allwinner provided boot0
- binaries, though they are probably board specific, so would likely benefit
@@ -769,12 +805,23 @@ unsigned long sunxi_dram_init(void) struct sunxi_mctl_ctl_reg * const mctl_ctl = (struct sunxi_mctl_ctl_reg *)SUNXI_DRAM_CTL0_BASE;
unsigned long size;
struct dram_para para = { .dual_rank = 1, .bus_full_width = 1,
.row_bits = 15,
.bank_bits = 3,
.page_size = 4096,
.ranks = {
{
.row_bits = 15,
.bank_bits = 3,
.page_size = 4096,
},
{
.row_bits = 15,
.bank_bits = 3,
.page_size = 4096,
}
},
#if defined(CONFIG_MACH_SUN8I_H3) .dx_read_delays = SUN8I_H3_DX_READ_DELAYS, @@ -846,6 +893,13 @@ unsigned long sunxi_dram_init(void) mctl_auto_detect_dram_size(socid, ¶); mctl_set_cr(socid, ¶);
return (1UL << (para.row_bits + para.bank_bits)) * para.page_size *
(para.dual_rank ? 2 : 1);
size = mctl_calc_rank_size(¶.ranks[0]);
if (socid == SOCID_A64 || socid == SOCID_R40) {
if (para.dual_rank)
size += mctl_calc_rank_size(¶.ranks[1]);
} else if (para.dual_rank) {
size *= 2;
}
return size;
}
2.30.0

Previously we do not have proper dual rank memory detection on R40 (because we omitted PIR_QSGATE, which does not work on R40 with our configuration), and dual rank memory is just simply disabled as early R40 boards available (Banana Pi M2 Ultra and Berry) have single rank memory.
As a board with dual rank memory (Forlinx OKA40i-C) is now known to us, we need to have a way to do memory rank detection to support that board.
Add some routine to detect memory rank by trying to access the memory in rank 1 and check for error status of the memory controller, and then enable dual rank memory on R40.
Similar routine can be used to detect half DQ width (which is also detected by PIR_QSGATE on other SoCs), but it's left unimplemented because there's no known R40 board with half DQ width now.
Signed-off-by: Icenowy Zheng icenowy@aosc.io --- arch/arm/mach-sunxi/dram_sunxi_dw.c | 55 +++++++++++++++++++++++++---- 1 file changed, 49 insertions(+), 6 deletions(-)
diff --git a/arch/arm/mach-sunxi/dram_sunxi_dw.c b/arch/arm/mach-sunxi/dram_sunxi_dw.c index 2b9d631d49..b86ae7cdf3 100644 --- a/arch/arm/mach-sunxi/dram_sunxi_dw.c +++ b/arch/arm/mach-sunxi/dram_sunxi_dw.c @@ -414,11 +414,9 @@ static void mctl_set_cr(uint16_t socid, struct dram_para *para) }
if (socid == SOCID_R40) { - if (para->dual_rank) - panic("Dual rank memory not supported\n"); - /* Mux pin to A15 address line for single rank memory. */ - setbits_le32(&mctl_com->cr_r1, MCTL_CR_R1_MUX_A15); + if (!para->dual_rank) + setbits_le32(&mctl_com->cr_r1, MCTL_CR_R1_MUX_A15); } }
@@ -702,8 +700,55 @@ static unsigned long mctl_calc_rank_size(struct rank_para *rank) return (1UL << (rank->row_bits + rank->bank_bits)) * rank->page_size; }
+/* + * Because we cannot do mctl_phy_init(PIR_QSGATE) on R40 now (which leads + * to failure), it's needed to detect the rank count of R40 in another way. + * + * The code here is modelled after time_out_detect() in BSP, which tries to + * access the memory and check for error code. + * + * TODO: auto detect half DQ width here + */ +static void mctl_r40_detect_rank_count(struct dram_para *para) +{ + ulong rank1_base = (ulong) CONFIG_SYS_SDRAM_BASE + + mctl_calc_rank_size(¶->ranks[0]); + struct sunxi_mctl_ctl_reg * const mctl_ctl = + (struct sunxi_mctl_ctl_reg *)SUNXI_DRAM_CTL0_BASE; + + /* Enable read time out */ + setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 25); + + (void) readl((void *) rank1_base); + udelay(10); + + if (readl(&mctl_ctl->pgsr[0]) & (0x1 << 13)) { + clrsetbits_le32(&mctl_ctl->dtcr, 0xf << 24, 0x1 << 24); + para->dual_rank = 0; + } + + /* Reset PHY FIFO to clear it */ + clrbits_le32(&mctl_ctl->pgcr[0], 0x1 << 26); + udelay(100); + setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 26); + + /* Clear error status */ + setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 24); + + /* Clear time out flag */ + clrbits_le32(&mctl_ctl->pgsr[0], 0x1 << 13); + + /* Disable read time out */ + clrbits_le32(&mctl_ctl->pgcr[0], 0x1 << 25); +} + static void mctl_auto_detect_dram_size(uint16_t socid, struct dram_para *para) { + if (socid == SOCID_R40) { + mctl_r40_detect_rank_count(para); + mctl_set_cr(socid, para); + } + mctl_auto_detect_dram_size_rank(socid, para, (ulong)CONFIG_SYS_SDRAM_BASE, ¶->ranks[0]);
if ((socid == SOCID_A64 || socid == SOCID_R40) && para->dual_rank) { @@ -854,8 +899,6 @@ unsigned long sunxi_dram_init(void) uint16_t socid = SOCID_H3; #elif defined(CONFIG_MACH_SUN8I_R40) uint16_t socid = SOCID_R40; - /* Currently we cannot support R40 with dual rank memory */ - para.dual_rank = 0; #elif defined(CONFIG_MACH_SUN8I_V3S) uint16_t socid = SOCID_V3S; #elif defined(CONFIG_MACH_SUN50I)

I have tested this patchset on both OKA40i-C and our custom carrier board which uses the same SoM (Forlinx FETA40i-C), and can confirm that: 1) It does enable functioning of U-Boot on the FETA40i SoM, and 2) I have not experienced any immediately apparent problems with DRAM in either U-Boot itself or the loaded OS during my testing.
Tested-by: Ivan Uvarov i.uvarov@cognitivepilot.com
On 2/25/21 7:13 PM, Icenowy Zheng wrote:
Previously we do not have proper dual rank memory detection on R40 (because we omitted PIR_QSGATE, which does not work on R40 with our configuration), and dual rank memory is just simply disabled as early R40 boards available (Banana Pi M2 Ultra and Berry) have single rank memory.
As a board with dual rank memory (Forlinx OKA40i-C) is now known to us, we need to have a way to do memory rank detection to support that board.
Add some routine to detect memory rank by trying to access the memory in rank 1 and check for error status of the memory controller, and then enable dual rank memory on R40.
Similar routine can be used to detect half DQ width (which is also detected by PIR_QSGATE on other SoCs), but it's left unimplemented because there's no known R40 board with half DQ width now.
Signed-off-by: Icenowy Zheng icenowy@aosc.io
arch/arm/mach-sunxi/dram_sunxi_dw.c | 55 +++++++++++++++++++++++++---- 1 file changed, 49 insertions(+), 6 deletions(-)
diff --git a/arch/arm/mach-sunxi/dram_sunxi_dw.c b/arch/arm/mach-sunxi/dram_sunxi_dw.c index 2b9d631d49..b86ae7cdf3 100644 --- a/arch/arm/mach-sunxi/dram_sunxi_dw.c +++ b/arch/arm/mach-sunxi/dram_sunxi_dw.c @@ -414,11 +414,9 @@ static void mctl_set_cr(uint16_t socid, struct dram_para *para) }
if (socid == SOCID_R40) {
if (para->dual_rank)
panic("Dual rank memory not supported\n");
- /* Mux pin to A15 address line for single rank memory. */
setbits_le32(&mctl_com->cr_r1, MCTL_CR_R1_MUX_A15);
if (!para->dual_rank)
}setbits_le32(&mctl_com->cr_r1, MCTL_CR_R1_MUX_A15);
}
@@ -702,8 +700,55 @@ static unsigned long mctl_calc_rank_size(struct rank_para *rank) return (1UL << (rank->row_bits + rank->bank_bits)) * rank->page_size; }
+/*
- Because we cannot do mctl_phy_init(PIR_QSGATE) on R40 now (which leads
- to failure), it's needed to detect the rank count of R40 in another way.
- The code here is modelled after time_out_detect() in BSP, which tries to
- access the memory and check for error code.
- TODO: auto detect half DQ width here
- */
+static void mctl_r40_detect_rank_count(struct dram_para *para) +{
- ulong rank1_base = (ulong) CONFIG_SYS_SDRAM_BASE +
mctl_calc_rank_size(¶->ranks[0]);
- struct sunxi_mctl_ctl_reg * const mctl_ctl =
(struct sunxi_mctl_ctl_reg *)SUNXI_DRAM_CTL0_BASE;
- /* Enable read time out */
- setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 25);
- (void) readl((void *) rank1_base);
- udelay(10);
- if (readl(&mctl_ctl->pgsr[0]) & (0x1 << 13)) {
clrsetbits_le32(&mctl_ctl->dtcr, 0xf << 24, 0x1 << 24);
para->dual_rank = 0;
- }
- /* Reset PHY FIFO to clear it */
- clrbits_le32(&mctl_ctl->pgcr[0], 0x1 << 26);
- udelay(100);
- setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 26);
- /* Clear error status */
- setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 24);
- /* Clear time out flag */
- clrbits_le32(&mctl_ctl->pgsr[0], 0x1 << 13);
- /* Disable read time out */
- clrbits_le32(&mctl_ctl->pgcr[0], 0x1 << 25);
+}
static void mctl_auto_detect_dram_size(uint16_t socid, struct dram_para *para) {
if (socid == SOCID_R40) {
mctl_r40_detect_rank_count(para);
mctl_set_cr(socid, para);
}
mctl_auto_detect_dram_size_rank(socid, para, (ulong)CONFIG_SYS_SDRAM_BASE, ¶->ranks[0]);
if ((socid == SOCID_A64 || socid == SOCID_R40) && para->dual_rank) {
@@ -854,8 +899,6 @@ unsigned long sunxi_dram_init(void) uint16_t socid = SOCID_H3; #elif defined(CONFIG_MACH_SUN8I_R40) uint16_t socid = SOCID_R40;
- /* Currently we cannot support R40 with dual rank memory */
- para.dual_rank = 0;
#elif defined(CONFIG_MACH_SUN8I_V3S) uint16_t socid = SOCID_V3S; #elif defined(CONFIG_MACH_SUN50I)

On Fri, 26 Feb 2021 00:13:25 +0800 Icenowy Zheng icenowy@aosc.io wrote:
Previously we do not have proper dual rank memory detection on R40 (because we omitted PIR_QSGATE, which does not work on R40 with our configuration), and dual rank memory is just simply disabled as early R40 boards available (Banana Pi M2 Ultra and Berry) have single rank memory.
As a board with dual rank memory (Forlinx OKA40i-C) is now known to us, we need to have a way to do memory rank detection to support that board.
Add some routine to detect memory rank by trying to access the memory in rank 1 and check for error status of the memory controller, and then enable dual rank memory on R40.
Similar routine can be used to detect half DQ width (which is also detected by PIR_QSGATE on other SoCs), but it's left unimplemented because there's no known R40 board with half DQ width now.
So when looking at the SPL size I realised that this patch breaks the socid constant parameter propagation, especially for mctl_set_cr(). I don't see immediately why, this whole code here should be compiled out on any A64 builds, for instance. Instead GCC turns "<mctl_set_cr.constprop.0>:" into "<mctl_set_cr>:", and passes 0x1689 around everytime. I tried GCC 10.2.0 and 9.2.0, also tried adding const everywhere, to amplify the constant nature of this value. Patch 1/2 added to the code size, but kept the const propagation (only one instance of 0x1689 in the disassembly). This patch here should be for free on A64, but adds 104 bytes.
Does anyone have a clue why this is happening? Is this a compiler issue?
Cheers, Andre
Signed-off-by: Icenowy Zheng icenowy@aosc.io
arch/arm/mach-sunxi/dram_sunxi_dw.c | 55 +++++++++++++++++++++++++---- 1 file changed, 49 insertions(+), 6 deletions(-)
diff --git a/arch/arm/mach-sunxi/dram_sunxi_dw.c b/arch/arm/mach-sunxi/dram_sunxi_dw.c index 2b9d631d49..b86ae7cdf3 100644 --- a/arch/arm/mach-sunxi/dram_sunxi_dw.c +++ b/arch/arm/mach-sunxi/dram_sunxi_dw.c @@ -414,11 +414,9 @@ static void mctl_set_cr(uint16_t socid, struct dram_para *para) }
if (socid == SOCID_R40) {
if (para->dual_rank)
panic("Dual rank memory not supported\n");
- /* Mux pin to A15 address line for single rank memory. */
setbits_le32(&mctl_com->cr_r1, MCTL_CR_R1_MUX_A15);
if (!para->dual_rank)
}setbits_le32(&mctl_com->cr_r1, MCTL_CR_R1_MUX_A15);
}
@@ -702,8 +700,55 @@ static unsigned long mctl_calc_rank_size(struct rank_para *rank) return (1UL << (rank->row_bits + rank->bank_bits)) * rank->page_size; }
+/*
- Because we cannot do mctl_phy_init(PIR_QSGATE) on R40 now (which leads
- to failure), it's needed to detect the rank count of R40 in another way.
- The code here is modelled after time_out_detect() in BSP, which tries to
- access the memory and check for error code.
- TODO: auto detect half DQ width here
- */
+static void mctl_r40_detect_rank_count(struct dram_para *para) +{
- ulong rank1_base = (ulong) CONFIG_SYS_SDRAM_BASE +
mctl_calc_rank_size(¶->ranks[0]);
- struct sunxi_mctl_ctl_reg * const mctl_ctl =
(struct sunxi_mctl_ctl_reg *)SUNXI_DRAM_CTL0_BASE;
- /* Enable read time out */
- setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 25);
- (void) readl((void *) rank1_base);
- udelay(10);
- if (readl(&mctl_ctl->pgsr[0]) & (0x1 << 13)) {
clrsetbits_le32(&mctl_ctl->dtcr, 0xf << 24, 0x1 << 24);
para->dual_rank = 0;
- }
- /* Reset PHY FIFO to clear it */
- clrbits_le32(&mctl_ctl->pgcr[0], 0x1 << 26);
- udelay(100);
- setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 26);
- /* Clear error status */
- setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 24);
- /* Clear time out flag */
- clrbits_le32(&mctl_ctl->pgsr[0], 0x1 << 13);
- /* Disable read time out */
- clrbits_le32(&mctl_ctl->pgcr[0], 0x1 << 25);
+}
static void mctl_auto_detect_dram_size(uint16_t socid, struct dram_para *para) {
if (socid == SOCID_R40) {
mctl_r40_detect_rank_count(para);
mctl_set_cr(socid, para);
}
mctl_auto_detect_dram_size_rank(socid, para, (ulong)CONFIG_SYS_SDRAM_BASE, ¶->ranks[0]);
if ((socid == SOCID_A64 || socid == SOCID_R40) && para->dual_rank) {
@@ -854,8 +899,6 @@ unsigned long sunxi_dram_init(void) uint16_t socid = SOCID_H3; #elif defined(CONFIG_MACH_SUN8I_R40) uint16_t socid = SOCID_R40;
- /* Currently we cannot support R40 with dual rank memory */
- para.dual_rank = 0;
#elif defined(CONFIG_MACH_SUN8I_V3S) uint16_t socid = SOCID_V3S; #elif defined(CONFIG_MACH_SUN50I)

于 2021年3月2日 GMT+08:00 下午9:40:44, Andre Przywara andre.przywara@arm.com 写到:
On Fri, 26 Feb 2021 00:13:25 +0800 Icenowy Zheng icenowy@aosc.io wrote:
Previously we do not have proper dual rank memory detection on R40 (because we omitted PIR_QSGATE, which does not work on R40 with our configuration), and dual rank memory is just simply disabled as early R40 boards available (Banana Pi M2 Ultra and Berry) have single rank memory.
As a board with dual rank memory (Forlinx OKA40i-C) is now known to
us,
we need to have a way to do memory rank detection to support that
board.
Add some routine to detect memory rank by trying to access the memory in rank 1 and check for error status of the memory controller, and
then
enable dual rank memory on R40.
Similar routine can be used to detect half DQ width (which is also detected by PIR_QSGATE on other SoCs), but it's left unimplemented because there's no known R40 board with half DQ width now.
So when looking at the SPL size I realised that this patch breaks the socid constant parameter propagation, especially for mctl_set_cr(). I don't see immediately why, this whole code here should be compiled out on any A64 builds, for instance. Instead GCC turns "<mctl_set_cr.constprop.0>:" into "<mctl_set_cr>:", and passes 0x1689 around everytime. I tried GCC 10.2.0 and 9.2.0, also tried adding const everywhere, to amplify the constant nature of this value. Patch 1/2 added to the code size, but kept the const propagation (only one instance of 0x1689 in the disassembly). This patch here should be for free on A64, but adds 104 bytes.
Does anyone have a clue why this is happening? Is this a compiler issue?
Maybe the issue is that I assume this codepath is only for R40 and I didn't add socid to it?
Could you try to add a socid parameter to mctl_r40_detect_rank_count()?
Or maybe it makes mctl_calc_rank_size() not inlined?
(Well I think the code looks noop on non-R40)
BTW, original rank/width detection part won't get called on R40. But R40 is not where we are tight on code size, so I didn't bother to ignore it on R40.
Or should we switch back to #if's and do not rely on compiler behavior any longer?
Cheers, Andre
Signed-off-by: Icenowy Zheng icenowy@aosc.io
arch/arm/mach-sunxi/dram_sunxi_dw.c | 55
+++++++++++++++++++++++++----
1 file changed, 49 insertions(+), 6 deletions(-)
diff --git a/arch/arm/mach-sunxi/dram_sunxi_dw.c
b/arch/arm/mach-sunxi/dram_sunxi_dw.c
index 2b9d631d49..b86ae7cdf3 100644 --- a/arch/arm/mach-sunxi/dram_sunxi_dw.c +++ b/arch/arm/mach-sunxi/dram_sunxi_dw.c @@ -414,11 +414,9 @@ static void mctl_set_cr(uint16_t socid, struct
dram_para *para)
}
if (socid == SOCID_R40) {
if (para->dual_rank)
panic("Dual rank memory not supported\n");
- /* Mux pin to A15 address line for single rank memory. */
setbits_le32(&mctl_com->cr_r1, MCTL_CR_R1_MUX_A15);
if (!para->dual_rank)
}setbits_le32(&mctl_com->cr_r1, MCTL_CR_R1_MUX_A15);
}
@@ -702,8 +700,55 @@ static unsigned long mctl_calc_rank_size(struct
rank_para *rank)
return (1UL << (rank->row_bits + rank->bank_bits)) *
rank->page_size;
}
+/*
- Because we cannot do mctl_phy_init(PIR_QSGATE) on R40 now (which
leads
- to failure), it's needed to detect the rank count of R40 in
another way.
- The code here is modelled after time_out_detect() in BSP, which
tries to
- access the memory and check for error code.
- TODO: auto detect half DQ width here
- */
+static void mctl_r40_detect_rank_count(struct dram_para *para) +{
- ulong rank1_base = (ulong) CONFIG_SYS_SDRAM_BASE +
mctl_calc_rank_size(¶->ranks[0]);
- struct sunxi_mctl_ctl_reg * const mctl_ctl =
(struct sunxi_mctl_ctl_reg *)SUNXI_DRAM_CTL0_BASE;
- /* Enable read time out */
- setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 25);
- (void) readl((void *) rank1_base);
- udelay(10);
- if (readl(&mctl_ctl->pgsr[0]) & (0x1 << 13)) {
clrsetbits_le32(&mctl_ctl->dtcr, 0xf << 24, 0x1 << 24);
para->dual_rank = 0;
- }
- /* Reset PHY FIFO to clear it */
- clrbits_le32(&mctl_ctl->pgcr[0], 0x1 << 26);
- udelay(100);
- setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 26);
- /* Clear error status */
- setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 24);
- /* Clear time out flag */
- clrbits_le32(&mctl_ctl->pgsr[0], 0x1 << 13);
- /* Disable read time out */
- clrbits_le32(&mctl_ctl->pgcr[0], 0x1 << 25);
+}
static void mctl_auto_detect_dram_size(uint16_t socid, struct
dram_para *para)
{
- if (socid == SOCID_R40) {
mctl_r40_detect_rank_count(para);
mctl_set_cr(socid, para);
- }
- mctl_auto_detect_dram_size_rank(socid, para,
(ulong)CONFIG_SYS_SDRAM_BASE, ¶->ranks[0]);
if ((socid == SOCID_A64 || socid == SOCID_R40) && para->dual_rank)
{
@@ -854,8 +899,6 @@ unsigned long sunxi_dram_init(void) uint16_t socid = SOCID_H3; #elif defined(CONFIG_MACH_SUN8I_R40) uint16_t socid = SOCID_R40;
- /* Currently we cannot support R40 with dual rank memory */
- para.dual_rank = 0;
#elif defined(CONFIG_MACH_SUN8I_V3S) uint16_t socid = SOCID_V3S; #elif defined(CONFIG_MACH_SUN50I)

On Tue, 02 Mar 2021 21:50:49 +0800 Icenowy Zheng icenowy@aosc.io wrote:
Hi Icenowy,
于 2021年3月2日 GMT+08:00 下午9:40:44, Andre Przywara andre.przywara@arm.com 写到:
On Fri, 26 Feb 2021 00:13:25 +0800 Icenowy Zheng icenowy@aosc.io wrote:
Previously we do not have proper dual rank memory detection on R40 (because we omitted PIR_QSGATE, which does not work on R40 with our configuration), and dual rank memory is just simply disabled as early R40 boards available (Banana Pi M2 Ultra and Berry) have single rank memory.
As a board with dual rank memory (Forlinx OKA40i-C) is now known to
us,
we need to have a way to do memory rank detection to support that
board.
Add some routine to detect memory rank by trying to access the memory in rank 1 and check for error status of the memory controller, and
then
enable dual rank memory on R40.
Similar routine can be used to detect half DQ width (which is also detected by PIR_QSGATE on other SoCs), but it's left unimplemented because there's no known R40 board with half DQ width now.
So when looking at the SPL size I realised that this patch breaks the socid constant parameter propagation, especially for mctl_set_cr(). I don't see immediately why, this whole code here should be compiled out on any A64 builds, for instance. Instead GCC turns "<mctl_set_cr.constprop.0>:" into "<mctl_set_cr>:", and passes 0x1689 around everytime. I tried GCC 10.2.0 and 9.2.0, also tried adding const everywhere, to amplify the constant nature of this value. Patch 1/2 added to the code size, but kept the const propagation (only one instance of 0x1689 in the disassembly). This patch here should be for free on A64, but adds 104 bytes.
Does anyone have a clue why this is happening? Is this a compiler issue?
Maybe the issue is that I assume this codepath is only for R40 and I didn't add socid to it?
But that's clearly visible by this function only being called inside "if (socid == SOCID_R40)". And that works very well for the H3 ZQ calibration quirk, for instance.
Could you try to add a socid parameter to mctl_r40_detect_rank_count()?
I just tried that, and apart from looking weird it didn't do anything.
To be clear: Your code is absolutely fine, exactly the way it should be written. It's just that the compiler is playing stupid suddenly. I was thinking that your dummy readl might have upset it, but even with that commented out it's still happening.
Or maybe it makes mctl_calc_rank_size() not inlined?
So the assembly looks like everything apart from mctl_set_cr() and mctl_auto_detect_dram_size_rank() is inlined. Those are extra because they are called multiple times and we are using -Os.
(Well I think the code looks noop on non-R40)
Exactly that was my thinking: when compiling for the A64, it should be totally compiled out, and the object file should be the same before and after.
BTW, original rank/width detection part won't get called on R40. But R40 is not where we are tight on code size, so I didn't bother to ignore it on R40.
I see. Yeah, I am not concerned about R40 either, but I want to avoid the A64 bloating up.
Or should we switch back to #if's and do not rely on compiler behavior any longer?
I'd rather not do that. We are doing the right thing, and it works marvellously so far.
To be clear: it's not a show stopper, I was just curious why this happens. The code size is not really critical on the A64 at the moment, so I'd merge it as it, even if we don't find a solution. Maybe just leave a hint about it in the code should people need to come back to this.
I also asked some compiler buffs about it, but it's not exactly the simple reproducer case they like to deal with ;-)
Cheers, Andre
Cheers, Andre
Signed-off-by: Icenowy Zheng icenowy@aosc.io
arch/arm/mach-sunxi/dram_sunxi_dw.c | 55
+++++++++++++++++++++++++----
1 file changed, 49 insertions(+), 6 deletions(-)
diff --git a/arch/arm/mach-sunxi/dram_sunxi_dw.c
b/arch/arm/mach-sunxi/dram_sunxi_dw.c
index 2b9d631d49..b86ae7cdf3 100644 --- a/arch/arm/mach-sunxi/dram_sunxi_dw.c +++ b/arch/arm/mach-sunxi/dram_sunxi_dw.c @@ -414,11 +414,9 @@ static void mctl_set_cr(uint16_t socid, struct
dram_para *para)
}
if (socid == SOCID_R40) {
if (para->dual_rank)
panic("Dual rank memory not supported\n");
- /* Mux pin to A15 address line for single rank memory. */
setbits_le32(&mctl_com->cr_r1, MCTL_CR_R1_MUX_A15);
if (!para->dual_rank)
}setbits_le32(&mctl_com->cr_r1, MCTL_CR_R1_MUX_A15);
}
@@ -702,8 +700,55 @@ static unsigned long mctl_calc_rank_size(struct
rank_para *rank)
return (1UL << (rank->row_bits + rank->bank_bits)) *
rank->page_size;
}
+/*
- Because we cannot do mctl_phy_init(PIR_QSGATE) on R40 now (which
leads
- to failure), it's needed to detect the rank count of R40 in
another way.
- The code here is modelled after time_out_detect() in BSP, which
tries to
- access the memory and check for error code.
- TODO: auto detect half DQ width here
- */
+static void mctl_r40_detect_rank_count(struct dram_para *para) +{
- ulong rank1_base = (ulong) CONFIG_SYS_SDRAM_BASE +
mctl_calc_rank_size(¶->ranks[0]);
- struct sunxi_mctl_ctl_reg * const mctl_ctl =
(struct sunxi_mctl_ctl_reg *)SUNXI_DRAM_CTL0_BASE;
- /* Enable read time out */
- setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 25);
- (void) readl((void *) rank1_base);
- udelay(10);
- if (readl(&mctl_ctl->pgsr[0]) & (0x1 << 13)) {
clrsetbits_le32(&mctl_ctl->dtcr, 0xf << 24, 0x1 << 24);
para->dual_rank = 0;
- }
- /* Reset PHY FIFO to clear it */
- clrbits_le32(&mctl_ctl->pgcr[0], 0x1 << 26);
- udelay(100);
- setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 26);
- /* Clear error status */
- setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 24);
- /* Clear time out flag */
- clrbits_le32(&mctl_ctl->pgsr[0], 0x1 << 13);
- /* Disable read time out */
- clrbits_le32(&mctl_ctl->pgcr[0], 0x1 << 25);
+}
static void mctl_auto_detect_dram_size(uint16_t socid, struct
dram_para *para)
{
- if (socid == SOCID_R40) {
mctl_r40_detect_rank_count(para);
mctl_set_cr(socid, para);
- }
- mctl_auto_detect_dram_size_rank(socid, para,
(ulong)CONFIG_SYS_SDRAM_BASE, ¶->ranks[0]);
if ((socid == SOCID_A64 || socid == SOCID_R40) && para->dual_rank)
{
@@ -854,8 +899,6 @@ unsigned long sunxi_dram_init(void) uint16_t socid = SOCID_H3; #elif defined(CONFIG_MACH_SUN8I_R40) uint16_t socid = SOCID_R40;
- /* Currently we cannot support R40 with dual rank memory */
- para.dual_rank = 0;
#elif defined(CONFIG_MACH_SUN8I_V3S) uint16_t socid = SOCID_V3S; #elif defined(CONFIG_MACH_SUN50I)

在 2021-03-02星期二的 15:19 +0000,Andre Przywara写道:
On Tue, 02 Mar 2021 21:50:49 +0800 Icenowy Zheng icenowy@aosc.io wrote:
Hi Icenowy,
于 2021年3月2日 GMT+08:00 下午9:40:44, Andre Przywara < andre.przywara@arm.com> 写到:
On Fri, 26 Feb 2021 00:13:25 +0800 Icenowy Zheng icenowy@aosc.io wrote:
Previously we do not have proper dual rank memory detection on R40 (because we omitted PIR_QSGATE, which does not work on R40 with our configuration), and dual rank memory is just simply disabled as early R40 boards available (Banana Pi M2 Ultra and Berry) have single rank memory.
As a board with dual rank memory (Forlinx OKA40i-C) is now known to
us,
we need to have a way to do memory rank detection to support that
board.
Add some routine to detect memory rank by trying to access the memory in rank 1 and check for error status of the memory controller, and
then
enable dual rank memory on R40.
Similar routine can be used to detect half DQ width (which is also detected by PIR_QSGATE on other SoCs), but it's left unimplemented because there's no known R40 board with half DQ width now.
So when looking at the SPL size I realised that this patch breaks the socid constant parameter propagation, especially for mctl_set_cr(). I don't see immediately why, this whole code here should be compiled out on any A64 builds, for instance. Instead GCC turns "<mctl_set_cr.constprop.0>:" into "<mctl_set_cr>:", and passes 0x1689 around everytime. I tried GCC 10.2.0 and 9.2.0, also tried adding const everywhere, to amplify the constant nature of this value. Patch 1/2 added to the code size, but kept the const propagation (only one instance of 0x1689 in the disassembly). This patch here should be for free on A64, but adds 104 bytes.
Does anyone have a clue why this is happening? Is this a compiler issue?
Maybe the issue is that I assume this codepath is only for R40 and I didn't add socid to it?
But that's clearly visible by this function only being called inside "if (socid == SOCID_R40)". And that works very well for the H3 ZQ calibration quirk, for instance.
Could you try to add a socid parameter to mctl_r40_detect_rank_count()?
I just tried that, and apart from looking weird it didn't do anything.
To be clear: Your code is absolutely fine, exactly the way it should be written. It's just that the compiler is playing stupid suddenly. I was thinking that your dummy readl might have upset it, but even with that commented out it's still happening.
Or maybe it makes mctl_calc_rank_size() not inlined?
So the assembly looks like everything apart from mctl_set_cr() and mctl_auto_detect_dram_size_rank() is inlined. Those are extra because they are called multiple times and we are using -Os.
(Well I think the code looks noop on non-R40)
Exactly that was my thinking: when compiling for the A64, it should be totally compiled out, and the object file should be the same before and after.
BTW, original rank/width detection part won't get called on R40. But R40 is not where we are tight on code size, so I didn't bother to ignore it on R40.
I see. Yeah, I am not concerned about R40 either, but I want to avoid the A64 bloating up.
Or should we switch back to #if's and do not rely on compiler behavior any longer?
I'd rather not do that. We are doing the right thing, and it works marvellously so far.
To be clear: it's not a show stopper, I was just curious why this happens. The code size is not really critical on the A64 at the moment, so I'd merge it as it, even if we don't find a solution. Maybe just leave a hint about it in the code should people need to come back to this.
I also asked some compiler buffs about it, but it's not exactly the simple reproducer case they like to deal with ;-)
Cheers, Andre
Cheers, Andre
Signed-off-by: Icenowy Zheng icenowy@aosc.io
arch/arm/mach-sunxi/dram_sunxi_dw.c | 55
+++++++++++++++++++++++++----
1 file changed, 49 insertions(+), 6 deletions(-)
diff --git a/arch/arm/mach-sunxi/dram_sunxi_dw.c
b/arch/arm/mach-sunxi/dram_sunxi_dw.c
index 2b9d631d49..b86ae7cdf3 100644 --- a/arch/arm/mach-sunxi/dram_sunxi_dw.c +++ b/arch/arm/mach-sunxi/dram_sunxi_dw.c @@ -414,11 +414,9 @@ static void mctl_set_cr(uint16_t socid, struct
dram_para *para)
} if (socid == SOCID_R40) { - if (para->dual_rank) - panic("Dual rank memory not supported\n");
/* Mux pin to A15 address line for single rank memory. */ - setbits_le32(&mctl_com->cr_r1, MCTL_CR_R1_MUX_A15); + if (!para->dual_rank) + setbits_le32(&mctl_com->cr_r1, MCTL_CR_R1_MUX_A15); } } @@ -702,8 +700,55 @@ static unsigned long mctl_calc_rank_size(struct
rank_para *rank)
return (1UL << (rank->row_bits + rank->bank_bits)) *
rank->page_size;
} +/*
- Because we cannot do mctl_phy_init(PIR_QSGATE) on R40 now
(which
leads
- to failure), it's needed to detect the rank count of R40
in
another way.
- The code here is modelled after time_out_detect() in BSP,
which
tries to
- access the memory and check for error code.
- TODO: auto detect half DQ width here
- */
+static void mctl_r40_detect_rank_count(struct dram_para *para) +{ + ulong rank1_base = (ulong) CONFIG_SYS_SDRAM_BASE + + mctl_calc_rank_size(¶-
ranks[0]);
+ struct sunxi_mctl_ctl_reg * const mctl_ctl = + (struct sunxi_mctl_ctl_reg *)SUNXI_DRAM_CTL0_BASE;
+ /* Enable read time out */ + setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 25);
+ (void) readl((void *) rank1_base); + udelay(10);
+ if (readl(&mctl_ctl->pgsr[0]) & (0x1 << 13)) { + clrsetbits_le32(&mctl_ctl->dtcr, 0xf << 24, 0x1 << 24); + para->dual_rank = 0; + }
+ /* Reset PHY FIFO to clear it */ + clrbits_le32(&mctl_ctl->pgcr[0], 0x1 << 26); + udelay(100); + setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 26);
+ /* Clear error status */ + setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 24);
+ /* Clear time out flag */ + clrbits_le32(&mctl_ctl->pgsr[0], 0x1 << 13);
+ /* Disable read time out */ + clrbits_le32(&mctl_ctl->pgcr[0], 0x1 << 25); +}
static void mctl_auto_detect_dram_size(uint16_t socid, struct
dram_para *para)
{ + if (socid == SOCID_R40) { + mctl_r40_detect_rank_count(para); + mctl_set_cr(socid, para); + }
Trying to move this routine to sunxi_dram_init() fixes the problem.
And I think this is also a valid code sequence because originally the mctl_auto_detect_dram_size only detects page/column (and bank when DDR2 support added).
BTW the mctl_r40_detect_rank_count() call looks innocent. The victim is the mctl_set_cr() call. Maybe the code get complex enough (too many function calls)?
mctl_auto_detect_dram_size_rank(socid, para,
(ulong)CONFIG_SYS_SDRAM_BASE, ¶->ranks[0]);
if ((socid == SOCID_A64 || socid == SOCID_R40) && para-
dual_rank)
{
@@ -854,8 +899,6 @@ unsigned long sunxi_dram_init(void) uint16_t socid = SOCID_H3; #elif defined(CONFIG_MACH_SUN8I_R40) uint16_t socid = SOCID_R40; - /* Currently we cannot support R40 with dual rank memory */ - para.dual_rank = 0; #elif defined(CONFIG_MACH_SUN8I_V3S) uint16_t socid = SOCID_V3S; #elif defined(CONFIG_MACH_SUN50I)

On Wed, 03 Mar 2021 01:49:53 +0800 Icenowy Zheng icenowy@aosc.io wrote:
Hi Icenowy,
many thanks for your research on this. I asked some local compiler buffs, see below ...
在 2021-03-02星期二的 15:19 +0000,Andre Przywara写道:
On Tue, 02 Mar 2021 21:50:49 +0800 Icenowy Zheng icenowy@aosc.io wrote:
Hi Icenowy,
于 2021年3月2日 GMT+08:00 下午9:40:44, Andre Przywara < andre.przywara@arm.com> 写到:
On Fri, 26 Feb 2021 00:13:25 +0800 Icenowy Zheng icenowy@aosc.io wrote:
Previously we do not have proper dual rank memory detection on R40 (because we omitted PIR_QSGATE, which does not work on R40 with our configuration), and dual rank memory is just simply disabled as early R40 boards available (Banana Pi M2 Ultra and Berry) have single rank memory.
As a board with dual rank memory (Forlinx OKA40i-C) is now known to
us,
we need to have a way to do memory rank detection to support that
board.
Add some routine to detect memory rank by trying to access the memory in rank 1 and check for error status of the memory controller, and
then
enable dual rank memory on R40.
Similar routine can be used to detect half DQ width (which is also detected by PIR_QSGATE on other SoCs), but it's left unimplemented because there's no known R40 board with half DQ width now.
So when looking at the SPL size I realised that this patch breaks the socid constant parameter propagation, especially for mctl_set_cr(). I don't see immediately why, this whole code here should be compiled out on any A64 builds, for instance. Instead GCC turns "<mctl_set_cr.constprop.0>:" into "<mctl_set_cr>:", and passes 0x1689 around everytime. I tried GCC 10.2.0 and 9.2.0, also tried adding const everywhere, to amplify the constant nature of this value. Patch 1/2 added to the code size, but kept the const propagation (only one instance of 0x1689 in the disassembly). This patch here should be for free on A64, but adds 104 bytes.
Does anyone have a clue why this is happening? Is this a compiler issue?
Maybe the issue is that I assume this codepath is only for R40 and I didn't add socid to it?
But that's clearly visible by this function only being called inside "if (socid == SOCID_R40)". And that works very well for the H3 ZQ calibration quirk, for instance.
Could you try to add a socid parameter to mctl_r40_detect_rank_count()?
I just tried that, and apart from looking weird it didn't do anything.
To be clear: Your code is absolutely fine, exactly the way it should be written. It's just that the compiler is playing stupid suddenly. I was thinking that your dummy readl might have upset it, but even with that commented out it's still happening.
Or maybe it makes mctl_calc_rank_size() not inlined?
So the assembly looks like everything apart from mctl_set_cr() and mctl_auto_detect_dram_size_rank() is inlined. Those are extra because they are called multiple times and we are using -Os.
(Well I think the code looks noop on non-R40)
Exactly that was my thinking: when compiling for the A64, it should be totally compiled out, and the object file should be the same before and after.
BTW, original rank/width detection part won't get called on R40. But R40 is not where we are tight on code size, so I didn't bother to ignore it on R40.
I see. Yeah, I am not concerned about R40 either, but I want to avoid the A64 bloating up.
Or should we switch back to #if's and do not rely on compiler behavior any longer?
I'd rather not do that. We are doing the right thing, and it works marvellously so far.
To be clear: it's not a show stopper, I was just curious why this happens. The code size is not really critical on the A64 at the moment, so I'd merge it as it, even if we don't find a solution. Maybe just leave a hint about it in the code should people need to come back to this.
I also asked some compiler buffs about it, but it's not exactly the simple reproducer case they like to deal with ;-)
Cheers, Andre
Cheers, Andre
Signed-off-by: Icenowy Zheng icenowy@aosc.io
arch/arm/mach-sunxi/dram_sunxi_dw.c | 55
+++++++++++++++++++++++++----
1 file changed, 49 insertions(+), 6 deletions(-)
diff --git a/arch/arm/mach-sunxi/dram_sunxi_dw.c
b/arch/arm/mach-sunxi/dram_sunxi_dw.c
index 2b9d631d49..b86ae7cdf3 100644 --- a/arch/arm/mach-sunxi/dram_sunxi_dw.c +++ b/arch/arm/mach-sunxi/dram_sunxi_dw.c @@ -414,11 +414,9 @@ static void mctl_set_cr(uint16_t socid, struct
dram_para *para)
} if (socid == SOCID_R40) { - if (para->dual_rank) - panic("Dual rank memory not supported\n");
/* Mux pin to A15 address line for single rank memory. */ - setbits_le32(&mctl_com->cr_r1, MCTL_CR_R1_MUX_A15); + if (!para->dual_rank) + setbits_le32(&mctl_com->cr_r1, MCTL_CR_R1_MUX_A15); } } @@ -702,8 +700,55 @@ static unsigned long mctl_calc_rank_size(struct
rank_para *rank)
return (1UL << (rank->row_bits + rank->bank_bits)) *
rank->page_size;
} +/*
- Because we cannot do mctl_phy_init(PIR_QSGATE) on R40 now
(which
leads
- to failure), it's needed to detect the rank count of R40
in
another way.
- The code here is modelled after time_out_detect() in BSP,
which
tries to
- access the memory and check for error code.
- TODO: auto detect half DQ width here
- */
+static void mctl_r40_detect_rank_count(struct dram_para *para) +{ + ulong rank1_base = (ulong) CONFIG_SYS_SDRAM_BASE + + mctl_calc_rank_size(¶-
ranks[0]);
+ struct sunxi_mctl_ctl_reg * const mctl_ctl = + (struct sunxi_mctl_ctl_reg *)SUNXI_DRAM_CTL0_BASE;
+ /* Enable read time out */ + setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 25);
+ (void) readl((void *) rank1_base); + udelay(10);
+ if (readl(&mctl_ctl->pgsr[0]) & (0x1 << 13)) { + clrsetbits_le32(&mctl_ctl->dtcr, 0xf << 24, 0x1 << 24); + para->dual_rank = 0; + }
+ /* Reset PHY FIFO to clear it */ + clrbits_le32(&mctl_ctl->pgcr[0], 0x1 << 26); + udelay(100); + setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 26);
+ /* Clear error status */ + setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 24);
+ /* Clear time out flag */ + clrbits_le32(&mctl_ctl->pgsr[0], 0x1 << 13);
+ /* Disable read time out */ + clrbits_le32(&mctl_ctl->pgcr[0], 0x1 << 25); +}
static void mctl_auto_detect_dram_size(uint16_t socid, struct
dram_para *para)
{ + if (socid == SOCID_R40) { + mctl_r40_detect_rank_count(para); + mctl_set_cr(socid, para); + }
Trying to move this routine to sunxi_dram_init() fixes the problem.
And I think this is also a valid code sequence because originally the mctl_auto_detect_dram_size only detects page/column (and bank when DDR2 support added).
Yes, I can confirm this. This is identical in functionality and fixes the code size growth problem.
BTW the mctl_r40_detect_rank_count() call looks innocent. The victim is the mctl_set_cr() call. Maybe the code get complex enough (too many function calls)?
Yes, so this is what a colleague (Tamar Christina) said: " ... so the reason is costing. The patch adds the following bit + if (socid == SOCID_R40) { + mctl_r40_detect_rank_count(para); + mctl_set_cr(socid, para); + } which now makes a direct call to mctl_set_cr with a new constant value for cloning SOCID_R40. Which makes IPA have to evaluate the costs of specializing both SOCID_A64 and SOCID_R40 at the same time. It determines that the benefit of doing so is quite small. It has to keep 2 copies of the function around and it can't replace all usages of mctl_set_cr with the specialized version. The function is too small and the gain from the specialization seemed very little.You can lower the threshold with --param=ipa-cp-eval-threshold=300 if you really want it."
So this seems to be heuristics driven, and in a first steps it tries to create multiple copies of the function, one of which would later be garbage collected. However those multiple functions don't seem worthwhile to the compiler in the first place now, so it discards that idea, *before* it sees that all of them wouldn't be needed later.
I think your fix is good, I will take this version.
Cheers, Andre
P.S. Thanks for taking the time to find the reproducer!
mctl_auto_detect_dram_size_rank(socid, para,
(ulong)CONFIG_SYS_SDRAM_BASE, ¶->ranks[0]);
if ((socid == SOCID_A64 || socid == SOCID_R40) && para-
dual_rank)
{
@@ -854,8 +899,6 @@ unsigned long sunxi_dram_init(void) uint16_t socid = SOCID_H3; #elif defined(CONFIG_MACH_SUN8I_R40) uint16_t socid = SOCID_R40; - /* Currently we cannot support R40 with dual rank memory */ - para.dual_rank = 0; #elif defined(CONFIG_MACH_SUN8I_V3S) uint16_t socid = SOCID_V3S; #elif defined(CONFIG_MACH_SUN50I)

在 2021-03-02星期二的 15:19 +0000,Andre Przywara写道:
On Tue, 02 Mar 2021 21:50:49 +0800 Icenowy Zheng icenowy@aosc.io wrote:
Hi Icenowy,
于 2021年3月2日 GMT+08:00 下午9:40:44, Andre Przywara < andre.przywara@arm.com> 写到:
On Fri, 26 Feb 2021 00:13:25 +0800 Icenowy Zheng icenowy@aosc.io wrote:
Previously we do not have proper dual rank memory detection on R40 (because we omitted PIR_QSGATE, which does not work on R40 with our configuration), and dual rank memory is just simply disabled as early R40 boards available (Banana Pi M2 Ultra and Berry) have single rank memory.
As a board with dual rank memory (Forlinx OKA40i-C) is now known to
us,
we need to have a way to do memory rank detection to support that
board.
Add some routine to detect memory rank by trying to access the memory in rank 1 and check for error status of the memory controller, and
then
enable dual rank memory on R40.
Similar routine can be used to detect half DQ width (which is also detected by PIR_QSGATE on other SoCs), but it's left unimplemented because there's no known R40 board with half DQ width now.
So when looking at the SPL size I realised that this patch breaks the socid constant parameter propagation, especially for mctl_set_cr(). I don't see immediately why, this whole code here should be compiled out on any A64 builds, for instance. Instead GCC turns "<mctl_set_cr.constprop.0>:" into "<mctl_set_cr>:", and passes 0x1689 around everytime. I tried GCC 10.2.0 and 9.2.0, also tried adding const everywhere, to amplify the constant nature of this value. Patch 1/2 added to the code size, but kept the const propagation (only one instance of 0x1689 in the disassembly). This patch here should be for free on A64, but adds 104 bytes.
Does anyone have a clue why this is happening? Is this a compiler issue?
Maybe the issue is that I assume this codepath is only for R40 and I didn't add socid to it?
But that's clearly visible by this function only being called inside "if (socid == SOCID_R40)". And that works very well for the H3 ZQ calibration quirk, for instance.
Could you try to add a socid parameter to mctl_r40_detect_rank_count()?
I just tried that, and apart from looking weird it didn't do anything.
To be clear: Your code is absolutely fine, exactly the way it should be written. It's just that the compiler is playing stupid suddenly. I was thinking that your dummy readl might have upset it, but even with that commented out it's still happening.
Or maybe it makes mctl_calc_rank_size() not inlined?
So the assembly looks like everything apart from mctl_set_cr() and mctl_auto_detect_dram_size_rank() is inlined. Those are extra because they are called multiple times and we are using -Os.
(Well I think the code looks noop on non-R40)
Exactly that was my thinking: when compiling for the A64, it should be totally compiled out, and the object file should be the same before and after.
BTW, original rank/width detection part won't get called on R40. But R40 is not where we are tight on code size, so I didn't bother to ignore it on R40.
I see. Yeah, I am not concerned about R40 either, but I want to avoid the A64 bloating up.
Or should we switch back to #if's and do not rely on compiler behavior any longer?
I'd rather not do that. We are doing the right thing, and it works marvellously so far.
To be clear: it's not a show stopper, I was just curious why this happens. The code size is not really critical on the A64 at the moment, so I'd merge it as it, even if we don't find a solution. Maybe just leave a hint about it in the code should people need to come back to this.
I also asked some compiler buffs about it, but it's not exactly the simple reproducer case they like to deal with ;-)
Well I tried to make a simpler reproducer:
``` #include <stdint.h>
__attribute__((noinline)) static void mctl_set_cr(uint16_t socid) { if (socid == 0x1701) *((volatile uint16_t *)0x12345678) = 40; else if (socid == 0x1689) *((volatile uint16_t *)0x12345678) = 64; }
static void test(uint16_t socid) { if (socid == 0x1701) mctl_set_cr(socid);
*((volatile uint16_t *)0x12345670) = 0xefe8; }
int sunxi_dram_init() { uint16_t socid = 0x1689;
test(socid); mctl_set_cr(socid);
return 0; } ```
If compile this with `aarch64-linux-gnu-gcc test.c -c -Os`, then the mctl_set_cr won't get constprop optimization.
In the same situation with real dram_sunxi_dw, if judge for 0x1701 is moved from test() to sunxi_dram_init() then mctl_set_cr gets optimized and codepath for setting 40 is omitted.
I think, the conditions that triggers the lose of the optimization is below: - Inside a function inlined to where the constant is defined (not the function that defines the constant itself) - Get called inside a if() block that the expression inside the condition uses the constant
Cheers, Andre
Cheers, Andre
Signed-off-by: Icenowy Zheng icenowy@aosc.io
arch/arm/mach-sunxi/dram_sunxi_dw.c | 55
+++++++++++++++++++++++++----
1 file changed, 49 insertions(+), 6 deletions(-)
diff --git a/arch/arm/mach-sunxi/dram_sunxi_dw.c
b/arch/arm/mach-sunxi/dram_sunxi_dw.c
index 2b9d631d49..b86ae7cdf3 100644 --- a/arch/arm/mach-sunxi/dram_sunxi_dw.c +++ b/arch/arm/mach-sunxi/dram_sunxi_dw.c @@ -414,11 +414,9 @@ static void mctl_set_cr(uint16_t socid, struct
dram_para *para)
} if (socid == SOCID_R40) { - if (para->dual_rank) - panic("Dual rank memory not supported\n");
/* Mux pin to A15 address line for single rank memory. */ - setbits_le32(&mctl_com->cr_r1, MCTL_CR_R1_MUX_A15); + if (!para->dual_rank) + setbits_le32(&mctl_com->cr_r1, MCTL_CR_R1_MUX_A15); } } @@ -702,8 +700,55 @@ static unsigned long mctl_calc_rank_size(struct
rank_para *rank)
return (1UL << (rank->row_bits + rank->bank_bits)) *
rank->page_size;
} +/*
- Because we cannot do mctl_phy_init(PIR_QSGATE) on R40 now
(which
leads
- to failure), it's needed to detect the rank count of R40
in
another way.
- The code here is modelled after time_out_detect() in BSP,
which
tries to
- access the memory and check for error code.
- TODO: auto detect half DQ width here
- */
+static void mctl_r40_detect_rank_count(struct dram_para *para) +{ + ulong rank1_base = (ulong) CONFIG_SYS_SDRAM_BASE + + mctl_calc_rank_size(¶-
ranks[0]);
+ struct sunxi_mctl_ctl_reg * const mctl_ctl = + (struct sunxi_mctl_ctl_reg *)SUNXI_DRAM_CTL0_BASE;
+ /* Enable read time out */ + setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 25);
+ (void) readl((void *) rank1_base); + udelay(10);
+ if (readl(&mctl_ctl->pgsr[0]) & (0x1 << 13)) { + clrsetbits_le32(&mctl_ctl->dtcr, 0xf << 24, 0x1 << 24); + para->dual_rank = 0; + }
+ /* Reset PHY FIFO to clear it */ + clrbits_le32(&mctl_ctl->pgcr[0], 0x1 << 26); + udelay(100); + setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 26);
+ /* Clear error status */ + setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 24);
+ /* Clear time out flag */ + clrbits_le32(&mctl_ctl->pgsr[0], 0x1 << 13);
+ /* Disable read time out */ + clrbits_le32(&mctl_ctl->pgcr[0], 0x1 << 25); +}
static void mctl_auto_detect_dram_size(uint16_t socid, struct
dram_para *para)
{ + if (socid == SOCID_R40) { + mctl_r40_detect_rank_count(para); + mctl_set_cr(socid, para); + }
mctl_auto_detect_dram_size_rank(socid, para,
(ulong)CONFIG_SYS_SDRAM_BASE, ¶->ranks[0]);
if ((socid == SOCID_A64 || socid == SOCID_R40) && para-
dual_rank)
{
@@ -854,8 +899,6 @@ unsigned long sunxi_dram_init(void) uint16_t socid = SOCID_H3; #elif defined(CONFIG_MACH_SUN8I_R40) uint16_t socid = SOCID_R40; - /* Currently we cannot support R40 with dual rank memory */ - para.dual_rank = 0; #elif defined(CONFIG_MACH_SUN8I_V3S) uint16_t socid = SOCID_V3S; #elif defined(CONFIG_MACH_SUN50I)

On Fri, 26 Feb 2021 00:13:23 +0800 Icenowy Zheng icenowy@aosc.io wrote:
This patchset contains two patches.
The first one enables asymmetric dual rank DRAM on A64. This is needed for 3GiB PinePhone, which has 2GiB rank 0 and 1GiB rank 1. This patch is already used by the firmware flashed to PinePhone by factory.
The second one enables dual rank (and asymmetric dual rank, although not tested because of lack of real board) on R40. In order to support single rank and dual rank at the same time, a new rank detection code is implemented (because PIR_QSGATE-based one does not work on R40). The code enables some error report facility of the DRAM controller, and then tries to access rank 1 and then check for error. It's placed at 2nd patch because it depends on the function that calculates rank 0 size (and rank 1 base address) introduced in PATCH 1.
Thanks, queued both for -next. I moved the call to mctl_r40_detect_rank_count() and mctl_set_cr() into sunxi_dram_init(), to avoid the size regression.
Cheers, Andre
Icenowy Zheng (2): sunxi: support asymmetric dual rank DRAM on A64/R40 sunxi: enable dual rank memory on R40
.../include/asm/arch-sunxi/dram_sunxi_dw.h | 11 +- arch/arm/mach-sunxi/dram_sunxi_dw.c | 149 +++++++++++++++--- 2 files changed, 131 insertions(+), 29 deletions(-)
participants (4)
-
Andre Przywara
-
Icenowy Zheng
-
Ivan Uvarov
-
Peter Robinson