
The timing registers in the DRAM controller can be programmed in any order, as they will only take effect once the controller is eventually "activated".
Switch the MMIO writes in mctl_set_timing_lpddr3() over to use writel_relaxed(), since we don't need the stronger guarantee of the normal writel(). We satisfy the overall ordering requirement by ending the function with an explicit DMB barrier.
In this case we are not interested in the performance benefit this usually gives, but in the saved instructions, which sum up for the many writes we have in the timing setup. Due to alignment effects this shrinks our chronically tight H6 SPL by a whopping 2KB, which brings it in the same region as for the other AArch64 Allwinner SPL builds.
Signed-off-by: Andre Przywara andre.przywara@arm.com --- arch/arm/mach-sunxi/dram_sun50i_h6.c | 79 +++++++++++++++++++----------------- 1 file changed, 42 insertions(+), 37 deletions(-)
diff --git a/arch/arm/mach-sunxi/dram_sun50i_h6.c b/arch/arm/mach-sunxi/dram_sun50i_h6.c index 5da90a2835..84a33a63d6 100644 --- a/arch/arm/mach-sunxi/dram_sun50i_h6.c +++ b/arch/arm/mach-sunxi/dram_sun50i_h6.c @@ -241,51 +241,55 @@ static void mctl_set_timing_lpddr3(struct dram_para *para) memcpy(mctl_phy->mr, mr_lpddr3, sizeof(mr_lpddr3));
/* set DRAM timing */ - writel((twtp << 24) | (tfaw << 16) | (trasmax << 8) | tras, - &mctl_ctl->dramtmg[0]); - writel((txp << 16) | (trtp << 8) | trc, &mctl_ctl->dramtmg[1]); - writel((tcwl << 24) | (tcl << 16) | (trd2wr << 8) | twr2rd, - &mctl_ctl->dramtmg[2]); - writel((tmrw << 20) | (tmrd << 12) | tmod, &mctl_ctl->dramtmg[3]); - writel((trcd << 24) | (tccd << 16) | (trrd << 8) | trp, - &mctl_ctl->dramtmg[4]); - writel((tcksrx << 24) | (tcksre << 16) | (tckesr << 8) | tcke, - &mctl_ctl->dramtmg[5]); + writel_relaxed((twtp << 24) | (tfaw << 16) | (trasmax << 8) | tras, + &mctl_ctl->dramtmg[0]); + writel_relaxed((txp << 16) | (trtp << 8) | trc, &mctl_ctl->dramtmg[1]); + writel_relaxed((tcwl << 24) | (tcl << 16) | (trd2wr << 8) | twr2rd, + &mctl_ctl->dramtmg[2]); + writel_relaxed((tmrw << 20) | (tmrd << 12) | tmod, + &mctl_ctl->dramtmg[3]); + writel_relaxed((trcd << 24) | (tccd << 16) | (trrd << 8) | trp, + &mctl_ctl->dramtmg[4]); + writel_relaxed((tcksrx << 24) | (tcksre << 16) | (tckesr << 8) | tcke, + &mctl_ctl->dramtmg[5]); /* Value suggested by ZynqMP manual and used by libdram */ - writel((txp + 2) | 0x02020000, &mctl_ctl->dramtmg[6]); - writel((txsfast << 24) | (txsabort << 16) | (txsdll << 8) | txs, - &mctl_ctl->dramtmg[8]); - writel(txsr, &mctl_ctl->dramtmg[14]); + writel_relaxed((txp + 2) | 0x02020000, &mctl_ctl->dramtmg[6]); + writel_relaxed((txsfast << 24) | (txsabort << 16) | (txsdll << 8) | txs, + &mctl_ctl->dramtmg[8]); + writel_relaxed(txsr, &mctl_ctl->dramtmg[14]);
clrsetbits_le32(&mctl_ctl->init[0], (3 << 30), (1 << 30)); - writel(0, &mctl_ctl->dfimisc); + writel_relaxed(0, &mctl_ctl->dfimisc); clrsetbits_le32(&mctl_ctl->rankctl, 0xff0, 0x660);
/* * Set timing registers of the PHY. * Note: the PHY is clocked 2x from the DRAM frequency. */ - writel((trrd << 25) | (tras << 17) | (trp << 9) | (trtp << 1), + writel_relaxed((trrd << 25) | (tras << 17) | (trp << 9) | (trtp << 1), &mctl_phy->dtpr[0]); - writel((tfaw << 17) | 0x28000400 | (tmrd << 1), &mctl_phy->dtpr[1]); - writel(((txs << 6) - 1) | (tcke << 17), &mctl_phy->dtpr[2]); - writel(((txsdll << 22) - (0x1 << 16)) | twtr_sa | (tcksrea << 8), - &mctl_phy->dtpr[3]); - writel((txp << 1) | (trfc << 17) | 0x800, &mctl_phy->dtpr[4]); - writel((trc << 17) | (trcd << 9) | (twtr << 1), &mctl_phy->dtpr[5]); - writel(0x0505, &mctl_phy->dtpr[6]); + writel_relaxed((tfaw << 17) | 0x28000400 | (tmrd << 1), + &mctl_phy->dtpr[1]); + writel_relaxed(((txs << 6) - 1) | (tcke << 17), &mctl_phy->dtpr[2]); + writel_relaxed(((txsdll << 22) - (0x1 << 16)) | twtr_sa | + (tcksrea << 8), &mctl_phy->dtpr[3]); + writel_relaxed((txp << 1) | (trfc << 17) | 0x800, &mctl_phy->dtpr[4]); + writel_relaxed((trc << 17) | (trcd << 9) | (twtr << 1), + &mctl_phy->dtpr[5]); + writel_relaxed(0x0505, &mctl_phy->dtpr[6]);
/* Configure DFI timing */ - writel(tcl | 0x2000200 | (t_rdata_en << 16) | 0x808000, - &mctl_ctl->dfitmg0); - writel(0x040201, &mctl_ctl->dfitmg1); + writel_relaxed(tcl | 0x2000200 | (t_rdata_en << 16) | 0x808000, + &mctl_ctl->dfitmg0); + writel_relaxed(0x040201, &mctl_ctl->dfitmg1);
/* Configure PHY timing */ - writel(tdinit0 | (tdinit1 << 20), &mctl_phy->ptr[3]); - writel(tdinit2 | (tdinit3 << 18), &mctl_phy->ptr[4]); + writel_relaxed(tdinit0 | (tdinit1 << 20), &mctl_phy->ptr[3]); + writel_relaxed(tdinit2 | (tdinit3 << 18), &mctl_phy->ptr[4]);
/* set refresh timing */ - writel((trefi << 16) | trfc, &mctl_ctl->rfshtmg); + writel_relaxed((trefi << 16) | trfc, &mctl_ctl->rfshtmg); + DMB; }
static void mctl_sys_init(struct dram_para *para) @@ -476,17 +480,17 @@ static void mctl_bit_delay_set(struct dram_para *para) val = readl(&mctl_phy->dx[i].bdlr0); for (j = 0; j < 4; j++) val += para->dx_write_delays[i][j] << (j * 8); - writel(val, &mctl_phy->dx[i].bdlr0); + writel_relaxed(val, &mctl_phy->dx[i].bdlr0);
val = readl(&mctl_phy->dx[i].bdlr1); for (j = 0; j < 4; j++) val += para->dx_write_delays[i][j + 4] << (j * 8); - writel(val, &mctl_phy->dx[i].bdlr1); + writel_relaxed(val, &mctl_phy->dx[i].bdlr1);
val = readl(&mctl_phy->dx[i].bdlr2); for (j = 0; j < 4; j++) val += para->dx_write_delays[i][j + 8] << (j * 8); - writel(val, &mctl_phy->dx[i].bdlr2); + writel_relaxed(val, &mctl_phy->dx[i].bdlr2); } clrbits_le32(&mctl_phy->pgcr[0], BIT(26));
@@ -494,22 +498,22 @@ static void mctl_bit_delay_set(struct dram_para *para) val = readl(&mctl_phy->dx[i].bdlr3); for (j = 0; j < 4; j++) val += para->dx_read_delays[i][j] << (j * 8); - writel(val, &mctl_phy->dx[i].bdlr3); + writel_relaxed(val, &mctl_phy->dx[i].bdlr3);
val = readl(&mctl_phy->dx[i].bdlr4); for (j = 0; j < 4; j++) val += para->dx_read_delays[i][j + 4] << (j * 8); - writel(val, &mctl_phy->dx[i].bdlr4); + writel_relaxed(val, &mctl_phy->dx[i].bdlr4);
val = readl(&mctl_phy->dx[i].bdlr5); for (j = 0; j < 4; j++) val += para->dx_read_delays[i][j + 8] << (j * 8); - writel(val, &mctl_phy->dx[i].bdlr5); + writel_relaxed(val, &mctl_phy->dx[i].bdlr5);
val = readl(&mctl_phy->dx[i].bdlr6); val += (para->dx_read_delays[i][12] << 8) | (para->dx_read_delays[i][13] << 16); - writel(val, &mctl_phy->dx[i].bdlr6); + writel_relaxed(val, &mctl_phy->dx[i].bdlr6); } setbits_le32(&mctl_phy->pgcr[0], BIT(26)); udelay(1); @@ -517,8 +521,9 @@ static void mctl_bit_delay_set(struct dram_para *para) for (i = 1; i < 14; i++) { val = readl(&mctl_phy->acbdlr[i]); val += 0x0a0a0a0a; - writel(val, &mctl_phy->acbdlr[i]); + writel_relaxed(val, &mctl_phy->acbdlr[i]); } + DMB; }
static void mctl_channel_init(struct dram_para *para)