[U-Boot] [PATCH 0/8] rockchip: Improve SPI-NOR read performance for the RK3399-Q7

The SPI-NOR driver has traditionally been slow enough to impact boot-time from SPI-NOR (which is the recommended storage for u-boot.itb on the RK3399-Q7): transfer for the ~890KB exceeded 0.8s even though the SPI-NOR bitrate was configured to 49.5MBit/s.
This series provides some urgently needed tough love for the SPI driver: - contains a few cleanups - implements a new 16bit-wide transfer mode to better utilise the 32x16bit RXFIFO - increases bus-utilisation to > 94% both for 49.5MBit/s (and, in an experimental branch that allows higher bitrates) for 74.25MBit/s
With the entire series applied, we can now load our u-boot.itb in ~0.095s (instead of ~ 0.820s previously). This is still slower than from the on-module eMMC (0.028s) that has an 8-bit wide data-path, yet makes the SPI-NOR finally useful for all customers that want to keep the boot firmware out-of-the-way of the operating system.
Philipp Tomsich (8): rockchip: spi: add debug message for delay in CS toggle rockchip: spi: remove unused code and fields in priv rockchip: spi: fix off-by-one in chunk size computation rockchip: spi: consistently use false/true with rkspi_enable_chip rockchip: spi: only wait for completion, if transmitting rockchip: spi: add optimised receive-only implementation rockchip: spi: add driver-data and a 'rxonly_manages_fifo' flag rockchip: spi: make optimised receive-handler unaligned-safe
drivers/spi/rk_spi.c | 163 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 127 insertions(+), 36 deletions(-)

In analysing delays introduced for large SPI reads, the absence of any indication when a delay was inserted (to ensure the CS toggling is observed by devices) became apparent.
Add an additional debug-only debug message to record the insertion and duration of any delay (note that the debug-message will cause a delay on-top of the delay-duration).
Signed-off-by: Philipp Tomsich philipp.tomsich@theobroma-systems.com ---
drivers/spi/rk_spi.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/drivers/spi/rk_spi.c b/drivers/spi/rk_spi.c index 14437c0..4153240 100644 --- a/drivers/spi/rk_spi.c +++ b/drivers/spi/rk_spi.c @@ -130,8 +130,13 @@ static void spi_cs_activate(struct udevice *dev, uint cs) if (plat->deactivate_delay_us && priv->last_transaction_us) { ulong delay_us; /* The delay completed so far */ delay_us = timer_get_us() - priv->last_transaction_us; - if (delay_us < plat->deactivate_delay_us) - udelay(plat->deactivate_delay_us - delay_us); + if (delay_us < plat->deactivate_delay_us) { + ulong additional_delay_us = + plat->deactivate_delay_us - delay_us; + debug("%s: delaying by %ld us\n", + __func__, additional_delay_us); + udelay(additional_delay_us); + } }
debug("activate cs%u\n", cs);

In analysing delays introduced for large SPI reads, the absence of any indication when a delay was inserted (to ensure the CS toggling is observed by devices) became apparent.
Add an additional debug-only debug message to record the insertion and duration of any delay (note that the debug-message will cause a delay on-top of the delay-duration).
Signed-off-by: Philipp Tomsich philipp.tomsich@theobroma-systems.com
drivers/spi/rk_spi.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-)
Applied to u-boot-rockchip, thanks!

Even though the priv-structure and the claim-bus function contain logic for 16bit frames and for unidirectional transfer modes, neither of these is used anywhere in the driver.
This removes the unused (as in "has no effect") logic and fields.
Signed-off-by: Philipp Tomsich philipp.tomsich@theobroma-systems.com ---
drivers/spi/rk_spi.c | 29 +++-------------------------- 1 file changed, 3 insertions(+), 26 deletions(-)
diff --git a/drivers/spi/rk_spi.c b/drivers/spi/rk_spi.c index 4153240..1df497d 100644 --- a/drivers/spi/rk_spi.c +++ b/drivers/spi/rk_spi.c @@ -40,11 +40,8 @@ struct rockchip_spi_priv { unsigned int max_freq; unsigned int mode; ulong last_transaction_us; /* Time of last transaction end */ - u8 bits_per_word; /* max 16 bits per word */ - u8 n_bytes; unsigned int speed_hz; unsigned int last_speed_hz; - unsigned int tmode; uint input_rate; };
@@ -268,8 +265,6 @@ static int rockchip_spi_probe(struct udevice *bus) } priv->input_rate = ret; debug("%s: rate = %u\n", __func__, priv->input_rate); - priv->bits_per_word = 8; - priv->tmode = TMOD_TR; /* Tx & Rx */
return 0; } @@ -279,29 +274,11 @@ static int rockchip_spi_claim_bus(struct udevice *dev) struct udevice *bus = dev->parent; struct rockchip_spi_priv *priv = dev_get_priv(bus); struct rockchip_spi *regs = priv->regs; - u8 spi_dfs, spi_tf; uint ctrlr0;
/* Disable the SPI hardware */ rkspi_enable_chip(regs, 0);
- switch (priv->bits_per_word) { - case 8: - priv->n_bytes = 1; - spi_dfs = DFS_8BIT; - spi_tf = HALF_WORD_OFF; - break; - case 16: - priv->n_bytes = 2; - spi_dfs = DFS_16BIT; - spi_tf = HALF_WORD_ON; - break; - default: - debug("%s: unsupported bits: %dbits\n", __func__, - priv->bits_per_word); - return -EPROTONOSUPPORT; - } - if (priv->speed_hz != priv->last_speed_hz) rkspi_set_clk(priv, priv->speed_hz);
@@ -309,7 +286,7 @@ static int rockchip_spi_claim_bus(struct udevice *dev) ctrlr0 = OMOD_MASTER << OMOD_SHIFT;
/* Data Frame Size */ - ctrlr0 |= spi_dfs << DFS_SHIFT; + ctrlr0 |= DFS_8BIT << DFS_SHIFT;
/* set SPI mode 0..3 */ if (priv->mode & SPI_CPOL) @@ -330,7 +307,7 @@ static int rockchip_spi_claim_bus(struct udevice *dev) ctrlr0 |= FBM_MSB << FBM_SHIFT;
/* Byte and Halfword Transform */ - ctrlr0 |= spi_tf << HALF_WORD_TX_SHIFT; + ctrlr0 |= HALF_WORD_OFF << HALF_WORD_TX_SHIFT;
/* Rxd Sample Delay */ ctrlr0 |= 0 << RXDSD_SHIFT; @@ -339,7 +316,7 @@ static int rockchip_spi_claim_bus(struct udevice *dev) ctrlr0 |= FRF_SPI << FRF_SHIFT;
/* Tx and Rx mode */ - ctrlr0 |= (priv->tmode & TMOD_MASK) << TMOD_SHIFT; + ctrlr0 |= TMOD_TR << TMOD_SHIFT;
writel(ctrlr0, ®s->ctrlr0);

Even though the priv-structure and the claim-bus function contain logic for 16bit frames and for unidirectional transfer modes, neither of these is used anywhere in the driver.
This removes the unused (as in "has no effect") logic and fields.
Signed-off-by: Philipp Tomsich philipp.tomsich@theobroma-systems.com
drivers/spi/rk_spi.c | 29 +++-------------------------- 1 file changed, 3 insertions(+), 26 deletions(-)
Applied to u-boot-rockchip, thanks!

The maximum transfer length (in a single transaction) for the Rockchip SPI controller is 64Kframes (i.e. 0x10000 frames) of 8bit or 16bit frames and is encoded as (num_frames - 1) in CTRLR1. The existing code subtracted the "minus 1" twice for a maximum transfer length of 0xffff (64K - 1) frames.
While this is not strictly an error (the existing code is correct, but leads to a bit of head-scrating), fix this off-by-one situation.
Signed-off-by: Philipp Tomsich philipp.tomsich@theobroma-systems.com ---
drivers/spi/rk_spi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/spi/rk_spi.c b/drivers/spi/rk_spi.c index 1df497d..e7b2df8 100644 --- a/drivers/spi/rk_spi.c +++ b/drivers/spi/rk_spi.c @@ -356,7 +356,7 @@ static int rockchip_spi_xfer(struct udevice *dev, unsigned int bitlen, spi_cs_activate(dev, slave_plat->cs);
while (len > 0) { - int todo = min(len, 0xffff); + int todo = min(len, 0x10000);
rkspi_enable_chip(regs, false); writel(todo - 1, ®s->ctrlr1);

The maximum transfer length (in a single transaction) for the Rockchip SPI controller is 64Kframes (i.e. 0x10000 frames) of 8bit or 16bit frames and is encoded as (num_frames - 1) in CTRLR1. The existing code subtracted the "minus 1" twice for a maximum transfer length of 0xffff (64K - 1) frames.
While this is not strictly an error (the existing code is correct, but leads to a bit of head-scrating), fix this off-by-one situation.
Signed-off-by: Philipp Tomsich philipp.tomsich@theobroma-systems.com
drivers/spi/rk_spi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
Applied to u-boot-rockchip, thanks!

While rkspi_enable_chip is called with true/false everywhere else in the file, one call site uses '0' to denot 'false'. This change this one parameter to 'false' and effects consistency.
Signed-off-by: Philipp Tomsich philipp.tomsich@theobroma-systems.com ---
drivers/spi/rk_spi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/spi/rk_spi.c b/drivers/spi/rk_spi.c index e7b2df8..aaf244d 100644 --- a/drivers/spi/rk_spi.c +++ b/drivers/spi/rk_spi.c @@ -277,7 +277,7 @@ static int rockchip_spi_claim_bus(struct udevice *dev) uint ctrlr0;
/* Disable the SPI hardware */ - rkspi_enable_chip(regs, 0); + rkspi_enable_chip(regs, false);
if (priv->speed_hz != priv->last_speed_hz) rkspi_set_clk(priv, priv->speed_hz);

While rkspi_enable_chip is called with true/false everywhere else in the file, one call site uses '0' to denot 'false'. This change this one parameter to 'false' and effects consistency.
Signed-off-by: Philipp Tomsich philipp.tomsich@theobroma-systems.com
drivers/spi/rk_spi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
Applied to u-boot-rockchip, thanks!

The logic in the main transmit loop took a bit of reading the TRM to fully understand (due to silent assumptions based in internal logic): the "wait until idle" at the end of each iteration through the loop is required for the transmit-path as each clearing of the ENA register (to update run-length in the CTRLR1 register) will implicitly flush the FIFOs... transmisson can therefore not overlap loop iterations.
This change adds a comment to clarify the reason/need for waiting until the controller becomes idle and wraps the entire check into an 'if (out)' to make it clear that this is required for transfers with a transmit-component only (for transfers having a receive-component, completion of the transmit-side is trivially ensured by having received the correct number of bytes).
The change does not increase execution time measurably in any of my tests.
Signed-off-by: Philipp Tomsich philipp.tomsich@theobroma-systems.com ---
drivers/spi/rk_spi.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-)
diff --git a/drivers/spi/rk_spi.c b/drivers/spi/rk_spi.c index aaf244d..c807d78 100644 --- a/drivers/spi/rk_spi.c +++ b/drivers/spi/rk_spi.c @@ -379,9 +379,18 @@ static int rockchip_spi_xfer(struct udevice *dev, unsigned int bitlen, toread--; } } - ret = rkspi_wait_till_not_busy(regs); - if (ret) - break; + + /* + * In case that there's a transmit-component, we need to wait + * until the control goes idle before we can disable the SPI + * control logic (as this will implictly flush the FIFOs). + */ + if (out) { + ret = rkspi_wait_till_not_busy(regs); + if (ret) + break; + } + len -= todo; }

The logic in the main transmit loop took a bit of reading the TRM to fully understand (due to silent assumptions based in internal logic): the "wait until idle" at the end of each iteration through the loop is required for the transmit-path as each clearing of the ENA register (to update run-length in the CTRLR1 register) will implicitly flush the FIFOs... transmisson can therefore not overlap loop iterations.
This change adds a comment to clarify the reason/need for waiting until the controller becomes idle and wraps the entire check into an 'if (out)' to make it clear that this is required for transfers with a transmit-component only (for transfers having a receive-component, completion of the transmit-side is trivially ensured by having received the correct number of bytes).
The change does not increase execution time measurably in any of my tests.
Signed-off-by: Philipp Tomsich philipp.tomsich@theobroma-systems.com
drivers/spi/rk_spi.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-)
Applied to u-boot-rockchip, thanks!

For the RK3399-Q7 we recommend storing SPL and u-boot.itb in the on-module 32MBit (and sometimes even larger, if requested as part of a configure-to-order configuration) SPI-NOR flash that is clocked for a bitrate of 49.5MBit/s and connected in a single-IO configuration (the RK3399 only supports single-IO for SPI).
Unfortunately, the existing SPI driver is excruciatingly slow at reading out large chunks of data (in fact it is just as slow for small chunks of data, but the overheads of the driver-framework make it less noticeable): before this change, the throughput on a 4MB read from SPI-NOR is 8.47MBit/s which equates a 17.11% bus-utilisation.
To improve on this, this commit adds an optimised receive-only transfer (i.e.: out == NULL) handler that hooks into the main transfer function and processes data in 16bit frames (utilising the full with of each FIFO element). As of now, the receive-only handler requires the in-buffer to be 16bit aligned. Any lingering data (i.e. either if the in-buffer was not 16-bit aligned or if an odd number of bytes are to be received) will be handled by the original 8bit reader/wirter.
Given that the SPI controller's documentation does not guarantuee any interlocking between the RXFIFO and the master SCLK, the transfer loop will be restarted for each chunk of 32 frames (i.e. 64 bytes).
With this new receive-only transfer handler, the throughput for a 4MB read increases to 36.28MBit/s (i.e. 73.29% bus-utilisation): this is a 4x improvement over the baseline.
Signed-off-by: Philipp Tomsich philipp.tomsich@theobroma-systems.com Reported-by: Klaus Goger klaus.goger@theobroma-systems.com
Series-Cc: Klaus Goger klaus.goger@theobroma-systems.com Series-Cc: Christoph Muellner christoph.muellner@theobroma-systems.com
---
drivers/spi/rk_spi.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 88 insertions(+), 1 deletion(-)
diff --git a/drivers/spi/rk_spi.c b/drivers/spi/rk_spi.c index c807d78..fec41a4 100644 --- a/drivers/spi/rk_spi.c +++ b/drivers/spi/rk_spi.c @@ -2,6 +2,8 @@ /* * spi driver for rockchip * + * (C) 2019 Theobroma Systems Design und Consulting GmbH + * * (C) Copyright 2015 Google, Inc * * (C) Copyright 2008-2013 Rockchip Electronics @@ -333,6 +335,81 @@ static int rockchip_spi_release_bus(struct udevice *dev) return 0; }
+static inline int rockchip_spi_16bit_reader(struct udevice *dev, + u8 **din, int *len) +{ + struct udevice *bus = dev->parent; + const struct rockchip_spi_params * const data = + (void *)dev_get_driver_data(bus); + struct rockchip_spi_priv *priv = dev_get_priv(bus); + struct rockchip_spi *regs = priv->regs; + const u32 saved_ctrlr0 = readl(®s->ctrlr0); +#if defined(DEBUG) + u32 statistics_rxlevels[33] = { }; +#endif + u32 frames = *len / 2; + u16 *in16 = (u16 *)(*din); + u32 max_chunk_size = SPI_FIFO_DEPTH; + + if (!frames) + return 0; + + /* + * If the destination buffer is unaligned, we'd run into a problem + * on ARMv8. Given that this doesn't seem to be a real issue, we + * just chicken out and fall back to the unoptimised implementation. + */ + if ((uintptr_t)*din & 1) { + debug("%s: unaligned buffer, din = %p\n", __func__, *din); + return 0; + } + + // rockchip_spi_configure(dev, mode, size) + rkspi_enable_chip(regs, false); + clrsetbits_le32(®s->ctrlr0, + TMOD_MASK << TMOD_SHIFT, + TMOD_RO << TMOD_SHIFT); + /* 16bit data frame size */ + clrsetbits_le32(®s->ctrlr0, DFS_MASK, DFS_16BIT); + + /* Update caller's context */ + const u32 bytes_to_process = 2 * frames; + *din += bytes_to_process; + *len -= bytes_to_process; + + /* Process our frames */ + while (frames) { + u32 chunk_size = min(frames, max_chunk_size); + + frames -= chunk_size; + + writew(chunk_size - 1, ®s->ctrlr1); + rkspi_enable_chip(regs, true); + + do { + u32 rx_level = readw(®s->rxflr); +#if defined(DEBUG) + statistics_rxlevels[rx_level]++; +#endif + chunk_size -= rx_level; + while (rx_level--) + *in16++ = readw(regs->rxdr); + } while (chunk_size); + + rkspi_enable_chip(regs, false); + } + +#if defined(DEBUG) + debug("%s: observed rx_level during processing:\n", __func__); + for (int i = 0; i <= 32; ++i) + if (statistics_rxlevels[i]) + debug("\t%2d: %d\n", i, statistics_rxlevels[i]); +#endif + /* Restore the original transfer setup and return error-free. */ + writel(saved_ctrlr0, ®s->ctrlr0); + return 0; +} + static int rockchip_spi_xfer(struct udevice *dev, unsigned int bitlen, const void *dout, void *din, unsigned long flags) { @@ -344,7 +421,7 @@ static int rockchip_spi_xfer(struct udevice *dev, unsigned int bitlen, const u8 *out = dout; u8 *in = din; int toread, towrite; - int ret; + int ret = 0;
debug("%s: dout=%p, din=%p, len=%x, flags=%lx\n", __func__, dout, din, len, flags); @@ -355,6 +432,16 @@ static int rockchip_spi_xfer(struct udevice *dev, unsigned int bitlen, if (flags & SPI_XFER_BEGIN) spi_cs_activate(dev, slave_plat->cs);
+ /* + * To ensure fast loading of firmware images (e.g. full U-Boot + * stage, ATF, Linux kernel) from SPI flash, we optimise the + * case of read-only transfers by using the full 16bits of each + * FIFO element. + */ + if (!out) + ret = rockchip_spi_16bit_reader(dev, &in, &len); + + /* This is the original 8bit reader/writer code */ while (len > 0) { int todo = min(len, 0x10000);

For the RK3399-Q7 we recommend storing SPL and u-boot.itb in the on-module 32MBit (and sometimes even larger, if requested as part of a configure-to-order configuration) SPI-NOR flash that is clocked for a bitrate of 49.5MBit/s and connected in a single-IO configuration (the RK3399 only supports single-IO for SPI).
Unfortunately, the existing SPI driver is excruciatingly slow at reading out large chunks of data (in fact it is just as slow for small chunks of data, but the overheads of the driver-framework make it less noticeable): before this change, the throughput on a 4MB read from SPI-NOR is 8.47MBit/s which equates a 17.11% bus-utilisation.
To improve on this, this commit adds an optimised receive-only transfer (i.e.: out == NULL) handler that hooks into the main transfer function and processes data in 16bit frames (utilising the full with of each FIFO element). As of now, the receive-only handler requires the in-buffer to be 16bit aligned. Any lingering data (i.e. either if the in-buffer was not 16-bit aligned or if an odd number of bytes are to be received) will be handled by the original 8bit reader/wirter.
Given that the SPI controller's documentation does not guarantuee any interlocking between the RXFIFO and the master SCLK, the transfer loop will be restarted for each chunk of 32 frames (i.e. 64 bytes).
With this new receive-only transfer handler, the throughput for a 4MB read increases to 36.28MBit/s (i.e. 73.29% bus-utilisation): this is a 4x improvement over the baseline.
Signed-off-by: Philipp Tomsich philipp.tomsich@theobroma-systems.com Reported-by: Klaus Goger klaus.goger@theobroma-systems.com
Series-Cc: Klaus Goger klaus.goger@theobroma-systems.com Series-Cc: Christoph Muellner christoph.muellner@theobroma-systems.com
drivers/spi/rk_spi.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 88 insertions(+), 1 deletion(-)
Applied to u-boot-rockchip, thanks!

The SPI controller's documentation (I only had access to the RK3399, RK3368 and PX30 TRMs) specifies that, when operating in master-mode, the controller will stop the SCLK to avoid RXFIFO overruns and TXFIFO underruns. Looks like my worries that we'd need to support DMA-330 (aka PL330) to make any further progress were unfounded.
This adds a driver-data structure to capture hardware-specific settings of individual controller instances (after all, we don't know if all versions are well-behaved) and adds a 'master_manages_fifo' flag to it. The first use of said flag is in the optimised receive-only transfer-handler, which can now request 64Kframe (i.e. 128KByte) bursts of data on each reprogramming of CTRLR1 (i.e. every time through the loop).
This improves throughput to 46.85MBit/s (a 94.65% bus-utilisation).
Signed-off-by: Philipp Tomsich philipp.tomsich@theobroma-systems.com ---
drivers/spi/rk_spi.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-)
diff --git a/drivers/spi/rk_spi.c b/drivers/spi/rk_spi.c index fec41a4..7b39d1c 100644 --- a/drivers/spi/rk_spi.c +++ b/drivers/spi/rk_spi.c @@ -26,6 +26,11 @@ /* Change to 1 to output registers at the start of each transaction */ #define DEBUG_RK_SPI 0
+struct rockchip_spi_params { + /* RXFIFO overruns and TXFIFO underruns stop the master clock */ + bool master_manages_fifo; +}; + struct rockchip_spi_platdata { #if CONFIG_IS_ENABLED(OF_PLATDATA) struct dtd_rockchip_rk3288_spi of_plat; @@ -364,6 +369,15 @@ static inline int rockchip_spi_16bit_reader(struct udevice *dev, return 0; }
+ /* + * If we know that the hardware will manage RXFIFO overruns + * (i.e. stop the SPI clock until there's space in the FIFO), + * we the allow largest possible chunk size that can be + * represented in CTRLR1. + */ + if (data && data->master_manages_fifo) + max_chunk_size = 0x10000; + // rockchip_spi_configure(dev, mode, size) rkspi_enable_chip(regs, false); clrsetbits_le32(®s->ctrlr0, @@ -524,10 +538,16 @@ static const struct dm_spi_ops rockchip_spi_ops = { */ };
+const struct rockchip_spi_params rk3399_spi_params = { + .master_manages_fifo = true, +}; + static const struct udevice_id rockchip_spi_ids[] = { { .compatible = "rockchip,rk3288-spi" }, - { .compatible = "rockchip,rk3368-spi" }, - { .compatible = "rockchip,rk3399-spi" }, + { .compatible = "rockchip,rk3368-spi", + .data = (ulong)&rk3399_spi_params }, + { .compatible = "rockchip,rk3399-spi", + .data = (ulong)&rk3399_spi_params }, { } };

The SPI controller's documentation (I only had access to the RK3399, RK3368 and PX30 TRMs) specifies that, when operating in master-mode, the controller will stop the SCLK to avoid RXFIFO overruns and TXFIFO underruns. Looks like my worries that we'd need to support DMA-330 (aka PL330) to make any further progress were unfounded.
This adds a driver-data structure to capture hardware-specific settings of individual controller instances (after all, we don't know if all versions are well-behaved) and adds a 'master_manages_fifo' flag to it. The first use of said flag is in the optimised receive-only transfer-handler, which can now request 64Kframe (i.e. 128KByte) bursts of data on each reprogramming of CTRLR1 (i.e. every time through the loop).
This improves throughput to 46.85MBit/s (a 94.65% bus-utilisation).
Signed-off-by: Philipp Tomsich philipp.tomsich@theobroma-systems.com
drivers/spi/rk_spi.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-)
Applied to u-boot-rockchip, thanks!

To support unaligned output buffers (i.e. 'in' in the terminology of the SPI framework), this change splits each 16bit FIFO element after reading and writes them to memory in two 8bit transactions. With this change, we can now always use the optimised mode for receive-only transcations independent on the alignment of the target buffer.
Given that we'll run with caches on, the impact should be negligible: as expected, this has no adverse impact on throughput if running with a 960MHz LPLL configuration.
Signed-off-by: Philipp Tomsich philipp.tomsich@theobroma-systems.com ---
drivers/spi/rk_spi.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-)
diff --git a/drivers/spi/rk_spi.c b/drivers/spi/rk_spi.c index 7b39d1c..dceced9 100644 --- a/drivers/spi/rk_spi.c +++ b/drivers/spi/rk_spi.c @@ -353,23 +353,13 @@ static inline int rockchip_spi_16bit_reader(struct udevice *dev, u32 statistics_rxlevels[33] = { }; #endif u32 frames = *len / 2; - u16 *in16 = (u16 *)(*din); + u8 *in = (u8 *)(*din); u32 max_chunk_size = SPI_FIFO_DEPTH;
if (!frames) return 0;
/* - * If the destination buffer is unaligned, we'd run into a problem - * on ARMv8. Given that this doesn't seem to be a real issue, we - * just chicken out and fall back to the unoptimised implementation. - */ - if ((uintptr_t)*din & 1) { - debug("%s: unaligned buffer, din = %p\n", __func__, *din); - return 0; - } - - /* * If we know that the hardware will manage RXFIFO overruns * (i.e. stop the SPI clock until there's space in the FIFO), * we the allow largest possible chunk size that can be @@ -406,8 +396,11 @@ static inline int rockchip_spi_16bit_reader(struct udevice *dev, statistics_rxlevels[rx_level]++; #endif chunk_size -= rx_level; - while (rx_level--) - *in16++ = readw(regs->rxdr); + while (rx_level--) { + u16 val = readw(regs->rxdr); + *in++ = val & 0xff; + *in++ = val >> 8; + } } while (chunk_size);
rkspi_enable_chip(regs, false);

To support unaligned output buffers (i.e. 'in' in the terminology of the SPI framework), this change splits each 16bit FIFO element after reading and writes them to memory in two 8bit transactions. With this change, we can now always use the optimised mode for receive-only transcations independent on the alignment of the target buffer.
Given that we'll run with caches on, the impact should be negligible: as expected, this has no adverse impact on throughput if running with a 960MHz LPLL configuration.
Signed-off-by: Philipp Tomsich philipp.tomsich@theobroma-systems.com
drivers/spi/rk_spi.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-)
Applied to u-boot-rockchip, thanks!
participants (1)
-
Philipp Tomsich