[PATCH v3 0/3] dwc3: gadget: properly fix cache operations

We experience huge problems with cache handling on Qualcomm systems, and it appears the dcache handling in the DWC3 gadget code is quite wrong and causes operational issues.
This serie fixes the dcache operations on unaligned data, and properly invalidate buffers when reading back data from hardware.
Signed-off-by: Neil Armstrong neil.armstrong@linaro.org --- Changes in v3: - Cast addresses to (unsigned long) when calling invalidate_dcache_range() - Drop unused CACHELINE_SIZE - Fix warning by casting ctrl to uintptr_r when calling dwc3_invalidate_cache() - Link to v2: https://lore.kernel.org/r/20240724-u-boot-dwc3-gadget-dcache-fixup-v2-0-6583...
Changes in v2: - Fix typo in drivers/usb/dwc3/core.h and rewrite patch 1 commit message - Link to v1: https://lore.kernel.org/r/20240719-u-boot-dwc3-gadget-dcache-fixup-v1-0-58a5...
--- Neil Armstrong (3): usb: dwc3: allocate setup_buf with dma_alloc_coherent() usb: dwc3: fix dcache flush range calculation usb: dwc3: invalidate dcache on buffer used in interrupt handling
drivers/usb/dwc3/core.h | 2 ++ drivers/usb/dwc3/ep0.c | 6 ++++-- drivers/usb/dwc3/gadget.c | 10 ++++++---- drivers/usb/dwc3/io.h | 14 ++++++++++++-- 4 files changed, 24 insertions(+), 8 deletions(-) --- base-commit: ddbcafeb53e7093c58488596bfce6d8823777c3a change-id: 20240719-u-boot-dwc3-gadget-dcache-fixup-ea1e92758663
Best regards,

Since setup_buf is also consumed by hardware DMA, aligns it's allocation like other hardware buffers by introduce setup_buf_addr populated by dma_alloc_coherent(), and use it to pass the physical address of the buffer to the hardware.
Reviewed-by: Mattijs Korpershoek mkorpershoek@baylibre.com Signed-off-by: Neil Armstrong neil.armstrong@linaro.org --- drivers/usb/dwc3/core.h | 2 ++ drivers/usb/dwc3/ep0.c | 4 ++-- drivers/usb/dwc3/gadget.c | 8 ++++---- 3 files changed, 8 insertions(+), 6 deletions(-)
diff --git a/drivers/usb/dwc3/core.h b/drivers/usb/dwc3/core.h index 7374ce950da..b572ea340c8 100644 --- a/drivers/usb/dwc3/core.h +++ b/drivers/usb/dwc3/core.h @@ -670,6 +670,7 @@ struct dwc3_scratchpad_array { * @ep0_trb: dma address of ep0_trb * @ep0_usb_req: dummy req used while handling STD USB requests * @ep0_bounce_addr: dma address of ep0_bounce + * @setup_buf_addr: dma address of setup_buf * @scratch_addr: dma address of scratchbuf * @lock: for synchronizing * @dev: pointer to our struct device @@ -757,6 +758,7 @@ struct dwc3 { dma_addr_t ep0_trb_addr; dma_addr_t ep0_bounce_addr; dma_addr_t scratch_addr; + dma_addr_t setup_buf_addr; struct dwc3_request ep0_usb_req;
/* device lock */ diff --git a/drivers/usb/dwc3/ep0.c b/drivers/usb/dwc3/ep0.c index 24f516a131b..8ba5fcd5312 100644 --- a/drivers/usb/dwc3/ep0.c +++ b/drivers/usb/dwc3/ep0.c @@ -380,7 +380,7 @@ static int dwc3_ep0_handle_status(struct dwc3 *dwc, dep = dwc->eps[0]; dwc->ep0_usb_req.dep = dep; dwc->ep0_usb_req.request.length = sizeof(*response_pkt); - dwc->ep0_usb_req.request.buf = dwc->setup_buf; + dwc->ep0_usb_req.request.buf = (void *)dwc->setup_buf_addr; dwc->ep0_usb_req.request.complete = dwc3_ep0_status_cmpl;
return __dwc3_gadget_ep0_queue(dep, &dwc->ep0_usb_req); @@ -662,7 +662,7 @@ static int dwc3_ep0_set_sel(struct dwc3 *dwc, struct usb_ctrlrequest *ctrl) dep = dwc->eps[0]; dwc->ep0_usb_req.dep = dep; dwc->ep0_usb_req.request.length = dep->endpoint.maxpacket; - dwc->ep0_usb_req.request.buf = dwc->setup_buf; + dwc->ep0_usb_req.request.buf = (void *)dwc->setup_buf_addr; dwc->ep0_usb_req.request.complete = dwc3_ep0_set_sel_cmpl;
return __dwc3_gadget_ep0_queue(dep, &dwc->ep0_usb_req); diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index fe33e307d3e..19c3a5f5e58 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -2653,8 +2653,8 @@ int dwc3_gadget_init(struct dwc3 *dwc) goto err1; }
- dwc->setup_buf = memalign(CONFIG_SYS_CACHELINE_SIZE, - DWC3_EP0_BOUNCE_SIZE); + dwc->setup_buf = dma_alloc_coherent(DWC3_EP0_BOUNCE_SIZE, + (unsigned long *)&dwc->setup_buf_addr); if (!dwc->setup_buf) { ret = -ENOMEM; goto err2; @@ -2701,7 +2701,7 @@ err4: dma_free_coherent(dwc->ep0_bounce);
err3: - kfree(dwc->setup_buf); + dma_free_coherent(dwc->setup_buf);
err2: dma_free_coherent(dwc->ep0_trb); @@ -2723,7 +2723,7 @@ void dwc3_gadget_exit(struct dwc3 *dwc)
dma_free_coherent(dwc->ep0_bounce);
- kfree(dwc->setup_buf); + dma_free_coherent(dwc->setup_buf);
dma_free_coherent(dwc->ep0_trb);

On 10/2/24 4:39 PM, Neil Armstrong wrote:
Since setup_buf is also consumed by hardware DMA, aligns it's allocation like other hardware buffers by introduce setup_buf_addr populated by dma_alloc_coherent(), and use it to pass the physical address of the buffer to the hardware.
Reviewed-by: Mattijs Korpershoek mkorpershoek@baylibre.com Signed-off-by: Neil Armstrong neil.armstrong@linaro.org
Reviewed-by: Marek Vasut marex@denx.de

The current flush operation will omit doing a flush/invalidate on the first and last bytes if the base address and size are not aligned with DMA_MINALIGN.
This causes operation failures Qualcomm platforms.
Take in account the alignment and size of the buffer and also flush the previous and last cacheline.
Remove CACHELINE_SIZE which was the same as DMA_MINALIGN.
Reviewed-by: Mattijs Korpershoek mkorpershoek@baylibre.com Signed-off-by: Neil Armstrong neil.armstrong@linaro.org --- drivers/usb/dwc3/io.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/drivers/usb/dwc3/io.h b/drivers/usb/dwc3/io.h index 04791d4c9be..a6c2bb0f47d 100644 --- a/drivers/usb/dwc3/io.h +++ b/drivers/usb/dwc3/io.h @@ -20,7 +20,6 @@ #include <cpu_func.h> #include <asm/io.h>
-#define CACHELINE_SIZE CONFIG_SYS_CACHELINE_SIZE static inline u32 dwc3_readl(void __iomem *base, u32 offset) { unsigned long offs = offset - DWC3_GLOBALS_REGS_START; @@ -50,6 +49,9 @@ static inline void dwc3_writel(void __iomem *base, u32 offset, u32 value)
static inline void dwc3_flush_cache(uintptr_t addr, int length) { - flush_dcache_range(addr, addr + ROUND(length, CACHELINE_SIZE)); + uintptr_t start_addr = (uintptr_t)addr & ~(ARCH_DMA_MINALIGN - 1); + uintptr_t end_addr = ALIGN((uintptr_t)addr + length, ARCH_DMA_MINALIGN); + + flush_dcache_range((unsigned long)start_addr, (unsigned long)end_addr); } #endif /* __DRIVERS_USB_DWC3_IO_H */

On 10/2/24 4:39 PM, Neil Armstrong wrote:
The current flush operation will omit doing a flush/invalidate on the first and last bytes if the base address and size are not aligned with DMA_MINALIGN.
This causes operation failures Qualcomm platforms.
Take in account the alignment and size of the buffer and also flush the previous and last cacheline.
Remove CACHELINE_SIZE which was the same as DMA_MINALIGN.
It isn't the same, CACHELINE_SIZE was set to CONFIG_SYS_CACHELINE_SIZE (CPU L1 cache cacheline length) while ARCH_DMA_MINALIGN is DMA engine alignment requirement (from times where there used to be one DMA engine on most devices). You likely want a max(CONFIG_SYS_CACHELINE_SIZE, dwc3-buffer-alignment-requirement) to really correctly align the buffer.
Reviewed-by: Mattijs Korpershoek mkorpershoek@baylibre.com Signed-off-by: Neil Armstrong neil.armstrong@linaro.org
drivers/usb/dwc3/io.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/drivers/usb/dwc3/io.h b/drivers/usb/dwc3/io.h index 04791d4c9be..a6c2bb0f47d 100644 --- a/drivers/usb/dwc3/io.h +++ b/drivers/usb/dwc3/io.h @@ -20,7 +20,6 @@ #include <cpu_func.h> #include <asm/io.h>
-#define CACHELINE_SIZE CONFIG_SYS_CACHELINE_SIZE static inline u32 dwc3_readl(void __iomem *base, u32 offset) { unsigned long offs = offset - DWC3_GLOBALS_REGS_START; @@ -50,6 +49,9 @@ static inline void dwc3_writel(void __iomem *base, u32 offset, u32 value)
static inline void dwc3_flush_cache(uintptr_t addr, int length) {
- flush_dcache_range(addr, addr + ROUND(length, CACHELINE_SIZE));
- uintptr_t start_addr = (uintptr_t)addr & ~(ARCH_DMA_MINALIGN - 1);
- uintptr_t end_addr = ALIGN((uintptr_t)addr + length, ARCH_DMA_MINALIGN);
- flush_dcache_range((unsigned long)start_addr, (unsigned long)end_addr); }

On 02/10/2024 16:55, Marek Vasut wrote:
On 10/2/24 4:39 PM, Neil Armstrong wrote:
The current flush operation will omit doing a flush/invalidate on the first and last bytes if the base address and size are not aligned with DMA_MINALIGN.
This causes operation failures Qualcomm platforms.
Take in account the alignment and size of the buffer and also flush the previous and last cacheline.
Remove CACHELINE_SIZE which was the same as DMA_MINALIGN.
It isn't the same, CACHELINE_SIZE was set to CONFIG_SYS_CACHELINE_SIZE (CPU L1 cache cacheline length) while ARCH_DMA_MINALIGN is DMA engine alignment requirement (from times where there used to be one DMA engine on most devices). You likely want a max(CONFIG_SYS_CACHELINE_SIZE, dwc3-buffer-alignment-requirement) to really correctly align the buffer.
It is definitely true for platforms declaring dma_alloc_coherent() (arm, riscv, x86) except nios2 but there's 0 chance dwc3 appears on a nios2 platform.
Neil
Reviewed-by: Mattijs Korpershoek mkorpershoek@baylibre.com Signed-off-by: Neil Armstrong neil.armstrong@linaro.org
drivers/usb/dwc3/io.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/drivers/usb/dwc3/io.h b/drivers/usb/dwc3/io.h index 04791d4c9be..a6c2bb0f47d 100644 --- a/drivers/usb/dwc3/io.h +++ b/drivers/usb/dwc3/io.h @@ -20,7 +20,6 @@ #include <cpu_func.h> #include <asm/io.h> -#define CACHELINE_SIZE CONFIG_SYS_CACHELINE_SIZE static inline u32 dwc3_readl(void __iomem *base, u32 offset) { unsigned long offs = offset - DWC3_GLOBALS_REGS_START; @@ -50,6 +49,9 @@ static inline void dwc3_writel(void __iomem *base, u32 offset, u32 value) static inline void dwc3_flush_cache(uintptr_t addr, int length) { - flush_dcache_range(addr, addr + ROUND(length, CACHELINE_SIZE)); + uintptr_t start_addr = (uintptr_t)addr & ~(ARCH_DMA_MINALIGN - 1); + uintptr_t end_addr = ALIGN((uintptr_t)addr + length, ARCH_DMA_MINALIGN);
+ flush_dcache_range((unsigned long)start_addr, (unsigned long)end_addr); }

On 10/3/24 2:49 PM, Neil Armstrong wrote:
On 02/10/2024 16:55, Marek Vasut wrote:
On 10/2/24 4:39 PM, Neil Armstrong wrote:
The current flush operation will omit doing a flush/invalidate on the first and last bytes if the base address and size are not aligned with DMA_MINALIGN.
This causes operation failures Qualcomm platforms.
Take in account the alignment and size of the buffer and also flush the previous and last cacheline.
Remove CACHELINE_SIZE which was the same as DMA_MINALIGN.
It isn't the same, CACHELINE_SIZE was set to CONFIG_SYS_CACHELINE_SIZE (CPU L1 cache cacheline length) while ARCH_DMA_MINALIGN is DMA engine alignment requirement (from times where there used to be one DMA engine on most devices). You likely want a max(CONFIG_SYS_CACHELINE_SIZE, dwc3-buffer-alignment-requirement) to really correctly align the buffer.
It is definitely true for platforms declaring dma_alloc_coherent() (arm, riscv, x86) except nios2 but there's 0 chance dwc3 appears on a nios2 platform.
There is real chance of that, because on modern SoCFPGA platforms (Agilex) you can have the FPGA content access the SoC peripherals, and one of the SoC peripherals is DWC3 controller. If anyone would actually synthesize it is another question ... but it is an FPGA, so that option exists.

On 03/10/2024 15:19, Marek Vasut wrote:
On 10/3/24 2:49 PM, Neil Armstrong wrote:
On 02/10/2024 16:55, Marek Vasut wrote:
On 10/2/24 4:39 PM, Neil Armstrong wrote:
The current flush operation will omit doing a flush/invalidate on the first and last bytes if the base address and size are not aligned with DMA_MINALIGN.
This causes operation failures Qualcomm platforms.
Take in account the alignment and size of the buffer and also flush the previous and last cacheline.
Remove CACHELINE_SIZE which was the same as DMA_MINALIGN.
It isn't the same, CACHELINE_SIZE was set to CONFIG_SYS_CACHELINE_SIZE (CPU L1 cache cacheline length) while ARCH_DMA_MINALIGN is DMA engine alignment requirement (from times where there used to be one DMA engine on most devices). You likely want a max(CONFIG_SYS_CACHELINE_SIZE, dwc3-buffer-alignment-requirement) to really correctly align the buffer.
It is definitely true for platforms declaring dma_alloc_coherent() (arm, riscv, x86) except nios2 but there's 0 chance dwc3 appears on a nios2 platform.
There is real chance of that, because on modern SoCFPGA platforms (Agilex) you can have the FPGA content access the SoC peripherals, and one of the SoC peripherals is DWC3 controller. If anyone would actually synthesize it is another question ... but it is an FPGA, so that option exists.
Guess I'll switch to CACHELINE_SIZE instead of DMA_MINALIGN for nios2.
Neil

On 10/4/24 9:16 AM, neil.armstrong@linaro.org wrote:
On 03/10/2024 15:19, Marek Vasut wrote:
On 10/3/24 2:49 PM, Neil Armstrong wrote:
On 02/10/2024 16:55, Marek Vasut wrote:
On 10/2/24 4:39 PM, Neil Armstrong wrote:
The current flush operation will omit doing a flush/invalidate on the first and last bytes if the base address and size are not aligned with DMA_MINALIGN.
This causes operation failures Qualcomm platforms.
Take in account the alignment and size of the buffer and also flush the previous and last cacheline.
Remove CACHELINE_SIZE which was the same as DMA_MINALIGN.
It isn't the same, CACHELINE_SIZE was set to CONFIG_SYS_CACHELINE_SIZE (CPU L1 cache cacheline length) while ARCH_DMA_MINALIGN is DMA engine alignment requirement (from times where there used to be one DMA engine on most devices). You likely want a max(CONFIG_SYS_CACHELINE_SIZE, dwc3-buffer-alignment- requirement) to really correctly align the buffer.
It is definitely true for platforms declaring dma_alloc_coherent() (arm, riscv, x86) except nios2 but there's 0 chance dwc3 appears on a nios2 platform.
There is real chance of that, because on modern SoCFPGA platforms (Agilex) you can have the FPGA content access the SoC peripherals, and one of the SoC peripherals is DWC3 controller. If anyone would actually synthesize it is another question ... but it is an FPGA, so that option exists.
Guess I'll switch to CACHELINE_SIZE instead of DMA_MINALIGN for nios2.
I think max(CONFIG_SYS_CACHELINE_SIZE, ARCH_DMA_MINALIGN) should cover all the cases ?

On Qualcomm systems, the setup buffer and even buffers are in a bad state at interrupt handling, so invalidate the dcache lines for the setup_buf and event buffer to make sure we read correct data written by the hardware.
This fixes the following error: dwc3-generic-peripheral usb@a600000: UNKNOWN IRQ type -1 dwc3-generic-peripheral usb@a600000: UNKNOWN IRQ type 4673109
and invalid situation in dwc3_gadget_giveback() because setup_buf content is read at 0s and leads to fatal crash fixed by [1].
[1] https://lore.kernel.org/all/20240528-topic-sm8x50-dwc3-gadget-crash-fix-v1-1...
Reviewed-by: Mattijs Korpershoek mkorpershoek@baylibre.com Signed-off-by: Neil Armstrong neil.armstrong@linaro.org --- drivers/usb/dwc3/ep0.c | 2 ++ drivers/usb/dwc3/gadget.c | 2 ++ drivers/usb/dwc3/io.h | 8 ++++++++ 3 files changed, 12 insertions(+)
diff --git a/drivers/usb/dwc3/ep0.c b/drivers/usb/dwc3/ep0.c index 8ba5fcd5312..531f0b522af 100644 --- a/drivers/usb/dwc3/ep0.c +++ b/drivers/usb/dwc3/ep0.c @@ -742,6 +742,8 @@ static void dwc3_ep0_inspect_setup(struct dwc3 *dwc, if (!dwc->gadget_driver) goto out;
+ dwc3_invalidate_cache((uintptr_t)ctrl, sizeof(*ctrl)); + len = le16_to_cpu(ctrl->wLength); if (!len) { dwc->three_stage_setup = false; diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index 19c3a5f5e58..e5a383407a2 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -2534,6 +2534,8 @@ static irqreturn_t dwc3_process_event_buf(struct dwc3 *dwc, u32 buf) while (left > 0) { union dwc3_event event;
+ dwc3_invalidate_cache((uintptr_t)evt->buf, evt->length); + event.raw = *(u32 *) (evt->buf + evt->lpos);
dwc3_process_event_entry(dwc, &event); diff --git a/drivers/usb/dwc3/io.h b/drivers/usb/dwc3/io.h index a6c2bb0f47d..b347a7f1fc1 100644 --- a/drivers/usb/dwc3/io.h +++ b/drivers/usb/dwc3/io.h @@ -54,4 +54,12 @@ static inline void dwc3_flush_cache(uintptr_t addr, int length)
flush_dcache_range((unsigned long)start_addr, (unsigned long)end_addr); } + +static inline void dwc3_invalidate_cache(uintptr_t addr, int length) +{ + uintptr_t start_addr = (uintptr_t)addr & ~(ARCH_DMA_MINALIGN - 1); + uintptr_t end_addr = ALIGN((uintptr_t)addr + length, ARCH_DMA_MINALIGN); + + invalidate_dcache_range((unsigned long)start_addr, (unsigned long)end_addr); +} #endif /* __DRIVERS_USB_DWC3_IO_H */
participants (3)
-
Marek Vasut
-
Neil Armstrong
-
neil.armstrong@linaro.org