[PATCH 0/4] ufs: properly fix cache operations

We experience huge problems with cache handling on Qualcomm systems, and it appears the dcache handling in the UFS core is quite wrong and causes all those issues.
This serie fixes the dcache operations, and fixes a big data write corruption due to a wrong data end address calculation.
Signed-off-by: Neil Armstrong neil.armstrong@linaro.org --- Neil Armstrong (4): ufs: allocate descriptors with size aligned with DMA_MINALIGN ufs: fix dcache flush and invalidate range calculation ufs: split flush and invalidate to only invalidate when required ufs: use dcache helpers for scsi_cmd data and only invalidate if necessary
drivers/ufs/ufs.c | 74 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 44 insertions(+), 30 deletions(-) --- base-commit: 3f772959501c99fbe5aa0b22a36efe3478d1ae1c change-id: 20240719-u-boot-ufs-dcache-fixup-d9a980a97835
Best regards,

Align the allocation size with DMA_MINALIGN to make sure we do not flush/invalidate data from following allocations.
Signed-off-by: Neil Armstrong neil.armstrong@linaro.org --- drivers/ufs/ufs.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/drivers/ufs/ufs.c b/drivers/ufs/ufs.c index e4400f319a7..bc86903245c 100644 --- a/drivers/ufs/ufs.c +++ b/drivers/ufs/ufs.c @@ -634,7 +634,9 @@ static int ufshcd_memory_alloc(struct ufs_hba *hba) /* Allocate one Transfer Request Descriptor * Should be aligned to 1k boundary. */ - hba->utrdl = memalign(1024, sizeof(struct utp_transfer_req_desc)); + hba->utrdl = memalign(1024, + ALIGN(sizeof(struct utp_transfer_req_desc), + ARCH_DMA_MINALIGN)); if (!hba->utrdl) { dev_err(hba->dev, "Transfer Descriptor memory allocation failed\n"); return -ENOMEM; @@ -643,7 +645,9 @@ static int ufshcd_memory_alloc(struct ufs_hba *hba) /* Allocate one Command Descriptor * Should be aligned to 1k boundary. */ - hba->ucdl = memalign(1024, sizeof(struct utp_transfer_cmd_desc)); + hba->ucdl = memalign(1024, + ALIGN(sizeof(struct utp_transfer_cmd_desc), + ARCH_DMA_MINALIGN)); if (!hba->ucdl) { dev_err(hba->dev, "Command descriptor memory allocation failed\n"); return -ENOMEM;

The current calculation will omit doing a flush/invalidate on the last cacheline if the base address is not aligned with DMA_MINALIGN.
This causes commands failures and write corruptions on Qualcomm platforms.
Signed-off-by: Neil Armstrong neil.armstrong@linaro.org --- drivers/ufs/ufs.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/drivers/ufs/ufs.c b/drivers/ufs/ufs.c index bc86903245c..db487d1ff9a 100644 --- a/drivers/ufs/ufs.c +++ b/drivers/ufs/ufs.c @@ -704,11 +704,11 @@ static inline u8 ufshcd_get_upmcrs(struct ufs_hba *hba) */ static void ufshcd_cache_flush_and_invalidate(void *addr, unsigned long size) { - uintptr_t aaddr = (uintptr_t)addr & ~(ARCH_DMA_MINALIGN - 1); - unsigned long asize = ALIGN(size, ARCH_DMA_MINALIGN); + uintptr_t start_addr = (uintptr_t)addr & ~(ARCH_DMA_MINALIGN - 1); + uintptr_t end_addr = ALIGN((uintptr_t)addr + size, ARCH_DMA_MINALIGN);
- flush_dcache_range(aaddr, aaddr + asize); - invalidate_dcache_range(aaddr, aaddr + asize); + flush_dcache_range(start_addr, end_addr); + invalidate_dcache_range(start_addr, end_addr); }
/** @@ -1467,13 +1467,13 @@ static void prepare_prdt_table(struct ufs_hba *hba, struct scsi_cmd *pccb) }
if (pccb->dma_dir == DMA_TO_DEVICE) { /* Write to device */ - flush_dcache_range(aaddr, aaddr + - ALIGN(datalen, ARCH_DMA_MINALIGN)); + flush_dcache_range(aaddr, + ALIGN((uintptr_t)pccb->pdata + datalen, ARCH_DMA_MINALIGN)); }
/* In any case, invalidate cache to avoid stale data in it. */ - invalidate_dcache_range(aaddr, aaddr + - ALIGN(datalen, ARCH_DMA_MINALIGN)); + invalidate_dcache_range(aaddr, + ALIGN((uintptr_t)pccb->pdata + datalen, ARCH_DMA_MINALIGN));
table_length = DIV_ROUND_UP(pccb->datalen, MAX_PRDT_ENTRY); buf = pccb->pdata;

There is no need to flush and invalidate all data updated by the driver, mainly because on ARM platforms flush also invalidates the cachelines.
Split the function in two and add the appropriate cacheline invalidates after the UFS DMA operation finishes to make sure we read from memory.
Flushing then invalidating cacheline unaligned data causes data corruption issues on Qualcomm platforms, and is largely unnecessary anyway, so let's cleanup the cache operations.
Signed-off-by: Neil Armstrong neil.armstrong@linaro.org --- drivers/ufs/ufs.c | 45 ++++++++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 15 deletions(-)
diff --git a/drivers/ufs/ufs.c b/drivers/ufs/ufs.c index db487d1ff9a..76cda83ce81 100644 --- a/drivers/ufs/ufs.c +++ b/drivers/ufs/ufs.c @@ -697,17 +697,28 @@ static inline u8 ufshcd_get_upmcrs(struct ufs_hba *hba) }
/** - * ufshcd_cache_flush_and_invalidate - Flush and invalidate cache + * ufshcd_cache_flush - Flush cache * - * Flush and invalidate cache in aligned address..address+size range. - * The invalidation is in place to avoid stale data in cache. + * Flush cache in aligned address..address+size range. */ -static void ufshcd_cache_flush_and_invalidate(void *addr, unsigned long size) +static void ufshcd_cache_flush(void *addr, unsigned long size) { uintptr_t start_addr = (uintptr_t)addr & ~(ARCH_DMA_MINALIGN - 1); uintptr_t end_addr = ALIGN((uintptr_t)addr + size, ARCH_DMA_MINALIGN);
flush_dcache_range(start_addr, end_addr); +} + +/** + * ufshcd_cache_invalidate - Invalidate cache + * + * Invalidate cache in aligned address..address+size range. + */ +static void ufshcd_cache_invalidate(void *addr, unsigned long size) +{ + uintptr_t start_addr = (uintptr_t)addr & ~(ARCH_DMA_MINALIGN - 1); + uintptr_t end_addr = ALIGN((uintptr_t)addr + size, ARCH_DMA_MINALIGN); + invalidate_dcache_range(start_addr, end_addr); }
@@ -755,7 +766,7 @@ static void ufshcd_prepare_req_desc_hdr(struct ufs_hba *hba,
req_desc->prd_table_length = 0;
- ufshcd_cache_flush_and_invalidate(req_desc, sizeof(*req_desc)); + ufshcd_cache_flush(req_desc, sizeof(*req_desc)); }
static void ufshcd_prepare_utp_query_req_upiu(struct ufs_hba *hba, @@ -786,13 +797,13 @@ static void ufshcd_prepare_utp_query_req_upiu(struct ufs_hba *hba, /* Copy the Descriptor */ if (query->request.upiu_req.opcode == UPIU_QUERY_OPCODE_WRITE_DESC) { memcpy(ucd_req_ptr + 1, query->descriptor, len); - ufshcd_cache_flush_and_invalidate(ucd_req_ptr, 2 * sizeof(*ucd_req_ptr)); + ufshcd_cache_flush(ucd_req_ptr, 2 * sizeof(*ucd_req_ptr)); } else { - ufshcd_cache_flush_and_invalidate(ucd_req_ptr, sizeof(*ucd_req_ptr)); + ufshcd_cache_flush(ucd_req_ptr, sizeof(*ucd_req_ptr)); }
memset(hba->ucd_rsp_ptr, 0, sizeof(struct utp_upiu_rsp)); - ufshcd_cache_flush_and_invalidate(hba->ucd_rsp_ptr, sizeof(*hba->ucd_rsp_ptr)); + ufshcd_cache_flush(hba->ucd_rsp_ptr, sizeof(*hba->ucd_rsp_ptr)); }
static inline void ufshcd_prepare_utp_nop_upiu(struct ufs_hba *hba) @@ -810,8 +821,8 @@ static inline void ufshcd_prepare_utp_nop_upiu(struct ufs_hba *hba)
memset(hba->ucd_rsp_ptr, 0, sizeof(struct utp_upiu_rsp));
- ufshcd_cache_flush_and_invalidate(ucd_req_ptr, sizeof(*ucd_req_ptr)); - ufshcd_cache_flush_and_invalidate(hba->ucd_rsp_ptr, sizeof(*hba->ucd_rsp_ptr)); + ufshcd_cache_flush(ucd_req_ptr, sizeof(*ucd_req_ptr)); + ufshcd_cache_flush(hba->ucd_rsp_ptr, sizeof(*hba->ucd_rsp_ptr)); }
/** @@ -878,6 +889,8 @@ static int ufshcd_send_command(struct ufs_hba *hba, unsigned int task_tag) */ static inline int ufshcd_get_req_rsp(struct utp_upiu_rsp *ucd_rsp_ptr) { + ufshcd_cache_invalidate(ucd_rsp_ptr, sizeof(*ucd_rsp_ptr)); + return be32_to_cpu(ucd_rsp_ptr->header.dword_0) >> 24; }
@@ -889,6 +902,8 @@ static inline int ufshcd_get_tr_ocs(struct ufs_hba *hba) { struct utp_transfer_req_desc *req_desc = hba->utrdl;
+ ufshcd_cache_invalidate(req_desc, sizeof(*req_desc)); + return le32_to_cpu(req_desc->header.dword_2) & MASK_OCS; }
@@ -1438,8 +1453,8 @@ void ufshcd_prepare_utp_scsi_cmd_upiu(struct ufs_hba *hba, memcpy(ucd_req_ptr->sc.cdb, pccb->cmd, cdb_len);
memset(hba->ucd_rsp_ptr, 0, sizeof(struct utp_upiu_rsp)); - ufshcd_cache_flush_and_invalidate(ucd_req_ptr, sizeof(*ucd_req_ptr)); - ufshcd_cache_flush_and_invalidate(hba->ucd_rsp_ptr, sizeof(*hba->ucd_rsp_ptr)); + ufshcd_cache_flush(ucd_req_ptr, sizeof(*ucd_req_ptr)); + ufshcd_cache_flush(hba->ucd_rsp_ptr, sizeof(*hba->ucd_rsp_ptr)); }
static inline void prepare_prdt_desc(struct ufshcd_sg_entry *entry, @@ -1462,7 +1477,7 @@ static void prepare_prdt_table(struct ufs_hba *hba, struct scsi_cmd *pccb)
if (!datalen) { req_desc->prd_table_length = 0; - ufshcd_cache_flush_and_invalidate(req_desc, sizeof(*req_desc)); + ufshcd_cache_flush(req_desc, sizeof(*req_desc)); return; }
@@ -1488,8 +1503,8 @@ static void prepare_prdt_table(struct ufs_hba *hba, struct scsi_cmd *pccb) prepare_prdt_desc(&prd_table[table_length - i - 1], buf, datalen - 1);
req_desc->prd_table_length = table_length; - ufshcd_cache_flush_and_invalidate(prd_table, sizeof(*prd_table) * table_length); - ufshcd_cache_flush_and_invalidate(req_desc, sizeof(*req_desc)); + ufshcd_cache_flush(prd_table, sizeof(*prd_table) * table_length); + ufshcd_cache_flush(req_desc, sizeof(*req_desc)); }
static int ufs_scsi_exec(struct udevice *scsi_dev, struct scsi_cmd *pccb)

Now we have proper flush and invalidate helpers, we can use them directly to operate on the scsi_cmd data.
Likewise, we do not need to flush then invalidate, just flush _or_ invalidate depending on the data direction.
Signed-off-by: Neil Armstrong neil.armstrong@linaro.org --- drivers/ufs/ufs.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-)
diff --git a/drivers/ufs/ufs.c b/drivers/ufs/ufs.c index 76cda83ce81..646d4856e9c 100644 --- a/drivers/ufs/ufs.c +++ b/drivers/ufs/ufs.c @@ -1469,7 +1469,6 @@ static void prepare_prdt_table(struct ufs_hba *hba, struct scsi_cmd *pccb) { struct utp_transfer_req_desc *req_desc = hba->utrdl; struct ufshcd_sg_entry *prd_table = hba->ucd_prdt_ptr; - uintptr_t aaddr = (uintptr_t)(pccb->pdata) & ~(ARCH_DMA_MINALIGN - 1); ulong datalen = pccb->datalen; int table_length; u8 *buf; @@ -1481,14 +1480,10 @@ static void prepare_prdt_table(struct ufs_hba *hba, struct scsi_cmd *pccb) return; }
- if (pccb->dma_dir == DMA_TO_DEVICE) { /* Write to device */ - flush_dcache_range(aaddr, - ALIGN((uintptr_t)pccb->pdata + datalen, ARCH_DMA_MINALIGN)); - } - - /* In any case, invalidate cache to avoid stale data in it. */ - invalidate_dcache_range(aaddr, - ALIGN((uintptr_t)pccb->pdata + datalen, ARCH_DMA_MINALIGN)); + if (pccb->dma_dir == DMA_TO_DEVICE) + ufshcd_cache_flush(pccb->pdata, datalen); + else + ufshcd_cache_invalidate(pccb->pdata, datalen);
table_length = DIV_ROUND_UP(pccb->datalen, MAX_PRDT_ENTRY); buf = pccb->pdata;

Hi,
On Fri, 19 Jul 2024 13:00:09 +0200, Neil Armstrong wrote:
We experience huge problems with cache handling on Qualcomm systems, and it appears the dcache handling in the UFS core is quite wrong and causes all those issues.
This serie fixes the dcache operations, and fixes a big data write corruption due to a wrong data end address calculation.
[...]
Thanks, Applied to https://source.denx.de/u-boot/custodians/u-boot-ufs (u-boot-ufs-next)
[1/4] ufs: allocate descriptors with size aligned with DMA_MINALIGN https://source.denx.de/u-boot/custodians/u-boot-ufs/-/commit/9c223d8d8b8fbad... [2/4] ufs: fix dcache flush and invalidate range calculation https://source.denx.de/u-boot/custodians/u-boot-ufs/-/commit/c64d22b57d8a9fe... [3/4] ufs: split flush and invalidate to only invalidate when required https://source.denx.de/u-boot/custodians/u-boot-ufs/-/commit/4139e5680bcf301... [4/4] ufs: use dcache helpers for scsi_cmd data and only invalidate if necessary https://source.denx.de/u-boot/custodians/u-boot-ufs/-/commit/589a7bf0febbe4d...
participants (1)
-
Neil Armstrong