--- /dev/null
+From 8c4e0f212398cdd1eb4310a5981d06a723cdd24f Mon Sep 17 00:00:00 2001
+From: Bodo Stroesser <bstroesser@ts.fujitsu.com>
+Date: Thu, 28 May 2020 21:31:08 +0200
+Subject: scsi: target: tcmu: Fix size in calls to tcmu_flush_dcache_range
+
+From: Bodo Stroesser <bstroesser@ts.fujitsu.com>
+
+commit 8c4e0f212398cdd1eb4310a5981d06a723cdd24f upstream.
+
+1) If remaining ring space before the end of the ring is smaller then the
+ next cmd to write, tcmu writes a padding entry which fills the remaining
+ space at the end of the ring.
+
+ Then tcmu calls tcmu_flush_dcache_range() with the size of struct
+ tcmu_cmd_entry as data length to flush. If the space filled by the
+ padding was smaller then tcmu_cmd_entry, tcmu_flush_dcache_range() is
+ called for an address range reaching behind the end of the vmalloc'ed
+ ring.
+
+ tcmu_flush_dcache_range() in a loop calls
+ flush_dcache_page(virt_to_page(start)); for every page being part of the
+ range. On x86 the line is optimized out by the compiler, as
+ flush_dcache_page() is empty on x86.
+
+ But I assume the above can cause trouble on other architectures that
+ really have a flush_dcache_page(). For paddings only the header part of
+ an entry is relevant due to alignment rules the header always fits in
+ the remaining space, if padding is needed. So tcmu_flush_dcache_range()
+ can safely be called with sizeof(entry->hdr) as the length here.
+
+2) After it has written a command to cmd ring, tcmu calls
+ tcmu_flush_dcache_range() using the size of a struct tcmu_cmd_entry as
+ data length to flush. But if a command needs many iovecs, the real size
+ of the command may be bigger then tcmu_cmd_entry, so a part of the
+ written command is not flushed then.
+
+Link: https://lore.kernel.org/r/20200528193108.9085-1-bstroesser@ts.fujitsu.com
+Acked-by: Mike Christie <michael.christie@oracle.com>
+Signed-off-by: Bodo Stroesser <bstroesser@ts.fujitsu.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/target/target_core_user.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/target/target_core_user.c
++++ b/drivers/target/target_core_user.c
+@@ -1018,7 +1018,7 @@ static int queue_cmd_ring(struct tcmu_cm
+ entry->hdr.cmd_id = 0; /* not used for PAD */
+ entry->hdr.kflags = 0;
+ entry->hdr.uflags = 0;
+- tcmu_flush_dcache_range(entry, sizeof(*entry));
++ tcmu_flush_dcache_range(entry, sizeof(entry->hdr));
+
+ UPDATE_HEAD(mb->cmd_head, pad_size, udev->cmdr_size);
+ tcmu_flush_dcache_range(mb, sizeof(*mb));
+@@ -1083,7 +1083,7 @@ static int queue_cmd_ring(struct tcmu_cm
+ cdb_off = CMDR_OFF + cmd_head + base_command_size;
+ memcpy((void *) mb + cdb_off, se_cmd->t_task_cdb, scsi_command_size(se_cmd->t_task_cdb));
+ entry->req.cdb_off = cdb_off;
+- tcmu_flush_dcache_range(entry, sizeof(*entry));
++ tcmu_flush_dcache_range(entry, command_size);
+
+ UPDATE_HEAD(mb->cmd_head, command_size, udev->cmdr_size);
+ tcmu_flush_dcache_range(mb, sizeof(*mb));
--- /dev/null
+From 3c58f737231e2c8cbf543a09d84d8c8e80e05e43 Mon Sep 17 00:00:00 2001
+From: Bodo Stroesser <bstroesser@ts.fujitsu.com>
+Date: Thu, 18 Jun 2020 15:16:31 +0200
+Subject: scsi: target: tcmu: Optimize use of flush_dcache_page
+
+From: Bodo Stroesser <bstroesser@ts.fujitsu.com>
+
+commit 3c58f737231e2c8cbf543a09d84d8c8e80e05e43 upstream.
+
+(scatter|gather)_data_area() need to flush dcache after writing data to or
+before reading data from a page in uio data area. The two routines are
+able to handle data transfer to/from such a page in fragments and flush the
+cache after each fragment was copied by calling the wrapper
+tcmu_flush_dcache_range().
+
+That means:
+
+1) flush_dcache_page() can be called multiple times for the same page.
+
+2) Calling flush_dcache_page() indirectly using the wrapper does not make
+ sense, because each call of the wrapper is for one single page only and
+ the calling routine already has the correct page pointer.
+
+Change (scatter|gather)_data_area() such that, instead of calling
+tcmu_flush_dcache_range() before/after each memcpy, it now calls
+flush_dcache_page() before unmapping a page (when writing is complete for
+that page) or after mapping a page (when starting to read the page).
+
+After this change only calls to tcmu_flush_dcache_range() for addresses in
+vmalloc'ed command ring are left over.
+
+The patch was tested on ARM with kernel 4.19.118 and 5.7.2
+
+Link: https://lore.kernel.org/r/20200618131632.32748-2-bstroesser@ts.fujitsu.com
+Tested-by: JiangYu <lnsyyj@hotmail.com>
+Tested-by: Daniel Meyerholt <dxm523@gmail.com>
+Acked-by: Mike Christie <michael.christie@oracle.com>
+Signed-off-by: Bodo Stroesser <bstroesser@ts.fujitsu.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/target/target_core_user.c | 11 +++++++----
+ 1 file changed, 7 insertions(+), 4 deletions(-)
+
+--- a/drivers/target/target_core_user.c
++++ b/drivers/target/target_core_user.c
+@@ -687,8 +687,10 @@ static void scatter_data_area(struct tcm
+ from = kmap_atomic(sg_page(sg)) + sg->offset;
+ while (sg_remaining > 0) {
+ if (block_remaining == 0) {
+- if (to)
++ if (to) {
++ flush_dcache_page(page);
+ kunmap_atomic(to);
++ }
+
+ block_remaining = DATA_BLOCK_SIZE;
+ dbi = tcmu_cmd_get_dbi(tcmu_cmd);
+@@ -733,7 +735,6 @@ static void scatter_data_area(struct tcm
+ memcpy(to + offset,
+ from + sg->length - sg_remaining,
+ copy_bytes);
+- tcmu_flush_dcache_range(to, copy_bytes);
+ }
+
+ sg_remaining -= copy_bytes;
+@@ -742,8 +743,10 @@ static void scatter_data_area(struct tcm
+ kunmap_atomic(from - sg->offset);
+ }
+
+- if (to)
++ if (to) {
++ flush_dcache_page(page);
+ kunmap_atomic(to);
++ }
+ }
+
+ static void gather_data_area(struct tcmu_dev *udev, struct tcmu_cmd *cmd,
+@@ -789,13 +792,13 @@ static void gather_data_area(struct tcmu
+ dbi = tcmu_cmd_get_dbi(cmd);
+ page = tcmu_get_block_page(udev, dbi);
+ from = kmap_atomic(page);
++ flush_dcache_page(page);
+ }
+ copy_bytes = min_t(size_t, sg_remaining,
+ block_remaining);
+ if (read_len < copy_bytes)
+ copy_bytes = read_len;
+ offset = DATA_BLOCK_SIZE - block_remaining;
+- tcmu_flush_dcache_range(from, copy_bytes);
+ memcpy(to + sg->length - sg_remaining, from + offset,
+ copy_bytes);
+