]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
5.10-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 3 Nov 2021 09:58:33 +0000 (10:58 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 3 Nov 2021 09:58:33 +0000 (10:58 +0100)
added patches:
media-firewire-firedtv-avc-fix-a-buffer-overflow-in-avc_ca_pmt.patch
mm-filemap-check-if-thp-has-hwpoisoned-subpage-for-pmd-page-fault.patch
mm-hwpoison-remove-the-unnecessary-thp-check.patch
net-ethernet-microchip-lan743x-fix-skb-allocation-failure.patch
sfc-fix-reading-non-legacy-supported-link-modes.patch
vrf-revert-reset-skb-conntrack-connection.patch

queue-5.10/media-firewire-firedtv-avc-fix-a-buffer-overflow-in-avc_ca_pmt.patch [new file with mode: 0644]
queue-5.10/mm-filemap-check-if-thp-has-hwpoisoned-subpage-for-pmd-page-fault.patch [new file with mode: 0644]
queue-5.10/mm-hwpoison-remove-the-unnecessary-thp-check.patch [new file with mode: 0644]
queue-5.10/net-ethernet-microchip-lan743x-fix-skb-allocation-failure.patch [new file with mode: 0644]
queue-5.10/series
queue-5.10/sfc-fix-reading-non-legacy-supported-link-modes.patch [new file with mode: 0644]
queue-5.10/vrf-revert-reset-skb-conntrack-connection.patch [new file with mode: 0644]

diff --git a/queue-5.10/media-firewire-firedtv-avc-fix-a-buffer-overflow-in-avc_ca_pmt.patch b/queue-5.10/media-firewire-firedtv-avc-fix-a-buffer-overflow-in-avc_ca_pmt.patch
new file mode 100644 (file)
index 0000000..d6e4d7c
--- /dev/null
@@ -0,0 +1,84 @@
+From 35d2969ea3c7d32aee78066b1f3cf61a0d935a4e Mon Sep 17 00:00:00 2001
+From: Dan Carpenter <dan.carpenter@oracle.com>
+Date: Mon, 7 Jun 2021 17:23:48 +0200
+Subject: media: firewire: firedtv-avc: fix a buffer overflow in avc_ca_pmt()
+
+From: Dan Carpenter <dan.carpenter@oracle.com>
+
+commit 35d2969ea3c7d32aee78066b1f3cf61a0d935a4e upstream.
+
+The bounds checking in avc_ca_pmt() is not strict enough.  It should
+be checking "read_pos + 4" because it's reading 5 bytes.  If the
+"es_info_length" is non-zero then it reads a 6th byte so there needs to
+be an additional check for that.
+
+I also added checks for the "write_pos".  I don't think these are
+required because "read_pos" and "write_pos" are tied together so
+checking one ought to be enough.  But they make the code easier to
+understand for me.  The check on write_pos is:
+
+       if (write_pos + 4 >= sizeof(c->operand) - 4) {
+
+The first "+ 4" is because we're writing 5 bytes and the last " - 4"
+is to leave space for the CRC.
+
+The other problem is that "length" can be invalid.  It comes from
+"data_length" in fdtv_ca_pmt().
+
+Cc: stable@vger.kernel.org
+Reported-by: Luo Likang <luolikang@nsfocus.com>
+Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
+Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
+Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/media/firewire/firedtv-avc.c |   14 +++++++++++---
+ drivers/media/firewire/firedtv-ci.c  |    2 ++
+ 2 files changed, 13 insertions(+), 3 deletions(-)
+
+--- a/drivers/media/firewire/firedtv-avc.c
++++ b/drivers/media/firewire/firedtv-avc.c
+@@ -1165,7 +1165,11 @@ int avc_ca_pmt(struct firedtv *fdtv, cha
+               read_pos += program_info_length;
+               write_pos += program_info_length;
+       }
+-      while (read_pos < length) {
++      while (read_pos + 4 < length) {
++              if (write_pos + 4 >= sizeof(c->operand) - 4) {
++                      ret = -EINVAL;
++                      goto out;
++              }
+               c->operand[write_pos++] = msg[read_pos++];
+               c->operand[write_pos++] = msg[read_pos++];
+               c->operand[write_pos++] = msg[read_pos++];
+@@ -1177,13 +1181,17 @@ int avc_ca_pmt(struct firedtv *fdtv, cha
+               c->operand[write_pos++] = es_info_length >> 8;
+               c->operand[write_pos++] = es_info_length & 0xff;
+               if (es_info_length > 0) {
++                      if (read_pos >= length) {
++                              ret = -EINVAL;
++                              goto out;
++                      }
+                       pmt_cmd_id = msg[read_pos++];
+                       if (pmt_cmd_id != 1 && pmt_cmd_id != 4)
+                               dev_err(fdtv->device, "invalid pmt_cmd_id %d at stream level\n",
+                                       pmt_cmd_id);
+-                      if (es_info_length > sizeof(c->operand) - 4 -
+-                                           write_pos) {
++                      if (es_info_length > sizeof(c->operand) - 4 - write_pos ||
++                          es_info_length > length - read_pos) {
+                               ret = -EINVAL;
+                               goto out;
+                       }
+--- a/drivers/media/firewire/firedtv-ci.c
++++ b/drivers/media/firewire/firedtv-ci.c
+@@ -134,6 +134,8 @@ static int fdtv_ca_pmt(struct firedtv *f
+       } else {
+               data_length = msg->msg[3];
+       }
++      if (data_length > sizeof(msg->msg) - data_pos)
++              return -EINVAL;
+       return avc_ca_pmt(fdtv, &msg->msg[data_pos], data_length);
+ }
diff --git a/queue-5.10/mm-filemap-check-if-thp-has-hwpoisoned-subpage-for-pmd-page-fault.patch b/queue-5.10/mm-filemap-check-if-thp-has-hwpoisoned-subpage-for-pmd-page-fault.patch
new file mode 100644 (file)
index 0000000..b6d18a8
--- /dev/null
@@ -0,0 +1,165 @@
+From eac96c3efdb593df1a57bb5b95dbe037bfa9a522 Mon Sep 17 00:00:00 2001
+From: Yang Shi <shy828301@gmail.com>
+Date: Thu, 28 Oct 2021 14:36:11 -0700
+Subject: mm: filemap: check if THP has hwpoisoned subpage for PMD page fault
+
+From: Yang Shi <shy828301@gmail.com>
+
+commit eac96c3efdb593df1a57bb5b95dbe037bfa9a522 upstream.
+
+When handling shmem page fault the THP with corrupted subpage could be
+PMD mapped if certain conditions are satisfied.  But kernel is supposed
+to send SIGBUS when trying to map hwpoisoned page.
+
+There are two paths which may do PMD map: fault around and regular
+fault.
+
+Before commit f9ce0be71d1f ("mm: Cleanup faultaround and finish_fault()
+codepaths") the thing was even worse in fault around path.  The THP
+could be PMD mapped as long as the VMA fits regardless what subpage is
+accessed and corrupted.  After this commit as long as head page is not
+corrupted the THP could be PMD mapped.
+
+In the regular fault path the THP could be PMD mapped as long as the
+corrupted page is not accessed and the VMA fits.
+
+This loophole could be fixed by iterating every subpage to check if any
+of them is hwpoisoned or not, but it is somewhat costly in page fault
+path.
+
+So introduce a new page flag called HasHWPoisoned on the first tail
+page.  It indicates the THP has hwpoisoned subpage(s).  It is set if any
+subpage of THP is found hwpoisoned by memory failure and after the
+refcount is bumped successfully, then cleared when the THP is freed or
+split.
+
+The soft offline path doesn't need this since soft offline handler just
+marks a subpage hwpoisoned when the subpage is migrated successfully.
+But shmem THP didn't get split then migrated at all.
+
+Link: https://lkml.kernel.org/r/20211020210755.23964-3-shy828301@gmail.com
+Fixes: 800d8c63b2e9 ("shmem: add huge pages support")
+Signed-off-by: Yang Shi <shy828301@gmail.com>
+Reviewed-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
+Suggested-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: Oscar Salvador <osalvador@suse.de>
+Cc: Peter Xu <peterx@redhat.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/page-flags.h |   23 +++++++++++++++++++++++
+ mm/huge_memory.c           |    2 ++
+ mm/memory-failure.c        |   14 ++++++++++++++
+ mm/memory.c                |    9 +++++++++
+ mm/page_alloc.c            |    4 +++-
+ 5 files changed, 51 insertions(+), 1 deletion(-)
+
+--- a/include/linux/page-flags.h
++++ b/include/linux/page-flags.h
+@@ -169,6 +169,15 @@ enum pageflags {
+       /* Compound pages. Stored in first tail page's flags */
+       PG_double_map = PG_workingset,
++#ifdef CONFIG_MEMORY_FAILURE
++      /*
++       * Compound pages. Stored in first tail page's flags.
++       * Indicates that at least one subpage is hwpoisoned in the
++       * THP.
++       */
++      PG_has_hwpoisoned = PG_mappedtodisk,
++#endif
++
+       /* non-lru isolated movable page */
+       PG_isolated = PG_reclaim,
+@@ -667,6 +676,20 @@ static inline int PageTransCompoundMap(s
+              atomic_read(compound_mapcount_ptr(head));
+ }
++#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
++/*
++ * PageHasHWPoisoned indicates that at least one subpage is hwpoisoned in the
++ * compound page.
++ *
++ * This flag is set by hwpoison handler.  Cleared by THP split or free page.
++ */
++PAGEFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND)
++      TESTSCFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND)
++#else
++PAGEFLAG_FALSE(HasHWPoisoned)
++      TESTSCFLAG_FALSE(HasHWPoisoned)
++#endif
++
+ /*
+  * PageTransTail returns true for both transparent huge pages
+  * and hugetlbfs pages, so it should only be called when it's known
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -2464,6 +2464,8 @@ static void __split_huge_page(struct pag
+               xa_lock(&swap_cache->i_pages);
+       }
++      ClearPageHasHWPoisoned(head);
++
+       for (i = nr - 1; i >= 1; i--) {
+               __split_huge_page_tail(head, i, lruvec, list);
+               /* Some pages can be beyond i_size: drop them from page cache */
+--- a/mm/memory-failure.c
++++ b/mm/memory-failure.c
+@@ -1367,6 +1367,20 @@ int memory_failure(unsigned long pfn, in
+       }
+       if (PageTransHuge(hpage)) {
++              /*
++               * The flag must be set after the refcount is bumped
++               * otherwise it may race with THP split.
++               * And the flag can't be set in get_hwpoison_page() since
++               * it is called by soft offline too and it is just called
++               * for !MF_COUNT_INCREASE.  So here seems to be the best
++               * place.
++               *
++               * Don't need care about the above error handling paths for
++               * get_hwpoison_page() since they handle either free page
++               * or unhandlable page.  The refcount is bumped iff the
++               * page is a valid handlable page.
++               */
++              SetPageHasHWPoisoned(hpage);
+               if (try_to_split_thp_page(p, "Memory Failure") < 0) {
+                       action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
+                       return -EBUSY;
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -3921,6 +3921,15 @@ vm_fault_t finish_fault(struct vm_fault
+               page = vmf->page;
+       /*
++       * Just backoff if any subpage of a THP is corrupted otherwise
++       * the corrupted page may mapped by PMD silently to escape the
++       * check.  This kind of THP just can be PTE mapped.  Access to
++       * the corrupted subpage should trigger SIGBUS as expected.
++       */
++      if (unlikely(PageHasHWPoisoned(page)))
++              return ret;
++
++      /*
+        * check even for read faults because we might have lost our CoWed
+        * page
+        */
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -1232,8 +1232,10 @@ static __always_inline bool free_pages_p
+               VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
+-              if (compound)
++              if (compound) {
+                       ClearPageDoubleMap(page);
++                      ClearPageHasHWPoisoned(page);
++              }
+               for (i = 1; i < (1 << order); i++) {
+                       if (compound)
+                               bad += free_tail_pages_check(page, page + i);
diff --git a/queue-5.10/mm-hwpoison-remove-the-unnecessary-thp-check.patch b/queue-5.10/mm-hwpoison-remove-the-unnecessary-thp-check.patch
new file mode 100644 (file)
index 0000000..361af16
--- /dev/null
@@ -0,0 +1,63 @@
+From c7cb42e94473aafe553c0f2a3d8ca904599399ed Mon Sep 17 00:00:00 2001
+From: Yang Shi <shy828301@gmail.com>
+Date: Thu, 28 Oct 2021 14:36:07 -0700
+Subject: mm: hwpoison: remove the unnecessary THP check
+
+From: Yang Shi <shy828301@gmail.com>
+
+commit c7cb42e94473aafe553c0f2a3d8ca904599399ed upstream.
+
+When handling THP hwpoison checked if the THP is in allocation or free
+stage since hwpoison may mistreat it as hugetlb page.  After commit
+415c64c1453a ("mm/memory-failure: split thp earlier in memory error
+handling") the problem has been fixed, so this check is no longer
+needed.  Remove it.  The side effect of the removal is hwpoison may
+report unsplit THP instead of unknown error for shmem THP.  It seems not
+like a big deal.
+
+The following patch "mm: filemap: check if THP has hwpoisoned subpage
+for PMD page fault" depends on this, which fixes shmem THP with
+hwpoisoned subpage(s) are mapped PMD wrongly.  So this patch needs to be
+backported to -stable as well.
+
+Link: https://lkml.kernel.org/r/20211020210755.23964-2-shy828301@gmail.com
+Signed-off-by: Yang Shi <shy828301@gmail.com>
+Suggested-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
+Acked-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: Oscar Salvador <osalvador@suse.de>
+Cc: Peter Xu <peterx@redhat.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/memory-failure.c |   14 --------------
+ 1 file changed, 14 deletions(-)
+
+--- a/mm/memory-failure.c
++++ b/mm/memory-failure.c
+@@ -956,20 +956,6 @@ static int get_hwpoison_page(struct page
+ {
+       struct page *head = compound_head(page);
+-      if (!PageHuge(head) && PageTransHuge(head)) {
+-              /*
+-               * Non anonymous thp exists only in allocation/free time. We
+-               * can't handle such a case correctly, so let's give it up.
+-               * This should be better than triggering BUG_ON when kernel
+-               * tries to touch the "partially handled" page.
+-               */
+-              if (!PageAnon(head)) {
+-                      pr_err("Memory failure: %#lx: non anonymous thp\n",
+-                              page_to_pfn(page));
+-                      return 0;
+-              }
+-      }
+-
+       if (get_page_unless_zero(head)) {
+               if (head == compound_head(page))
+                       return 1;
diff --git a/queue-5.10/net-ethernet-microchip-lan743x-fix-skb-allocation-failure.patch b/queue-5.10/net-ethernet-microchip-lan743x-fix-skb-allocation-failure.patch
new file mode 100644 (file)
index 0000000..08dc381
--- /dev/null
@@ -0,0 +1,59 @@
+From e8684db191e4164f3f5f3ad7dec04a6734c25f1c Mon Sep 17 00:00:00 2001
+From: Yuiko Oshino <yuiko.oshino@microchip.com>
+Date: Wed, 27 Oct 2021 14:23:02 -0400
+Subject: net: ethernet: microchip: lan743x: Fix skb allocation failure
+
+From: Yuiko Oshino <yuiko.oshino@microchip.com>
+
+commit e8684db191e4164f3f5f3ad7dec04a6734c25f1c upstream.
+
+The driver allocates skb during ndo_open with GFP_ATOMIC which has high chance of failure when there are multiple instances.
+GFP_KERNEL is enough while open and use GFP_ATOMIC only from interrupt context.
+
+Fixes: 23f0703c125b ("lan743x: Add main source files for new lan743x driver")
+Signed-off-by: Yuiko Oshino <yuiko.oshino@microchip.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/net/ethernet/microchip/lan743x_main.c |   10 ++++++----
+ 1 file changed, 6 insertions(+), 4 deletions(-)
+
+--- a/drivers/net/ethernet/microchip/lan743x_main.c
++++ b/drivers/net/ethernet/microchip/lan743x_main.c
+@@ -1963,13 +1963,13 @@ static int lan743x_rx_next_index(struct
+       return ((++index) % rx->ring_size);
+ }
+-static struct sk_buff *lan743x_rx_allocate_skb(struct lan743x_rx *rx)
++static struct sk_buff *lan743x_rx_allocate_skb(struct lan743x_rx *rx, gfp_t gfp)
+ {
+       int length = 0;
+       length = (LAN743X_MAX_FRAME_SIZE + ETH_HLEN + 4 + RX_HEAD_PADDING);
+       return __netdev_alloc_skb(rx->adapter->netdev,
+-                                length, GFP_ATOMIC | GFP_DMA);
++                                length, gfp);
+ }
+ static void lan743x_rx_update_tail(struct lan743x_rx *rx, int index)
+@@ -2141,7 +2141,8 @@ static int lan743x_rx_process_packet(str
+                       struct sk_buff *new_skb = NULL;
+                       int packet_length;
+-                      new_skb = lan743x_rx_allocate_skb(rx);
++                      new_skb = lan743x_rx_allocate_skb(rx,
++                                                        GFP_ATOMIC | GFP_DMA);
+                       if (!new_skb) {
+                               /* failed to allocate next skb.
+                                * Memory is very low.
+@@ -2377,7 +2378,8 @@ static int lan743x_rx_ring_init(struct l
+       rx->last_head = 0;
+       for (index = 0; index < rx->ring_size; index++) {
+-              struct sk_buff *new_skb = lan743x_rx_allocate_skb(rx);
++              struct sk_buff *new_skb = lan743x_rx_allocate_skb(rx,
++                                                                 GFP_KERNEL);
+               ret = lan743x_rx_init_ring_element(rx, index, new_skb);
+               if (ret)
index 34866132c9e361629814655c7c081978c1769917..e1733dac28d08c9642b179316169d4840bcceb43 100644 (file)
@@ -1,2 +1,8 @@
 scsi-core-put-lld-module-refcnt-after-scsi-device-is-released.patch
 revert-io_uring-reinforce-cancel-on-flush-during-exit.patch
+sfc-fix-reading-non-legacy-supported-link-modes.patch
+vrf-revert-reset-skb-conntrack-connection.patch
+net-ethernet-microchip-lan743x-fix-skb-allocation-failure.patch
+mm-hwpoison-remove-the-unnecessary-thp-check.patch
+mm-filemap-check-if-thp-has-hwpoisoned-subpage-for-pmd-page-fault.patch
+media-firewire-firedtv-avc-fix-a-buffer-overflow-in-avc_ca_pmt.patch
diff --git a/queue-5.10/sfc-fix-reading-non-legacy-supported-link-modes.patch b/queue-5.10/sfc-fix-reading-non-legacy-supported-link-modes.patch
new file mode 100644 (file)
index 0000000..ff44194
--- /dev/null
@@ -0,0 +1,47 @@
+From 041c61488236a5a84789083e3d9f0a51139b6edf Mon Sep 17 00:00:00 2001
+From: Erik Ekman <erik@kryo.se>
+Date: Sun, 17 Oct 2021 19:16:57 +0200
+Subject: sfc: Fix reading non-legacy supported link modes
+
+From: Erik Ekman <erik@kryo.se>
+
+commit 041c61488236a5a84789083e3d9f0a51139b6edf upstream.
+
+Everything except the first 32 bits was lost when the pause flags were
+added. This makes the 50000baseCR2 mode flag (bit 34) not appear.
+
+I have tested this with a 10G card (SFN5122F-R7) by modifying it to
+return a non-legacy link mode (10000baseCR).
+
+Signed-off-by: Erik Ekman <erik@kryo.se>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/sfc/ethtool_common.c |   10 ++--------
+ 1 file changed, 2 insertions(+), 8 deletions(-)
+
+--- a/drivers/net/ethernet/sfc/ethtool_common.c
++++ b/drivers/net/ethernet/sfc/ethtool_common.c
+@@ -563,20 +563,14 @@ int efx_ethtool_get_link_ksettings(struc
+ {
+       struct efx_nic *efx = netdev_priv(net_dev);
+       struct efx_link_state *link_state = &efx->link_state;
+-      u32 supported;
+       mutex_lock(&efx->mac_lock);
+       efx_mcdi_phy_get_link_ksettings(efx, cmd);
+       mutex_unlock(&efx->mac_lock);
+       /* Both MACs support pause frames (bidirectional and respond-only) */
+-      ethtool_convert_link_mode_to_legacy_u32(&supported,
+-                                              cmd->link_modes.supported);
+-
+-      supported |= SUPPORTED_Pause | SUPPORTED_Asym_Pause;
+-
+-      ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.supported,
+-                                              supported);
++      ethtool_link_ksettings_add_link_mode(cmd, supported, Pause);
++      ethtool_link_ksettings_add_link_mode(cmd, supported, Asym_Pause);
+       if (LOOPBACK_INTERNAL(efx)) {
+               cmd->base.speed = link_state->speed;
diff --git a/queue-5.10/vrf-revert-reset-skb-conntrack-connection.patch b/queue-5.10/vrf-revert-reset-skb-conntrack-connection.patch
new file mode 100644 (file)
index 0000000..6f4f53b
--- /dev/null
@@ -0,0 +1,130 @@
+From 55161e67d44fdd23900be166a81e996abd6e3be9 Mon Sep 17 00:00:00 2001
+From: Eugene Crosser <crosser@average.org>
+Date: Mon, 18 Oct 2021 20:22:50 +0200
+Subject: vrf: Revert "Reset skb conntrack connection..."
+
+From: Eugene Crosser <crosser@average.org>
+
+commit 55161e67d44fdd23900be166a81e996abd6e3be9 upstream.
+
+This reverts commit 09e856d54bda5f288ef8437a90ab2b9b3eab83d1.
+
+When an interface is enslaved in a VRF, prerouting conntrack hook is
+called twice: once in the context of the original input interface, and
+once in the context of the VRF interface. If no special precausions are
+taken, this leads to creation of two conntrack entries instead of one,
+and breaks SNAT.
+
+Commit above was intended to avoid creation of extra conntrack entries
+when input interface is enslaved in a VRF. It did so by resetting
+conntrack related data associated with the skb when it enters VRF context.
+
+However it breaks netfilter operation. Imagine a use case when conntrack
+zone must be assigned based on the original input interface, rather than
+VRF interface (that would make original interfaces indistinguishable). One
+could create netfilter rules similar to these:
+
+        chain rawprerouting {
+                type filter hook prerouting priority raw;
+                iif realiface1 ct zone set 1 return
+                iif realiface2 ct zone set 2 return
+        }
+
+This works before the mentioned commit, but not after: zone assignment
+is "forgotten", and any subsequent NAT or filtering that is dependent
+on the conntrack zone does not work.
+
+Here is a reproducer script that demonstrates the difference in behaviour.
+
+==========
+#!/bin/sh
+
+# This script demonstrates unexpected change of nftables behaviour
+# caused by commit 09e856d54bda5f28 ""vrf: Reset skb conntrack
+# connection on VRF rcv"
+# https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=09e856d54bda5f288ef8437a90ab2b9b3eab83d1
+#
+# Before the commit, it was possible to assign conntrack zone to a
+# packet (or mark it for `notracking`) in the prerouting chanin, raw
+# priority, based on the `iif` (interface from which the packet
+# arrived).
+# After the change, # if the interface is enslaved in a VRF, such
+# assignment is lost. Instead, assignment based on the `iif` matching
+# the VRF master interface is honored. Thus it is impossible to
+# distinguish packets based on the original interface.
+#
+# This script demonstrates this change of behaviour: conntrack zone 1
+# or 2 is assigned depending on the match with the original interface
+# or the vrf master interface. It can be observed that conntrack entry
+# appears in different zone in the kernel versions before and after
+# the commit.
+
+IPIN=172.30.30.1
+IPOUT=172.30.30.2
+PFXL=30
+
+ip li sh vein >/dev/null 2>&1 && ip li del vein
+ip li sh tvrf >/dev/null 2>&1 && ip li del tvrf
+nft list table testct >/dev/null 2>&1 && nft delete table testct
+
+ip li add vein type veth peer veout
+ip li add tvrf type vrf table 9876
+ip li set veout master tvrf
+ip li set vein up
+ip li set veout up
+ip li set tvrf up
+/sbin/sysctl -w net.ipv4.conf.veout.accept_local=1
+/sbin/sysctl -w net.ipv4.conf.veout.rp_filter=0
+ip addr add $IPIN/$PFXL dev vein
+ip addr add $IPOUT/$PFXL dev veout
+
+nft -f - <<__END__
+table testct {
+       chain rawpre {
+               type filter hook prerouting priority raw;
+               iif { veout, tvrf } meta nftrace set 1
+               iif veout ct zone set 1 return
+               iif tvrf ct zone set 2 return
+               notrack
+       }
+       chain rawout {
+               type filter hook output priority raw;
+               notrack
+       }
+}
+__END__
+
+uname -rv
+conntrack -F
+ping -W 1 -c 1 -I vein $IPOUT
+conntrack -L
+
+Signed-off-by: Eugene Crosser <crosser@average.org>
+Acked-by: David Ahern <dsahern@kernel.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Cc: Florian Westphal <fw@strlen.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/vrf.c |    4 ----
+ 1 file changed, 4 deletions(-)
+
+--- a/drivers/net/vrf.c
++++ b/drivers/net/vrf.c
+@@ -1313,8 +1313,6 @@ static struct sk_buff *vrf_ip6_rcv(struc
+       bool need_strict = rt6_need_strict(&ipv6_hdr(skb)->daddr);
+       bool is_ndisc = ipv6_ndisc_frame(skb);
+-      nf_reset_ct(skb);
+-
+       /* loopback, multicast & non-ND link-local traffic; do not push through
+        * packet taps again. Reset pkt_type for upper layers to process skb.
+        * For strict packets with a source LLA, determine the dst using the
+@@ -1371,8 +1369,6 @@ static struct sk_buff *vrf_ip_rcv(struct
+       skb->skb_iif = vrf_dev->ifindex;
+       IPCB(skb)->flags |= IPSKB_L3SLAVE;
+-      nf_reset_ct(skb);
+-
+       if (ipv4_is_multicast(ip_hdr(skb)->daddr))
+               goto out;