netfilter: nfnetlink_queue: do shared-unconfirmed check before segmentation

author Florian Westphal <fw@strlen.de>

Thu, 20 Nov 2025 16:17:06 +0000 (17:17 +0100)

committer Florian Westphal <fw@strlen.de>

Fri, 6 Feb 2026 12:34:55 +0000 (13:34 +0100)
author Florian Westphal <fw@strlen.de>
Thu, 20 Nov 2025 16:17:06 +0000 (17:17 +0100)
committer Florian Westphal <fw@strlen.de>
Fri, 6 Feb 2026 12:34:55 +0000 (13:34 +0100)
diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h

index e6803831d6af519797cdfb8fe015ffb49d4620af..45eb26b2e95b37ec855b964f06888d6233b95ef3 100644 (file)
--- a/include/net/netfilter/nf_queue.h
+++ b/include/net/netfilter/nf_queue.h
@@ -21,6 +21,7 @@ struct nf_queue_entry {
         struct net_device       *physout;
  #endif
         struct nf_hook_state    state;
+       bool                    nf_ct_is_unconfirmed;
         u16                     size; /* sizeof(entry) + saved route keys */
         u16                     queue_num;
  
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c

index 671b52c652ef6e615d1f9d34bac3fd2643a5c155..f1c8049861a6b7c91c2d1fd0aa1ddb2db9a31d8f 100644 (file)
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -435,6 +435,34 @@ next_hook:
         nf_queue_entry_free(entry);
  }
  
+/* return true if the entry has an unconfirmed conntrack attached that isn't owned by us
+ * exclusively.
+ */
+static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry, bool *is_unconfirmed)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+       struct nf_conn *ct = (void *)skb_nfct(entry->skb);
+
+       if (!ct || nf_ct_is_confirmed(ct))
+               return false;
+
+       if (is_unconfirmed)
+               *is_unconfirmed = true;
+
+       /* in some cases skb_clone() can occur after initial conntrack
+        * pickup, but conntrack assumes exclusive skb->_nfct ownership for
+        * unconfirmed entries.
+        *
+        * This happens for br_netfilter and with ip multicast routing.
+        * This can't be solved with serialization here because one clone
+        * could have been queued for local delivery or could be transmitted
+        * in parallel on another CPU.
+        */
+       return refcount_read(&ct->ct_general.use) > 1;
+#endif
+       return false;
+}
+
  static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
  {
         const struct nf_ct_hook *ct_hook;
@@ -462,6 +490,24 @@ static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
                         break;
                 }
         }
+
+       if (verdict != NF_DROP && entry->nf_ct_is_unconfirmed) {
+               /* If first queued segment was already reinjected then
+                * there is a good chance the ct entry is now confirmed.
+                *
+                * Handle the rare cases:
+                *  - out-of-order verdict
+                *  - threaded userspace reinjecting in parallel
+                *  - first segment was dropped
+                *
+                * In all of those cases we can't handle this packet
+                * because we can't be sure that another CPU won't modify
+                * nf_conn->ext in parallel which isn't allowed.
+                */
+               if (nf_ct_drop_unconfirmed(entry, NULL))
+                       verdict = NF_DROP;
+       }
+
         nf_reinject(entry, verdict);
  }
  
@@ -891,49 +937,6 @@ nlmsg_failure:
         return NULL;
  }
  
-static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry)
-{
-#if IS_ENABLED(CONFIG_NF_CONNTRACK)
-       static const unsigned long flags = IPS_CONFIRMED | IPS_DYING;
-       struct nf_conn *ct = (void *)skb_nfct(entry->skb);
-       unsigned long status;
-       unsigned int use;
-
-       if (!ct)
-               return false;
-
-       status = READ_ONCE(ct->status);
-       if ((status & flags) == IPS_DYING)
-               return true;
-
-       if (status & IPS_CONFIRMED)
-               return false;
-
-       /* in some cases skb_clone() can occur after initial conntrack
-        * pickup, but conntrack assumes exclusive skb->_nfct ownership for
-        * unconfirmed entries.
-        *
-        * This happens for br_netfilter and with ip multicast routing.
-        * We can't be solved with serialization here because one clone could
-        * have been queued for local delivery.
-        */
-       use = refcount_read(&ct->ct_general.use);
-       if (likely(use == 1))
-               return false;
-
-       /* Can't decrement further? Exclusive ownership. */
-       if (!refcount_dec_not_one(&ct->ct_general.use))
-               return false;
-
-       skb_set_nfct(entry->skb, 0);
-       /* No nf_ct_put(): we already decremented .use and it cannot
-        * drop down to 0.
-        */
-       return true;
-#endif
-       return false;
-}
-
  static int
  __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
                         struct nf_queue_entry *entry)
@@ -950,9 +953,6 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
         }
         spin_lock_bh(&queue->lock);
  
-       if (nf_ct_drop_unconfirmed(entry))
-               goto err_out_free_nskb;
-
         if (queue->queue_total >= queue->queue_maxlen)
                 goto err_out_queue_drop;
  
@@ -995,7 +995,6 @@ err_out_queue_drop:
                 else
                         net_warn_ratelimited("nf_queue: hash insert failed: %d\n", err);
         }
-err_out_free_nskb:
         kfree_skb(nskb);
  err_out_unlock:
         spin_unlock_bh(&queue->lock);
@@ -1074,9 +1073,10 @@ __nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue,
  static int
  nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
  {
-       unsigned int queued;
-       struct nfqnl_instance *queue;
         struct sk_buff *skb, *segs, *nskb;
+       bool ct_is_unconfirmed = false;
+       struct nfqnl_instance *queue;
+       unsigned int queued;
         int err = -ENOBUFS;
         struct net *net = entry->state.net;
         struct nfnl_queue_net *q = nfnl_queue_pernet(net);
@@ -1100,6 +1100,15 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
                 break;
         }
  
+       /* Check if someone already holds another reference to
+        * unconfirmed ct.  If so, we cannot queue the skb:
+        * concurrent modifications of nf_conn->ext are not
+        * allowed and we can't know if another CPU isn't
+        * processing the same nf_conn entry in parallel.
+        */
+       if (nf_ct_drop_unconfirmed(entry, &ct_is_unconfirmed))
+               return -EINVAL;
+
         if (!skb_is_gso(skb) || ((queue->flags & NFQA_CFG_F_GSO) && !skb_is_gso_sctp(skb)))
                 return __nfqnl_enqueue_packet(net, queue, entry);
  
@@ -1113,7 +1122,23 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
                 goto out_err;
         queued = 0;
         err = 0;
+
         skb_list_walk_safe(segs, segs, nskb) {
+               if (ct_is_unconfirmed && queued > 0) {
+                       /* skb_gso_segment() increments the ct refcount.
+                        * This is a problem for unconfirmed (not in hash)
+                        * entries, those can race when reinjections happen
+                        * in parallel.
+                        *
+                        * Annotate this for all queued entries except the
+                        * first one.
+                        *
+                        * As long as the first one is reinjected first it
+                        * will do the confirmation for us.
+                        */
+                       entry->nf_ct_is_unconfirmed = ct_is_unconfirmed;
+               }
+
                 if (err == 0)
                         err = __nfqnl_enqueue_packet_gso(net, queue,
                                                         segs, entry);
author	Florian Westphal <fw@strlen.de>
	Thu, 20 Nov 2025 16:17:06 +0000 (17:17 +0100)
committer	Florian Westphal <fw@strlen.de>
	Fri, 6 Feb 2026 12:34:55 +0000 (13:34 +0100)
include/net/netfilter/nf_queue.h		patch \| blob \| blame \| history
net/netfilter/nfnetlink_queue.c		patch \| blob \| blame \| history