]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
netfilter: nfnetlink_queue: do shared-unconfirmed check before segmentation
authorFlorian Westphal <fw@strlen.de>
Thu, 20 Nov 2025 16:17:06 +0000 (17:17 +0100)
committerFlorian Westphal <fw@strlen.de>
Fri, 6 Feb 2026 12:34:55 +0000 (13:34 +0100)
Ulrich reports a regression with nfqueue:

If an application did not set the 'F_GSO' capability flag and a gso
packet with an unconfirmed nf_conn entry is received all packets are
now dropped instead of queued, because the check happens after
skb_gso_segment().  In that case, we did have exclusive ownership
of the skb and its associated conntrack entry.  The elevated use
count is due to skb_clone happening via skb_gso_segment().

Move the check so that its peformed vs. the aggregated packet.

Then, annotate the individual segments except the first one so we
can do a 2nd check at reinject time.

For the normal case, where userspace does in-order reinjects, this avoids
packet drops: first reinjected segment continues traversal and confirms
entry, remaining segments observe the confirmed entry.

While at it, simplify nf_ct_drop_unconfirmed(): We only care about
unconfirmed entries with a refcnt > 1, there is no need to special-case
dying entries.

This only happens with UDP.  With TCP, the only unconfirmed packet will
be the TCP SYN, those aren't aggregated by GRO.

Next patch adds a udpgro test case to cover this scenario.

Reported-by: Ulrich Weber <ulrich.weber@gmail.com>
Fixes: 7d8dc1c7be8d ("netfilter: nf_queue: drop packets with cloned unconfirmed conntracks")
Signed-off-by: Florian Westphal <fw@strlen.de>
include/net/netfilter/nf_queue.h
net/netfilter/nfnetlink_queue.c

index e6803831d6af519797cdfb8fe015ffb49d4620af..45eb26b2e95b37ec855b964f06888d6233b95ef3 100644 (file)
@@ -21,6 +21,7 @@ struct nf_queue_entry {
        struct net_device       *physout;
 #endif
        struct nf_hook_state    state;
+       bool                    nf_ct_is_unconfirmed;
        u16                     size; /* sizeof(entry) + saved route keys */
        u16                     queue_num;
 
index 671b52c652ef6e615d1f9d34bac3fd2643a5c155..f1c8049861a6b7c91c2d1fd0aa1ddb2db9a31d8f 100644 (file)
@@ -435,6 +435,34 @@ next_hook:
        nf_queue_entry_free(entry);
 }
 
+/* return true if the entry has an unconfirmed conntrack attached that isn't owned by us
+ * exclusively.
+ */
+static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry, bool *is_unconfirmed)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+       struct nf_conn *ct = (void *)skb_nfct(entry->skb);
+
+       if (!ct || nf_ct_is_confirmed(ct))
+               return false;
+
+       if (is_unconfirmed)
+               *is_unconfirmed = true;
+
+       /* in some cases skb_clone() can occur after initial conntrack
+        * pickup, but conntrack assumes exclusive skb->_nfct ownership for
+        * unconfirmed entries.
+        *
+        * This happens for br_netfilter and with ip multicast routing.
+        * This can't be solved with serialization here because one clone
+        * could have been queued for local delivery or could be transmitted
+        * in parallel on another CPU.
+        */
+       return refcount_read(&ct->ct_general.use) > 1;
+#endif
+       return false;
+}
+
 static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
 {
        const struct nf_ct_hook *ct_hook;
@@ -462,6 +490,24 @@ static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
                        break;
                }
        }
+
+       if (verdict != NF_DROP && entry->nf_ct_is_unconfirmed) {
+               /* If first queued segment was already reinjected then
+                * there is a good chance the ct entry is now confirmed.
+                *
+                * Handle the rare cases:
+                *  - out-of-order verdict
+                *  - threaded userspace reinjecting in parallel
+                *  - first segment was dropped
+                *
+                * In all of those cases we can't handle this packet
+                * because we can't be sure that another CPU won't modify
+                * nf_conn->ext in parallel which isn't allowed.
+                */
+               if (nf_ct_drop_unconfirmed(entry, NULL))
+                       verdict = NF_DROP;
+       }
+
        nf_reinject(entry, verdict);
 }
 
@@ -891,49 +937,6 @@ nlmsg_failure:
        return NULL;
 }
 
-static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry)
-{
-#if IS_ENABLED(CONFIG_NF_CONNTRACK)
-       static const unsigned long flags = IPS_CONFIRMED | IPS_DYING;
-       struct nf_conn *ct = (void *)skb_nfct(entry->skb);
-       unsigned long status;
-       unsigned int use;
-
-       if (!ct)
-               return false;
-
-       status = READ_ONCE(ct->status);
-       if ((status & flags) == IPS_DYING)
-               return true;
-
-       if (status & IPS_CONFIRMED)
-               return false;
-
-       /* in some cases skb_clone() can occur after initial conntrack
-        * pickup, but conntrack assumes exclusive skb->_nfct ownership for
-        * unconfirmed entries.
-        *
-        * This happens for br_netfilter and with ip multicast routing.
-        * We can't be solved with serialization here because one clone could
-        * have been queued for local delivery.
-        */
-       use = refcount_read(&ct->ct_general.use);
-       if (likely(use == 1))
-               return false;
-
-       /* Can't decrement further? Exclusive ownership. */
-       if (!refcount_dec_not_one(&ct->ct_general.use))
-               return false;
-
-       skb_set_nfct(entry->skb, 0);
-       /* No nf_ct_put(): we already decremented .use and it cannot
-        * drop down to 0.
-        */
-       return true;
-#endif
-       return false;
-}
-
 static int
 __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
                        struct nf_queue_entry *entry)
@@ -950,9 +953,6 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
        }
        spin_lock_bh(&queue->lock);
 
-       if (nf_ct_drop_unconfirmed(entry))
-               goto err_out_free_nskb;
-
        if (queue->queue_total >= queue->queue_maxlen)
                goto err_out_queue_drop;
 
@@ -995,7 +995,6 @@ err_out_queue_drop:
                else
                        net_warn_ratelimited("nf_queue: hash insert failed: %d\n", err);
        }
-err_out_free_nskb:
        kfree_skb(nskb);
 err_out_unlock:
        spin_unlock_bh(&queue->lock);
@@ -1074,9 +1073,10 @@ __nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue,
 static int
 nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
 {
-       unsigned int queued;
-       struct nfqnl_instance *queue;
        struct sk_buff *skb, *segs, *nskb;
+       bool ct_is_unconfirmed = false;
+       struct nfqnl_instance *queue;
+       unsigned int queued;
        int err = -ENOBUFS;
        struct net *net = entry->state.net;
        struct nfnl_queue_net *q = nfnl_queue_pernet(net);
@@ -1100,6 +1100,15 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
                break;
        }
 
+       /* Check if someone already holds another reference to
+        * unconfirmed ct.  If so, we cannot queue the skb:
+        * concurrent modifications of nf_conn->ext are not
+        * allowed and we can't know if another CPU isn't
+        * processing the same nf_conn entry in parallel.
+        */
+       if (nf_ct_drop_unconfirmed(entry, &ct_is_unconfirmed))
+               return -EINVAL;
+
        if (!skb_is_gso(skb) || ((queue->flags & NFQA_CFG_F_GSO) && !skb_is_gso_sctp(skb)))
                return __nfqnl_enqueue_packet(net, queue, entry);
 
@@ -1113,7 +1122,23 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
                goto out_err;
        queued = 0;
        err = 0;
+
        skb_list_walk_safe(segs, segs, nskb) {
+               if (ct_is_unconfirmed && queued > 0) {
+                       /* skb_gso_segment() increments the ct refcount.
+                        * This is a problem for unconfirmed (not in hash)
+                        * entries, those can race when reinjections happen
+                        * in parallel.
+                        *
+                        * Annotate this for all queued entries except the
+                        * first one.
+                        *
+                        * As long as the first one is reinjected first it
+                        * will do the confirmation for us.
+                        */
+                       entry->nf_ct_is_unconfirmed = ct_is_unconfirmed;
+               }
+
                if (err == 0)
                        err = __nfqnl_enqueue_packet_gso(net, queue,
                                                        segs, entry);