bpf, sockmap: Fix FIONREAD for sockmap

author Jiayuan Chen <jiayuan.chen@linux.dev>

Sat, 24 Jan 2026 11:32:44 +0000 (19:32 +0800)

committer Alexei Starovoitov <ast@kernel.org>

Tue, 27 Jan 2026 17:11:30 +0000 (09:11 -0800)
author Jiayuan Chen <jiayuan.chen@linux.dev>
Sat, 24 Jan 2026 11:32:44 +0000 (19:32 +0800)
committer Alexei Starovoitov <ast@kernel.org>
Tue, 27 Jan 2026 17:11:30 +0000 (09:11 -0800)
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h

index dfdc158ab88c8e646dc707270e36c9611648ba3b..829b281d6c9c27733caec554f1cb2e3ad0fe7fe5 100644 (file)
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -97,6 +97,8 @@ struct sk_psock {
         struct sk_buff_head             ingress_skb;
         struct list_head                ingress_msg;
         spinlock_t                      ingress_lock;
+       /** @msg_tot_len: Total bytes queued in ingress_msg list. */
+       u32                             msg_tot_len;
         unsigned long                   state;
         struct list_head                link;
         spinlock_t                      link_lock;
@@ -321,6 +323,27 @@ static inline void sock_drop(struct sock *sk, struct sk_buff *skb)
         kfree_skb(skb);
  }
  
+static inline u32 sk_psock_get_msg_len_nolock(struct sk_psock *psock)
+{
+       /* Used by ioctl to read msg_tot_len only; lock-free for performance */
+       return READ_ONCE(psock->msg_tot_len);
+}
+
+static inline void sk_psock_msg_len_add_locked(struct sk_psock *psock, int diff)
+{
+       /* Use WRITE_ONCE to ensure correct read in sk_psock_get_msg_len_nolock().
+        * ingress_lock should be held to prevent concurrent updates to msg_tot_len
+        */
+       WRITE_ONCE(psock->msg_tot_len, psock->msg_tot_len + diff);
+}
+
+static inline void sk_psock_msg_len_add(struct sk_psock *psock, int diff)
+{
+       spin_lock_bh(&psock->ingress_lock);
+       sk_psock_msg_len_add_locked(psock, diff);
+       spin_unlock_bh(&psock->ingress_lock);
+}
+
  static inline bool sk_psock_queue_msg(struct sk_psock *psock,
                                       struct sk_msg *msg)
  {
@@ -329,6 +352,7 @@ static inline bool sk_psock_queue_msg(struct sk_psock *psock,
         spin_lock_bh(&psock->ingress_lock);
         if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
                 list_add_tail(&msg->list, &psock->ingress_msg);
+               sk_psock_msg_len_add_locked(psock, msg->sg.size);
                 ret = true;
         } else {
                 sk_msg_free(psock->sk, msg);
@@ -345,18 +369,25 @@ static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock)
  
         spin_lock_bh(&psock->ingress_lock);
         msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list);
-       if (msg)
+       if (msg) {
                 list_del(&msg->list);
+               sk_psock_msg_len_add_locked(psock, -msg->sg.size);
+       }
         spin_unlock_bh(&psock->ingress_lock);
         return msg;
  }
  
+static inline struct sk_msg *sk_psock_peek_msg_locked(struct sk_psock *psock)
+{
+       return list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list);
+}
+
  static inline struct sk_msg *sk_psock_peek_msg(struct sk_psock *psock)
  {
         struct sk_msg *msg;
  
         spin_lock_bh(&psock->ingress_lock);
-       msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list);
+       msg = sk_psock_peek_msg_locked(psock);
         spin_unlock_bh(&psock->ingress_lock);
         return msg;
  }
@@ -523,6 +554,39 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
         return !!psock->saved_data_ready;
  }
  
+/* for tcp only, sk is locked */
+static inline ssize_t sk_psock_msg_inq(struct sock *sk)
+{
+       struct sk_psock *psock;
+       ssize_t inq = 0;
+
+       psock = sk_psock_get(sk);
+       if (likely(psock)) {
+               inq = sk_psock_get_msg_len_nolock(psock);
+               sk_psock_put(sk, psock);
+       }
+       return inq;
+}
+
+/* for udp only, sk is not locked */
+static inline ssize_t sk_msg_first_len(struct sock *sk)
+{
+       struct sk_psock *psock;
+       struct sk_msg *msg;
+       ssize_t inq = 0;
+
+       psock = sk_psock_get(sk);
+       if (likely(psock)) {
+               spin_lock_bh(&psock->ingress_lock);
+               msg = sk_psock_peek_msg_locked(psock);
+               if (msg)
+                       inq = msg->sg.size;
+               spin_unlock_bh(&psock->ingress_lock);
+               sk_psock_put(sk, psock);
+       }
+       return inq;
+}
+
  #if IS_ENABLED(CONFIG_NET_SOCK_MSG)
  
  #define BPF_F_STRPARSER        (1UL << 1)
diff --git a/net/core/skmsg.c b/net/core/skmsg.c

index d402da5caadd6566bc0638a35b4033feb16d967f..ddde93dd8bc6d4e7941daff8b18229376f0d459d 100644 (file)
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -458,6 +458,7 @@ int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg
                                         atomic_sub(copy, &sk->sk_rmem_alloc);
                                 }
                                 msg_rx->sg.size -= copy;
+                               sk_psock_msg_len_add(psock, -copy);
  
                                 if (!sge->length) {
                                         sk_msg_iter_var_next(i);
@@ -821,9 +822,11 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
                 list_del(&msg->list);
                 if (!msg->skb)
                         atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc);
+               sk_psock_msg_len_add(psock, -msg->sg.size);
                 sk_msg_free(psock->sk, msg);
                 kfree(msg);
         }
+       WARN_ON_ONCE(psock->msg_tot_len);
  }
  
  static void __sk_psock_zap_ingress(struct sk_psock *psock)
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c

index 5c698fd7fbf8159b7515e6b26b46a0269f05bcd0..ca8a5cb8e569d78d7b1df03eb904ac5cc115cf47 100644 (file)
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -10,6 +10,7 @@
  
  #include <net/inet_common.h>
  #include <net/tls.h>
+#include <asm/ioctls.h>
  
  void tcp_eat_skb(struct sock *sk, struct sk_buff *skb)
  {
@@ -332,6 +333,24 @@ unlock:
         return copied;
  }
  
+static int tcp_bpf_ioctl(struct sock *sk, int cmd, int *karg)
+{
+       bool slow;
+
+       if (cmd != SIOCINQ)
+               return tcp_ioctl(sk, cmd, karg);
+
+       /* works similar as tcp_ioctl */
+       if (sk->sk_state == TCP_LISTEN)
+               return -EINVAL;
+
+       slow = lock_sock_fast(sk);
+       *karg = sk_psock_msg_inq(sk);
+       unlock_sock_fast(sk, slow);
+
+       return 0;
+}
+
  static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
                            int flags, int *addr_len)
  {
@@ -610,6 +629,7 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
         prot[TCP_BPF_BASE].close                = sock_map_close;
         prot[TCP_BPF_BASE].recvmsg              = tcp_bpf_recvmsg;
         prot[TCP_BPF_BASE].sock_is_readable     = sk_msg_is_readable;
+       prot[TCP_BPF_BASE].ioctl                = tcp_bpf_ioctl;
  
         prot[TCP_BPF_TX]                        = prot[TCP_BPF_BASE];
         prot[TCP_BPF_TX].sendmsg                = tcp_bpf_sendmsg;
diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c

index 0735d820e413f3002bb83fb011d998a9cad1b05c..91233e37cd97a2b0f0c891723b20b1235cc628d1 100644 (file)
--- a/net/ipv4/udp_bpf.c
+++ b/net/ipv4/udp_bpf.c
@@ -5,6 +5,7 @@
  #include <net/sock.h>
  #include <net/udp.h>
  #include <net/inet_common.h>
+#include <asm/ioctls.h>
  
  #include "udp_impl.h"
  
@@ -111,12 +112,26 @@ enum {
  static DEFINE_SPINLOCK(udpv6_prot_lock);
  static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS];
  
+static int udp_bpf_ioctl(struct sock *sk, int cmd, int *karg)
+{
+       if (cmd != SIOCINQ)
+               return udp_ioctl(sk, cmd, karg);
+
+       /* Since we don't hold a lock, sk_receive_queue may contain data.
+        * BPF might only be processing this data at the moment. We only
+        * care about the data in the ingress_msg here.
+        */
+       *karg = sk_msg_first_len(sk);
+       return 0;
+}
+
  static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
  {
-       *prot        = *base;
-       prot->close  = sock_map_close;
-       prot->recvmsg = udp_bpf_recvmsg;
-       prot->sock_is_readable = sk_msg_is_readable;
+       *prot                   = *base;
+       prot->close             = sock_map_close;
+       prot->recvmsg           = udp_bpf_recvmsg;
+       prot->sock_is_readable  = sk_msg_is_readable;
+       prot->ioctl             = udp_bpf_ioctl;
  }
  
  static void udp_bpf_check_v6_needs_rebuild(struct proto *ops)
author	Jiayuan Chen <jiayuan.chen@linux.dev>
	Sat, 24 Jan 2026 11:32:44 +0000 (19:32 +0800)
committer	Alexei Starovoitov <ast@kernel.org>
	Tue, 27 Jan 2026 17:11:30 +0000 (09:11 -0800)
include/linux/skmsg.h		patch \| blob \| blame \| history
net/core/skmsg.c		patch \| blob \| blame \| history
net/ipv4/tcp_bpf.c		patch \| blob \| blame \| history
net/ipv4/udp_bpf.c		patch \| blob \| blame \| history