vsock_poll() reads vsk->peer_shutdown before taking the socket lock
to set EPOLLHUP and EPOLLRDHUP, then reads it again after taking
the lock to report EOF readability. A shutdown packet can update
peer_shutdown while poll is waiting for the lock, so one poll invocation
can report EOF readability without the corresponding HUP/RDHUP bits.
For connectible sockets, take one peer_shutdown snapshot after
lock_sock() and use it for all peer-shutdown-derived poll bits. For
datagram sockets, which do not take lock_sock() in poll(), take one
lockless READ_ONCE() snapshot and pair it with WRITE_ONCE() on the
writer side.
This keeps the peer-shutdown-derived bits internally consistent for each
poll pass.
Fixes: d021c344051a ("VSOCK: Introduce VM Sockets")
Signed-off-by: Ziyu Zhang <ziyuzhang201@gmail.com>
Link: https://patch.msgid.link/20260519165636.62542-1-ziyuzhang201@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
*/
sock_reset_flag(sk, SOCK_DONE);
sk->sk_state = TCP_CLOSE;
- vsk->peer_shutdown = 0;
+ WRITE_ONCE(vsk->peer_shutdown, 0);
}
if (sk->sk_type == SOCK_SEQPACKET) {
vsk->rejected = false;
vsk->sent_request = false;
vsk->ignore_connecting_rst = false;
- vsk->peer_shutdown = 0;
+ WRITE_ONCE(vsk->peer_shutdown, 0);
INIT_DELAYED_WORK(&vsk->connect_work, vsock_connect_timeout);
INIT_DELAYED_WORK(&vsk->pending_work, vsock_pending_work);
return err;
}
+static __poll_t vsock_poll_shutdown(struct sock *sk, u32 peer_shutdown)
+{
+ __poll_t mask = 0;
+
+ /* INET sockets treat local write shutdown and peer write shutdown as a
+ * case of EPOLLHUP set.
+ */
+ if (sk->sk_shutdown == SHUTDOWN_MASK ||
+ ((sk->sk_shutdown & SEND_SHUTDOWN) &&
+ (peer_shutdown & SEND_SHUTDOWN)))
+ mask |= EPOLLHUP;
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN ||
+ peer_shutdown & SEND_SHUTDOWN)
+ mask |= EPOLLRDHUP;
+
+ return mask;
+}
+
static __poll_t vsock_poll(struct file *file, struct socket *sock,
poll_table *wait)
{
/* Signify that there has been an error on this socket. */
mask |= EPOLLERR;
- /* INET sockets treat local write shutdown and peer write shutdown as a
- * case of EPOLLHUP set.
- */
- if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
- ((sk->sk_shutdown & SEND_SHUTDOWN) &&
- (vsk->peer_shutdown & SEND_SHUTDOWN))) {
- mask |= EPOLLHUP;
- }
-
- if (sk->sk_shutdown & RCV_SHUTDOWN ||
- vsk->peer_shutdown & SEND_SHUTDOWN) {
- mask |= EPOLLRDHUP;
- }
-
if (sk_is_readable(sk))
mask |= EPOLLIN | EPOLLRDNORM;
if (sock->type == SOCK_DGRAM) {
+ u32 peer_shutdown = READ_ONCE(vsk->peer_shutdown);
+
+ /* DGRAM sockets do not take lock_sock() in poll(), so use one
+ * lockless snapshot for all shutdown-derived mask bits.
+ */
+ mask |= vsock_poll_shutdown(sk, peer_shutdown);
+
/* For datagram sockets we can read if there is something in
* the queue and write as long as the socket isn't shutdown for
* sending.
} else if (sock_type_connectible(sk->sk_type)) {
const struct vsock_transport *transport;
+ u32 peer_shutdown;
lock_sock(sk);
* terminated should also be considered read, and we check the
* shutdown flag for that.
*/
+ peer_shutdown = READ_ONCE(vsk->peer_shutdown);
+ mask |= vsock_poll_shutdown(sk, peer_shutdown);
if (sk->sk_shutdown & RCV_SHUTDOWN ||
- vsk->peer_shutdown & SEND_SHUTDOWN) {
+ peer_shutdown & SEND_SHUTDOWN) {
mask |= EPOLLIN | EPOLLRDNORM;
}
struct sock *sk = sk_vsock(vsk);
sock_set_flag(sk, SOCK_DONE);
- vsk->peer_shutdown = SHUTDOWN_MASK;
+ WRITE_ONCE(vsk->peer_shutdown, SHUTDOWN_MASK);
if (vsock_stream_has_data(vsk) <= 0)
sk->sk_state = TCP_CLOSING;
sk->sk_state_change(sk);
return -EIO;
if (payload_len == 0)
- hvs->vsk->peer_shutdown |= SEND_SHUTDOWN;
+ WRITE_ONCE(hvs->vsk->peer_shutdown,
+ READ_ONCE(hvs->vsk->peer_shutdown) |
+ SEND_SHUTDOWN);
hvs->recv_data_len = payload_len;
hvs->recv_data_off = 0;
return ret;
return hvs->recv_data_len;
case 0:
- vsk->peer_shutdown |= SEND_SHUTDOWN;
+ WRITE_ONCE(vsk->peer_shutdown,
+ READ_ONCE(vsk->peer_shutdown) | SEND_SHUTDOWN);
ret = 0;
break;
default: /* -1 */
struct sock *sk = sk_vsock(vsk);
sock_set_flag(sk, SOCK_DONE);
- vsk->peer_shutdown = SHUTDOWN_MASK;
+ WRITE_ONCE(vsk->peer_shutdown, SHUTDOWN_MASK);
if (vsock_stream_has_data(vsk) <= 0)
sk->sk_state = TCP_CLOSING;
sk->sk_state_change(sk);
case VIRTIO_VSOCK_OP_CREDIT_UPDATE:
sk->sk_write_space(sk);
break;
- case VIRTIO_VSOCK_OP_SHUTDOWN:
+ case VIRTIO_VSOCK_OP_SHUTDOWN: {
+ u32 peer_shutdown = READ_ONCE(vsk->peer_shutdown);
+
if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SHUTDOWN_RCV)
- vsk->peer_shutdown |= RCV_SHUTDOWN;
+ peer_shutdown |= RCV_SHUTDOWN;
if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SHUTDOWN_SEND)
- vsk->peer_shutdown |= SEND_SHUTDOWN;
- if (vsk->peer_shutdown == SHUTDOWN_MASK) {
+ peer_shutdown |= SEND_SHUTDOWN;
+ WRITE_ONCE(vsk->peer_shutdown, peer_shutdown);
+ if (peer_shutdown == SHUTDOWN_MASK) {
if (vsock_stream_has_data(vsk) <= 0 && !sock_flag(sk, SOCK_DONE)) {
(void)virtio_transport_reset(vsk, NULL);
virtio_transport_do_close(vsk, true);
if (le32_to_cpu(virtio_vsock_hdr(skb)->flags))
sk->sk_state_change(sk);
break;
+ }
case VIRTIO_VSOCK_OP_RST:
virtio_transport_do_close(vsk, true);
break;
/* On a detach the peer will not be sending or receiving
* anymore.
*/
- vsk->peer_shutdown = SHUTDOWN_MASK;
+ WRITE_ONCE(vsk->peer_shutdown, SHUTDOWN_MASK);
/* We should not be sending anymore since the peer won't be
* there to receive, but we can still receive if there is data
if (pkt->u.mode) {
vsk = vsock_sk(sk);
- vsk->peer_shutdown |= pkt->u.mode;
+ WRITE_ONCE(vsk->peer_shutdown,
+ READ_ONCE(vsk->peer_shutdown) |
+ pkt->u.mode);
sk->sk_state_change(sk);
}
break;
* a clean shutdown.
*/
sock_set_flag(sk, SOCK_DONE);
- vsk->peer_shutdown = SHUTDOWN_MASK;
+ WRITE_ONCE(vsk->peer_shutdown, SHUTDOWN_MASK);
if (vsock_stream_has_data(vsk) <= 0)
sk->sk_state = TCP_CLOSING;