socket.7: select()/poll()/epoll honor SO_RCVLOWAT since Linux 2.6.28

[thirdparty/man-pages.git] / man7 / socket.7
diff --git a/man7/socket.7 b/man7/socket.7

index 33376e6f4b87f1191f1dfdefc025207e8312eecd..30b699a071ecb7ac9793fea46c024cb1151ce6cc 100644 (file)
--- a/man7/socket.7
+++ b/man7/socket.7
@@ -61,12 +61,12 @@
  .\"    commit ea02f9411d9faa3553ed09ce0ec9f00ceae9885e
  .\"    Author: Michal Sekletar <msekleta@redhat.com>
  .\"
-.TH SOCKET 7 2017-05-03 Linux "Linux Programmer's Manual"
+.TH SOCKET 7 2019-03-06 Linux "Linux Programmer's Manual"
  .SH NAME
  socket \- Linux socket interface
  .SH SYNOPSIS
  .B #include <sys/socket.h>
-.sp
+.PP
  .IB sockfd " = socket(int " socket_family ", int " socket_type ", int " protocol );
  .SH DESCRIPTION
  This manual page describes the Linux networking socket layer user
@@ -91,7 +91,7 @@ for more information on families and types.
  These functions are used by the user process to send or receive packets
  and to do other socket operations.
  For more information see their respective manual pages.
-
+.PP
  .BR socket (2)
  creates a socket,
  .BR connect (2)
@@ -252,7 +252,7 @@ the various system calls (e.g.,
  .BR getpeername (2)),
  which are generic to all socket domains,
  to determine the domain of a particular socket address.
-
+.PP
  To allow any type of socket address to be passed to
  interfaces in the sockets API,
  the type
@@ -262,7 +262,7 @@ The purpose of this type is purely to allow casting of
  domain-specific socket address types to a "generic" type,
  so as to avoid compiler warnings about type mismatches in
  calls to the sockets API.
-
+.PP
  In addition, the sockets API provides the data type
  .IR "struct sockaddr_storage".
  This type
@@ -272,13 +272,13 @@ address structures; it is large enough and is aligned properly.
  IPv6 socket addresses.)
  The structure includes the following field, which can be used to identify
  the type of socket address actually stored in the structure:
-
+.PP
  .in +4n
-.nf
+.EX
      sa_family_t ss_family;
-.fi
+.EE
  .in
-
+.PP
  The
  .I sockaddr_storage
  structure is useful in programs that must handle socket addresses
@@ -319,25 +319,25 @@ or an extended BPF
  program to the socket for use as a filter of incoming packets.
  A packet will be dropped if the filter program returns zero.
  If the filter program returns a
-non-zero value which is less than the packet's data length,
+nonzero value which is less than the packet's data length,
  the packet will be truncated to the length returned.
  If the value returned by the filter is greater than or equal to the
  packet's data length, the packet is allowed to proceed unmodified.
-
+.IP
  The argument for
  .BR SO_ATTACH_FILTER
  is a
  .I sock_fprog
  structure, defined in
  .IR <linux/filter.h> :
-.sp
+.IP
  .in +4n
-.nf
+.EX
  struct sock_fprog {
      unsigned short      len;
      struct sock_filter *filter;
  };
-.fi
+.EE
  .in
  .IP
  The argument for
@@ -345,14 +345,14 @@ The argument for
  is a file descriptor returned by the
  .BR bpf (2)
  system call and must refer to a program of type
-.BR BPF_PROG_TYPE_SOCKET_FILTER.
-
+.BR BPF_PROG_TYPE_SOCKET_FILTER .
+.IP
  These options may be set multiple times for a given socket,
  each time replacing the previous filter program.
  The classic and extended versions may be called on the same socket,
  but the previous filter will always be replaced such that a socket
  never has more than one filter defined.
-
+.IP
  Both classic and extended BPF are explained in the kernel source file
  .I Documentation/networking/filter.txt
  .TP
@@ -367,7 +367,7 @@ program which defines how packets are assigned to
  the sockets in the reuseport group (that is, all sockets which have
  .BR SO_REUSEPORT
  set and are using the same local address to receive packets).
-
+.IP
  The BPF program must return an index between 0 and N\-1 representing
  the socket which should receive the packet
  (where N is the number of sockets in the group).
@@ -375,7 +375,7 @@ If the BPF program returns an invalid index,
  socket selection will fall back to the plain
  .BR SO_REUSEPORT
  mechanism.
-
+.IP
  Sockets are numbered in the order in which they are added to the group
  (that is, the order of
  .BR bind (2)
@@ -387,18 +387,18 @@ When a socket is removed from a reuseport group (via
  .BR close (2)),
  the last socket in the group will be moved into the closed socket's
  position.
-
+.IP
  These options may be set repeatedly at any time on any socket in the group
  to replace the current BPF program used by all sockets in the group.
-
+.IP
  .BR SO_ATTACH_REUSEPORT_CBPF
  takes the same argument type as
  .BR SO_ATTACH_FILTER
  and
  .BR SO_ATTACH_REUSEPORT_EBPF
  takes the same argument type as
-.BR SO_ATTACH_BPF.
-
+.BR SO_ATTACH_BPF .
+.IP
  UDP support for this feature is available since Linux 4.5;
  TCP support is available since Linux 4.6.
  .TP
@@ -420,7 +420,7 @@ sockets.
  It is not supported for packet sockets (use normal
  .BR bind (2)
  there).
-
+.IP
  Before Linux 3.8,
  this socket option could be set, but could not retrieved with
  .BR getsockopt (2).
@@ -429,7 +429,7 @@ The
  .I optlen
  argument should contain the buffer size available
  to receive the device name and is recommended to be
-.BR IFNAMSZ
+.BR IFNAMSIZ
  bytes.
  The real device name length is reported back in the
  .I optlen
@@ -495,15 +495,14 @@ Expects an integer boolean flag.
  .\" setsockopt 70da268b569d32a9fddeea85dc18043de9d89f89
  Sets or gets the CPU affinity of a socket.
  Expects an integer flag.
-
+.IP
  .in +4n
-.nf
+.EX
  int cpu = 1;
-socklen_t len = sizeof(cpu);
-setsockopt(fd, SOL_SOCKET, SO_INCOMING_CPU, &cpu, &len);
-.fi
+setsockopt(fd, SOL_SOCKET, SO_INCOMING_CPU, &cpu, sizeof(cpu));
+.EE
  .in
-
+.IP
  Because all of the packets for a single stream
  (i.e., all packets for the same 4-tuple)
  arrive on the single RX queue that is associated with a particular CPU,
@@ -523,9 +522,9 @@ This provides optimal NUMA behavior and keeps CPU caches hot.
  .\" >
  .\" > Sorry -- I'm lost here. How does this comment relate to the proposed
  .\" > man page text above?
-.\"  
+.\"
  .\" Simply that :
-.\"  
+.\"
  .\" If an application uses both SO_INCOMING_CPU and SO_REUSEPORT, then
  .\" SO_REUSEPORT logic, selecting the socket to receive the packet, ignores
  .\" SO_INCOMING_CPU setting.
@@ -541,14 +540,14 @@ option.
  The argument is a
  .I linger
  structure.
-.sp
+.IP
  .in +4n
-.nf
+.EX
  struct linger {
      int l_onoff;    /* linger active */
      int l_linger;   /* how many seconds to linger for */
  };
-.fi
+.EE
  .in
  .IP
  When enabled, a
@@ -568,12 +567,12 @@ it always lingers in the background.
  When set, this option will prevent
  changing the filters associated with the socket.
  These filters include any set using the socket options
-.BR SO_ATTACH_FILTER,
-.BR SO_ATTACH_BPF,
-.BR SO_ATTACH_REUSEPORT_CBPF
+.BR SO_ATTACH_FILTER ,
+.BR SO_ATTACH_BPF ,
+.BR SO_ATTACH_REUSEPORT_CBPF ,
  and
-.BR SO_ATTACH_REUSEPORT_EPBF .
-
+.BR SO_ATTACH_REUSEPORT_EBPF .
+.IP
  The typical use case is for a privileged process to set up a raw socket
  (an operation that requires the
  .BR CAP_NET_RAW
@@ -582,7 +581,7 @@ capability), apply a restrictive filter, set the
  option,
  and then either drop its privileges or pass the socket file descriptor
  to an unprivileged process via a UNIX domain socket.
-
+.IP
  Once the
  .BR SO_LOCK_FILTER
  option has been enabled, attempts to change or remove the filter
@@ -624,8 +623,13 @@ Enable or disable the receiving of the
  control message.
  For more information see
  .BR unix (7).
-.\" FIXME Document SO_PASSSEC, added in 2.6.18; there is some info
-.\" in the 2.6.18 ChangeLog
+.TP
+.B SO_PASSSEC
+Enable or disable the receiving of the
+.B SCM_SECURITY
+control message.
+For more information see
+.BR unix (7).
  .TP
  .BR SO_PEEK_OFF " (since Linux 3.4)"
  .\" commit ef64a54f6e558155b4f149bb10666b9e914b6c54
@@ -636,7 +640,7 @@ sockets, sets the value of the "peek offset" for the
  system call when used with
  .BR MSG_PEEK
  flag.
-
+.IP
  When this option is set to a negative value
  (it is set to \-1 for all new sockets),
  traditional behavior is provided:
@@ -644,14 +648,14 @@ traditional behavior is provided:
  with the
  .BR MSG_PEEK
  flag will peek data from the front of the queue.
-
+.IP
  When the option is set to a value greater than or equal to zero,
  then the next peek at data queued in the socket will occur at
  the byte offset specified by the option value.
  At the same time, the "peek offset" will be
  incremented by the number of bytes that were peeked from the queue,
  so that a subsequent peek will return the next data in the queue.
-
+.IP
  If data is removed from the front of the queue via a call to
  .BR recv (2)
  (or similar) without the
@@ -663,24 +667,24 @@ flag will cause the "peek offset" to be adjusted to maintain
  the correct relative position in the queued data,
  so that a subsequent peek will retrieve the data that would have been
  retrieved had the data not been removed.
-
+.IP
  For datagram sockets, if the "peek offset" points to the middle of a packet,
  the data returned will be marked with the
  .BR MSG_TRUNC
  flag.
-
+.IP
  The following example serves to illustrate the use of
  .BR SO_PEEK_OFF .
  Suppose a stream socket has the following queued input data:
-
+.IP
      aabbccddeeff
  .IP
  The following sequence of
  .BR recv (2)
  calls would have the effect noted in the comments:
-
+.IP
  .in +4n
-.nf
+.EX
  int ov = 4;                  // Set peek offset to 4
  setsockopt(fd, SOL_SOCKET, SO_PEEK_OFF, &ov, sizeof(ov));
  
@@ -688,31 +692,13 @@ recv(fd, buf, 2, MSG_PEEK);  // Peeks "cc"; offset set to 6
  recv(fd, buf, 2, MSG_PEEK);  // Peeks "dd"; offset set to 8
  recv(fd, buf, 2, 0);         // Reads "aa"; offset set to 6
  recv(fd, buf, 2, MSG_PEEK);  // Peeks "ee"; offset set to 8
-.fi
+.EE
  .in
  .TP
  .B SO_PEERCRED
-Return the credentials of the foreign process connected to this socket.
-This is possible only for connected
-.B AF_UNIX
-stream sockets and
-.B AF_UNIX
-stream and datagram socket pairs created using
-.BR socketpair (2);
-see
+Return the credentials of the peer process connected to this socket.
+For further details, see
  .BR unix (7).
-The returned credentials are those that were in effect at the time
-of the call to
-.BR connect (2)
-or
-.BR socketpair (2).
-The argument is a
-.I ucred
-structure; define the
-.B _GNU_SOURCE
-feature test macro to obtain the definition of that structure from
-.IR <sys/socket.h> .
-This socket option is read-only.
  .TP
  .B SO_PRIORITY
  Set the protocol-defined priority for all packets to be sent on
@@ -778,15 +764,19 @@ fails with the error
  .B SO_RCVLOWAT
  is changeable
  only since Linux 2.4.
-The
-.BR select (2)
+.IP
+Before Linux 2.6.28
+.\" commit c7004482e8dcb7c3c72666395cfa98a216a4fb70
+.BR select (2),
+.BR poll (2),
  and
-.BR poll (2)
-system calls currently do not respect the
+.BR epoll (7)
+did not respect the
  .B SO_RCVLOWAT
  setting on Linux,
-and mark a socket readable when even a single byte of data is available.
-A subsequent read from the socket will block until
+and indicated a socket as readable when even a single byte of data
+was available.
+A subsequent read from the socket would then block until
  .B SO_RCVLOWAT
  bytes are available.
  .\" See http://marc.theaimsgroup.com/?l=linux-kernel&m=111049368106984&w=2
@@ -858,7 +848,7 @@ To prevent port hijacking,
  all of the processes binding to the same address must have the same
  effective UID.
  This option can be employed with both TCP and UDP sockets.
-
+.IP
  For TCP sockets, this option allows
  .BR accept (2)
  load distribution in a multi-threaded server to be improved by
@@ -870,7 +860,7 @@ thread that distributes connections,
  or having multiple threads that compete to
  .BR accept (2)
  from the same socket.
-
+.IP
  For UDP sockets,
  the use of this option can provide better distribution
  of incoming datagrams to multiple processes (or threads) as compared
@@ -881,8 +871,7 @@ compete to receive datagrams on the same socket.
  .\" commit 3b885787ea4112eaa80945999ea0901bf742707f
  Indicates that an unsigned 32-bit value ancillary message (cmsg)
  should be attached to received skbs indicating
-the number of packets dropped by the socket between
-the last received packet and this received packet.
+the number of packets dropped by the socket since its creation.
  .TP
  .B SO_SNDBUF
  Sets or gets the maximum socket send buffer in bytes.
@@ -938,7 +927,7 @@ Increasing this value requires
  The default for this option is controlled by the
  .I /proc/sys/net/core/busy_read
  file.
-
+.IP
  The value in the
  .I /proc/sys/net/core/busy_poll
  file determines how long
@@ -948,11 +937,11 @@ and
  will busy poll when they operate on sockets with
  .BR SO_BUSY_POLL
  set and no events to report are found.
-
+.IP
  In both cases,
  busy polling will only be done when the socket last received data
  from a network device that supports this option.
-
+.IP
  While busy polling may improve latency of some applications,
  care must be taken when using it since this will increase
  both CPU utilization and power usage.
@@ -1037,11 +1026,11 @@ per socket.
  .SS Ioctls
  These operations can be accessed using
  .BR ioctl (2):
-
+.PP
  .in +4n
-.nf
+.EX
  .IB error " = ioctl(" ip_socket ", " ioctl_type ", " &value_result ");"
-.fi
+.EE
  .in
  .TP
  .B SIOCGSTAMP
@@ -1140,7 +1129,7 @@ Linux assumes that half of the send/receive buffer is used for internal
  kernel structures; thus the values in the corresponding
  .I /proc
  files are twice what can be observed on the wire.
-
+.PP
  Linux will allow port reuse only with the
  .B SO_REUSEADDR
  option
@@ -1163,6 +1152,7 @@ program is designed to always set this option.
  .BR setsockopt (2),
  .BR socket (2),
  .BR pcap (3),
+.BR address_families (7),
  .BR capabilities (7),
  .BR ddp (7),
  .BR ip (7),