*/
#define TCP_RX_WINDOW_SCALE 9
+/** TCP selective acknowledgement permitted option */
+struct tcp_sack_permitted_option {
+ uint8_t kind;
+ uint8_t length;
+} __attribute__ (( packed ));
+
+/** Padded TCP selective acknowledgement permitted option (used for sending) */
+struct tcp_sack_permitted_padded_option {
+ uint8_t nop[2];
+ struct tcp_sack_permitted_option spopt;
+} __attribute__ (( packed ));
+
+/** Code for the TCP selective acknowledgement permitted option */
+#define TCP_OPTION_SACK_PERMITTED 4
+
+/** TCP selective acknowledgement option */
+struct tcp_sack_option {
+ uint8_t kind;
+ uint8_t length;
+} __attribute__ (( packed ));
+
+/** TCP selective acknowledgement block */
+struct tcp_sack_block {
+ uint32_t left;
+ uint32_t right;
+} __attribute__ (( packed ));
+
+/** Maximum number of selective acknowledgement blocks
+ *
+ * This allows for the presence of the TCP timestamp option.
+ */
+#define TCP_SACK_MAX 3
+
+/** Padded TCP selective acknowledgement option (used for sending) */
+struct tcp_sack_padded_option {
+ uint8_t nop[2];
+ struct tcp_sack_option sackopt;
+} __attribute__ (( packed ));
+
+/** Code for the TCP selective acknowledgement option */
+#define TCP_OPTION_SACK 5
+
/** TCP timestamp option */
struct tcp_timestamp_option {
uint8_t kind;
const struct tcp_mss_option *mssopt;
/** Window scale option, if present */
const struct tcp_window_scale_option *wsopt;
+ /** SACK permitted option, if present */
+ const struct tcp_sack_permitted_option *spopt;
/** Timestamp option, if present */
const struct tcp_timestamp_option *tsopt;
};
/** Maximum receive window */
uint32_t max_rcv_win;
+ /** Selective acknowledgement list (in host-endian order) */
+ struct tcp_sack_block sack[TCP_SACK_MAX];
+
/** Transmit queue */
struct list_head tx_queue;
/** Receive queue */
TCP_TS_ENABLED = 0x0002,
/** TCP acknowledgement is pending */
TCP_ACK_PENDING = 0x0004,
+ /** TCP selective acknowledgement is enabled */
+ TCP_SACK_ENABLED = 0x0008,
};
/** TCP internal header
* enqueued, and so excludes the SYN, if present.
*/
uint32_t seq;
+ /** Next SEQ value, in host-endian order */
+ uint32_t nxt;
/** Flags
*
* Only FIN is valid within this flags byte; all other flags
return tcp_xmit_win ( tcp );
}
+/**
+ * Find selective acknowledgement block
+ *
+ * @v tcp TCP connection
+ * @v seq SEQ value in SACK block (in host-endian order)
+ * @v sack SACK block to fill in (in host-endian order)
+ * @ret len Length of SACK block
+ */
+static uint32_t tcp_sack_block ( struct tcp_connection *tcp, uint32_t seq,
+ struct tcp_sack_block *sack ) {
+ struct io_buffer *iobuf;
+ struct tcp_rx_queued_header *tcpqhdr;
+ uint32_t left = tcp->rcv_ack;
+ uint32_t right = left;
+
+ /* Find highest block which does not start after SEQ */
+ list_for_each_entry ( iobuf, &tcp->rx_queue, list ) {
+ tcpqhdr = iobuf->data;
+ if ( tcp_cmp ( tcpqhdr->seq, right ) > 0 ) {
+ if ( tcp_cmp ( tcpqhdr->seq, seq ) > 0 )
+ break;
+ left = tcpqhdr->seq;
+ }
+ if ( tcp_cmp ( tcpqhdr->nxt, right ) > 0 )
+ right = tcpqhdr->nxt;
+ }
+
+ /* Fail if this block does not contain SEQ */
+ if ( tcp_cmp ( right, seq ) < 0 )
+ return 0;
+
+ /* Populate SACK block */
+ sack->left = left;
+ sack->right = right;
+ return ( right - left );
+}
+
+/**
+ * Update TCP selective acknowledgement list
+ *
+ * @v tcp TCP connection
+ * @v seq SEQ value in first SACK block (in host-endian order)
+ * @ret count Number of SACK blocks
+ */
+static unsigned int tcp_sack ( struct tcp_connection *tcp, uint32_t seq ) {
+ struct tcp_sack_block sack[TCP_SACK_MAX];
+ unsigned int old = 0;
+ unsigned int new = 0;
+ unsigned int i;
+ uint32_t len;
+
+ /* Populate first new SACK block */
+ len = tcp_sack_block ( tcp, seq, &sack[0] );
+ if ( len )
+ new++;
+
+ /* Populate remaining new SACK blocks based on old SACK blocks */
+ for ( old = 0 ; old < TCP_SACK_MAX ; old++ ) {
+
+ /* Stop if we run out of space in the new list */
+ if ( new == TCP_SACK_MAX )
+ break;
+
+ /* Skip empty old SACK blocks */
+ if ( tcp->sack[old].left == tcp->sack[old].right )
+ continue;
+
+ /* Populate new SACK block */
+ len = tcp_sack_block ( tcp, tcp->sack[old].left, &sack[new] );
+ if ( len == 0 )
+ continue;
+
+ /* Eliminate duplicates */
+ for ( i = 0 ; i < new ; i++ ) {
+ if ( sack[i].left == sack[new].left ) {
+ new--;
+ break;
+ }
+ }
+ new++;
+ }
+
+ /* Update SACK list */
+ memset ( tcp->sack, 0, sizeof ( tcp->sack ) );
+ memcpy ( tcp->sack, sack, ( new * sizeof ( tcp->sack[0] ) ) );
+ return new;
+}
+
/**
* Process TCP transmit queue
*
}
/**
- * Transmit any outstanding data
+ * Transmit any outstanding data (with selective acknowledgement)
*
* @v tcp TCP connection
+ * @v sack_seq SEQ for first selective acknowledgement (if any)
*
* Transmits any outstanding data on the connection.
*
* will have been started if necessary, and so the stack will
* eventually attempt to retransmit the failed packet.
*/
-static void tcp_xmit ( struct tcp_connection *tcp ) {
+static void tcp_xmit_sack ( struct tcp_connection *tcp, uint32_t sack_seq ) {
struct io_buffer *iobuf;
struct tcp_header *tcphdr;
struct tcp_mss_option *mssopt;
struct tcp_window_scale_padded_option *wsopt;
struct tcp_timestamp_padded_option *tsopt;
+ struct tcp_sack_permitted_padded_option *spopt;
+ struct tcp_sack_padded_option *sackopt;
+ struct tcp_sack_block *sack;
void *payload;
unsigned int flags;
+ unsigned int sack_count;
+ unsigned int i;
size_t len = 0;
+ size_t sack_len;
uint32_t seq_len;
uint32_t app_win;
uint32_t max_rcv_win;
wsopt->wsopt.kind = TCP_OPTION_WS;
wsopt->wsopt.length = sizeof ( wsopt->wsopt );
wsopt->wsopt.scale = TCP_RX_WINDOW_SCALE;
+ spopt = iob_push ( iobuf, sizeof ( *spopt ) );
+ memset ( spopt->nop, TCP_OPTION_NOP, sizeof ( spopt ) );
+ spopt->spopt.kind = TCP_OPTION_SACK_PERMITTED;
+ spopt->spopt.length = sizeof ( spopt->spopt );
}
if ( ( flags & TCP_SYN ) || ( tcp->flags & TCP_TS_ENABLED ) ) {
tsopt = iob_push ( iobuf, sizeof ( *tsopt ) );
tsopt->tsopt.tsval = htonl ( currticks() );
tsopt->tsopt.tsecr = htonl ( tcp->ts_recent );
}
+ if ( ( tcp->flags & TCP_SACK_ENABLED ) &&
+ ( ! list_empty ( &tcp->rx_queue ) ) &&
+ ( ( sack_count = tcp_sack ( tcp, sack_seq ) ) != 0 ) ) {
+ sack_len = ( sack_count * sizeof ( *sack ) );
+ sackopt = iob_push ( iobuf, ( sizeof ( *sackopt ) + sack_len ));
+ memset ( sackopt->nop, TCP_OPTION_NOP, sizeof ( sackopt->nop ));
+ sackopt->sackopt.kind = TCP_OPTION_SACK;
+ sackopt->sackopt.length =
+ ( sizeof ( sackopt->sackopt ) + sack_len );
+ sack = ( ( ( void * ) sackopt ) + sizeof ( *sackopt ) );
+ for ( i = 0 ; i < sack_count ; i++, sack++ ) {
+ sack->left = htonl ( tcp->sack[i].left );
+ sack->right = htonl ( tcp->sack[i].right );
+ }
+ }
if ( len != 0 )
flags |= TCP_PSH;
tcphdr = iob_push ( iobuf, sizeof ( *tcphdr ) );
profile_stop ( &tcp_tx_profiler );
}
+/**
+ * Transmit any outstanding data
+ *
+ * @v tcp TCP connection
+ */
+static void tcp_xmit ( struct tcp_connection *tcp ) {
+
+ /* Transmit without an explicit first SACK */
+ tcp_xmit_sack ( tcp, tcp->rcv_ack );
+}
+
/** TCP process descriptor */
static struct process_descriptor tcp_process_desc =
PROC_DESC_ONCE ( struct tcp_connection, process, tcp_xmit );
case TCP_OPTION_WS:
options->wsopt = data;
break;
+ case TCP_OPTION_SACK_PERMITTED:
+ options->spopt = data;
+ break;
+ case TCP_OPTION_SACK:
+ /* Ignore received SACKs */
+ break;
case TCP_OPTION_TS:
options->tsopt = data;
break;
* @v seq_len Sequence space length to consume
*/
static void tcp_rx_seq ( struct tcp_connection *tcp, uint32_t seq_len ) {
+ unsigned int sack;
/* Sanity check */
assert ( seq_len > 0 );
/* Update timestamp */
tcp->ts_recent = tcp->ts_val;
+ /* Update SACK list */
+ for ( sack = 0 ; sack < TCP_SACK_MAX ; sack++ ) {
+ if ( tcp->sack[sack].left == tcp->sack[sack].right )
+ continue;
+ if ( tcp_cmp ( tcp->sack[sack].left, tcp->rcv_ack ) < 0 )
+ tcp->sack[sack].left = tcp->rcv_ack;
+ if ( tcp_cmp ( tcp->sack[sack].right, tcp->rcv_ack ) < 0 )
+ tcp->sack[sack].right = tcp->rcv_ack;
+ }
+
/* Mark ACK as pending */
tcp->flags |= TCP_ACK_PENDING;
}
tcp->rcv_ack = seq;
if ( options->tsopt )
tcp->flags |= TCP_TS_ENABLED;
+ if ( options->spopt )
+ tcp->flags |= TCP_SACK_ENABLED;
if ( options->wsopt ) {
tcp->snd_win_scale = options->wsopt->scale;
tcp->rcv_win_scale = TCP_RX_WINDOW_SCALE;
struct io_buffer *queued;
size_t len;
uint32_t seq_len;
+ uint32_t nxt;
/* Calculate remaining flags and sequence length. Note that
* SYN, if present, has already been processed by this point.
flags &= TCP_FIN;
len = iob_len ( iobuf );
seq_len = ( len + ( flags ? 1 : 0 ) );
+ nxt = ( seq + seq_len );
/* Discard immediately (to save memory) if:
*
*/
if ( ( ! ( tcp->tcp_state & TCP_STATE_RCVD ( TCP_SYN ) ) ) ||
( tcp_cmp ( seq, tcp->rcv_ack + tcp->rcv_win ) >= 0 ) ||
- ( tcp_cmp ( seq + seq_len, tcp->rcv_ack ) < 0 ) ||
+ ( tcp_cmp ( nxt, tcp->rcv_ack ) < 0 ) ||
( seq_len == 0 ) ) {
free_iob ( iobuf );
return;
/* Add internal header */
tcpqhdr = iob_push ( iobuf, sizeof ( *tcpqhdr ) );
tcpqhdr->seq = seq;
+ tcpqhdr->nxt = nxt;
tcpqhdr->flags = flags;
/* Add to RX queue */
if ( list_empty ( &tcp->rx_queue ) ) {
process_add ( &tcp->process );
} else {
- tcp_xmit ( tcp );
+ tcp_xmit_sack ( tcp, seq );
}
/* If this packet was the last we expect to receive, set up