]> git.ipfire.org Git - thirdparty/wireguard-go.git/commitdiff
device: distribute crypto work as slice of elements
authorJordan Whited <jordan@tailscale.com>
Mon, 2 Oct 2023 21:41:04 +0000 (14:41 -0700)
committerJason A. Donenfeld <Jason@zx2c4.com>
Tue, 10 Oct 2023 13:07:36 +0000 (15:07 +0200)
After reducing UDP stack traversal overhead via GSO and GRO,
runtime.chanrecv() began to account for a high percentage (20% in one
environment) of perf samples during a throughput benchmark. The
individual packet channel ops with the crypto goroutines was the primary
contributor to this overhead.

Updating these channels to pass vectors, which the device package
already handles at its ends, reduced this overhead substantially, and
improved throughput.

The iperf3 results below demonstrate the effect of this commit between
two Linux computers with i5-12400 CPUs. There is roughly ~13us of round
trip latency between them.

The first result is with UDP GSO and GRO, and with single element
channels.

Starting Test: protocol: TCP, 1 streams, 131072 byte blocks
[ ID] Interval           Transfer     Bitrate         Retr  Cwnd
[  5]   0.00-10.00  sec  12.3 GBytes  10.6 Gbits/sec  232   3.15 MBytes
- - - - - - - - - - - - - - - - - - - - - - - - -
Test Complete. Summary Results:
[ ID] Interval           Transfer     Bitrate         Retr
[  5]   0.00-10.00  sec  12.3 GBytes  10.6 Gbits/sec  232   sender
[  5]   0.00-10.04  sec  12.3 GBytes  10.6 Gbits/sec        receiver

The second result is with channels updated to pass a slice of
elements.

Starting Test: protocol: TCP, 1 streams, 131072 byte blocks
[ ID] Interval           Transfer     Bitrate         Retr  Cwnd
[  5]   0.00-10.00  sec  13.2 GBytes  11.3 Gbits/sec  182   3.15 MBytes
- - - - - - - - - - - - - - - - - - - - - - - - -
Test Complete. Summary Results:
[ ID] Interval           Transfer     Bitrate         Retr
[  5]   0.00-10.00  sec  13.2 GBytes  11.3 Gbits/sec  182   sender
[  5]   0.00-10.04  sec  13.2 GBytes  11.3 Gbits/sec        receiver

Reviewed-by: Adrian Dewhurst <adrian@tailscale.com>
Signed-off-by: Jordan Whited <jordan@tailscale.com>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
device/channels.go
device/receive.go
device/send.go

index 039d8dfd0655e372e56d2cf1f2418f1848bd3a54..40ee5c9a51ae083fe7b4aba0ccd2050fe938557e 100644 (file)
@@ -19,13 +19,13 @@ import (
 // call wg.Done to remove the initial reference.
 // When the refcount hits 0, the queue's channel is closed.
 type outboundQueue struct {
-       c  chan *QueueOutboundElement
+       c  chan *[]*QueueOutboundElement
        wg sync.WaitGroup
 }
 
 func newOutboundQueue() *outboundQueue {
        q := &outboundQueue{
-               c: make(chan *QueueOutboundElement, QueueOutboundSize),
+               c: make(chan *[]*QueueOutboundElement, QueueOutboundSize),
        }
        q.wg.Add(1)
        go func() {
@@ -37,13 +37,13 @@ func newOutboundQueue() *outboundQueue {
 
 // A inboundQueue is similar to an outboundQueue; see those docs.
 type inboundQueue struct {
-       c  chan *QueueInboundElement
+       c  chan *[]*QueueInboundElement
        wg sync.WaitGroup
 }
 
 func newInboundQueue() *inboundQueue {
        q := &inboundQueue{
-               c: make(chan *QueueInboundElement, QueueInboundSize),
+               c: make(chan *[]*QueueInboundElement, QueueInboundSize),
        }
        q.wg.Add(1)
        go func() {
index e24d29f5b077a21672637de26d4ebea5fd98fd13..f0f37a14aa234f43bb98942fa35ddfb0d28ae0cb 100644 (file)
@@ -220,9 +220,7 @@ func (device *Device) RoutineReceiveIncoming(maxBatchSize int, recv conn.Receive
                for peer, elems := range elemsByPeer {
                        if peer.isRunning.Load() {
                                peer.queue.inbound.c <- elems
-                               for _, elem := range *elems {
-                                       device.queue.decryption.c <- elem
-                               }
+                               device.queue.decryption.c <- elems
                        } else {
                                for _, elem := range *elems {
                                        device.PutMessageBuffer(elem.buffer)
@@ -241,26 +239,28 @@ func (device *Device) RoutineDecryption(id int) {
        defer device.log.Verbosef("Routine: decryption worker %d - stopped", id)
        device.log.Verbosef("Routine: decryption worker %d - started", id)
 
-       for elem := range device.queue.decryption.c {
-               // split message into fields
-               counter := elem.packet[MessageTransportOffsetCounter:MessageTransportOffsetContent]
-               content := elem.packet[MessageTransportOffsetContent:]
-
-               // decrypt and release to consumer
-               var err error
-               elem.counter = binary.LittleEndian.Uint64(counter)
-               // copy counter to nonce
-               binary.LittleEndian.PutUint64(nonce[0x4:0xc], elem.counter)
-               elem.packet, err = elem.keypair.receive.Open(
-                       content[:0],
-                       nonce[:],
-                       content,
-                       nil,
-               )
-               if err != nil {
-                       elem.packet = nil
+       for elems := range device.queue.decryption.c {
+               for _, elem := range *elems {
+                       // split message into fields
+                       counter := elem.packet[MessageTransportOffsetCounter:MessageTransportOffsetContent]
+                       content := elem.packet[MessageTransportOffsetContent:]
+
+                       // decrypt and release to consumer
+                       var err error
+                       elem.counter = binary.LittleEndian.Uint64(counter)
+                       // copy counter to nonce
+                       binary.LittleEndian.PutUint64(nonce[0x4:0xc], elem.counter)
+                       elem.packet, err = elem.keypair.receive.Open(
+                               content[:0],
+                               nonce[:],
+                               content,
+                               nil,
+                       )
+                       if err != nil {
+                               elem.packet = nil
+                       }
+                       elem.Unlock()
                }
-               elem.Unlock()
        }
 }
 
index cd8a2a0ddf0bad8d8ae74e154c45ebdf7ac0dfb0..e838c4e847f52f28572815d277d238fa32f9bf9e 100644 (file)
@@ -385,9 +385,7 @@ top:
                        // add to parallel and sequential queue
                        if peer.isRunning.Load() {
                                peer.queue.outbound.c <- elems
-                               for _, elem := range *elems {
-                                       peer.device.queue.encryption.c <- elem
-                               }
+                               peer.device.queue.encryption.c <- elems
                        } else {
                                for _, elem := range *elems {
                                        peer.device.PutMessageBuffer(elem.buffer)
@@ -447,32 +445,34 @@ func (device *Device) RoutineEncryption(id int) {
        defer device.log.Verbosef("Routine: encryption worker %d - stopped", id)
        device.log.Verbosef("Routine: encryption worker %d - started", id)
 
-       for elem := range device.queue.encryption.c {
-               // populate header fields
-               header := elem.buffer[:MessageTransportHeaderSize]
-
-               fieldType := header[0:4]
-               fieldReceiver := header[4:8]
-               fieldNonce := header[8:16]
-
-               binary.LittleEndian.PutUint32(fieldType, MessageTransportType)
-               binary.LittleEndian.PutUint32(fieldReceiver, elem.keypair.remoteIndex)
-               binary.LittleEndian.PutUint64(fieldNonce, elem.nonce)
-
-               // pad content to multiple of 16
-               paddingSize := calculatePaddingSize(len(elem.packet), int(device.tun.mtu.Load()))
-               elem.packet = append(elem.packet, paddingZeros[:paddingSize]...)
-
-               // encrypt content and release to consumer
-
-               binary.LittleEndian.PutUint64(nonce[4:], elem.nonce)
-               elem.packet = elem.keypair.send.Seal(
-                       header,
-                       nonce[:],
-                       elem.packet,
-                       nil,
-               )
-               elem.Unlock()
+       for elems := range device.queue.encryption.c {
+               for _, elem := range *elems {
+                       // populate header fields
+                       header := elem.buffer[:MessageTransportHeaderSize]
+
+                       fieldType := header[0:4]
+                       fieldReceiver := header[4:8]
+                       fieldNonce := header[8:16]
+
+                       binary.LittleEndian.PutUint32(fieldType, MessageTransportType)
+                       binary.LittleEndian.PutUint32(fieldReceiver, elem.keypair.remoteIndex)
+                       binary.LittleEndian.PutUint64(fieldNonce, elem.nonce)
+
+                       // pad content to multiple of 16
+                       paddingSize := calculatePaddingSize(len(elem.packet), int(device.tun.mtu.Load()))
+                       elem.packet = append(elem.packet, paddingZeros[:paddingSize]...)
+
+                       // encrypt content and release to consumer
+
+                       binary.LittleEndian.PutUint64(nonce[4:], elem.nonce)
+                       elem.packet = elem.keypair.send.Seal(
+                               header,
+                               nonce[:],
+                               elem.packet,
+                               nil,
+                       )
+                       elem.Unlock()
+               }
        }
 }