]> git.ipfire.org Git - thirdparty/pdns.git/blob - pdns/xsk.hh
dnsdist: Fix DNS over plain HTTP broken by `reloadAllCertificates()`
[thirdparty/pdns.git] / pdns / xsk.hh
1 /*
2 * This file is part of PowerDNS or dnsdist.
3 * Copyright -- PowerDNS.COM B.V. and its contributors
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of version 2 of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * In addition, for the avoidance of any doubt, permission is granted to
10 * link this program with OpenSSL and to (re)distribute the binaries
11 * produced as the result of such linking.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21 */
22
23 #pragma once
24 #include "config.h"
25
26 #ifdef HAVE_XSK
27 #include <array>
28 #include <bits/types/struct_timespec.h>
29 #include <boost/lockfree/spsc_queue.hpp>
30 #include <boost/multi_index/hashed_index.hpp>
31 #include <boost/multi_index_container.hpp>
32 #include <boost/multi_index/member.hpp>
33 #include <cstdint>
34 #include <memory>
35 #include <poll.h>
36 #include <queue>
37 #include <stdexcept>
38 #include <string>
39 #include <unistd.h>
40 #include <unordered_map>
41 #include <vector>
42
43 #include <xdp/xsk.h>
44
45 #include "iputils.hh"
46 #include "lock.hh"
47 #include "misc.hh"
48 #include "noinitvector.hh"
49
50 class XskPacket;
51 class XskWorker;
52 class XskSocket;
53
54 using MACAddr = std::array<uint8_t, 6>;
55
56 // We use an XskSocket to manage an AF_XDP Socket corresponding to a NIC queue.
57 // The XDP program running in the kernel redirects the data to the XskSocket in userspace.
58 // We allocate frames that are placed into the descriptors in the fill queue, allowing the kernel to put incoming packets into the frames and place descriptors into the rx queue.
59 // Once we have read the descriptors from the rx queue we release them, but we own the frames.
60 // After we are done with the frame, we place them into descriptors of either the fill queue (empty frames) or tx queues (packets to be sent).
61 // Once the kernel is done, it places descriptors referencing these frames into the cq where we can recycle them (packets destined to the tx queue or empty frame to the fill queue queue).
62
63 // XskSocket routes packets to multiple worker threads registered on XskSocket via XskSocket::addWorker based on the destination port number of the packet.
64 // The kernel and the worker thread holding XskWorker will wake up the XskSocket through XskFd and the Eventfd corresponding to each worker thread, respectively.
65
66 class XskSocket
67 {
68 struct XskUmem
69 {
70 xsk_umem* umem{nullptr};
71 uint8_t* bufBase{nullptr};
72 size_t size{0};
73 void umemInit(size_t memSize, xsk_ring_cons* completionQueue, xsk_ring_prod* fillQueue, xsk_umem_config* config);
74 ~XskUmem();
75 XskUmem() = default;
76 };
77 using WorkerContainer = std::unordered_map<int, std::shared_ptr<XskWorker>>;
78 WorkerContainer d_workers;
79 using WorkerRoutesMap = std::unordered_map<ComboAddress, std::shared_ptr<XskWorker>, ComboAddress::addressPortOnlyHash>;
80 // it might be better to move to a StateHolder for performance
81 LockGuarded<WorkerRoutesMap> d_workerRoutes;
82 // number of frames to keep in sharedEmptyFrameOffset
83 static constexpr size_t holdThreshold = 256;
84 // number of frames to insert into the fill queue
85 static constexpr size_t fillThreshold = 128;
86 static constexpr size_t frameSize = 2048;
87 // number of entries (frames) in the umem
88 const size_t frameNum;
89 // responses that have been delayed
90 std::priority_queue<XskPacket> waitForDelay;
91 MACAddr source{};
92 const std::string ifName;
93 // AF_XDP socket then worker waker sockets
94 vector<pollfd> fds;
95 // list of frames, aka (indexes of) umem entries that can be reused to fill fq,
96 // collected from packets that we could not route (unknown destination),
97 // could not parse, were dropped during processing (!UPDATE), or
98 // simply recycled from cq after being processed by the kernel
99 vector<uint64_t> uniqueEmptyFrameOffset;
100 // completion ring: queue where sent packets are stored by the kernel
101 xsk_ring_cons cq{};
102 // rx ring: queue where the incoming packets are stored, read by XskRouter
103 xsk_ring_cons rx{};
104 // fill ring: queue where umem entries available to be filled (put into rx) are stored
105 xsk_ring_prod fq{};
106 // tx ring: queue where outgoing packets are stored
107 xsk_ring_prod tx{};
108 std::unique_ptr<xsk_socket, void (*)(xsk_socket*)> socket;
109 XskUmem umem;
110
111 static constexpr uint32_t fqCapacity = XSK_RING_PROD__DEFAULT_NUM_DESCS * 4;
112 static constexpr uint32_t cqCapacity = XSK_RING_CONS__DEFAULT_NUM_DESCS * 4;
113 static constexpr uint32_t rxCapacity = XSK_RING_CONS__DEFAULT_NUM_DESCS * 2;
114 static constexpr uint32_t txCapacity = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2;
115
116 constexpr static bool isPowOfTwo(uint32_t value) noexcept;
117 [[nodiscard]] static int timeDifference(const timespec& lhs, const timespec& rhs) noexcept;
118
119 [[nodiscard]] uint64_t frameOffset(const XskPacket& packet) const noexcept;
120 [[nodiscard]] int firstTimeout();
121 void getMACFromIfName();
122
123 public:
124 static void clearDestinationMap(const std::string& mapPath, bool isV6);
125 static void addDestinationAddress(const std::string& mapPath, const ComboAddress& destination);
126 static void removeDestinationAddress(const std::string& mapPath, const ComboAddress& destination);
127 static constexpr size_t getFrameSize()
128 {
129 return frameSize;
130 }
131 // list of free umem entries that can be reused
132 std::shared_ptr<LockGuarded<vector<uint64_t>>> sharedEmptyFrameOffset;
133 XskSocket(size_t frameNum, std::string ifName, uint32_t queue_id, const std::string& xskMapPath);
134 [[nodiscard]] int xskFd() const noexcept;
135 // wait until one event has occurred
136 [[nodiscard]] int wait(int timeout);
137 // add as many packets as possible to the rx queue for sending */
138 void send(std::vector<XskPacket>& packets);
139 // look at incoming packets in rx, return them if parsing succeeeded
140 [[nodiscard]] std::vector<XskPacket> recv(uint32_t recvSizeMax, uint32_t* failedCount);
141 void addWorker(std::shared_ptr<XskWorker> worker);
142 void addWorkerRoute(const std::shared_ptr<XskWorker>& worker, const ComboAddress& dest);
143 void removeWorkerRoute(const ComboAddress& dest);
144 [[nodiscard]] std::string getMetrics() const;
145 [[nodiscard]] std::string getXDPMode() const;
146 void markAsFree(const XskPacket& packet);
147 [[nodiscard]] const std::shared_ptr<XskWorker>& getWorkerByDescriptor(int desc) const
148 {
149 return d_workers.at(desc);
150 }
151 [[nodiscard]] std::shared_ptr<XskWorker> getWorkerByDestination(const ComboAddress& destination)
152 {
153 auto routes = d_workerRoutes.lock();
154 auto workerIt = routes->find(destination);
155 if (workerIt == routes->end()) {
156 return nullptr;
157 }
158 return workerIt->second;
159 }
160 [[nodiscard]] const std::vector<pollfd>& getDescriptors() const
161 {
162 return fds;
163 }
164 [[nodiscard]] MACAddr getSourceMACAddress() const
165 {
166 return source;
167 }
168 [[nodiscard]] const std::string& getInterfaceName() const
169 {
170 return ifName;
171 }
172 // pick ups available frames from uniqueEmptyFrameOffset
173 // insert entries from uniqueEmptyFrameOffset into fq
174 void fillFq(uint32_t fillSize = fillThreshold) noexcept;
175 // picks up entries that have been processed (sent) from cq and push them into uniqueEmptyFrameOffset
176 void recycle(size_t size) noexcept;
177 // look at delayed packets, and send the ones that are ready
178 void pickUpReadyPacket(std::vector<XskPacket>& packets);
179 void pushDelayed(XskPacket& packet)
180 {
181 waitForDelay.push(packet);
182 }
183 };
184
185 struct ethhdr;
186 struct iphdr;
187 struct ipv6hdr;
188 struct udphdr;
189
190 class XskPacket
191 {
192 public:
193 enum Flags : uint32_t
194 {
195 UPDATE = 1 << 0,
196 DELAY = 1 << 1,
197 REWRITE = 1 << 2
198 };
199
200 private:
201 ComboAddress from;
202 ComboAddress to;
203 timespec sendTime{};
204 uint8_t* frame{nullptr};
205 size_t frameLength{0};
206 size_t frameSize{0};
207 uint32_t flags{0};
208 bool v6{false};
209
210 // You must set ipHeader.check = 0 before calling this method
211 [[nodiscard]] static __be16 ipv4Checksum(const struct iphdr*) noexcept;
212 [[nodiscard]] static uint64_t ip_checksum_partial(const void* p, size_t len, uint64_t sum) noexcept;
213 [[nodiscard]] static __be16 ip_checksum_fold(uint64_t sum) noexcept;
214 [[nodiscard]] static uint64_t tcp_udp_v4_header_checksum_partial(__be32 src_ip, __be32 dst_ip, uint8_t protocol, uint16_t len) noexcept;
215 [[nodiscard]] static uint64_t tcp_udp_v6_header_checksum_partial(const struct in6_addr* src_ip, const struct in6_addr* dst_ip, uint8_t protocol, uint32_t len) noexcept;
216 static void rewriteIpv4Header(struct iphdr* ipv4header, size_t frameLen) noexcept;
217 static void rewriteIpv6Header(struct ipv6hdr* ipv6header, size_t frameLen) noexcept;
218
219 // You must set l4Header.check = 0 before calling this method
220 // ip options is not supported
221 [[nodiscard]] __be16 tcp_udp_v4_checksum(const struct iphdr*) const noexcept;
222 // You must set l4Header.check = 0 before calling this method
223 [[nodiscard]] __be16 tcp_udp_v6_checksum(const struct ipv6hdr*) const noexcept;
224 /* offset of the L4 (udphdr) header (after ethhdr and iphdr/ipv6hdr) */
225 [[nodiscard]] size_t getL4HeaderOffset() const noexcept;
226 /* offset of the data after the UDP header */
227 [[nodiscard]] size_t getDataOffset() const noexcept;
228 [[nodiscard]] size_t getDataSize() const noexcept;
229 [[nodiscard]] ethhdr getEthernetHeader() const noexcept;
230 void setEthernetHeader(const ethhdr& ethHeader) noexcept;
231 [[nodiscard]] iphdr getIPv4Header() const noexcept;
232 void setIPv4Header(const iphdr& ipv4Header) noexcept;
233 [[nodiscard]] ipv6hdr getIPv6Header() const noexcept;
234 void setIPv6Header(const ipv6hdr& ipv6Header) noexcept;
235 [[nodiscard]] udphdr getUDPHeader() const noexcept;
236 void setUDPHeader(const udphdr& udpHeader) noexcept;
237 void changeDirectAndUpdateChecksum() noexcept;
238
239 constexpr static uint8_t DefaultTTL = 64;
240
241 public:
242 [[nodiscard]] const ComboAddress& getFromAddr() const noexcept;
243 [[nodiscard]] const ComboAddress& getToAddr() const noexcept;
244 [[nodiscard]] const void* getPayloadData() const;
245 [[nodiscard]] bool isIPV6() const noexcept;
246 [[nodiscard]] size_t getCapacity() const noexcept;
247 [[nodiscard]] uint32_t getDataLen() const noexcept;
248 [[nodiscard]] uint32_t getFrameLen() const noexcept;
249 [[nodiscard]] PacketBuffer clonePacketBuffer() const;
250 [[nodiscard]] PacketBuffer cloneHeaderToPacketBuffer() const;
251 void setAddr(const ComboAddress& from_, MACAddr fromMAC, const ComboAddress& to_, MACAddr toMAC) noexcept;
252 bool setPayload(const PacketBuffer& buf);
253 void rewrite() noexcept;
254 void setHeader(PacketBuffer& buf);
255 XskPacket(uint8_t* frame, size_t dataSize, size_t frameSize);
256 void addDelay(int relativeMilliseconds) noexcept;
257 void updatePacket() noexcept;
258 // parse IP and UDP payloads
259 bool parse(bool fromSetHeader);
260 [[nodiscard]] uint32_t getFlags() const noexcept;
261 [[nodiscard]] timespec getSendTime() const noexcept
262 {
263 return sendTime;
264 }
265 [[nodiscard]] uint64_t getFrameOffsetFrom(const uint8_t* base) const noexcept
266 {
267 return frame - base;
268 }
269 };
270 bool operator<(const XskPacket& lhs, const XskPacket& rhs) noexcept;
271
272 /* g++ defines __SANITIZE_THREAD__
273 clang++ supports the nice __has_feature(thread_sanitizer),
274 let's merge them */
275 #if defined(__has_feature)
276 #if __has_feature(thread_sanitizer)
277 #define __SANITIZE_THREAD__ 1
278 #endif
279 #endif
280
281 // XskWorker obtains XskPackets of specific ports in the NIC from XskSocket through cq.
282 // After finishing processing the packet, XskWorker puts the packet into sq so that XskSocket decides whether to send it through the network card according to XskPacket::flags.
283 // XskWorker wakes up XskSocket via xskSocketWaker after putting the packets in sq.
284 class XskWorker
285 {
286 #if defined(__SANITIZE_THREAD__)
287 using XskPacketRing = LockGuarded<boost::lockfree::spsc_queue<XskPacket, boost::lockfree::capacity<XSK_RING_CONS__DEFAULT_NUM_DESCS * 2>>>;
288 #else
289 using XskPacketRing = boost::lockfree::spsc_queue<XskPacket, boost::lockfree::capacity<XSK_RING_CONS__DEFAULT_NUM_DESCS * 2>>;
290 #endif
291
292 public:
293 // queue of packets to be processed by this worker
294 XskPacketRing incomingPacketsQueue;
295 // queue of packets processed by this worker (to be sent, or discarded)
296 XskPacketRing outgoingPacketsQueue;
297
298 uint8_t* umemBufBase{nullptr};
299 // list of frames that are shared with the XskRouter
300 std::shared_ptr<LockGuarded<vector<uint64_t>>> sharedEmptyFrameOffset;
301 // list of frames that we own, used to generate new packets (health-check)
302 vector<uint64_t> uniqueEmptyFrameOffset;
303 const size_t frameSize{XskSocket::getFrameSize()};
304 FDWrapper workerWaker;
305 FDWrapper xskSocketWaker;
306
307 XskWorker();
308 static int createEventfd();
309 static void notify(int desc);
310 static std::shared_ptr<XskWorker> create();
311 void pushToProcessingQueue(XskPacket& packet);
312 void pushToSendQueue(XskPacket& packet);
313 void markAsFree(const XskPacket& packet);
314 // notify worker that at least one packet is available for processing
315 void notifyWorker() const;
316 // notify the router that packets are ready to be sent
317 void notifyXskSocket() const;
318 void waitForXskSocket() const noexcept;
319 void cleanWorkerNotification() const noexcept;
320 void cleanSocketNotification() const noexcept;
321 [[nodiscard]] uint64_t frameOffset(const XskPacket& packet) const noexcept;
322 // reap empty umem entry from sharedEmptyFrameOffset into uniqueEmptyFrameOffset
323 void fillUniqueEmptyOffset();
324 // look for an empty umem entry in uniqueEmptyFrameOffset
325 // then sharedEmptyFrameOffset if needed
326 std::optional<XskPacket> getEmptyFrame();
327 };
328 std::vector<pollfd> getPollFdsForWorker(XskWorker& info);
329 #else
330 class XskSocket
331 {
332 };
333 class XskPacket
334 {
335 };
336 class XskWorker
337 {
338 };
339
340 #endif /* HAVE_XSK */