]>
Commit | Line | Data |
---|---|---|
cc7ac21d | 1 | .\" Copyright (C) 2015 Alexei Starovoitov <ast@kernel.org> |
ce5db3fc | 2 | .\" and Copyright (C) 2015 Michael Kerrisk <mtk.manpages@gmail.com> |
cc7ac21d | 3 | .\" |
5fbde956 | 4 | .\" SPDX-License-Identifier: Linux-man-pages-copyleft |
cc7ac21d | 5 | .\" |
4c1c5274 | 6 | .TH bpf 2 (date) "Linux man-pages (unreleased)" |
cc7ac21d | 7 | .SH NAME |
99663603 | 8 | bpf \- perform a command on an extended BPF map or program |
cc7ac21d AS |
9 | .SH SYNOPSIS |
10 | .nf | |
11 | .B #include <linux/bpf.h> | |
eaa18d3c | 12 | .PP |
266791fb | 13 | .BI "int bpf(int " cmd ", union bpf_attr *" attr ", unsigned int " size ); |
c36ac88f | 14 | .fi |
cc7ac21d | 15 | .SH DESCRIPTION |
5988a659 | 16 | The |
16152abb | 17 | .BR bpf () |
842ee010 MK |
18 | system call performs a range of operations related to extended |
19 | Berkeley Packet Filters. | |
20 | Extended BPF (or eBPF) is similar to | |
54513c00 MK |
21 | the original ("classic") BPF (cBPF) used to filter network packets. |
22 | For both cBPF and eBPF programs, | |
842ee010 MK |
23 | the kernel statically analyzes the programs before loading them, |
24 | in order to ensure that they cannot harm the running system. | |
11ac5b51 | 25 | .PP |
cc42e9b8 | 26 | eBPF extends cBPF in multiple ways, including the ability to call |
f774ddf1 MK |
27 | a fixed set of in-kernel helper functions |
28 | .\" See 'enum bpf_func_id' in include/uapi/linux/bpf.h | |
29 | (via the | |
842ee010 MK |
30 | .B BPF_CALL |
31 | opcode extension provided by eBPF) | |
ce5db3fc | 32 | and access shared data structures such as eBPF maps. |
fcd1bee3 | 33 | .\" |
cc7ac21d | 34 | .SS Extended BPF Design/Architecture |
953d2673 | 35 | eBPF maps are a generic data structure for storage of different data types. |
9a818ddd | 36 | Data types are generally treated as binary blobs, so a user just specifies |
cd579c3f | 37 | the size of the key and the size of the value at map-creation time. |
9a818ddd | 38 | In other words, a key/value for a given map can have an arbitrary structure. |
f0271688 | 39 | .PP |
cc7ac21d | 40 | A user process can create multiple maps (with key/value-pairs being |
16152abb | 41 | opaque bytes of data) and access them via file descriptors. |
b87d8ba6 | 42 | Different eBPF programs can access the same maps in parallel. |
54513c00 | 43 | It's up to the user process and eBPF program to decide what they store |
cc7ac21d | 44 | inside maps. |
f0271688 | 45 | .PP |
cd579c3f MK |
46 | There's one special map type, called a program array. |
47 | This type of map stores file descriptors referring to other eBPF programs. | |
48 | When a lookup in the map is performed, the program flow is | |
49 | redirected in-place to the beginning of another eBPF program and does not | |
50 | return back to the calling program. | |
aabe0499 MK |
51 | The level of nesting has a fixed limit of 32, |
52 | .\" Defined by the kernel constant MAX_TAIL_CALL_CNT in include/linux/bpf.h | |
53 | so that infinite loops cannot be crafted. | |
29c0586f | 54 | At run time, the program file descriptors stored in the map can be modified, |
9a818ddd | 55 | so program functionality can be altered based on specific requirements. |
cd579c3f MK |
56 | All programs referred to in a program-array map must |
57 | have been previously loaded into the kernel via | |
58 | .BR bpf (). | |
59 | If a map lookup fails, the current program continues its execution. | |
60 | See | |
61 | .B BPF_MAP_TYPE_PROG_ARRAY | |
62 | below for further details. | |
11ac5b51 | 63 | .PP |
9a818ddd | 64 | Generally, eBPF programs are loaded by the user process and automatically |
cd579c3f MK |
65 | unloaded when the process exits. |
66 | In some cases, for example, | |
28a4c58c | 67 | .BR tc\-bpf (8), |
9a818ddd | 68 | the program will continue to stay alive inside the kernel even after the |
a0d8ddd1 | 69 | process that loaded the program exits. |
cd579c3f MK |
70 | In that case, |
71 | the tc subsystem holds a reference to the eBPF program after the | |
72 | file descriptor has been closed by the user-space program. | |
9a818ddd DB |
73 | Thus, whether a specific program continues to live inside the kernel |
74 | depends on how it is further attached to a given kernel subsystem | |
75 | after it was loaded via | |
cd579c3f | 76 | .BR bpf (). |
f0271688 | 77 | .PP |
cd579c3f | 78 | Each eBPF program is a set of instructions that is safe to run until |
9a5215bf | 79 | its completion. |
54513c00 | 80 | An in-kernel verifier statically determines that the eBPF program |
9a5215bf | 81 | terminates and is safe to execute. |
896388c8 MK |
82 | During verification, the kernel increments reference counts for each of |
83 | the maps that the eBPF program uses, | |
953d2673 | 84 | so that the attached maps can't be removed until the program is unloaded. |
f0271688 | 85 | .PP |
54513c00 | 86 | eBPF programs can be attached to different events. |
9ab03361 | 87 | These events can be the arrival of network packets, tracing |
953d2673 MK |
88 | events, classification events by network queueing disciplines |
89 | (for eBPF programs attached to a | |
9ab03361 MK |
90 | .BR tc (8) |
91 | classifier), and other types that may be added in the future. | |
54513c00 | 92 | A new event triggers execution of the eBPF program, which |
f774ddf1 | 93 | may store information about the event in eBPF maps. |
54513c00 | 94 | Beyond storing data, eBPF programs may call a fixed set of |
896388c8 | 95 | in-kernel helper functions. |
f0271688 | 96 | .PP |
f774ddf1 | 97 | The same eBPF program can be attached to multiple events and different |
cc42e9b8 | 98 | eBPF programs can access the same map: |
f0271688 | 99 | .PP |
1148d934 | 100 | .in +4n |
f0271688 | 101 | .EX |
cd579c3f MK |
102 | tracing tracing tracing packet packet packet |
103 | event A event B event C on eth0 on eth1 on eth2 | |
9ca13180 | 104 | | | | | | \(ha |
cd579c3f | 105 | | | | | v | |
d064d41a | 106 | \-\-> tracing <\-\- tracing socket tc ingress tc egress |
cd579c3f MK |
107 | prog_1 prog_2 prog_3 classifier action |
108 | | | | | prog_4 prog_5 | |
d064d41a MK |
109 | |\-\-\- \-\-\-\-\-| |\-\-\-\-\-\-| map_3 | | |
110 | map_1 map_2 \-\-| map_4 |\-\- | |
f0271688 | 111 | .EE |
1148d934 | 112 | .in |
fcd1bee3 | 113 | .\" |
5988a659 | 114 | .SS Arguments |
842ee010 | 115 | The operation to be performed by the |
1148d934 | 116 | .BR bpf () |
842ee010 | 117 | system call is determined by the |
266791fb | 118 | .I cmd |
f774ddf1 MK |
119 | argument. |
120 | Each operation takes an accompanying argument, | |
121 | provided via | |
122 | .IR attr , | |
123 | which is a pointer to a union of type | |
266791fb | 124 | .I bpf_attr |
f774ddf1 | 125 | (see below). |
cad2ee71 | 126 | The unused fields and padding must be zeroed out before the call. |
f774ddf1 MK |
127 | The |
128 | .I size | |
129 | argument is the size of the union pointed to by | |
130 | .IR attr . | |
efeece04 | 131 | .PP |
f774ddf1 | 132 | The value provided in |
266791fb | 133 | .I cmd |
f774ddf1 | 134 | is one of the following: |
cc7ac21d AS |
135 | .TP |
136 | .B BPF_MAP_CREATE | |
953d2673 | 137 | Create a map and return a file descriptor that refers to the map. |
0f166ce1 MK |
138 | The close-on-exec file descriptor flag (see |
139 | .BR fcntl (2)) | |
140 | is automatically enabled for the new file descriptor. | |
cc7ac21d AS |
141 | .TP |
142 | .B BPF_MAP_LOOKUP_ELEM | |
842ee010 | 143 | Look up an element by key in a specified map and return its value. |
cc7ac21d AS |
144 | .TP |
145 | .B BPF_MAP_UPDATE_ELEM | |
842ee010 | 146 | Create or update an element (key/value pair) in a specified map. |
cc7ac21d AS |
147 | .TP |
148 | .B BPF_MAP_DELETE_ELEM | |
842ee010 | 149 | Look up and delete an element by key in a specified map. |
cc7ac21d AS |
150 | .TP |
151 | .B BPF_MAP_GET_NEXT_KEY | |
842ee010 MK |
152 | Look up an element by key in a specified map and return the key |
153 | of the next element. | |
cc7ac21d AS |
154 | .TP |
155 | .B BPF_PROG_LOAD | |
9ab03361 MK |
156 | Verify and load an eBPF program, |
157 | returning a new file descriptor associated with the program. | |
0f166ce1 MK |
158 | The close-on-exec file descriptor flag (see |
159 | .BR fcntl (2)) | |
160 | is automatically enabled for the new file descriptor. | |
f0271688 | 161 | .IP |
842ee010 MK |
162 | The |
163 | .I bpf_attr | |
164 | union consists of various anonymous structures that are used by different | |
165 | .BR bpf () | |
166 | commands: | |
b3b5781e | 167 | .PP |
842ee010 | 168 | .in +4n |
f0271688 | 169 | .EX |
cc7ac21d | 170 | union bpf_attr { |
842ee010 | 171 | struct { /* Used by BPF_MAP_CREATE */ |
115b4e0e AC |
172 | __u32 map_type; |
173 | __u32 key_size; /* size of key in bytes */ | |
174 | __u32 value_size; /* size of value in bytes */ | |
175 | __u32 max_entries; /* maximum number of entries | |
842ee010 | 176 | in a map */ |
cc7ac21d AS |
177 | }; |
178 | ||
f774ddf1 MK |
179 | struct { /* Used by BPF_MAP_*_ELEM and BPF_MAP_GET_NEXT_KEY |
180 | commands */ | |
115b4e0e | 181 | __u32 map_fd; |
842ee010 | 182 | __aligned_u64 key; |
cc7ac21d AS |
183 | union { |
184 | __aligned_u64 value; | |
185 | __aligned_u64 next_key; | |
186 | }; | |
115b4e0e | 187 | __u64 flags; |
cc7ac21d AS |
188 | }; |
189 | ||
842ee010 | 190 | struct { /* Used by BPF_PROG_LOAD */ |
115b4e0e AC |
191 | __u32 prog_type; |
192 | __u32 insn_cnt; | |
861d36ba MK |
193 | __aligned_u64 insns; /* \(aqconst struct bpf_insn *\(aq */ |
194 | __aligned_u64 license; /* \(aqconst char *\(aq */ | |
115b4e0e AC |
195 | __u32 log_level; /* verbosity level of verifier */ |
196 | __u32 log_size; /* size of user buffer */ | |
861d36ba | 197 | __aligned_u64 log_buf; /* user supplied \(aqchar *\(aq |
842ee010 | 198 | buffer */ |
115b4e0e | 199 | __u32 kern_version; |
9ab03361 MK |
200 | /* checked when prog_type=kprobe |
201 | (since Linux 4.1) */ | |
202 | .\" commit 2541517c32be2531e0da59dfd7efc1ce844644f5 | |
cc7ac21d AS |
203 | }; |
204 | } __attribute__((aligned(8))); | |
f0271688 | 205 | .EE |
842ee010 | 206 | .in |
fcd1bee3 | 207 | .\" |
ce5db3fc | 208 | .SS eBPF maps |
8440f771 MK |
209 | Maps are a generic data structure for storage of different types of data. |
210 | They allow sharing of data between eBPF kernel programs, | |
211 | and also between kernel and user-space applications. | |
f0271688 | 212 | .PP |
16152abb | 213 | Each map type has the following attributes: |
22356d97 | 214 | .IP \(bu 3 |
16152abb | 215 | type |
22356d97 | 216 | .IP \(bu |
79e2beef | 217 | maximum number of elements |
22356d97 | 218 | .IP \(bu |
16152abb | 219 | key size in bytes |
22356d97 | 220 | .IP \(bu |
16152abb | 221 | value size in bytes |
16152abb | 222 | .PP |
842ee010 MK |
223 | The following wrapper functions demonstrate how various |
224 | .BR bpf () | |
225 | commands can be used to access the maps. | |
9a5215bf | 226 | The functions use the |
266791fb | 227 | .I cmd |
cc7ac21d | 228 | argument to invoke different operations. |
ce5db3fc | 229 | .TP |
842ee010 MK |
230 | .B BPF_MAP_CREATE |
231 | The | |
cc7ac21d | 232 | .B BPF_MAP_CREATE |
5415d504 MK |
233 | command creates a new map, |
234 | returning a new file descriptor that refers to the map. | |
f0271688 | 235 | .IP |
842ee010 | 236 | .in +4n |
f0271688 | 237 | .EX |
842ee010 | 238 | int |
953d2673 MK |
239 | bpf_create_map(enum bpf_map_type map_type, |
240 | unsigned int key_size, | |
241 | unsigned int value_size, | |
242 | unsigned int max_entries) | |
cc7ac21d AS |
243 | { |
244 | union bpf_attr attr = { | |
953d2673 MK |
245 | .map_type = map_type, |
246 | .key_size = key_size, | |
247 | .value_size = value_size, | |
cc7ac21d AS |
248 | .max_entries = max_entries |
249 | }; | |
250 | ||
251 | return bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); | |
252 | } | |
f0271688 | 253 | .EE |
842ee010 | 254 | .in |
f0271688 | 255 | .IP |
842ee010 MK |
256 | The new map has the type specified by |
257 | .IR map_type , | |
258 | and attributes as specified in | |
1148d934 MK |
259 | .IR key_size , |
260 | .IR value_size , | |
842ee010 | 261 | and |
1148d934 | 262 | .IR max_entries . |
46a4949b | 263 | On success, this operation returns a file descriptor. |
9a5215bf | 264 | On error, \-1 is returned and |
cc7ac21d | 265 | .I errno |
1148d934 MK |
266 | is set to |
267 | .BR EINVAL , | |
268 | .BR EPERM , | |
269 | or | |
270 | .BR ENOMEM . | |
f0271688 | 271 | .IP |
953d2673 | 272 | The |
cc7ac21d AS |
273 | .I key_size |
274 | and | |
275 | .I value_size | |
953d2673 MK |
276 | attributes will be used by the verifier during program loading |
277 | to check that the program is calling | |
1148d934 MK |
278 | .BR bpf_map_*_elem () |
279 | helper functions with a correctly initialized | |
cc7ac21d | 280 | .I key |
f774ddf1 | 281 | and to check that the program doesn't access the map element |
cc7ac21d AS |
282 | .I value |
283 | beyond the specified | |
16152abb | 284 | .IR value_size . |
842ee010 | 285 | For example, when a map is created with a |
266791fb | 286 | .I key_size |
f774ddf1 | 287 | of 8 and the eBPF program calls |
f0271688 | 288 | .IP |
1148d934 | 289 | .in +4n |
f0271688 | 290 | .EX |
d064d41a | 291 | bpf_map_lookup_elem(map_fd, fp \- 4) |
f0271688 | 292 | .EE |
1148d934 | 293 | .in |
f0271688 | 294 | .IP |
cc7ac21d | 295 | the program will be rejected, |
1148d934 | 296 | since the in-kernel helper function |
f0271688 | 297 | .IP |
c6ba384b | 298 | .in +4n |
f0271688 | 299 | .EX |
c6ba384b | 300 | bpf_map_lookup_elem(map_fd, void *key) |
f0271688 | 301 | .EE |
c6ba384b | 302 | .in |
f0271688 | 303 | .IP |
46a4949b MK |
304 | expects to read 8 bytes from the location pointed to by |
305 | .IR key , | |
306 | but the | |
cd415e73 | 307 | .I fp\ \-\ 4 |
46a4949b MK |
308 | (where |
309 | .I fp | |
310 | is the top of the stack) | |
1148d934 | 311 | starting address will cause out-of-bounds stack access. |
f0271688 | 312 | .IP |
842ee010 MK |
313 | Similarly, when a map is created with a |
314 | .I value_size | |
f774ddf1 | 315 | of 1 and the eBPF program contains |
f0271688 | 316 | .IP |
1148d934 | 317 | .in +4n |
f0271688 | 318 | .EX |
cc7ac21d | 319 | value = bpf_map_lookup_elem(...); |
115b4e0e | 320 | *(u32 *) value = 1; |
f0271688 | 321 | .EE |
1148d934 | 322 | .in |
f0271688 | 323 | .IP |
cc7ac21d AS |
324 | the program will be rejected, since it accesses the |
325 | .I value | |
1148d934 MK |
326 | pointer beyond the specified 1 byte |
327 | .I value_size | |
328 | limit. | |
f0271688 | 329 | .IP |
f774ddf1 MK |
330 | Currently, the following values are supported for |
331 | .IR map_type : | |
f0271688 | 332 | .IP |
1148d934 | 333 | .in +4n |
f0271688 | 334 | .EX |
cc7ac21d | 335 | enum bpf_map_type { |
ce5db3fc | 336 | BPF_MAP_TYPE_UNSPEC, /* Reserve 0 as invalid map type */ |
842ee010 MK |
337 | BPF_MAP_TYPE_HASH, |
338 | BPF_MAP_TYPE_ARRAY, | |
5415d504 | 339 | BPF_MAP_TYPE_PROG_ARRAY, |
1b7adc7c NB |
340 | BPF_MAP_TYPE_PERF_EVENT_ARRAY, |
341 | BPF_MAP_TYPE_PERCPU_HASH, | |
342 | BPF_MAP_TYPE_PERCPU_ARRAY, | |
343 | BPF_MAP_TYPE_STACK_TRACE, | |
344 | BPF_MAP_TYPE_CGROUP_ARRAY, | |
345 | BPF_MAP_TYPE_LRU_HASH, | |
346 | BPF_MAP_TYPE_LRU_PERCPU_HASH, | |
347 | BPF_MAP_TYPE_LPM_TRIE, | |
348 | BPF_MAP_TYPE_ARRAY_OF_MAPS, | |
349 | BPF_MAP_TYPE_HASH_OF_MAPS, | |
350 | BPF_MAP_TYPE_DEVMAP, | |
351 | BPF_MAP_TYPE_SOCKMAP, | |
352 | BPF_MAP_TYPE_CPUMAP, | |
0e861952 PW |
353 | BPF_MAP_TYPE_XSKMAP, |
354 | BPF_MAP_TYPE_SOCKHASH, | |
355 | BPF_MAP_TYPE_CGROUP_STORAGE, | |
356 | BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, | |
357 | BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, | |
358 | BPF_MAP_TYPE_QUEUE, | |
359 | BPF_MAP_TYPE_STACK, | |
360 | /* See /usr/include/linux/bpf.h for the full list. */ | |
cc7ac21d | 361 | }; |
f0271688 | 362 | .EE |
1148d934 | 363 | .in |
f0271688 | 364 | .IP |
cc7ac21d | 365 | .I map_type |
9a5215bf | 366 | selects one of the available map implementations in the kernel. |
f774ddf1 | 367 | .\" FIXME We need an explanation of why one might choose each of |
b913d165 | 368 | .\" these map implementations |
16152abb | 369 | For all map types, |
f774ddf1 MK |
370 | eBPF programs access maps with the same |
371 | .BR bpf_map_lookup_elem () | |
372 | and | |
1148d934 | 373 | .BR bpf_map_update_elem () |
cc7ac21d | 374 | helper functions. |
ce5db3fc | 375 | Further details of the various map types are given below. |
cc7ac21d AS |
376 | .TP |
377 | .B BPF_MAP_LOOKUP_ELEM | |
842ee010 MK |
378 | The |
379 | .B BPF_MAP_LOOKUP_ELEM | |
380 | command looks up an element with a given | |
381 | .I key | |
382 | in the map referred to by the file descriptor | |
383 | .IR fd . | |
f0271688 | 384 | .IP |
842ee010 | 385 | .in +4n |
f0271688 | 386 | .EX |
842ee010 | 387 | int |
953d2673 | 388 | bpf_lookup_elem(int fd, const void *key, void *value) |
cc7ac21d AS |
389 | { |
390 | union bpf_attr attr = { | |
391 | .map_fd = fd, | |
953d2673 MK |
392 | .key = ptr_to_u64(key), |
393 | .value = ptr_to_u64(value), | |
cc7ac21d AS |
394 | }; |
395 | ||
396 | return bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)); | |
397 | } | |
f0271688 | 398 | .EE |
842ee010 | 399 | .in |
f0271688 | 400 | .IP |
842ee010 MK |
401 | If an element is found, |
402 | the operation returns zero and stores the element's value into | |
5415d504 MK |
403 | .IR value , |
404 | which must point to a buffer of | |
405 | .I value_size | |
406 | bytes. | |
f0271688 | 407 | .IP |
842ee010 | 408 | If no element is found, the operation returns \-1 and sets |
cc7ac21d | 409 | .I errno |
1148d934 MK |
410 | to |
411 | .BR ENOENT . | |
cc7ac21d AS |
412 | .TP |
413 | .B BPF_MAP_UPDATE_ELEM | |
842ee010 MK |
414 | The |
415 | .B BPF_MAP_UPDATE_ELEM | |
416 | command | |
417 | creates or updates an element with a given | |
418 | .I key/value | |
419 | in the map referred to by the file descriptor | |
420 | .IR fd . | |
f0271688 | 421 | .IP |
842ee010 | 422 | .in +4n |
f0271688 | 423 | .EX |
842ee010 | 424 | int |
953d2673 MK |
425 | bpf_update_elem(int fd, const void *key, const void *value, |
426 | uint64_t flags) | |
cc7ac21d AS |
427 | { |
428 | union bpf_attr attr = { | |
429 | .map_fd = fd, | |
953d2673 MK |
430 | .key = ptr_to_u64(key), |
431 | .value = ptr_to_u64(value), | |
432 | .flags = flags, | |
cc7ac21d AS |
433 | }; |
434 | ||
435 | return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)); | |
436 | } | |
f0271688 | 437 | .EE |
842ee010 | 438 | .in |
f0271688 | 439 | .IP |
842ee010 | 440 | The |
cc7ac21d | 441 | .I flags |
842ee010 MK |
442 | argument should be specified as one of the following: |
443 | .RS | |
444 | .TP | |
445 | .B BPF_ANY | |
446 | Create a new element or update an existing element. | |
447 | .TP | |
448 | .B BPF_NOEXIST | |
449 | Create a new element only if it did not exist. | |
450 | .TP | |
451 | .B BPF_EXIST | |
452 | Update an existing element. | |
453 | .RE | |
454 | .IP | |
455 | On success, the operation returns zero. | |
cc7ac21d AS |
456 | On error, \-1 is returned and |
457 | .I errno | |
1148d934 MK |
458 | is set to |
459 | .BR EINVAL , | |
460 | .BR EPERM , | |
461 | .BR ENOMEM , | |
462 | or | |
463 | .BR E2BIG . | |
cc7ac21d | 464 | .B E2BIG |
842ee010 | 465 | indicates that the number of elements in the map reached the |
cc7ac21d AS |
466 | .I max_entries |
467 | limit specified at map creation time. | |
468 | .B EEXIST | |
842ee010 MK |
469 | will be returned if |
470 | .I flags | |
471 | specifies | |
472 | .B BPF_NOEXIST | |
473 | and the element with | |
1148d934 MK |
474 | .I key |
475 | already exists in the map. | |
cc7ac21d | 476 | .B ENOENT |
953d2673 | 477 | will be returned if |
842ee010 MK |
478 | .I flags |
479 | specifies | |
480 | .B BPF_EXIST | |
481 | and the element with | |
1148d934 MK |
482 | .I key |
483 | doesn't exist in the map. | |
cc7ac21d AS |
484 | .TP |
485 | .B BPF_MAP_DELETE_ELEM | |
842ee010 MK |
486 | The |
487 | .B BPF_MAP_DELETE_ELEM | |
488 | command | |
96ed2f3f | 489 | deletes the element whose key is |
842ee010 MK |
490 | .I key |
491 | from the map referred to by the file descriptor | |
492 | .IR fd . | |
f0271688 | 493 | .IP |
842ee010 | 494 | .in +4n |
f0271688 | 495 | .EX |
842ee010 | 496 | int |
953d2673 | 497 | bpf_delete_elem(int fd, const void *key) |
cc7ac21d AS |
498 | { |
499 | union bpf_attr attr = { | |
500 | .map_fd = fd, | |
953d2673 | 501 | .key = ptr_to_u64(key), |
cc7ac21d AS |
502 | }; |
503 | ||
504 | return bpf(BPF_MAP_DELETE_ELEM, &attr, sizeof(attr)); | |
505 | } | |
f0271688 | 506 | .EE |
842ee010 | 507 | .in |
f0271688 | 508 | .IP |
842ee010 MK |
509 | On success, zero is returned. |
510 | If the element is not found, \-1 is returned and | |
cc7ac21d | 511 | .I errno |
842ee010 | 512 | is set to |
1148d934 | 513 | .BR ENOENT . |
cc7ac21d AS |
514 | .TP |
515 | .B BPF_MAP_GET_NEXT_KEY | |
842ee010 MK |
516 | The |
517 | .B BPF_MAP_GET_NEXT_KEY | |
518 | command looks up an element by | |
519 | .I key | |
520 | in the map referred to by the file descriptor | |
266791fb | 521 | .I fd |
842ee010 MK |
522 | and sets the |
523 | .I next_key | |
524 | pointer to the key of the next element. | |
f0271688 | 525 | .IP |
842ee010 | 526 | .in +4n |
f0271688 | 527 | .EX |
842ee010 | 528 | int |
953d2673 | 529 | bpf_get_next_key(int fd, const void *key, void *next_key) |
cc7ac21d AS |
530 | { |
531 | union bpf_attr attr = { | |
953d2673 MK |
532 | .map_fd = fd, |
533 | .key = ptr_to_u64(key), | |
cc7ac21d AS |
534 | .next_key = ptr_to_u64(next_key), |
535 | }; | |
536 | ||
537 | return bpf(BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr)); | |
538 | } | |
f0271688 | 539 | .EE |
842ee010 | 540 | .in |
f0271688 | 541 | .IP |
5415d504 MK |
542 | If |
543 | .I key | |
544 | is found, the operation returns zero and sets the | |
545 | .I next_key | |
546 | pointer to the key of the next element. | |
cc7ac21d AS |
547 | If |
548 | .I key | |
842ee010 | 549 | is not found, the operation returns zero and sets the |
cc7ac21d AS |
550 | .I next_key |
551 | pointer to the key of the first element. | |
552 | If | |
553 | .I key | |
842ee010 | 554 | is the last element, \-1 is returned and |
cc7ac21d | 555 | .I errno |
842ee010 | 556 | is set to |
1148d934 | 557 | .BR ENOENT . |
9a5215bf | 558 | Other possible |
cc7ac21d | 559 | .I errno |
1148d934 MK |
560 | values are |
561 | .BR ENOMEM , | |
562 | .BR EFAULT , | |
563 | .BR EPERM , | |
564 | and | |
565 | .BR EINVAL . | |
cc7ac21d AS |
566 | This method can be used to iterate over all elements in the map. |
567 | .TP | |
568 | .B close(map_fd) | |
842ee010 | 569 | Delete the map referred to by the file descriptor |
1148d934 | 570 | .IR map_fd . |
842ee010 | 571 | When the user-space program that created a map exits, all maps will |
ce5db3fc MK |
572 | be deleted automatically (but see NOTES). |
573 | .\" | |
574 | .SS eBPF map types | |
575 | The following map types are supported: | |
576 | .TP | |
577 | .B BPF_MAP_TYPE_HASH | |
578 | .\" commit 0f8e4bd8a1fc8c4185f1630061d0a1f2d197a475 | |
ce5db3fc MK |
579 | Hash-table maps have the following characteristics: |
580 | .RS | |
22356d97 | 581 | .IP \(bu 3 |
ce5db3fc MK |
582 | Maps are created and destroyed by user-space programs. |
583 | Both user-space and eBPF programs | |
46a4949b | 584 | can perform lookup, update, and delete operations. |
22356d97 | 585 | .IP \(bu |
ce5db3fc | 586 | The kernel takes care of allocating and freeing key/value pairs. |
22356d97 | 587 | .IP \(bu |
ce5db3fc MK |
588 | The |
589 | .BR map_update_elem () | |
998f951b | 590 | helper will fail to insert new element when the |
ce5db3fc MK |
591 | .I max_entries |
592 | limit is reached. | |
593 | (This ensures that eBPF programs cannot exhaust memory.) | |
22356d97 | 594 | .IP \(bu |
ce5db3fc MK |
595 | .BR map_update_elem () |
596 | replaces existing elements atomically. | |
597 | .RE | |
598 | .IP | |
953d2673 | 599 | Hash-table maps are |
ce5db3fc MK |
600 | optimized for speed of lookup. |
601 | .TP | |
602 | .B BPF_MAP_TYPE_ARRAY | |
603 | .\" commit 28fbcfa08d8ed7c5a50d41a0433aad222835e8e3 | |
ce5db3fc MK |
604 | Array maps have the following characteristics: |
605 | .RS | |
22356d97 | 606 | .IP \(bu 3 |
ce5db3fc | 607 | Optimized for fastest possible lookup. |
46a4949b | 608 | In the future the verifier/JIT compiler |
ce5db3fc MK |
609 | may recognize lookup() operations that employ a constant key |
610 | and optimize it into constant pointer. | |
611 | It is possible to optimize a non-constant | |
612 | key into direct pointer arithmetic as well, since pointers and | |
613 | .I value_size | |
614 | are constant for the life of the eBPF program. | |
615 | In other words, | |
616 | .BR array_map_lookup_elem () | |
617 | may be 'inlined' by the verifier/JIT compiler | |
618 | while preserving concurrent access to this map from user space. | |
22356d97 | 619 | .IP \(bu |
ce5db3fc | 620 | All array elements pre-allocated and zero initialized at init time |
22356d97 | 621 | .IP \(bu |
ce5db3fc | 622 | The key is an array index, and must be exactly four bytes. |
22356d97 | 623 | .IP \(bu |
ce5db3fc MK |
624 | .BR map_delete_elem () |
625 | fails with the error | |
626 | .BR EINVAL , | |
627 | since elements cannot be deleted. | |
22356d97 | 628 | .IP \(bu |
ce5db3fc | 629 | .BR map_update_elem () |
953d2673 MK |
630 | replaces elements in a |
631 | .B nonatomic | |
632 | fashion; | |
cd579c3f MK |
633 | for atomic updates, a hash-table map should be used instead. |
634 | There is however one special case that can also be used with arrays: | |
635 | the atomic built-in | |
266791fb | 636 | .B __sync_fetch_and_add() |
cd579c3f MK |
637 | can be used on 32 and 64 bit atomic counters. |
638 | For example, it can be | |
9a818ddd DB |
639 | applied on the whole value itself if it represents a single counter, |
640 | or in case of a structure containing multiple counters, it could be | |
cd579c3f MK |
641 | used on individual counters. |
642 | This is quite often useful for aggregation and accounting of events. | |
ce5db3fc MK |
643 | .RE |
644 | .IP | |
645 | Among the uses for array maps are the following: | |
646 | .RS | |
22356d97 | 647 | .IP \(bu 3 |
ce5db3fc MK |
648 | As "global" eBPF variables: an array of 1 element whose key is (index) 0 |
649 | and where the value is a collection of 'global' variables which | |
650 | eBPF programs can use to keep state between events. | |
22356d97 | 651 | .IP \(bu |
ce5db3fc | 652 | Aggregation of tracing events into a fixed set of buckets. |
22356d97 | 653 | .IP \(bu |
9a818ddd DB |
654 | Accounting of networking events, for example, number of packets and packet |
655 | sizes. | |
ce5db3fc MK |
656 | .RE |
657 | .TP | |
658 | .BR BPF_MAP_TYPE_PROG_ARRAY " (since Linux 4.2)" | |
cd579c3f MK |
659 | A program array map is a special kind of array map whose map values |
660 | contain only file descriptors referring to other eBPF programs. | |
661 | Thus, both the | |
662 | .I key_size | |
663 | and | |
664 | .I value_size | |
665 | must be exactly four bytes. | |
9a818ddd | 666 | This map is used in conjunction with the |
cd579c3f | 667 | .BR bpf_tail_call () |
9a818ddd | 668 | helper. |
f0271688 | 669 | .IP |
9a818ddd DB |
670 | This means that an eBPF program with a program array map attached to it |
671 | can call from kernel side into | |
f0271688 | 672 | .IP |
9a818ddd | 673 | .in +4n |
f0271688 | 674 | .EX |
05f10213 MK |
675 | void bpf_tail_call(void *context, void *prog_map, |
676 | unsigned int index); | |
f0271688 | 677 | .EE |
9a818ddd | 678 | .in |
f0271688 | 679 | .IP |
9a818ddd | 680 | and therefore replace its own program flow with the one from the program |
cd579c3f MK |
681 | at the given program array slot, if present. |
682 | This can be regarded as kind of a jump table to a different eBPF program. | |
683 | The invoked program will then reuse the same stack. | |
684 | When a jump into the new program has been performed, | |
685 | it won't return to the old program anymore. | |
f0271688 | 686 | .IP |
aabe0499 MK |
687 | If no eBPF program is found at the given index of the program array |
688 | (because the map slot doesn't contain a valid program file descriptor, | |
689 | the specified lookup index/key is out of bounds, | |
690 | or the limit of 32 | |
691 | .\" MAX_TAIL_CALL_CNT | |
692 | nested calls has been exceed), | |
9a818ddd DB |
693 | execution continues with the current eBPF program. |
694 | This can be used as a fall-through for default cases. | |
f0271688 | 695 | .IP |
9a818ddd | 696 | A program array map is useful, for example, in tracing or networking, to |
cd579c3f MK |
697 | handle individual system calls or protocols in their own subprograms and |
698 | use their identifiers as an individual map index. | |
699 | This approach may result in performance benefits, | |
700 | and also makes it possible to overcome the maximum | |
701 | instruction limit of a single eBPF program. | |
702 | In dynamic environments, | |
703 | a user-space daemon might atomically replace individual subprograms | |
704 | at run-time with newer versions to alter overall program behavior, | |
705 | for instance, if global policies change. | |
ce5db3fc MK |
706 | .\" |
707 | .SS eBPF programs | |
842ee010 MK |
708 | The |
709 | .B BPF_PROG_LOAD | |
54513c00 | 710 | command is used to load an eBPF program into the kernel. |
9ab03361 | 711 | The return value for this command is a new file descriptor associated |
ce5db3fc | 712 | with this eBPF program. |
f0271688 | 713 | .PP |
842ee010 | 714 | .in +4n |
f0271688 | 715 | .EX |
cc7ac21d AS |
716 | char bpf_log_buf[LOG_BUF_SIZE]; |
717 | ||
842ee010 | 718 | int |
953d2673 | 719 | bpf_prog_load(enum bpf_prog_type type, |
842ee010 MK |
720 | const struct bpf_insn *insns, int insn_cnt, |
721 | const char *license) | |
cc7ac21d AS |
722 | { |
723 | union bpf_attr attr = { | |
953d2673 MK |
724 | .prog_type = type, |
725 | .insns = ptr_to_u64(insns), | |
726 | .insn_cnt = insn_cnt, | |
727 | .license = ptr_to_u64(license), | |
728 | .log_buf = ptr_to_u64(bpf_log_buf), | |
729 | .log_size = LOG_BUF_SIZE, | |
cc7ac21d AS |
730 | .log_level = 1, |
731 | }; | |
732 | ||
733 | return bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); | |
734 | } | |
f0271688 | 735 | .EE |
842ee010 | 736 | .in |
f0271688 | 737 | .PP |
1148d934 | 738 | .I prog_type |
cc7ac21d | 739 | is one of the available program types: |
f0271688 | 740 | .IP |
1148d934 | 741 | .in +4n |
f0271688 | 742 | .EX |
cc7ac21d | 743 | enum bpf_prog_type { |
f774ddf1 MK |
744 | BPF_PROG_TYPE_UNSPEC, /* Reserve 0 as invalid |
745 | program type */ | |
ce5db3fc MK |
746 | BPF_PROG_TYPE_SOCKET_FILTER, |
747 | BPF_PROG_TYPE_KPROBE, | |
748 | BPF_PROG_TYPE_SCHED_CLS, | |
749 | BPF_PROG_TYPE_SCHED_ACT, | |
0e861952 PW |
750 | BPF_PROG_TYPE_TRACEPOINT, |
751 | BPF_PROG_TYPE_XDP, | |
752 | BPF_PROG_TYPE_PERF_EVENT, | |
753 | BPF_PROG_TYPE_CGROUP_SKB, | |
754 | BPF_PROG_TYPE_CGROUP_SOCK, | |
755 | BPF_PROG_TYPE_LWT_IN, | |
756 | BPF_PROG_TYPE_LWT_OUT, | |
757 | BPF_PROG_TYPE_LWT_XMIT, | |
758 | BPF_PROG_TYPE_SOCK_OPS, | |
759 | BPF_PROG_TYPE_SK_SKB, | |
760 | BPF_PROG_TYPE_CGROUP_DEVICE, | |
761 | BPF_PROG_TYPE_SK_MSG, | |
762 | BPF_PROG_TYPE_RAW_TRACEPOINT, | |
763 | BPF_PROG_TYPE_CGROUP_SOCK_ADDR, | |
764 | BPF_PROG_TYPE_LWT_SEG6LOCAL, | |
765 | BPF_PROG_TYPE_LIRC_MODE2, | |
766 | BPF_PROG_TYPE_SK_REUSEPORT, | |
767 | BPF_PROG_TYPE_FLOW_DISSECTOR, | |
768 | /* See /usr/include/linux/bpf.h for the full list. */ | |
cc7ac21d | 769 | }; |
f0271688 | 770 | .EE |
1148d934 | 771 | .in |
f0271688 | 772 | .PP |
ce5db3fc | 773 | For further details of eBPF program types, see below. |
f0271688 | 774 | .PP |
ce5db3fc | 775 | The remaining fields of |
842ee010 MK |
776 | .I bpf_attr |
777 | are set as follows: | |
22356d97 | 778 | .IP \(bu 3 |
1148d934 | 779 | .I insns |
842ee010 | 780 | is an array of |
1148d934 MK |
781 | .I "struct bpf_insn" |
782 | instructions. | |
22356d97 | 783 | .IP \(bu |
1148d934 | 784 | .I insn_cnt |
842ee010 MK |
785 | is the number of instructions in the program referred to by |
786 | .IR insns . | |
22356d97 | 787 | .IP \(bu |
1148d934 | 788 | .I license |
842ee010 | 789 | is a license string, which must be GPL compatible to call helper functions |
1148d934 MK |
790 | marked |
791 | .IR gpl_only . | |
fcd1bee3 | 792 | (The licensing rules are the same as for kernel modules, |
9a818ddd | 793 | so that also dual licenses, such as "Dual BSD/GPL", may be used.) |
22356d97 | 794 | .IP \(bu |
1148d934 | 795 | .I log_buf |
842ee010 MK |
796 | is a pointer to a caller-allocated buffer in which the in-kernel |
797 | verifier can store the verification log. | |
9a5215bf | 798 | This log is a multi-line string that can be checked by |
cc7ac21d | 799 | the program author in order to understand how the verifier came to |
953d2673 | 800 | the conclusion that the eBPF program is unsafe. |
cc7ac21d | 801 | The format of the output can change at any time as the verifier evolves. |
22356d97 | 802 | .IP \(bu |
1148d934 | 803 | .I log_size |
842ee010 | 804 | size of the buffer pointed to by |
029b613f | 805 | .IR log_buf . |
9a5215bf | 806 | If the size of the buffer is not large enough to store all |
cc7ac21d AS |
807 | verifier messages, \-1 is returned and |
808 | .I errno | |
1148d934 MK |
809 | is set to |
810 | .BR ENOSPC . | |
22356d97 | 811 | .IP \(bu |
1148d934 | 812 | .I log_level |
9a5215bf | 813 | verbosity level of the verifier. |
fcd1bee3 MK |
814 | A value of zero means that the verifier will not provide a log; |
815 | in this case, | |
816 | .I log_buf | |
817 | must be a NULL pointer, and | |
818 | .I log_size | |
819 | must be zero. | |
f0271688 | 820 | .PP |
ce5db3fc MK |
821 | Applying |
822 | .BR close (2) | |
823 | to the file descriptor returned by | |
824 | .B BPF_PROG_LOAD | |
825 | will unload the eBPF program (but see NOTES). | |
f0271688 | 826 | .PP |
54513c00 MK |
827 | Maps are accessible from eBPF programs and are used to exchange data between |
828 | eBPF programs and between eBPF programs and user-space programs. | |
5415d504 MK |
829 | For example, |
830 | eBPF programs can process various events (like kprobe, packets) and | |
831 | store their data into a map, | |
832 | and user-space programs can then fetch data from the map. | |
833 | Conversely, user-space programs can use a map as a configuration mechanism, | |
834 | populating the map with values checked by the eBPF program, | |
835 | which then modifies its behavior on the fly according to those values. | |
953d2673 MK |
836 | .\" |
837 | .\" | |
ce5db3fc | 838 | .SS eBPF program types |
953d2673 MK |
839 | The eBPF program type |
840 | .RI ( prog_type ) | |
fcd1bee3 | 841 | determines the subset of kernel helper functions that the program |
953d2673 | 842 | may call. |
fcd1bee3 | 843 | The program type also determines the program input (context)\(emthe |
953d2673 MK |
844 | format of |
845 | .I "struct bpf_context" | |
ce5db3fc | 846 | (which is the data blob passed into the eBPF program as the first argument). |
0fc33df7 | 847 | .\" |
30ea59e7 | 848 | .\" FIXME |
24493e9b | 849 | .\" Somewhere in this page we need a general introduction to the |
0fc33df7 MK |
850 | .\" bpf_context. For example, how does a BPF program access the |
851 | .\" context? | |
f0271688 | 852 | .PP |
953d2673 MK |
853 | For example, a tracing program does not have the exact same |
854 | subset of helper functions as a socket filter program | |
855 | (though they may have some helpers in common). | |
856 | Similarly, | |
857 | the input (context) for a tracing program is a set of register values, | |
858 | while for a socket filter it is a network packet. | |
f0271688 | 859 | .PP |
ce5db3fc MK |
860 | The set of functions available to eBPF programs of a given type may increase |
861 | in the future. | |
f0271688 | 862 | .PP |
ce5db3fc MK |
863 | The following program types are supported: |
864 | .TP | |
865 | .BR BPF_PROG_TYPE_SOCKET_FILTER " (since Linux 3.19)" | |
866 | Currently, the set of functions for | |
867 | .B BPF_PROG_TYPE_SOCKET_FILTER | |
868 | is: | |
f0271688 | 869 | .IP |
1148d934 | 870 | .in +4n |
f0271688 | 871 | .EX |
ce5db3fc MK |
872 | bpf_map_lookup_elem(map_fd, void *key) |
873 | /* look up key in a map_fd */ | |
874 | bpf_map_update_elem(map_fd, void *key, void *value) | |
875 | /* update key/value */ | |
876 | bpf_map_delete_elem(map_fd, void *key) | |
877 | /* delete key in a map_fd */ | |
f0271688 | 878 | .EE |
1148d934 | 879 | .in |
f0271688 | 880 | .IP |
ce5db3fc MK |
881 | The |
882 | .I bpf_context | |
883 | argument is a pointer to a | |
b87d8ba6 | 884 | .IR "struct __sk_buff" . |
953d2673 | 885 | .\" FIXME: We need some text here to explain how the program |
b913d165 MK |
886 | .\" accesses __sk_buff. |
887 | .\" See 'struct __sk_buff' and commit 9bac3d6d548e5 | |
888 | .\" | |
b87d8ba6 | 889 | .\" Alexei commented: |
b913d165 MK |
890 | .\" Actually now in case of SOCKET_FILTER, SCHED_CLS, SCHED_ACT |
891 | .\" the program can now access skb fields. | |
ce5db3fc MK |
892 | .\" |
893 | .TP | |
266791fb | 894 | .BR BPF_PROG_TYPE_KPROBE " (since Linux 4.1)" |
ce5db3fc MK |
895 | .\" commit 2541517c32be2531e0da59dfd7efc1ce844644f5 |
896 | [To be documented] | |
897 | .\" FIXME Document this program type | |
898 | .\" Describe allowed helper functions for this program type | |
899 | .\" Describe bpf_context for this program type | |
b913d165 | 900 | .\" |
ce5db3fc MK |
901 | .\" FIXME We need text here to describe 'kern_version' |
902 | .TP | |
266791fb | 903 | .BR BPF_PROG_TYPE_SCHED_CLS " (since Linux 4.1)" |
ce5db3fc MK |
904 | .\" commit 96be4325f443dbbfeb37d2a157675ac0736531a1 |
905 | .\" commit e2e9b6541dd4b31848079da80fe2253daaafb549 | |
906 | [To be documented] | |
907 | .\" FIXME Document this program type | |
908 | .\" Describe allowed helper functions for this program type | |
909 | .\" Describe bpf_context for this program type | |
910 | .TP | |
266791fb | 911 | .BR BPF_PROG_TYPE_SCHED_ACT " (since Linux 4.1)" |
ce5db3fc MK |
912 | .\" commit 94caee8c312d96522bcdae88791aaa9ebcd5f22c |
913 | .\" commit a8cb5f556b567974d75ea29c15181c445c541b1f | |
914 | [To be documented] | |
915 | .\" FIXME Document this program type | |
916 | .\" Describe allowed helper functions for this program type | |
917 | .\" Describe bpf_context for this program type | |
918 | .SS Events | |
919 | Once a program is loaded, it can be attached to an event. | |
920 | Various kernel subsystems have different ways to do so. | |
f0271688 | 921 | .PP |
ce5db3fc MK |
922 | Since Linux 3.19, |
923 | .\" commit 89aa075832b0da4402acebd698d0411dcc82d03e | |
924 | the following call will attach the program | |
cc7ac21d | 925 | .I prog_fd |
842ee010 MK |
926 | to the socket |
927 | .IR sockfd , | |
ce5db3fc MK |
928 | which was created by an earlier call to |
929 | .BR socket (2): | |
f0271688 | 930 | .PP |
1148d934 | 931 | .in +4n |
f0271688 | 932 | .EX |
ce5db3fc MK |
933 | setsockopt(sockfd, SOL_SOCKET, SO_ATTACH_BPF, |
934 | &prog_fd, sizeof(prog_fd)); | |
f0271688 | 935 | .EE |
1148d934 | 936 | .in |
f0271688 | 937 | .PP |
ce5db3fc MK |
938 | Since Linux 4.1, |
939 | .\" commit 2541517c32be2531e0da59dfd7efc1ce844644f5 | |
940 | the following call may be used to attach | |
941 | the eBPF program referred to by the file descriptor | |
cc7ac21d | 942 | .I prog_fd |
ce5db3fc MK |
943 | to a perf event file descriptor, |
944 | .IR event_fd , | |
945 | that was created by a previous call to | |
946 | .BR perf_event_open (2): | |
efeece04 | 947 | .PP |
ce5db3fc | 948 | .in +4n |
b76974c1 | 949 | .EX |
ce5db3fc | 950 | ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd); |
b76974c1 | 951 | .EE |
ce5db3fc MK |
952 | .in |
953 | .\" | |
ce5db3fc | 954 | .\" |
cc7ac21d AS |
955 | .SH RETURN VALUE |
956 | For a successful call, the return value depends on the operation: | |
957 | .TP | |
958 | .B BPF_MAP_CREATE | |
ce5db3fc | 959 | The new file descriptor associated with the eBPF map. |
cc7ac21d AS |
960 | .TP |
961 | .B BPF_PROG_LOAD | |
54513c00 | 962 | The new file descriptor associated with the eBPF program. |
cc7ac21d AS |
963 | .TP |
964 | All other commands | |
965 | Zero. | |
966 | .PP | |
967 | On error, \-1 is returned, and | |
968 | .I errno | |
f6a4078b | 969 | is set to indicate the error. |
cc7ac21d AS |
970 | .SH ERRORS |
971 | .TP | |
266791fb | 972 | .B E2BIG |
6cedbd4c MK |
973 | The eBPF program is too large or a map reached the |
974 | .I max_entries | |
975 | limit (maximum number of elements). | |
cc7ac21d | 976 | .TP |
266791fb | 977 | .B EACCES |
6cedbd4c | 978 | For |
266791fb | 979 | .BR BPF_PROG_LOAD , |
6cedbd4c MK |
980 | even though all program instructions are valid, the program has been |
981 | rejected because it was deemed unsafe. | |
982 | This may be because it may have | |
983 | accessed a disallowed memory region or an uninitialized stack/register or | |
984 | because the function constraints don't match the actual types or because | |
985 | there was a misaligned memory access. | |
986 | In this case, it is recommended to call | |
987 | .BR bpf () | |
988 | again with | |
989 | .I log_level = 1 | |
990 | and examine | |
991 | .I log_buf | |
992 | for the specific reason provided by the verifier. | |
cc7ac21d AS |
993 | .TP |
994 | .B EBADF | |
995 | .I fd | |
7d6bfe72 | 996 | is not an open file descriptor. |
cc7ac21d AS |
997 | .TP |
998 | .B EFAULT | |
1148d934 MK |
999 | One of the pointers |
1000 | .RI ( key | |
cc7ac21d AS |
1001 | or |
1002 | .I value | |
1003 | or | |
1004 | .I log_buf | |
1005 | or | |
1148d934 MK |
1006 | .IR insns ) |
1007 | is outside the accessible address space. | |
cc7ac21d AS |
1008 | .TP |
1009 | .B EINVAL | |
1010 | The value specified in | |
1011 | .I cmd | |
1012 | is not recognized by this kernel. | |
1013 | .TP | |
1014 | .B EINVAL | |
1015 | For | |
1016 | .BR BPF_MAP_CREATE , | |
1017 | either | |
1018 | .I map_type | |
1019 | or attributes are invalid. | |
1020 | .TP | |
1021 | .B EINVAL | |
1022 | For | |
266791fb | 1023 | .B BPF_MAP_*_ELEM |
cc7ac21d | 1024 | commands, |
1148d934 MK |
1025 | some of the fields of |
1026 | .I "union bpf_attr" | |
1027 | that are not used by this command | |
cc7ac21d AS |
1028 | are not set to zero. |
1029 | .TP | |
1030 | .B EINVAL | |
1031 | For | |
266791fb | 1032 | .BR BPF_PROG_LOAD , |
9a5215bf | 1033 | indicates an attempt to load an invalid program. |
953d2673 MK |
1034 | eBPF programs can be deemed |
1035 | invalid due to unrecognized instructions, the use of reserved fields, jumps | |
cc7ac21d AS |
1036 | out of range, infinite loops or calls of unknown functions. |
1037 | .TP | |
266791fb | 1038 | .B ENOENT |
cc7ac21d AS |
1039 | For |
1040 | .B BPF_MAP_LOOKUP_ELEM | |
1041 | or | |
16152abb | 1042 | .BR BPF_MAP_DELETE_ELEM , |
cc7ac21d AS |
1043 | indicates that the element with the given |
1044 | .I key | |
1045 | was not found. | |
1046 | .TP | |
6cedbd4c MK |
1047 | .B ENOMEM |
1048 | Cannot allocate sufficient memory. | |
1049 | .TP | |
1050 | .B EPERM | |
1051 | The call was made without sufficient privilege | |
1052 | (without the | |
1053 | .B CAP_SYS_ADMIN | |
1054 | capability). | |
5f920e10 MK |
1055 | .SH VERSIONS |
1056 | The | |
1057 | .BR bpf () | |
1058 | system call first appeared in Linux 3.18. | |
3113c7f3 | 1059 | .SH STANDARDS |
8dbf8f2d MK |
1060 | The |
1061 | .BR bpf () | |
1062 | system call is Linux-specific. | |
cc7ac21d | 1063 | .SH NOTES |
821bf91c | 1064 | Prior to Linux 4.4, all |
842ee010 MK |
1065 | .BR bpf () |
1066 | commands require the caller to have the | |
cc7ac21d | 1067 | .B CAP_SYS_ADMIN |
35732aa7 MK |
1068 | capability. |
1069 | From Linux 4.4 onwards, | |
1070 | .\" commit 1be7f75d1668d6296b80bf35dcf6762393530afc | |
1071 | an unprivileged user may create limited programs of type | |
1ae6b2c7 | 1072 | .B BPF_PROG_TYPE_SOCKET_FILTER |
35732aa7 MK |
1073 | and associated maps. |
1074 | However they may not store kernel pointers within | |
821bf91c | 1075 | the maps and are presently limited to the following helper functions: |
f7d706ba MK |
1076 | .\" [Linux 5.6] mtk: The list of available functions is, I think, governed |
1077 | .\" by the check in net/core/filter.c::bpf_base_func_proto(). | |
22356d97 | 1078 | .IP \(bu 3 |
821bf91c RP |
1079 | get_random |
1080 | .PD 0 | |
22356d97 | 1081 | .IP \(bu |
821bf91c | 1082 | get_smp_processor_id |
22356d97 | 1083 | .IP \(bu |
821bf91c | 1084 | tail_call |
22356d97 | 1085 | .IP \(bu |
821bf91c | 1086 | ktime_get_ns |
22356d97 | 1087 | .PD |
821bf91c | 1088 | .PP |
c53d4fc1 | 1089 | Unprivileged access may be blocked by writing the value 1 to the file |
821bf91c | 1090 | .IR /proc/sys/kernel/unprivileged_bpf_disabled . |
f0271688 | 1091 | .PP |
f774ddf1 MK |
1092 | eBPF objects (maps and programs) can be shared between processes. |
1093 | For example, after | |
1094 | .BR fork (2), | |
1095 | the child inherits file descriptors referring to the same eBPF objects. | |
1096 | In addition, file descriptors referring to eBPF objects can be | |
1097 | transferred over UNIX domain sockets. | |
1098 | File descriptors referring to eBPF objects can be duplicated | |
1099 | in the usual way, using | |
1100 | .BR dup (2) | |
1101 | and similar calls. | |
1102 | An eBPF object is deallocated only after all file descriptors | |
1103 | referring to the object have been closed. | |
f0271688 | 1104 | .PP |
4fba111e MK |
1105 | eBPF programs can be written in a restricted C that is compiled (using the |
1106 | .B clang | |
953d2673 MK |
1107 | compiler) into eBPF bytecode. |
1108 | Various features are omitted from this restricted C, such as loops, | |
f774ddf1 | 1109 | global variables, variadic functions, floating-point numbers, |
953d2673 | 1110 | and passing structures as function arguments. |
4fba111e MK |
1111 | Some examples can be found in the |
1112 | .I samples/bpf/*_kern.c | |
1113 | files in the kernel source tree. | |
ce5db3fc MK |
1114 | .\" There are also examples for the tc classifier, in the iproute2 |
1115 | .\" project, in examples/bpf | |
f0271688 | 1116 | .PP |
953d2673 MK |
1117 | The kernel contains a just-in-time (JIT) compiler that translates |
1118 | eBPF bytecode into native machine code for better performance. | |
5a29959a MK |
1119 | In kernels before Linux 4.15, |
1120 | the JIT compiler is disabled by default, | |
953d2673 MK |
1121 | but its operation can be controlled by writing one of the |
1122 | following integer strings to the file | |
1123 | .IR /proc/sys/net/core/bpf_jit_enable : | |
4279e42d AC |
1124 | .TP |
1125 | .B 0 | |
953d2673 | 1126 | Disable JIT compilation (default). |
4279e42d AC |
1127 | .TP |
1128 | .B 1 | |
953d2673 | 1129 | Normal compilation. |
4279e42d AC |
1130 | .TP |
1131 | .B 2 | |
953d2673 MK |
1132 | Debugging mode. |
1133 | The generated opcodes are dumped in hexadecimal into the kernel log. | |
1134 | These opcodes can then be disassembled using the program | |
266791fb | 1135 | .I tools/net/bpf_jit_disasm.c |
953d2673 | 1136 | provided in the kernel source tree. |
fcd1bee3 | 1137 | .PP |
5a29959a MK |
1138 | Since Linux 4.15, |
1139 | .\" commit 290af86629b25ffd1ed6232c4e9107da031705cb | |
1140 | the kernel may configured with the | |
1141 | .B CONFIG_BPF_JIT_ALWAYS_ON | |
1142 | option. | |
1143 | In this case, the JIT compiler is always enabled, and the | |
1144 | .I bpf_jit_enable | |
1145 | is initialized to 1 and is immutable. | |
1146 | (This kernel configuration option was provided as a mitigation for | |
1147 | one of the Spectre attacks against the BPF interpreter.) | |
1148 | .PP | |
2b623a23 | 1149 | The JIT compiler for eBPF is currently |
4167f63f | 1150 | .\" Last reviewed in Linux 4.18-rc by grepping for BPF_ALU64 in arch/ |
6d2ac026 MK |
1151 | .\" and by checking the documentation for bpf_jit_enable in |
1152 | .\" Documentation/sysctl/net.txt | |
2b623a23 | 1153 | available for the following architectures: |
22356d97 | 1154 | .IP \(bu 3 |
2ef9216b MK |
1155 | x86-64 (since Linux 3.18; cBPF since Linux 3.0); |
1156 | .\" commit 0a14842f5a3c0e88a1e59fac5c3025db39721f74 | |
2b623a23 | 1157 | .PD 0 |
22356d97 | 1158 | .IP \(bu |
2ef9216b MK |
1159 | ARM32 (since Linux 3.18; cBPF since Linux 3.4); |
1160 | .\" commit ddecdfcea0ae891f782ae853771c867ab51024c2 | |
22356d97 | 1161 | .IP \(bu |
2ef9216b MK |
1162 | SPARC 32 (since Linux 3.18; cBPF since Linux 3.5); |
1163 | .\" commit 2809a2087cc44b55e4377d7b9be3f7f5d2569091 | |
22356d97 | 1164 | .IP \(bu |
2ef9216b MK |
1165 | ARM-64 (since Linux 3.18); |
1166 | .\" commit e54bcde3d69d40023ae77727213d14f920eb264a | |
22356d97 | 1167 | .IP \(bu |
069be4fd MK |
1168 | s390 (since Linux 4.1; cBPF since Linux 3.7); |
1169 | .\" commit c10302efe569bfd646b4c22df29577a4595b4580 | |
22356d97 | 1170 | .IP \(bu |
2ef9216b MK |
1171 | PowerPC 64 (since Linux 4.8; cBPF since Linux 3.1); |
1172 | .\" commit 0ca87f05ba8bdc6791c14878464efc901ad71e99 | |
1173 | .\" commit 156d0e290e969caba25f1851c52417c14d141b24 | |
22356d97 | 1174 | .IP \(bu |
2b623a23 | 1175 | SPARC 64 (since Linux 4.12); |
2ef9216b | 1176 | .\" commit 7a12b5031c6b947cc13918237ae652b536243b76 |
22356d97 | 1177 | .IP \(bu |
2ef9216b MK |
1178 | x86-32 (since Linux 4.18); |
1179 | .\" commit 03f5781be2c7b7e728d724ac70ba10799cc710d7 | |
22356d97 | 1180 | .IP \(bu |
2ef9216b MK |
1181 | MIPS 64 (since Linux 4.18; cBPF since Linux 3.16); |
1182 | .\" commit c6610de353da5ca6eee5b8960e838a87a90ead0c | |
1183 | .\" commit f381bf6d82f032b7410185b35d000ea370ac706b | |
22356d97 | 1184 | .IP \(bu |
2ef9216b MK |
1185 | riscv (since Linux 5.1). |
1186 | .\" commit 2353ecc6f91fd15b893fa01bf85a1c7a823ee4f2 | |
2b623a23 | 1187 | .PD |
ce7ba00b | 1188 | .SH EXAMPLES |
33857069 | 1189 | .\" [[FIXME]] SRC BEGIN (bpf.c) |
ce7ba00b MK |
1190 | .EX |
1191 | /* bpf+sockets example: | |
1192 | * 1. create array map of 256 elements | |
1193 | * 2. load program that counts number of packets received | |
d064d41a | 1194 | * r0 = skb\->data[ETH_HLEN + offsetof(struct iphdr, protocol)] |
ce7ba00b MK |
1195 | * map[r0]++ |
1196 | * 3. attach prog_fd to raw socket via setsockopt() | |
1197 | * 4. print number of received TCP/UDP packets every second | |
1198 | */ | |
1199 | int | |
aa1f53cc | 1200 | main(int argc, char *argv[]) |
ce7ba00b MK |
1201 | { |
1202 | int sock, map_fd, prog_fd, key; | |
1203 | long long value = 0, tcp_cnt, udp_cnt; | |
1204 | ||
1205 | map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key), | |
1206 | sizeof(value), 256); | |
1207 | if (map_fd < 0) { | |
1208 | printf("failed to create map \(aq%s\(aq\en", strerror(errno)); | |
1209 | /* likely not run as root */ | |
1210 | return 1; | |
1211 | } | |
1212 | ||
1213 | struct bpf_insn prog[] = { | |
1214 | BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), /* r6 = r1 */ | |
1215 | BPF_LD_ABS(BPF_B, ETH_HLEN + offsetof(struct iphdr, protocol)), | |
d064d41a MK |
1216 | /* r0 = ip\->proto */ |
1217 | BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, \-4), | |
115b4e0e | 1218 | /* *(u32 *)(fp \- 4) = r0 */ |
ce7ba00b | 1219 | BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), /* r2 = fp */ |
d064d41a | 1220 | BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, \-4), /* r2 = r2 \- 4 */ |
ce7ba00b MK |
1221 | BPF_LD_MAP_FD(BPF_REG_1, map_fd), /* r1 = map_fd */ |
1222 | BPF_CALL_FUNC(BPF_FUNC_map_lookup_elem), | |
1223 | /* r0 = map_lookup(r1, r2) */ | |
1224 | BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), | |
1225 | /* if (r0 == 0) goto pc+2 */ | |
1226 | BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */ | |
1227 | BPF_XADD(BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), | |
115b4e0e | 1228 | /* lock *(u64 *) r0 += r1 */ |
ce7ba00b MK |
1229 | .\" == atomic64_add |
1230 | BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */ | |
1231 | BPF_EXIT_INSN(), /* return r0 */ | |
1232 | }; | |
1233 | ||
1234 | prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, prog, | |
1235 | sizeof(prog) / sizeof(prog[0]), "GPL"); | |
1236 | ||
1237 | sock = open_raw_sock("lo"); | |
1238 | ||
1239 | assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, | |
1240 | sizeof(prog_fd)) == 0); | |
1241 | ||
1242 | for (;;) { | |
1243 | key = IPPROTO_TCP; | |
1244 | assert(bpf_lookup_elem(map_fd, &key, &tcp_cnt) == 0); | |
1245 | key = IPPROTO_UDP; | |
1246 | assert(bpf_lookup_elem(map_fd, &key, &udp_cnt) == 0); | |
1247 | printf("TCP %lld UDP %lld packets\en", tcp_cnt, udp_cnt); | |
1248 | sleep(1); | |
1249 | } | |
1250 | ||
1251 | return 0; | |
1252 | } | |
1253 | .EE | |
33857069 | 1254 | .\" SRC END |
ce7ba00b MK |
1255 | .PP |
1256 | Some complete working code can be found in the | |
1257 | .I samples/bpf | |
1258 | directory in the kernel source tree. | |
cc7ac21d | 1259 | .SH SEE ALSO |
842ee010 | 1260 | .BR seccomp (2), |
28a4c58c | 1261 | .BR bpf\-helpers (7), |
cc42e9b8 | 1262 | .BR socket (7), |
8440f771 | 1263 | .BR tc (8), |
28a4c58c | 1264 | .BR tc\-bpf (8) |
f0271688 | 1265 | .PP |
5988a659 | 1266 | Both classic and extended BPF are explained in the kernel source file |
1148d934 | 1267 | .IR Documentation/networking/filter.txt . |