]> git.ipfire.org Git - thirdparty/man-pages.git/blame - man2/bpf.2
Many pages: Use \[aq] instead of \(aq
[thirdparty/man-pages.git] / man2 / bpf.2
CommitLineData
cc7ac21d 1.\" Copyright (C) 2015 Alexei Starovoitov <ast@kernel.org>
ce5db3fc 2.\" and Copyright (C) 2015 Michael Kerrisk <mtk.manpages@gmail.com>
cc7ac21d 3.\"
5fbde956 4.\" SPDX-License-Identifier: Linux-man-pages-copyleft
cc7ac21d 5.\"
4c1c5274 6.TH bpf 2 (date) "Linux man-pages (unreleased)"
cc7ac21d 7.SH NAME
99663603 8bpf \- perform a command on an extended BPF map or program
cc7ac21d
AS
9.SH SYNOPSIS
10.nf
11.B #include <linux/bpf.h>
eaa18d3c 12.PP
266791fb 13.BI "int bpf(int " cmd ", union bpf_attr *" attr ", unsigned int " size );
c36ac88f 14.fi
cc7ac21d 15.SH DESCRIPTION
5988a659 16The
16152abb 17.BR bpf ()
842ee010
MK
18system call performs a range of operations related to extended
19Berkeley Packet Filters.
20Extended BPF (or eBPF) is similar to
54513c00
MK
21the original ("classic") BPF (cBPF) used to filter network packets.
22For both cBPF and eBPF programs,
842ee010
MK
23the kernel statically analyzes the programs before loading them,
24in order to ensure that they cannot harm the running system.
11ac5b51 25.PP
cc42e9b8 26eBPF extends cBPF in multiple ways, including the ability to call
f774ddf1
MK
27a fixed set of in-kernel helper functions
28.\" See 'enum bpf_func_id' in include/uapi/linux/bpf.h
29(via the
842ee010
MK
30.B BPF_CALL
31opcode extension provided by eBPF)
ce5db3fc 32and access shared data structures such as eBPF maps.
fcd1bee3 33.\"
cc7ac21d 34.SS Extended BPF Design/Architecture
953d2673 35eBPF maps are a generic data structure for storage of different data types.
9a818ddd 36Data types are generally treated as binary blobs, so a user just specifies
cd579c3f 37the size of the key and the size of the value at map-creation time.
9a818ddd 38In other words, a key/value for a given map can have an arbitrary structure.
f0271688 39.PP
cc7ac21d 40A user process can create multiple maps (with key/value-pairs being
16152abb 41opaque bytes of data) and access them via file descriptors.
b87d8ba6 42Different eBPF programs can access the same maps in parallel.
54513c00 43It's up to the user process and eBPF program to decide what they store
cc7ac21d 44inside maps.
f0271688 45.PP
cd579c3f
MK
46There's one special map type, called a program array.
47This type of map stores file descriptors referring to other eBPF programs.
48When a lookup in the map is performed, the program flow is
49redirected in-place to the beginning of another eBPF program and does not
50return back to the calling program.
aabe0499
MK
51The level of nesting has a fixed limit of 32,
52.\" Defined by the kernel constant MAX_TAIL_CALL_CNT in include/linux/bpf.h
53so that infinite loops cannot be crafted.
29c0586f 54At run time, the program file descriptors stored in the map can be modified,
9a818ddd 55so program functionality can be altered based on specific requirements.
cd579c3f
MK
56All programs referred to in a program-array map must
57have been previously loaded into the kernel via
58.BR bpf ().
59If a map lookup fails, the current program continues its execution.
60See
61.B BPF_MAP_TYPE_PROG_ARRAY
62below for further details.
11ac5b51 63.PP
9a818ddd 64Generally, eBPF programs are loaded by the user process and automatically
cd579c3f
MK
65unloaded when the process exits.
66In some cases, for example,
28a4c58c 67.BR tc\-bpf (8),
9a818ddd 68the program will continue to stay alive inside the kernel even after the
a0d8ddd1 69process that loaded the program exits.
cd579c3f
MK
70In that case,
71the tc subsystem holds a reference to the eBPF program after the
72file descriptor has been closed by the user-space program.
9a818ddd
DB
73Thus, whether a specific program continues to live inside the kernel
74depends on how it is further attached to a given kernel subsystem
75after it was loaded via
cd579c3f 76.BR bpf ().
f0271688 77.PP
cd579c3f 78Each eBPF program is a set of instructions that is safe to run until
9a5215bf 79its completion.
54513c00 80An in-kernel verifier statically determines that the eBPF program
9a5215bf 81terminates and is safe to execute.
896388c8
MK
82During verification, the kernel increments reference counts for each of
83the maps that the eBPF program uses,
953d2673 84so that the attached maps can't be removed until the program is unloaded.
f0271688 85.PP
54513c00 86eBPF programs can be attached to different events.
9ab03361 87These events can be the arrival of network packets, tracing
953d2673
MK
88events, classification events by network queueing disciplines
89(for eBPF programs attached to a
9ab03361
MK
90.BR tc (8)
91classifier), and other types that may be added in the future.
54513c00 92A new event triggers execution of the eBPF program, which
f774ddf1 93may store information about the event in eBPF maps.
54513c00 94Beyond storing data, eBPF programs may call a fixed set of
896388c8 95in-kernel helper functions.
f0271688 96.PP
f774ddf1 97The same eBPF program can be attached to multiple events and different
cc42e9b8 98eBPF programs can access the same map:
f0271688 99.PP
1148d934 100.in +4n
f0271688 101.EX
cd579c3f
MK
102tracing tracing tracing packet packet packet
103event A event B event C on eth0 on eth1 on eth2
9ca13180 104 | | | | | \(ha
cd579c3f 105 | | | | v |
d064d41a 106 \-\-> tracing <\-\- tracing socket tc ingress tc egress
cd579c3f
MK
107 prog_1 prog_2 prog_3 classifier action
108 | | | | prog_4 prog_5
d064d41a
MK
109 |\-\-\- \-\-\-\-\-| |\-\-\-\-\-\-| map_3 | |
110 map_1 map_2 \-\-| map_4 |\-\-
f0271688 111.EE
1148d934 112.in
fcd1bee3 113.\"
5988a659 114.SS Arguments
842ee010 115The operation to be performed by the
1148d934 116.BR bpf ()
842ee010 117system call is determined by the
266791fb 118.I cmd
f774ddf1
MK
119argument.
120Each operation takes an accompanying argument,
121provided via
122.IR attr ,
123which is a pointer to a union of type
266791fb 124.I bpf_attr
f774ddf1 125(see below).
cad2ee71 126The unused fields and padding must be zeroed out before the call.
f774ddf1
MK
127The
128.I size
129argument is the size of the union pointed to by
130.IR attr .
efeece04 131.PP
f774ddf1 132The value provided in
266791fb 133.I cmd
f774ddf1 134is one of the following:
cc7ac21d
AS
135.TP
136.B BPF_MAP_CREATE
953d2673 137Create a map and return a file descriptor that refers to the map.
0f166ce1
MK
138The close-on-exec file descriptor flag (see
139.BR fcntl (2))
140is automatically enabled for the new file descriptor.
cc7ac21d
AS
141.TP
142.B BPF_MAP_LOOKUP_ELEM
842ee010 143Look up an element by key in a specified map and return its value.
cc7ac21d
AS
144.TP
145.B BPF_MAP_UPDATE_ELEM
842ee010 146Create or update an element (key/value pair) in a specified map.
cc7ac21d
AS
147.TP
148.B BPF_MAP_DELETE_ELEM
842ee010 149Look up and delete an element by key in a specified map.
cc7ac21d
AS
150.TP
151.B BPF_MAP_GET_NEXT_KEY
842ee010
MK
152Look up an element by key in a specified map and return the key
153of the next element.
cc7ac21d
AS
154.TP
155.B BPF_PROG_LOAD
9ab03361
MK
156Verify and load an eBPF program,
157returning a new file descriptor associated with the program.
0f166ce1
MK
158The close-on-exec file descriptor flag (see
159.BR fcntl (2))
160is automatically enabled for the new file descriptor.
f0271688 161.IP
842ee010
MK
162The
163.I bpf_attr
164union consists of various anonymous structures that are used by different
165.BR bpf ()
166commands:
b3b5781e 167.PP
842ee010 168.in +4n
f0271688 169.EX
cc7ac21d 170union bpf_attr {
842ee010 171 struct { /* Used by BPF_MAP_CREATE */
115b4e0e
AC
172 __u32 map_type;
173 __u32 key_size; /* size of key in bytes */
174 __u32 value_size; /* size of value in bytes */
175 __u32 max_entries; /* maximum number of entries
842ee010 176 in a map */
cc7ac21d
AS
177 };
178
f774ddf1
MK
179 struct { /* Used by BPF_MAP_*_ELEM and BPF_MAP_GET_NEXT_KEY
180 commands */
115b4e0e 181 __u32 map_fd;
842ee010 182 __aligned_u64 key;
cc7ac21d
AS
183 union {
184 __aligned_u64 value;
185 __aligned_u64 next_key;
186 };
115b4e0e 187 __u64 flags;
cc7ac21d
AS
188 };
189
842ee010 190 struct { /* Used by BPF_PROG_LOAD */
115b4e0e
AC
191 __u32 prog_type;
192 __u32 insn_cnt;
b957f81f
AC
193 __aligned_u64 insns; /* \[aq]const struct bpf_insn *\[aq] */
194 __aligned_u64 license; /* \[aq]const char *\[aq] */
115b4e0e
AC
195 __u32 log_level; /* verbosity level of verifier */
196 __u32 log_size; /* size of user buffer */
b957f81f 197 __aligned_u64 log_buf; /* user supplied \[aq]char *\[aq]
842ee010 198 buffer */
115b4e0e 199 __u32 kern_version;
9ab03361
MK
200 /* checked when prog_type=kprobe
201 (since Linux 4.1) */
202.\" commit 2541517c32be2531e0da59dfd7efc1ce844644f5
cc7ac21d
AS
203 };
204} __attribute__((aligned(8)));
f0271688 205.EE
842ee010 206.in
fcd1bee3 207.\"
ce5db3fc 208.SS eBPF maps
8440f771
MK
209Maps are a generic data structure for storage of different types of data.
210They allow sharing of data between eBPF kernel programs,
211and also between kernel and user-space applications.
f0271688 212.PP
16152abb 213Each map type has the following attributes:
22356d97 214.IP \(bu 3
16152abb 215type
22356d97 216.IP \(bu
79e2beef 217maximum number of elements
22356d97 218.IP \(bu
16152abb 219key size in bytes
22356d97 220.IP \(bu
16152abb 221value size in bytes
16152abb 222.PP
842ee010
MK
223The following wrapper functions demonstrate how various
224.BR bpf ()
225commands can be used to access the maps.
9a5215bf 226The functions use the
266791fb 227.I cmd
cc7ac21d 228argument to invoke different operations.
ce5db3fc 229.TP
842ee010
MK
230.B BPF_MAP_CREATE
231The
cc7ac21d 232.B BPF_MAP_CREATE
5415d504
MK
233command creates a new map,
234returning a new file descriptor that refers to the map.
f0271688 235.IP
842ee010 236.in +4n
f0271688 237.EX
842ee010 238int
953d2673
MK
239bpf_create_map(enum bpf_map_type map_type,
240 unsigned int key_size,
241 unsigned int value_size,
242 unsigned int max_entries)
cc7ac21d
AS
243{
244 union bpf_attr attr = {
953d2673
MK
245 .map_type = map_type,
246 .key_size = key_size,
247 .value_size = value_size,
cc7ac21d
AS
248 .max_entries = max_entries
249 };
250
251 return bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
252}
f0271688 253.EE
842ee010 254.in
f0271688 255.IP
842ee010
MK
256The new map has the type specified by
257.IR map_type ,
258and attributes as specified in
1148d934
MK
259.IR key_size ,
260.IR value_size ,
842ee010 261and
1148d934 262.IR max_entries .
46a4949b 263On success, this operation returns a file descriptor.
9a5215bf 264On error, \-1 is returned and
cc7ac21d 265.I errno
1148d934
MK
266is set to
267.BR EINVAL ,
268.BR EPERM ,
269or
270.BR ENOMEM .
f0271688 271.IP
953d2673 272The
cc7ac21d
AS
273.I key_size
274and
275.I value_size
953d2673
MK
276attributes will be used by the verifier during program loading
277to check that the program is calling
1148d934
MK
278.BR bpf_map_*_elem ()
279helper functions with a correctly initialized
cc7ac21d 280.I key
f774ddf1 281and to check that the program doesn't access the map element
cc7ac21d
AS
282.I value
283beyond the specified
16152abb 284.IR value_size .
842ee010 285For example, when a map is created with a
266791fb 286.I key_size
f774ddf1 287of 8 and the eBPF program calls
f0271688 288.IP
1148d934 289.in +4n
f0271688 290.EX
d064d41a 291bpf_map_lookup_elem(map_fd, fp \- 4)
f0271688 292.EE
1148d934 293.in
f0271688 294.IP
cc7ac21d 295the program will be rejected,
1148d934 296since the in-kernel helper function
f0271688 297.IP
c6ba384b 298.in +4n
f0271688 299.EX
c6ba384b 300bpf_map_lookup_elem(map_fd, void *key)
f0271688 301.EE
c6ba384b 302.in
f0271688 303.IP
46a4949b
MK
304expects to read 8 bytes from the location pointed to by
305.IR key ,
306but the
cd415e73 307.I fp\ \-\ 4
46a4949b
MK
308(where
309.I fp
310is the top of the stack)
1148d934 311starting address will cause out-of-bounds stack access.
f0271688 312.IP
842ee010
MK
313Similarly, when a map is created with a
314.I value_size
f774ddf1 315of 1 and the eBPF program contains
f0271688 316.IP
1148d934 317.in +4n
f0271688 318.EX
cc7ac21d 319value = bpf_map_lookup_elem(...);
115b4e0e 320*(u32 *) value = 1;
f0271688 321.EE
1148d934 322.in
f0271688 323.IP
cc7ac21d
AS
324the program will be rejected, since it accesses the
325.I value
1148d934
MK
326pointer beyond the specified 1 byte
327.I value_size
328limit.
f0271688 329.IP
f774ddf1
MK
330Currently, the following values are supported for
331.IR map_type :
f0271688 332.IP
1148d934 333.in +4n
f0271688 334.EX
cc7ac21d 335enum bpf_map_type {
ce5db3fc 336 BPF_MAP_TYPE_UNSPEC, /* Reserve 0 as invalid map type */
842ee010
MK
337 BPF_MAP_TYPE_HASH,
338 BPF_MAP_TYPE_ARRAY,
5415d504 339 BPF_MAP_TYPE_PROG_ARRAY,
1b7adc7c
NB
340 BPF_MAP_TYPE_PERF_EVENT_ARRAY,
341 BPF_MAP_TYPE_PERCPU_HASH,
342 BPF_MAP_TYPE_PERCPU_ARRAY,
343 BPF_MAP_TYPE_STACK_TRACE,
344 BPF_MAP_TYPE_CGROUP_ARRAY,
345 BPF_MAP_TYPE_LRU_HASH,
346 BPF_MAP_TYPE_LRU_PERCPU_HASH,
347 BPF_MAP_TYPE_LPM_TRIE,
348 BPF_MAP_TYPE_ARRAY_OF_MAPS,
349 BPF_MAP_TYPE_HASH_OF_MAPS,
350 BPF_MAP_TYPE_DEVMAP,
351 BPF_MAP_TYPE_SOCKMAP,
352 BPF_MAP_TYPE_CPUMAP,
0e861952
PW
353 BPF_MAP_TYPE_XSKMAP,
354 BPF_MAP_TYPE_SOCKHASH,
355 BPF_MAP_TYPE_CGROUP_STORAGE,
356 BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
357 BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
358 BPF_MAP_TYPE_QUEUE,
359 BPF_MAP_TYPE_STACK,
360 /* See /usr/include/linux/bpf.h for the full list. */
cc7ac21d 361};
f0271688 362.EE
1148d934 363.in
f0271688 364.IP
cc7ac21d 365.I map_type
9a5215bf 366selects one of the available map implementations in the kernel.
f774ddf1 367.\" FIXME We need an explanation of why one might choose each of
b913d165 368.\" these map implementations
16152abb 369For all map types,
f774ddf1
MK
370eBPF programs access maps with the same
371.BR bpf_map_lookup_elem ()
372and
1148d934 373.BR bpf_map_update_elem ()
cc7ac21d 374helper functions.
ce5db3fc 375Further details of the various map types are given below.
cc7ac21d
AS
376.TP
377.B BPF_MAP_LOOKUP_ELEM
842ee010
MK
378The
379.B BPF_MAP_LOOKUP_ELEM
380command looks up an element with a given
381.I key
382in the map referred to by the file descriptor
383.IR fd .
f0271688 384.IP
842ee010 385.in +4n
f0271688 386.EX
842ee010 387int
953d2673 388bpf_lookup_elem(int fd, const void *key, void *value)
cc7ac21d
AS
389{
390 union bpf_attr attr = {
391 .map_fd = fd,
953d2673
MK
392 .key = ptr_to_u64(key),
393 .value = ptr_to_u64(value),
cc7ac21d
AS
394 };
395
396 return bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
397}
f0271688 398.EE
842ee010 399.in
f0271688 400.IP
842ee010
MK
401If an element is found,
402the operation returns zero and stores the element's value into
5415d504
MK
403.IR value ,
404which must point to a buffer of
405.I value_size
406bytes.
f0271688 407.IP
842ee010 408If no element is found, the operation returns \-1 and sets
cc7ac21d 409.I errno
1148d934
MK
410to
411.BR ENOENT .
cc7ac21d
AS
412.TP
413.B BPF_MAP_UPDATE_ELEM
842ee010
MK
414The
415.B BPF_MAP_UPDATE_ELEM
416command
417creates or updates an element with a given
418.I key/value
419in the map referred to by the file descriptor
420.IR fd .
f0271688 421.IP
842ee010 422.in +4n
f0271688 423.EX
842ee010 424int
953d2673
MK
425bpf_update_elem(int fd, const void *key, const void *value,
426 uint64_t flags)
cc7ac21d
AS
427{
428 union bpf_attr attr = {
429 .map_fd = fd,
953d2673
MK
430 .key = ptr_to_u64(key),
431 .value = ptr_to_u64(value),
432 .flags = flags,
cc7ac21d
AS
433 };
434
435 return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
436}
f0271688 437.EE
842ee010 438.in
f0271688 439.IP
842ee010 440The
cc7ac21d 441.I flags
842ee010
MK
442argument should be specified as one of the following:
443.RS
444.TP
445.B BPF_ANY
446Create a new element or update an existing element.
447.TP
448.B BPF_NOEXIST
449Create a new element only if it did not exist.
450.TP
451.B BPF_EXIST
452Update an existing element.
453.RE
454.IP
455On success, the operation returns zero.
cc7ac21d
AS
456On error, \-1 is returned and
457.I errno
1148d934
MK
458is set to
459.BR EINVAL ,
460.BR EPERM ,
461.BR ENOMEM ,
462or
463.BR E2BIG .
cc7ac21d 464.B E2BIG
842ee010 465indicates that the number of elements in the map reached the
cc7ac21d
AS
466.I max_entries
467limit specified at map creation time.
468.B EEXIST
842ee010
MK
469will be returned if
470.I flags
471specifies
472.B BPF_NOEXIST
473and the element with
1148d934
MK
474.I key
475already exists in the map.
cc7ac21d 476.B ENOENT
953d2673 477will be returned if
842ee010
MK
478.I flags
479specifies
480.B BPF_EXIST
481and the element with
1148d934
MK
482.I key
483doesn't exist in the map.
cc7ac21d
AS
484.TP
485.B BPF_MAP_DELETE_ELEM
842ee010
MK
486The
487.B BPF_MAP_DELETE_ELEM
488command
96ed2f3f 489deletes the element whose key is
842ee010
MK
490.I key
491from the map referred to by the file descriptor
492.IR fd .
f0271688 493.IP
842ee010 494.in +4n
f0271688 495.EX
842ee010 496int
953d2673 497bpf_delete_elem(int fd, const void *key)
cc7ac21d
AS
498{
499 union bpf_attr attr = {
500 .map_fd = fd,
953d2673 501 .key = ptr_to_u64(key),
cc7ac21d
AS
502 };
503
504 return bpf(BPF_MAP_DELETE_ELEM, &attr, sizeof(attr));
505}
f0271688 506.EE
842ee010 507.in
f0271688 508.IP
842ee010
MK
509On success, zero is returned.
510If the element is not found, \-1 is returned and
cc7ac21d 511.I errno
842ee010 512is set to
1148d934 513.BR ENOENT .
cc7ac21d
AS
514.TP
515.B BPF_MAP_GET_NEXT_KEY
842ee010
MK
516The
517.B BPF_MAP_GET_NEXT_KEY
518command looks up an element by
519.I key
520in the map referred to by the file descriptor
266791fb 521.I fd
842ee010
MK
522and sets the
523.I next_key
524pointer to the key of the next element.
f0271688 525.IP
842ee010 526.in +4n
f0271688 527.EX
842ee010 528int
953d2673 529bpf_get_next_key(int fd, const void *key, void *next_key)
cc7ac21d
AS
530{
531 union bpf_attr attr = {
953d2673
MK
532 .map_fd = fd,
533 .key = ptr_to_u64(key),
cc7ac21d
AS
534 .next_key = ptr_to_u64(next_key),
535 };
536
537 return bpf(BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr));
538}
f0271688 539.EE
842ee010 540.in
f0271688 541.IP
5415d504
MK
542If
543.I key
544is found, the operation returns zero and sets the
545.I next_key
546pointer to the key of the next element.
cc7ac21d
AS
547If
548.I key
842ee010 549is not found, the operation returns zero and sets the
cc7ac21d
AS
550.I next_key
551pointer to the key of the first element.
552If
553.I key
842ee010 554is the last element, \-1 is returned and
cc7ac21d 555.I errno
842ee010 556is set to
1148d934 557.BR ENOENT .
9a5215bf 558Other possible
cc7ac21d 559.I errno
1148d934
MK
560values are
561.BR ENOMEM ,
562.BR EFAULT ,
563.BR EPERM ,
564and
565.BR EINVAL .
cc7ac21d
AS
566This method can be used to iterate over all elements in the map.
567.TP
568.B close(map_fd)
842ee010 569Delete the map referred to by the file descriptor
1148d934 570.IR map_fd .
842ee010 571When the user-space program that created a map exits, all maps will
ce5db3fc
MK
572be deleted automatically (but see NOTES).
573.\"
574.SS eBPF map types
575The following map types are supported:
576.TP
577.B BPF_MAP_TYPE_HASH
578.\" commit 0f8e4bd8a1fc8c4185f1630061d0a1f2d197a475
ce5db3fc
MK
579Hash-table maps have the following characteristics:
580.RS
22356d97 581.IP \(bu 3
ce5db3fc
MK
582Maps are created and destroyed by user-space programs.
583Both user-space and eBPF programs
46a4949b 584can perform lookup, update, and delete operations.
22356d97 585.IP \(bu
ce5db3fc 586The kernel takes care of allocating and freeing key/value pairs.
22356d97 587.IP \(bu
ce5db3fc
MK
588The
589.BR map_update_elem ()
998f951b 590helper will fail to insert new element when the
ce5db3fc
MK
591.I max_entries
592limit is reached.
593(This ensures that eBPF programs cannot exhaust memory.)
22356d97 594.IP \(bu
ce5db3fc
MK
595.BR map_update_elem ()
596replaces existing elements atomically.
597.RE
598.IP
953d2673 599Hash-table maps are
ce5db3fc
MK
600optimized for speed of lookup.
601.TP
602.B BPF_MAP_TYPE_ARRAY
603.\" commit 28fbcfa08d8ed7c5a50d41a0433aad222835e8e3
ce5db3fc
MK
604Array maps have the following characteristics:
605.RS
22356d97 606.IP \(bu 3
ce5db3fc 607Optimized for fastest possible lookup.
46a4949b 608In the future the verifier/JIT compiler
ce5db3fc
MK
609may recognize lookup() operations that employ a constant key
610and optimize it into constant pointer.
611It is possible to optimize a non-constant
612key into direct pointer arithmetic as well, since pointers and
613.I value_size
614are constant for the life of the eBPF program.
615In other words,
616.BR array_map_lookup_elem ()
617may be 'inlined' by the verifier/JIT compiler
618while preserving concurrent access to this map from user space.
22356d97 619.IP \(bu
ce5db3fc 620All array elements pre-allocated and zero initialized at init time
22356d97 621.IP \(bu
ce5db3fc 622The key is an array index, and must be exactly four bytes.
22356d97 623.IP \(bu
ce5db3fc
MK
624.BR map_delete_elem ()
625fails with the error
626.BR EINVAL ,
627since elements cannot be deleted.
22356d97 628.IP \(bu
ce5db3fc 629.BR map_update_elem ()
953d2673
MK
630replaces elements in a
631.B nonatomic
632fashion;
cd579c3f
MK
633for atomic updates, a hash-table map should be used instead.
634There is however one special case that can also be used with arrays:
635the atomic built-in
266791fb 636.B __sync_fetch_and_add()
cd579c3f
MK
637can be used on 32 and 64 bit atomic counters.
638For example, it can be
9a818ddd
DB
639applied on the whole value itself if it represents a single counter,
640or in case of a structure containing multiple counters, it could be
cd579c3f
MK
641used on individual counters.
642This is quite often useful for aggregation and accounting of events.
ce5db3fc
MK
643.RE
644.IP
645Among the uses for array maps are the following:
646.RS
22356d97 647.IP \(bu 3
ce5db3fc
MK
648As "global" eBPF variables: an array of 1 element whose key is (index) 0
649and where the value is a collection of 'global' variables which
650eBPF programs can use to keep state between events.
22356d97 651.IP \(bu
ce5db3fc 652Aggregation of tracing events into a fixed set of buckets.
22356d97 653.IP \(bu
9a818ddd
DB
654Accounting of networking events, for example, number of packets and packet
655sizes.
ce5db3fc
MK
656.RE
657.TP
658.BR BPF_MAP_TYPE_PROG_ARRAY " (since Linux 4.2)"
cd579c3f
MK
659A program array map is a special kind of array map whose map values
660contain only file descriptors referring to other eBPF programs.
661Thus, both the
662.I key_size
663and
664.I value_size
665must be exactly four bytes.
9a818ddd 666This map is used in conjunction with the
cd579c3f 667.BR bpf_tail_call ()
9a818ddd 668helper.
f0271688 669.IP
9a818ddd
DB
670This means that an eBPF program with a program array map attached to it
671can call from kernel side into
f0271688 672.IP
9a818ddd 673.in +4n
f0271688 674.EX
05f10213
MK
675void bpf_tail_call(void *context, void *prog_map,
676 unsigned int index);
f0271688 677.EE
9a818ddd 678.in
f0271688 679.IP
9a818ddd 680and therefore replace its own program flow with the one from the program
cd579c3f
MK
681at the given program array slot, if present.
682This can be regarded as kind of a jump table to a different eBPF program.
683The invoked program will then reuse the same stack.
684When a jump into the new program has been performed,
685it won't return to the old program anymore.
f0271688 686.IP
aabe0499
MK
687If no eBPF program is found at the given index of the program array
688(because the map slot doesn't contain a valid program file descriptor,
689the specified lookup index/key is out of bounds,
690or the limit of 32
691.\" MAX_TAIL_CALL_CNT
692nested calls has been exceed),
9a818ddd
DB
693execution continues with the current eBPF program.
694This can be used as a fall-through for default cases.
f0271688 695.IP
9a818ddd 696A program array map is useful, for example, in tracing or networking, to
cd579c3f
MK
697handle individual system calls or protocols in their own subprograms and
698use their identifiers as an individual map index.
699This approach may result in performance benefits,
700and also makes it possible to overcome the maximum
701instruction limit of a single eBPF program.
702In dynamic environments,
703a user-space daemon might atomically replace individual subprograms
704at run-time with newer versions to alter overall program behavior,
705for instance, if global policies change.
ce5db3fc
MK
706.\"
707.SS eBPF programs
842ee010
MK
708The
709.B BPF_PROG_LOAD
54513c00 710command is used to load an eBPF program into the kernel.
9ab03361 711The return value for this command is a new file descriptor associated
ce5db3fc 712with this eBPF program.
f0271688 713.PP
842ee010 714.in +4n
f0271688 715.EX
cc7ac21d
AS
716char bpf_log_buf[LOG_BUF_SIZE];
717
842ee010 718int
953d2673 719bpf_prog_load(enum bpf_prog_type type,
842ee010
MK
720 const struct bpf_insn *insns, int insn_cnt,
721 const char *license)
cc7ac21d
AS
722{
723 union bpf_attr attr = {
953d2673
MK
724 .prog_type = type,
725 .insns = ptr_to_u64(insns),
726 .insn_cnt = insn_cnt,
727 .license = ptr_to_u64(license),
728 .log_buf = ptr_to_u64(bpf_log_buf),
729 .log_size = LOG_BUF_SIZE,
cc7ac21d
AS
730 .log_level = 1,
731 };
732
733 return bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
734}
f0271688 735.EE
842ee010 736.in
f0271688 737.PP
1148d934 738.I prog_type
cc7ac21d 739is one of the available program types:
f0271688 740.IP
1148d934 741.in +4n
f0271688 742.EX
cc7ac21d 743enum bpf_prog_type {
f774ddf1
MK
744 BPF_PROG_TYPE_UNSPEC, /* Reserve 0 as invalid
745 program type */
ce5db3fc
MK
746 BPF_PROG_TYPE_SOCKET_FILTER,
747 BPF_PROG_TYPE_KPROBE,
748 BPF_PROG_TYPE_SCHED_CLS,
749 BPF_PROG_TYPE_SCHED_ACT,
0e861952
PW
750 BPF_PROG_TYPE_TRACEPOINT,
751 BPF_PROG_TYPE_XDP,
752 BPF_PROG_TYPE_PERF_EVENT,
753 BPF_PROG_TYPE_CGROUP_SKB,
754 BPF_PROG_TYPE_CGROUP_SOCK,
755 BPF_PROG_TYPE_LWT_IN,
756 BPF_PROG_TYPE_LWT_OUT,
757 BPF_PROG_TYPE_LWT_XMIT,
758 BPF_PROG_TYPE_SOCK_OPS,
759 BPF_PROG_TYPE_SK_SKB,
760 BPF_PROG_TYPE_CGROUP_DEVICE,
761 BPF_PROG_TYPE_SK_MSG,
762 BPF_PROG_TYPE_RAW_TRACEPOINT,
763 BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
764 BPF_PROG_TYPE_LWT_SEG6LOCAL,
765 BPF_PROG_TYPE_LIRC_MODE2,
766 BPF_PROG_TYPE_SK_REUSEPORT,
767 BPF_PROG_TYPE_FLOW_DISSECTOR,
768 /* See /usr/include/linux/bpf.h for the full list. */
cc7ac21d 769};
f0271688 770.EE
1148d934 771.in
f0271688 772.PP
ce5db3fc 773For further details of eBPF program types, see below.
f0271688 774.PP
ce5db3fc 775The remaining fields of
842ee010
MK
776.I bpf_attr
777are set as follows:
22356d97 778.IP \(bu 3
1148d934 779.I insns
842ee010 780is an array of
1148d934
MK
781.I "struct bpf_insn"
782instructions.
22356d97 783.IP \(bu
1148d934 784.I insn_cnt
842ee010
MK
785is the number of instructions in the program referred to by
786.IR insns .
22356d97 787.IP \(bu
1148d934 788.I license
842ee010 789is a license string, which must be GPL compatible to call helper functions
1148d934
MK
790marked
791.IR gpl_only .
fcd1bee3 792(The licensing rules are the same as for kernel modules,
9a818ddd 793so that also dual licenses, such as "Dual BSD/GPL", may be used.)
22356d97 794.IP \(bu
1148d934 795.I log_buf
842ee010
MK
796is a pointer to a caller-allocated buffer in which the in-kernel
797verifier can store the verification log.
9a5215bf 798This log is a multi-line string that can be checked by
cc7ac21d 799the program author in order to understand how the verifier came to
953d2673 800the conclusion that the eBPF program is unsafe.
cc7ac21d 801The format of the output can change at any time as the verifier evolves.
22356d97 802.IP \(bu
1148d934 803.I log_size
842ee010 804size of the buffer pointed to by
029b613f 805.IR log_buf .
9a5215bf 806If the size of the buffer is not large enough to store all
cc7ac21d
AS
807verifier messages, \-1 is returned and
808.I errno
1148d934
MK
809is set to
810.BR ENOSPC .
22356d97 811.IP \(bu
1148d934 812.I log_level
9a5215bf 813verbosity level of the verifier.
fcd1bee3
MK
814A value of zero means that the verifier will not provide a log;
815in this case,
816.I log_buf
817must be a NULL pointer, and
818.I log_size
819must be zero.
f0271688 820.PP
ce5db3fc
MK
821Applying
822.BR close (2)
823to the file descriptor returned by
824.B BPF_PROG_LOAD
825will unload the eBPF program (but see NOTES).
f0271688 826.PP
54513c00
MK
827Maps are accessible from eBPF programs and are used to exchange data between
828eBPF programs and between eBPF programs and user-space programs.
5415d504
MK
829For example,
830eBPF programs can process various events (like kprobe, packets) and
831store their data into a map,
832and user-space programs can then fetch data from the map.
833Conversely, user-space programs can use a map as a configuration mechanism,
834populating the map with values checked by the eBPF program,
835which then modifies its behavior on the fly according to those values.
953d2673
MK
836.\"
837.\"
ce5db3fc 838.SS eBPF program types
953d2673
MK
839The eBPF program type
840.RI ( prog_type )
fcd1bee3 841determines the subset of kernel helper functions that the program
953d2673 842may call.
fcd1bee3 843The program type also determines the program input (context)\(emthe
953d2673
MK
844format of
845.I "struct bpf_context"
ce5db3fc 846(which is the data blob passed into the eBPF program as the first argument).
0fc33df7 847.\"
30ea59e7 848.\" FIXME
24493e9b 849.\" Somewhere in this page we need a general introduction to the
0fc33df7
MK
850.\" bpf_context. For example, how does a BPF program access the
851.\" context?
f0271688 852.PP
953d2673
MK
853For example, a tracing program does not have the exact same
854subset of helper functions as a socket filter program
855(though they may have some helpers in common).
856Similarly,
857the input (context) for a tracing program is a set of register values,
858while for a socket filter it is a network packet.
f0271688 859.PP
ce5db3fc
MK
860The set of functions available to eBPF programs of a given type may increase
861in the future.
f0271688 862.PP
ce5db3fc
MK
863The following program types are supported:
864.TP
865.BR BPF_PROG_TYPE_SOCKET_FILTER " (since Linux 3.19)"
866Currently, the set of functions for
867.B BPF_PROG_TYPE_SOCKET_FILTER
868is:
f0271688 869.IP
1148d934 870.in +4n
f0271688 871.EX
ce5db3fc
MK
872bpf_map_lookup_elem(map_fd, void *key)
873 /* look up key in a map_fd */
874bpf_map_update_elem(map_fd, void *key, void *value)
875 /* update key/value */
876bpf_map_delete_elem(map_fd, void *key)
877 /* delete key in a map_fd */
f0271688 878.EE
1148d934 879.in
f0271688 880.IP
ce5db3fc
MK
881The
882.I bpf_context
883argument is a pointer to a
b87d8ba6 884.IR "struct __sk_buff" .
953d2673 885.\" FIXME: We need some text here to explain how the program
b913d165
MK
886.\" accesses __sk_buff.
887.\" See 'struct __sk_buff' and commit 9bac3d6d548e5
888.\"
b87d8ba6 889.\" Alexei commented:
b913d165
MK
890.\" Actually now in case of SOCKET_FILTER, SCHED_CLS, SCHED_ACT
891.\" the program can now access skb fields.
ce5db3fc
MK
892.\"
893.TP
266791fb 894.BR BPF_PROG_TYPE_KPROBE " (since Linux 4.1)"
ce5db3fc
MK
895.\" commit 2541517c32be2531e0da59dfd7efc1ce844644f5
896[To be documented]
897.\" FIXME Document this program type
898.\" Describe allowed helper functions for this program type
899.\" Describe bpf_context for this program type
b913d165 900.\"
ce5db3fc
MK
901.\" FIXME We need text here to describe 'kern_version'
902.TP
266791fb 903.BR BPF_PROG_TYPE_SCHED_CLS " (since Linux 4.1)"
ce5db3fc
MK
904.\" commit 96be4325f443dbbfeb37d2a157675ac0736531a1
905.\" commit e2e9b6541dd4b31848079da80fe2253daaafb549
906[To be documented]
907.\" FIXME Document this program type
908.\" Describe allowed helper functions for this program type
909.\" Describe bpf_context for this program type
910.TP
266791fb 911.BR BPF_PROG_TYPE_SCHED_ACT " (since Linux 4.1)"
ce5db3fc
MK
912.\" commit 94caee8c312d96522bcdae88791aaa9ebcd5f22c
913.\" commit a8cb5f556b567974d75ea29c15181c445c541b1f
914[To be documented]
915.\" FIXME Document this program type
916.\" Describe allowed helper functions for this program type
917.\" Describe bpf_context for this program type
918.SS Events
919Once a program is loaded, it can be attached to an event.
920Various kernel subsystems have different ways to do so.
f0271688 921.PP
ce5db3fc
MK
922Since Linux 3.19,
923.\" commit 89aa075832b0da4402acebd698d0411dcc82d03e
924the following call will attach the program
cc7ac21d 925.I prog_fd
842ee010
MK
926to the socket
927.IR sockfd ,
ce5db3fc
MK
928which was created by an earlier call to
929.BR socket (2):
f0271688 930.PP
1148d934 931.in +4n
f0271688 932.EX
ce5db3fc
MK
933setsockopt(sockfd, SOL_SOCKET, SO_ATTACH_BPF,
934 &prog_fd, sizeof(prog_fd));
f0271688 935.EE
1148d934 936.in
f0271688 937.PP
ce5db3fc
MK
938Since Linux 4.1,
939.\" commit 2541517c32be2531e0da59dfd7efc1ce844644f5
940the following call may be used to attach
941the eBPF program referred to by the file descriptor
cc7ac21d 942.I prog_fd
ce5db3fc
MK
943to a perf event file descriptor,
944.IR event_fd ,
945that was created by a previous call to
946.BR perf_event_open (2):
efeece04 947.PP
ce5db3fc 948.in +4n
b76974c1 949.EX
ce5db3fc 950ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
b76974c1 951.EE
ce5db3fc
MK
952.in
953.\"
ce5db3fc 954.\"
cc7ac21d
AS
955.SH RETURN VALUE
956For a successful call, the return value depends on the operation:
957.TP
958.B BPF_MAP_CREATE
ce5db3fc 959The new file descriptor associated with the eBPF map.
cc7ac21d
AS
960.TP
961.B BPF_PROG_LOAD
54513c00 962The new file descriptor associated with the eBPF program.
cc7ac21d
AS
963.TP
964All other commands
965Zero.
966.PP
967On error, \-1 is returned, and
968.I errno
f6a4078b 969is set to indicate the error.
cc7ac21d
AS
970.SH ERRORS
971.TP
266791fb 972.B E2BIG
6cedbd4c
MK
973The eBPF program is too large or a map reached the
974.I max_entries
975limit (maximum number of elements).
cc7ac21d 976.TP
266791fb 977.B EACCES
6cedbd4c 978For
266791fb 979.BR BPF_PROG_LOAD ,
6cedbd4c
MK
980even though all program instructions are valid, the program has been
981rejected because it was deemed unsafe.
982This may be because it may have
983accessed a disallowed memory region or an uninitialized stack/register or
984because the function constraints don't match the actual types or because
985there was a misaligned memory access.
986In this case, it is recommended to call
987.BR bpf ()
988again with
989.I log_level = 1
990and examine
991.I log_buf
992for the specific reason provided by the verifier.
cc7ac21d
AS
993.TP
994.B EBADF
995.I fd
7d6bfe72 996is not an open file descriptor.
cc7ac21d
AS
997.TP
998.B EFAULT
1148d934
MK
999One of the pointers
1000.RI ( key
cc7ac21d
AS
1001or
1002.I value
1003or
1004.I log_buf
1005or
1148d934
MK
1006.IR insns )
1007is outside the accessible address space.
cc7ac21d
AS
1008.TP
1009.B EINVAL
1010The value specified in
1011.I cmd
1012is not recognized by this kernel.
1013.TP
1014.B EINVAL
1015For
1016.BR BPF_MAP_CREATE ,
1017either
1018.I map_type
1019or attributes are invalid.
1020.TP
1021.B EINVAL
1022For
266791fb 1023.B BPF_MAP_*_ELEM
cc7ac21d 1024commands,
1148d934
MK
1025some of the fields of
1026.I "union bpf_attr"
1027that are not used by this command
cc7ac21d
AS
1028are not set to zero.
1029.TP
1030.B EINVAL
1031For
266791fb 1032.BR BPF_PROG_LOAD ,
9a5215bf 1033indicates an attempt to load an invalid program.
953d2673
MK
1034eBPF programs can be deemed
1035invalid due to unrecognized instructions, the use of reserved fields, jumps
cc7ac21d
AS
1036out of range, infinite loops or calls of unknown functions.
1037.TP
266791fb 1038.B ENOENT
cc7ac21d
AS
1039For
1040.B BPF_MAP_LOOKUP_ELEM
1041or
16152abb 1042.BR BPF_MAP_DELETE_ELEM ,
cc7ac21d
AS
1043indicates that the element with the given
1044.I key
1045was not found.
1046.TP
6cedbd4c
MK
1047.B ENOMEM
1048Cannot allocate sufficient memory.
1049.TP
1050.B EPERM
1051The call was made without sufficient privilege
1052(without the
1053.B CAP_SYS_ADMIN
1054capability).
5f920e10
MK
1055.SH VERSIONS
1056The
1057.BR bpf ()
1058system call first appeared in Linux 3.18.
3113c7f3 1059.SH STANDARDS
8dbf8f2d
MK
1060The
1061.BR bpf ()
1062system call is Linux-specific.
cc7ac21d 1063.SH NOTES
821bf91c 1064Prior to Linux 4.4, all
842ee010
MK
1065.BR bpf ()
1066commands require the caller to have the
cc7ac21d 1067.B CAP_SYS_ADMIN
35732aa7
MK
1068capability.
1069From Linux 4.4 onwards,
1070.\" commit 1be7f75d1668d6296b80bf35dcf6762393530afc
1071an unprivileged user may create limited programs of type
1ae6b2c7 1072.B BPF_PROG_TYPE_SOCKET_FILTER
35732aa7
MK
1073and associated maps.
1074However they may not store kernel pointers within
821bf91c 1075the maps and are presently limited to the following helper functions:
f7d706ba
MK
1076.\" [Linux 5.6] mtk: The list of available functions is, I think, governed
1077.\" by the check in net/core/filter.c::bpf_base_func_proto().
22356d97 1078.IP \(bu 3
821bf91c
RP
1079get_random
1080.PD 0
22356d97 1081.IP \(bu
821bf91c 1082get_smp_processor_id
22356d97 1083.IP \(bu
821bf91c 1084tail_call
22356d97 1085.IP \(bu
821bf91c 1086ktime_get_ns
22356d97 1087.PD
821bf91c 1088.PP
c53d4fc1 1089Unprivileged access may be blocked by writing the value 1 to the file
821bf91c 1090.IR /proc/sys/kernel/unprivileged_bpf_disabled .
f0271688 1091.PP
f774ddf1
MK
1092eBPF objects (maps and programs) can be shared between processes.
1093For example, after
1094.BR fork (2),
1095the child inherits file descriptors referring to the same eBPF objects.
1096In addition, file descriptors referring to eBPF objects can be
1097transferred over UNIX domain sockets.
1098File descriptors referring to eBPF objects can be duplicated
1099in the usual way, using
1100.BR dup (2)
1101and similar calls.
1102An eBPF object is deallocated only after all file descriptors
1103referring to the object have been closed.
f0271688 1104.PP
4fba111e
MK
1105eBPF programs can be written in a restricted C that is compiled (using the
1106.B clang
953d2673
MK
1107compiler) into eBPF bytecode.
1108Various features are omitted from this restricted C, such as loops,
f774ddf1 1109global variables, variadic functions, floating-point numbers,
953d2673 1110and passing structures as function arguments.
4fba111e
MK
1111Some examples can be found in the
1112.I samples/bpf/*_kern.c
1113files in the kernel source tree.
ce5db3fc
MK
1114.\" There are also examples for the tc classifier, in the iproute2
1115.\" project, in examples/bpf
f0271688 1116.PP
953d2673
MK
1117The kernel contains a just-in-time (JIT) compiler that translates
1118eBPF bytecode into native machine code for better performance.
b324e17d 1119Before Linux 4.15,
5a29959a 1120the JIT compiler is disabled by default,
953d2673
MK
1121but its operation can be controlled by writing one of the
1122following integer strings to the file
1123.IR /proc/sys/net/core/bpf_jit_enable :
4279e42d
AC
1124.TP
1125.B 0
953d2673 1126Disable JIT compilation (default).
4279e42d
AC
1127.TP
1128.B 1
953d2673 1129Normal compilation.
4279e42d
AC
1130.TP
1131.B 2
953d2673
MK
1132Debugging mode.
1133The generated opcodes are dumped in hexadecimal into the kernel log.
1134These opcodes can then be disassembled using the program
266791fb 1135.I tools/net/bpf_jit_disasm.c
953d2673 1136provided in the kernel source tree.
fcd1bee3 1137.PP
5a29959a
MK
1138Since Linux 4.15,
1139.\" commit 290af86629b25ffd1ed6232c4e9107da031705cb
1140the kernel may configured with the
1141.B CONFIG_BPF_JIT_ALWAYS_ON
1142option.
1143In this case, the JIT compiler is always enabled, and the
1144.I bpf_jit_enable
1145is initialized to 1 and is immutable.
1146(This kernel configuration option was provided as a mitigation for
1147one of the Spectre attacks against the BPF interpreter.)
1148.PP
2b623a23 1149The JIT compiler for eBPF is currently
4167f63f 1150.\" Last reviewed in Linux 4.18-rc by grepping for BPF_ALU64 in arch/
6d2ac026
MK
1151.\" and by checking the documentation for bpf_jit_enable in
1152.\" Documentation/sysctl/net.txt
2b623a23 1153available for the following architectures:
22356d97 1154.IP \(bu 3
2ef9216b
MK
1155x86-64 (since Linux 3.18; cBPF since Linux 3.0);
1156.\" commit 0a14842f5a3c0e88a1e59fac5c3025db39721f74
2b623a23 1157.PD 0
22356d97 1158.IP \(bu
2ef9216b
MK
1159ARM32 (since Linux 3.18; cBPF since Linux 3.4);
1160.\" commit ddecdfcea0ae891f782ae853771c867ab51024c2
22356d97 1161.IP \(bu
2ef9216b
MK
1162SPARC 32 (since Linux 3.18; cBPF since Linux 3.5);
1163.\" commit 2809a2087cc44b55e4377d7b9be3f7f5d2569091
22356d97 1164.IP \(bu
2ef9216b
MK
1165ARM-64 (since Linux 3.18);
1166.\" commit e54bcde3d69d40023ae77727213d14f920eb264a
22356d97 1167.IP \(bu
069be4fd
MK
1168s390 (since Linux 4.1; cBPF since Linux 3.7);
1169.\" commit c10302efe569bfd646b4c22df29577a4595b4580
22356d97 1170.IP \(bu
2ef9216b
MK
1171PowerPC 64 (since Linux 4.8; cBPF since Linux 3.1);
1172.\" commit 0ca87f05ba8bdc6791c14878464efc901ad71e99
1173.\" commit 156d0e290e969caba25f1851c52417c14d141b24
22356d97 1174.IP \(bu
2b623a23 1175SPARC 64 (since Linux 4.12);
2ef9216b 1176.\" commit 7a12b5031c6b947cc13918237ae652b536243b76
22356d97 1177.IP \(bu
2ef9216b
MK
1178x86-32 (since Linux 4.18);
1179.\" commit 03f5781be2c7b7e728d724ac70ba10799cc710d7
22356d97 1180.IP \(bu
2ef9216b
MK
1181MIPS 64 (since Linux 4.18; cBPF since Linux 3.16);
1182.\" commit c6610de353da5ca6eee5b8960e838a87a90ead0c
1183.\" commit f381bf6d82f032b7410185b35d000ea370ac706b
22356d97 1184.IP \(bu
2ef9216b
MK
1185riscv (since Linux 5.1).
1186.\" commit 2353ecc6f91fd15b893fa01bf85a1c7a823ee4f2
2b623a23 1187.PD
ce7ba00b 1188.SH EXAMPLES
33857069 1189.\" [[FIXME]] SRC BEGIN (bpf.c)
ce7ba00b
MK
1190.EX
1191/* bpf+sockets example:
1192 * 1. create array map of 256 elements
1193 * 2. load program that counts number of packets received
d064d41a 1194 * r0 = skb\->data[ETH_HLEN + offsetof(struct iphdr, protocol)]
ce7ba00b
MK
1195 * map[r0]++
1196 * 3. attach prog_fd to raw socket via setsockopt()
1197 * 4. print number of received TCP/UDP packets every second
1198 */
1199int
aa1f53cc 1200main(int argc, char *argv[])
ce7ba00b
MK
1201{
1202 int sock, map_fd, prog_fd, key;
1203 long long value = 0, tcp_cnt, udp_cnt;
1204
1205 map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key),
1206 sizeof(value), 256);
1207 if (map_fd < 0) {
b957f81f 1208 printf("failed to create map \[aq]%s\[aq]\en", strerror(errno));
ce7ba00b
MK
1209 /* likely not run as root */
1210 return 1;
1211 }
1212
1213 struct bpf_insn prog[] = {
1214 BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), /* r6 = r1 */
1215 BPF_LD_ABS(BPF_B, ETH_HLEN + offsetof(struct iphdr, protocol)),
d064d41a
MK
1216 /* r0 = ip\->proto */
1217 BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, \-4),
115b4e0e 1218 /* *(u32 *)(fp \- 4) = r0 */
ce7ba00b 1219 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), /* r2 = fp */
d064d41a 1220 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, \-4), /* r2 = r2 \- 4 */
ce7ba00b
MK
1221 BPF_LD_MAP_FD(BPF_REG_1, map_fd), /* r1 = map_fd */
1222 BPF_CALL_FUNC(BPF_FUNC_map_lookup_elem),
1223 /* r0 = map_lookup(r1, r2) */
1224 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
1225 /* if (r0 == 0) goto pc+2 */
1226 BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
1227 BPF_XADD(BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0),
115b4e0e 1228 /* lock *(u64 *) r0 += r1 */
ce7ba00b
MK
1229.\" == atomic64_add
1230 BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */
1231 BPF_EXIT_INSN(), /* return r0 */
1232 };
1233
1234 prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, prog,
1235 sizeof(prog) / sizeof(prog[0]), "GPL");
1236
1237 sock = open_raw_sock("lo");
1238
1239 assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd,
1240 sizeof(prog_fd)) == 0);
1241
1242 for (;;) {
1243 key = IPPROTO_TCP;
1244 assert(bpf_lookup_elem(map_fd, &key, &tcp_cnt) == 0);
1245 key = IPPROTO_UDP;
1246 assert(bpf_lookup_elem(map_fd, &key, &udp_cnt) == 0);
1247 printf("TCP %lld UDP %lld packets\en", tcp_cnt, udp_cnt);
1248 sleep(1);
1249 }
1250
1251 return 0;
1252}
1253.EE
33857069 1254.\" SRC END
ce7ba00b
MK
1255.PP
1256Some complete working code can be found in the
1257.I samples/bpf
1258directory in the kernel source tree.
cc7ac21d 1259.SH SEE ALSO
842ee010 1260.BR seccomp (2),
28a4c58c 1261.BR bpf\-helpers (7),
cc42e9b8 1262.BR socket (7),
8440f771 1263.BR tc (8),
28a4c58c 1264.BR tc\-bpf (8)
f0271688 1265.PP
5988a659 1266Both classic and extended BPF are explained in the kernel source file
1148d934 1267.IR Documentation/networking/filter.txt .