1 // SPDX-License-Identifier: GPL-2.0
2 /* XDP user-space packet buffer
3 * Copyright(c) 2018 Intel Corporation.
6 #include <linux/init.h>
7 #include <linux/sched/mm.h>
8 #include <linux/sched/signal.h>
9 #include <linux/sched/task.h>
10 #include <linux/uaccess.h>
11 #include <linux/slab.h>
12 #include <linux/bpf.h>
14 #include <linux/netdevice.h>
15 #include <linux/rtnetlink.h>
16 #include <linux/idr.h>
19 #include "xsk_queue.h"
21 #define XDP_UMEM_MIN_CHUNK_SIZE 2048
23 static DEFINE_IDA(umem_ida
);
25 void xdp_add_sk_umem(struct xdp_umem
*umem
, struct xdp_sock
*xs
)
29 spin_lock_irqsave(&umem
->xsk_list_lock
, flags
);
30 list_add_rcu(&xs
->list
, &umem
->xsk_list
);
31 spin_unlock_irqrestore(&umem
->xsk_list_lock
, flags
);
34 void xdp_del_sk_umem(struct xdp_umem
*umem
, struct xdp_sock
*xs
)
38 spin_lock_irqsave(&umem
->xsk_list_lock
, flags
);
39 list_del_rcu(&xs
->list
);
40 spin_unlock_irqrestore(&umem
->xsk_list_lock
, flags
);
43 /* The umem is stored both in the _rx struct and the _tx struct as we do
44 * not know if the device has more tx queues than rx, or the opposite.
45 * This might also change during run time.
47 static int xdp_reg_umem_at_qid(struct net_device
*dev
, struct xdp_umem
*umem
,
50 if (queue_id
>= max_t(unsigned int,
51 dev
->real_num_rx_queues
,
52 dev
->real_num_tx_queues
))
55 if (queue_id
< dev
->real_num_rx_queues
)
56 dev
->_rx
[queue_id
].umem
= umem
;
57 if (queue_id
< dev
->real_num_tx_queues
)
58 dev
->_tx
[queue_id
].umem
= umem
;
63 struct xdp_umem
*xdp_get_umem_from_qid(struct net_device
*dev
,
66 if (queue_id
< dev
->real_num_rx_queues
)
67 return dev
->_rx
[queue_id
].umem
;
68 if (queue_id
< dev
->real_num_tx_queues
)
69 return dev
->_tx
[queue_id
].umem
;
73 EXPORT_SYMBOL(xdp_get_umem_from_qid
);
75 static void xdp_clear_umem_at_qid(struct net_device
*dev
, u16 queue_id
)
77 if (queue_id
< dev
->real_num_rx_queues
)
78 dev
->_rx
[queue_id
].umem
= NULL
;
79 if (queue_id
< dev
->real_num_tx_queues
)
80 dev
->_tx
[queue_id
].umem
= NULL
;
83 int xdp_umem_assign_dev(struct xdp_umem
*umem
, struct net_device
*dev
,
84 u16 queue_id
, u16 flags
)
86 bool force_zc
, force_copy
;
87 struct netdev_bpf bpf
;
90 force_zc
= flags
& XDP_ZEROCOPY
;
91 force_copy
= flags
& XDP_COPY
;
93 if (force_zc
&& force_copy
)
97 if (xdp_get_umem_from_qid(dev
, queue_id
)) {
102 err
= xdp_reg_umem_at_qid(dev
, umem
, queue_id
);
104 goto out_rtnl_unlock
;
107 umem
->queue_id
= queue_id
;
109 /* For copy-mode, we are done. */
110 goto out_rtnl_unlock
;
112 if (!dev
->netdev_ops
->ndo_bpf
||
113 !dev
->netdev_ops
->ndo_xsk_async_xmit
) {
118 bpf
.command
= XDP_SETUP_XSK_UMEM
;
120 bpf
.xsk
.queue_id
= queue_id
;
122 err
= dev
->netdev_ops
->ndo_bpf(dev
, &bpf
);
133 err
= 0; /* fallback to copy mode */
135 xdp_clear_umem_at_qid(dev
, queue_id
);
141 static void xdp_umem_clear_dev(struct xdp_umem
*umem
)
143 struct netdev_bpf bpf
;
147 bpf
.command
= XDP_SETUP_XSK_UMEM
;
149 bpf
.xsk
.queue_id
= umem
->queue_id
;
152 err
= umem
->dev
->netdev_ops
->ndo_bpf(umem
->dev
, &bpf
);
156 WARN(1, "failed to disable umem!\n");
161 xdp_clear_umem_at_qid(umem
->dev
, umem
->queue_id
);
171 static void xdp_umem_unpin_pages(struct xdp_umem
*umem
)
175 for (i
= 0; i
< umem
->npgs
; i
++) {
176 struct page
*page
= umem
->pgs
[i
];
178 set_page_dirty_lock(page
);
186 static void xdp_umem_unaccount_pages(struct xdp_umem
*umem
)
189 atomic_long_sub(umem
->npgs
, &umem
->user
->locked_vm
);
190 free_uid(umem
->user
);
194 static void xdp_umem_release(struct xdp_umem
*umem
)
196 struct task_struct
*task
;
197 struct mm_struct
*mm
;
199 xdp_umem_clear_dev(umem
);
201 ida_simple_remove(&umem_ida
, umem
->id
);
204 xskq_destroy(umem
->fq
);
209 xskq_destroy(umem
->cq
);
213 xsk_reuseq_destroy(umem
);
215 xdp_umem_unpin_pages(umem
);
217 task
= get_pid_task(umem
->pid
, PIDTYPE_PID
);
221 mm
= get_task_mm(task
);
222 put_task_struct(task
);
230 xdp_umem_unaccount_pages(umem
);
235 static void xdp_umem_release_deferred(struct work_struct
*work
)
237 struct xdp_umem
*umem
= container_of(work
, struct xdp_umem
, work
);
239 xdp_umem_release(umem
);
242 void xdp_get_umem(struct xdp_umem
*umem
)
244 refcount_inc(&umem
->users
);
247 void xdp_put_umem(struct xdp_umem
*umem
)
252 if (refcount_dec_and_test(&umem
->users
)) {
253 INIT_WORK(&umem
->work
, xdp_umem_release_deferred
);
254 schedule_work(&umem
->work
);
258 static int xdp_umem_pin_pages(struct xdp_umem
*umem
)
260 unsigned int gup_flags
= FOLL_WRITE
;
264 umem
->pgs
= kcalloc(umem
->npgs
, sizeof(*umem
->pgs
),
265 GFP_KERNEL
| __GFP_NOWARN
);
269 down_read(¤t
->mm
->mmap_sem
);
270 npgs
= get_user_pages_longterm(umem
->address
, umem
->npgs
,
271 gup_flags
, &umem
->pgs
[0], NULL
);
272 up_read(¤t
->mm
->mmap_sem
);
274 if (npgs
!= umem
->npgs
) {
286 xdp_umem_unpin_pages(umem
);
293 static int xdp_umem_account_pages(struct xdp_umem
*umem
)
295 unsigned long lock_limit
, new_npgs
, old_npgs
;
297 if (capable(CAP_IPC_LOCK
))
300 lock_limit
= rlimit(RLIMIT_MEMLOCK
) >> PAGE_SHIFT
;
301 umem
->user
= get_uid(current_user());
304 old_npgs
= atomic_long_read(&umem
->user
->locked_vm
);
305 new_npgs
= old_npgs
+ umem
->npgs
;
306 if (new_npgs
> lock_limit
) {
307 free_uid(umem
->user
);
311 } while (atomic_long_cmpxchg(&umem
->user
->locked_vm
, old_npgs
,
312 new_npgs
) != old_npgs
);
316 static int xdp_umem_reg(struct xdp_umem
*umem
, struct xdp_umem_reg
*mr
)
318 u32 chunk_size
= mr
->chunk_size
, headroom
= mr
->headroom
;
319 unsigned int chunks
, chunks_per_page
;
320 u64 addr
= mr
->addr
, size
= mr
->len
;
321 int size_chk
, err
, i
;
323 if (chunk_size
< XDP_UMEM_MIN_CHUNK_SIZE
|| chunk_size
> PAGE_SIZE
) {
324 /* Strictly speaking we could support this, if:
326 * - using an IOMMU, or
327 * - making sure the memory area is consecutive
328 * but for now, we simply say "computer says no".
333 if (!is_power_of_2(chunk_size
))
336 if (!PAGE_ALIGNED(addr
)) {
337 /* Memory area has to be page size aligned. For
338 * simplicity, this might change.
343 if ((addr
+ size
) < addr
)
346 chunks
= (unsigned int)div_u64(size
, chunk_size
);
350 chunks_per_page
= PAGE_SIZE
/ chunk_size
;
351 if (chunks
< chunks_per_page
|| chunks
% chunks_per_page
)
354 headroom
= ALIGN(headroom
, 64);
356 size_chk
= chunk_size
- headroom
- XDP_PACKET_HEADROOM
;
360 umem
->pid
= get_task_pid(current
, PIDTYPE_PID
);
361 umem
->address
= (unsigned long)addr
;
362 umem
->chunk_mask
= ~((u64
)chunk_size
- 1);
364 umem
->headroom
= headroom
;
365 umem
->chunk_size_nohr
= chunk_size
- headroom
;
366 umem
->npgs
= size
/ PAGE_SIZE
;
369 INIT_LIST_HEAD(&umem
->xsk_list
);
370 spin_lock_init(&umem
->xsk_list_lock
);
372 refcount_set(&umem
->users
, 1);
374 err
= xdp_umem_account_pages(umem
);
378 err
= xdp_umem_pin_pages(umem
);
382 umem
->pages
= kcalloc(umem
->npgs
, sizeof(*umem
->pages
), GFP_KERNEL
);
388 for (i
= 0; i
< umem
->npgs
; i
++)
389 umem
->pages
[i
].addr
= page_address(umem
->pgs
[i
]);
394 xdp_umem_unaccount_pages(umem
);
400 struct xdp_umem
*xdp_umem_create(struct xdp_umem_reg
*mr
)
402 struct xdp_umem
*umem
;
405 umem
= kzalloc(sizeof(*umem
), GFP_KERNEL
);
407 return ERR_PTR(-ENOMEM
);
409 err
= ida_simple_get(&umem_ida
, 0, 0, GFP_KERNEL
);
416 err
= xdp_umem_reg(umem
, mr
);
418 ida_simple_remove(&umem_ida
, umem
->id
);
426 bool xdp_umem_validate_queues(struct xdp_umem
*umem
)
428 return umem
->fq
&& umem
->cq
;