net/xdp/xdp_umem.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /* XDP user-space packet buffer
   3  * Copyright(c) 2018 Intel Corporation.
   4  */
   5
   6 #include <linux/init.h>
   7 #include <linux/sched/mm.h>
   8 #include <linux/sched/signal.h>
   9 #include <linux/sched/task.h>
  10 #include <linux/uaccess.h>
  11 #include <linux/slab.h>
  12 #include <linux/bpf.h>
  13 #include <linux/mm.h>
  14 #include <linux/netdevice.h>
  15 #include <linux/rtnetlink.h>
  16 #include <linux/idr.h>
  17
  18 #include "xdp_umem.h"
  19 #include "xsk_queue.h"
  20
  21 #define XDP_UMEM_MIN_CHUNK_SIZE 2048
  22
  23 static DEFINE_IDA(umem_ida);
  24
  25 void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
  26 {
  27         unsigned long flags;
  28
  29         spin_lock_irqsave(&umem->xsk_list_lock, flags);
  30         list_add_rcu(&xs->list, &umem->xsk_list);
  31         spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
  32 }
  33
  34 void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
  35 {
  36         unsigned long flags;
  37
  38         spin_lock_irqsave(&umem->xsk_list_lock, flags);
  39         list_del_rcu(&xs->list);
  40         spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
  41 }
  42
  43 /* The umem is stored both in the _rx struct and the _tx struct as we do
  44  * not know if the device has more tx queues than rx, or the opposite.
  45  * This might also change during run time.
  46  */
  47 static int xdp_reg_umem_at_qid(struct net_device *dev, struct xdp_umem *umem,
  48                                u16 queue_id)
  49 {
  50         if (queue_id >= max_t(unsigned int,
  51                               dev->real_num_rx_queues,
  52                               dev->real_num_tx_queues))
  53                 return -EINVAL;
  54
  55         if (queue_id < dev->real_num_rx_queues)
  56                 dev->_rx[queue_id].umem = umem;
  57         if (queue_id < dev->real_num_tx_queues)
  58                 dev->_tx[queue_id].umem = umem;
  59
  60         return 0;
  61 }
  62
  63 struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
  64                                        u16 queue_id)
  65 {
  66         if (queue_id < dev->real_num_rx_queues)
  67                 return dev->_rx[queue_id].umem;
  68         if (queue_id < dev->real_num_tx_queues)
  69                 return dev->_tx[queue_id].umem;
  70
  71         return NULL;
  72 }
  73 EXPORT_SYMBOL(xdp_get_umem_from_qid);
  74
  75 static void xdp_clear_umem_at_qid(struct net_device *dev, u16 queue_id)
  76 {
  77         if (queue_id < dev->real_num_rx_queues)
  78                 dev->_rx[queue_id].umem = NULL;
  79         if (queue_id < dev->real_num_tx_queues)
  80                 dev->_tx[queue_id].umem = NULL;
  81 }
  82
  83 int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
  84                         u16 queue_id, u16 flags)
  85 {
  86         bool force_zc, force_copy;
  87         struct netdev_bpf bpf;
  88         int err = 0;
  89
  90         force_zc = flags & XDP_ZEROCOPY;
  91         force_copy = flags & XDP_COPY;
  92
  93         if (force_zc && force_copy)
  94                 return -EINVAL;
  95
  96         rtnl_lock();
  97         if (xdp_get_umem_from_qid(dev, queue_id)) {
  98                 err = -EBUSY;
  99                 goto out_rtnl_unlock;
 100         }
 101
 102         err = xdp_reg_umem_at_qid(dev, umem, queue_id);
 103         if (err)
 104                 goto out_rtnl_unlock;
 105
 106         umem->dev = dev;
 107         umem->queue_id = queue_id;
 108         if (force_copy)
 109                 /* For copy-mode, we are done. */
 110                 goto out_rtnl_unlock;
 111
 112         if (!dev->netdev_ops->ndo_bpf ||
 113             !dev->netdev_ops->ndo_xsk_async_xmit) {
 114                 err = -EOPNOTSUPP;
 115                 goto err_unreg_umem;
 116         }
 117
 118         bpf.command = XDP_SETUP_XSK_UMEM;
 119         bpf.xsk.umem = umem;
 120         bpf.xsk.queue_id = queue_id;
 121
 122         err = dev->netdev_ops->ndo_bpf(dev, &bpf);
 123         if (err)
 124                 goto err_unreg_umem;
 125         rtnl_unlock();
 126
 127         dev_hold(dev);
 128         umem->zc = true;
 129         return 0;
 130
 131 err_unreg_umem:
 132         if (!force_zc)
 133                 err = 0; /* fallback to copy mode */
 134         if (err)
 135                 xdp_clear_umem_at_qid(dev, queue_id);
 136 out_rtnl_unlock:
 137         rtnl_unlock();
 138         return err;
 139 }
 140
 141 static void xdp_umem_clear_dev(struct xdp_umem *umem)
 142 {
 143         struct netdev_bpf bpf;
 144         int err;
 145
 146         if (umem->zc) {
 147                 bpf.command = XDP_SETUP_XSK_UMEM;
 148                 bpf.xsk.umem = NULL;
 149                 bpf.xsk.queue_id = umem->queue_id;
 150
 151                 rtnl_lock();
 152                 err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf);
 153                 rtnl_unlock();
 154
 155                 if (err)
 156                         WARN(1, "failed to disable umem!\n");
 157         }
 158
 159         if (umem->dev) {
 160                 rtnl_lock();
 161                 xdp_clear_umem_at_qid(umem->dev, umem->queue_id);
 162                 rtnl_unlock();
 163         }
 164
 165         if (umem->zc) {
 166                 dev_put(umem->dev);
 167                 umem->zc = false;
 168         }
 169 }
 170
 171 static void xdp_umem_unpin_pages(struct xdp_umem *umem)
 172 {
 173         unsigned int i;
 174
 175         for (i = 0; i < umem->npgs; i++) {
 176                 struct page *page = umem->pgs[i];
 177
 178                 set_page_dirty_lock(page);
 179                 put_page(page);
 180         }
 181
 182         kfree(umem->pgs);
 183         umem->pgs = NULL;
 184 }
 185
 186 static void xdp_umem_unaccount_pages(struct xdp_umem *umem)
 187 {
 188         if (umem->user) {
 189                 atomic_long_sub(umem->npgs, &umem->user->locked_vm);
 190                 free_uid(umem->user);
 191         }
 192 }
 193
 194 static void xdp_umem_release(struct xdp_umem *umem)
 195 {
 196         struct task_struct *task;
 197         struct mm_struct *mm;
 198
 199         xdp_umem_clear_dev(umem);
 200
 201         ida_simple_remove(&umem_ida, umem->id);
 202
 203         if (umem->fq) {
 204                 xskq_destroy(umem->fq);
 205                 umem->fq = NULL;
 206         }
 207
 208         if (umem->cq) {
 209                 xskq_destroy(umem->cq);
 210                 umem->cq = NULL;
 211         }
 212
 213         xsk_reuseq_destroy(umem);
 214
 215         xdp_umem_unpin_pages(umem);
 216
 217         task = get_pid_task(umem->pid, PIDTYPE_PID);
 218         put_pid(umem->pid);
 219         if (!task)
 220                 goto out;
 221         mm = get_task_mm(task);
 222         put_task_struct(task);
 223         if (!mm)
 224                 goto out;
 225
 226         mmput(mm);
 227         kfree(umem->pages);
 228         umem->pages = NULL;
 229
 230         xdp_umem_unaccount_pages(umem);
 231 out:
 232         kfree(umem);
 233 }
 234
 235 static void xdp_umem_release_deferred(struct work_struct *work)
 236 {
 237         struct xdp_umem *umem = container_of(work, struct xdp_umem, work);
 238
 239         xdp_umem_release(umem);
 240 }
 241
 242 void xdp_get_umem(struct xdp_umem *umem)
 243 {
 244         refcount_inc(&umem->users);
 245 }
 246
 247 void xdp_put_umem(struct xdp_umem *umem)
 248 {
 249         if (!umem)
 250                 return;
 251
 252         if (refcount_dec_and_test(&umem->users)) {
 253                 INIT_WORK(&umem->work, xdp_umem_release_deferred);
 254                 schedule_work(&umem->work);
 255         }
 256 }
 257
 258 static int xdp_umem_pin_pages(struct xdp_umem *umem)
 259 {
 260         unsigned int gup_flags = FOLL_WRITE;
 261         long npgs;
 262         int err;
 263
 264         umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs),
 265                             GFP_KERNEL | __GFP_NOWARN);
 266         if (!umem->pgs)
 267                 return -ENOMEM;
 268
 269         down_read(&current->mm->mmap_sem);
 270         npgs = get_user_pages_longterm(umem->address, umem->npgs,
 271                                        gup_flags, &umem->pgs[0], NULL);
 272         up_read(&current->mm->mmap_sem);
 273
 274         if (npgs != umem->npgs) {
 275                 if (npgs >= 0) {
 276                         umem->npgs = npgs;
 277                         err = -ENOMEM;
 278                         goto out_pin;
 279                 }
 280                 err = npgs;
 281                 goto out_pgs;
 282         }
 283         return 0;
 284
 285 out_pin:
 286         xdp_umem_unpin_pages(umem);
 287 out_pgs:
 288         kfree(umem->pgs);
 289         umem->pgs = NULL;
 290         return err;
 291 }
 292
 293 static int xdp_umem_account_pages(struct xdp_umem *umem)
 294 {
 295         unsigned long lock_limit, new_npgs, old_npgs;
 296
 297         if (capable(CAP_IPC_LOCK))
 298                 return 0;
 299
 300         lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 301         umem->user = get_uid(current_user());
 302
 303         do {
 304                 old_npgs = atomic_long_read(&umem->user->locked_vm);
 305                 new_npgs = old_npgs + umem->npgs;
 306                 if (new_npgs > lock_limit) {
 307                         free_uid(umem->user);
 308                         umem->user = NULL;
 309                         return -ENOBUFS;
 310                 }
 311         } while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs,
 312                                      new_npgs) != old_npgs);
 313         return 0;
 314 }
 315
 316 static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 317 {
 318         u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
 319         unsigned int chunks, chunks_per_page;
 320         u64 addr = mr->addr, size = mr->len;
 321         int size_chk, err, i;
 322
 323         if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
 324                 /* Strictly speaking we could support this, if:
 325                  * - huge pages, or*
 326                  * - using an IOMMU, or
 327                  * - making sure the memory area is consecutive
 328                  * but for now, we simply say "computer says no".
 329                  */
 330                 return -EINVAL;
 331         }
 332
 333         if (!is_power_of_2(chunk_size))
 334                 return -EINVAL;
 335
 336         if (!PAGE_ALIGNED(addr)) {
 337                 /* Memory area has to be page size aligned. For
 338                  * simplicity, this might change.
 339                  */
 340                 return -EINVAL;
 341         }
 342
 343         if ((addr + size) < addr)
 344                 return -EINVAL;
 345
 346         chunks = (unsigned int)div_u64(size, chunk_size);
 347         if (chunks == 0)
 348                 return -EINVAL;
 349
 350         chunks_per_page = PAGE_SIZE / chunk_size;
 351         if (chunks < chunks_per_page || chunks % chunks_per_page)
 352                 return -EINVAL;
 353
 354         headroom = ALIGN(headroom, 64);
 355
 356         size_chk = chunk_size - headroom - XDP_PACKET_HEADROOM;
 357         if (size_chk < 0)
 358                 return -EINVAL;
 359
 360         umem->pid = get_task_pid(current, PIDTYPE_PID);
 361         umem->address = (unsigned long)addr;
 362         umem->chunk_mask = ~((u64)chunk_size - 1);
 363         umem->size = size;
 364         umem->headroom = headroom;
 365         umem->chunk_size_nohr = chunk_size - headroom;
 366         umem->npgs = size / PAGE_SIZE;
 367         umem->pgs = NULL;
 368         umem->user = NULL;
 369         INIT_LIST_HEAD(&umem->xsk_list);
 370         spin_lock_init(&umem->xsk_list_lock);
 371
 372         refcount_set(&umem->users, 1);
 373
 374         err = xdp_umem_account_pages(umem);
 375         if (err)
 376                 goto out;
 377
 378         err = xdp_umem_pin_pages(umem);
 379         if (err)
 380                 goto out_account;
 381
 382         umem->pages = kcalloc(umem->npgs, sizeof(*umem->pages), GFP_KERNEL);
 383         if (!umem->pages) {
 384                 err = -ENOMEM;
 385                 goto out_account;
 386         }
 387
 388         for (i = 0; i < umem->npgs; i++)
 389                 umem->pages[i].addr = page_address(umem->pgs[i]);
 390
 391         return 0;
 392
 393 out_account:
 394         xdp_umem_unaccount_pages(umem);
 395 out:
 396         put_pid(umem->pid);
 397         return err;
 398 }
 399
 400 struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr)
 401 {
 402         struct xdp_umem *umem;
 403         int err;
 404
 405         umem = kzalloc(sizeof(*umem), GFP_KERNEL);
 406         if (!umem)
 407                 return ERR_PTR(-ENOMEM);
 408
 409         err = ida_simple_get(&umem_ida, 0, 0, GFP_KERNEL);
 410         if (err < 0) {
 411                 kfree(umem);
 412                 return ERR_PTR(err);
 413         }
 414         umem->id = err;
 415
 416         err = xdp_umem_reg(umem, mr);
 417         if (err) {
 418                 ida_simple_remove(&umem_ida, umem->id);
 419                 kfree(umem);
 420                 return ERR_PTR(err);
 421         }
 422
 423         return umem;
 424 }
 425
 426 bool xdp_umem_validate_queues(struct xdp_umem *umem)
 427 {
 428         return umem->fq && umem->cq;
 429 }