]> git.ipfire.org Git - people/pmueller/ipfire-2.x.git/blobdiff - src/patches/suse-2.6.27.31/patches.suse/SoN-21-netvm-skbuff-reserve.patch
Move xen patchset to new version's subdir.
[people/pmueller/ipfire-2.x.git] / src / patches / suse-2.6.27.31 / patches.suse / SoN-21-netvm-skbuff-reserve.patch
diff --git a/src/patches/suse-2.6.27.31/patches.suse/SoN-21-netvm-skbuff-reserve.patch b/src/patches/suse-2.6.27.31/patches.suse/SoN-21-netvm-skbuff-reserve.patch
new file mode 100644 (file)
index 0000000..6ecd80c
--- /dev/null
@@ -0,0 +1,450 @@
+From: Peter Zijlstra <a.p.zijlstra@chello.nl> 
+Subject: netvm: hook skb allocation to reserves
+Patch-mainline: No
+References: FATE#303834
+
+Change the skb allocation api to indicate RX usage and use this to fall back to
+the reserve when needed. SKBs allocated from the reserve are tagged in
+skb->emergency.
+
+Teach all other skb ops about emergency skbs and the reserve accounting.
+
+Use the (new) packet split API to allocate and track fragment pages from the
+emergency reserve. Do this using an atomic counter in page->index. This is
+needed because the fragments have a different sharing semantic than that
+indicated by skb_shinfo()->dataref.
+
+Note that the decision to distinguish between regular and emergency SKBs allows
+the accounting overhead to be limited to the later kind.
+
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Acked-by: Neil Brown <neilb@suse.de>
+Acked-by: Suresh Jayaraman <sjayaraman@suse.de>
+
+---
+ include/linux/mm_types.h |    1 
+ include/linux/skbuff.h   |   27 +++++++--
+ net/core/skbuff.c        |  135 +++++++++++++++++++++++++++++++++++++----------
+ 3 files changed, 132 insertions(+), 31 deletions(-)
+
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -75,6 +75,7 @@ struct page {
+               pgoff_t index;          /* Our offset within mapping. */
+               void *freelist;         /* SLUB: freelist req. slab lock */
+               int reserve;            /* page_alloc: page is a reserve page */
++              atomic_t frag_count;    /* skb fragment use count */
+       };
+       struct list_head lru;           /* Pageout list, eg. active_list
+                                        * protected by zone->lru_lock !
+--- a/include/linux/skbuff.h
++++ b/include/linux/skbuff.h
+@@ -320,7 +320,10 @@ struct sk_buff {
+ #if defined(CONFIG_MAC80211) || defined(CONFIG_MAC80211_MODULE)
+       __u8                    do_not_encrypt:1;
+ #endif
+-      /* 0/13/14 bit hole */
++#ifdef CONFIG_NETVM
++      __u8                    emergency:1;
++#endif
++      /* 12-16 bit hole */
+ #ifdef CONFIG_NET_DMA
+       dma_cookie_t            dma_cookie;
+@@ -353,10 +356,22 @@ struct sk_buff {
+ #include <asm/system.h>
++#define SKB_ALLOC_FCLONE      0x01
++#define SKB_ALLOC_RX          0x02
++
++static inline bool skb_emergency(const struct sk_buff *skb)
++{
++#ifdef CONFIG_NETVM
++      return unlikely(skb->emergency);
++#else
++      return false;
++#endif
++}
++
+ extern void kfree_skb(struct sk_buff *skb);
+ extern void          __kfree_skb(struct sk_buff *skb);
+ extern struct sk_buff *__alloc_skb(unsigned int size,
+-                                 gfp_t priority, int fclone, int node);
++                                 gfp_t priority, int flags, int node);
+ static inline struct sk_buff *alloc_skb(unsigned int size,
+                                       gfp_t priority)
+ {
+@@ -366,7 +381,7 @@ static inline struct sk_buff *alloc_skb(
+ static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
+                                              gfp_t priority)
+ {
+-      return __alloc_skb(size, priority, 1, -1);
++      return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, -1);
+ }
+ extern struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src);
+@@ -1216,7 +1231,8 @@ static inline void __skb_queue_purge(str
+ static inline struct sk_buff *__dev_alloc_skb(unsigned int length,
+                                             gfp_t gfp_mask)
+ {
+-      struct sk_buff *skb = alloc_skb(length + NET_SKB_PAD, gfp_mask);
++      struct sk_buff *skb =
++              __alloc_skb(length + NET_SKB_PAD, gfp_mask, SKB_ALLOC_RX, -1);
+       if (likely(skb))
+               skb_reserve(skb, NET_SKB_PAD);
+       return skb;
+@@ -1247,6 +1263,7 @@ static inline struct sk_buff *netdev_all
+ }
+ extern struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask);
++extern void __netdev_free_page(struct net_device *dev, struct page *page);
+ /**
+  *    netdev_alloc_page - allocate a page for ps-rx on a specific device
+@@ -1263,7 +1280,7 @@ static inline struct page *netdev_alloc_
+ static inline void netdev_free_page(struct net_device *dev, struct page *page)
+ {
+-      __free_page(page);
++      __netdev_free_page(dev, page);
+ }
+ /**
+--- a/net/core/skbuff.c
++++ b/net/core/skbuff.c
+@@ -173,23 +173,29 @@ EXPORT_SYMBOL(skb_truesize_bug);
+  *    %GFP_ATOMIC.
+  */
+ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
+-                          int fclone, int node)
++                          int flags, int node)
+ {
+       struct kmem_cache *cache;
+       struct skb_shared_info *shinfo;
+       struct sk_buff *skb;
+       u8 *data;
++      int emergency = 0;
++      int memalloc = sk_memalloc_socks();
+-      cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
++      size = SKB_DATA_ALIGN(size);
++      cache = (flags & SKB_ALLOC_FCLONE)
++              ? skbuff_fclone_cache : skbuff_head_cache;
++
++      if (memalloc && (flags & SKB_ALLOC_RX))
++              gfp_mask |= __GFP_MEMALLOC;
+       /* Get the HEAD */
+       skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
+       if (!skb)
+               goto out;
+-      size = SKB_DATA_ALIGN(size);
+-      data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
+-                      gfp_mask, node);
++      data = kmalloc_reserve(size + sizeof(struct skb_shared_info),
++                      gfp_mask, node, &net_skb_reserve, &emergency);
+       if (!data)
+               goto nodata;
+@@ -199,6 +205,9 @@ struct sk_buff *__alloc_skb(unsigned int
+        * the tail pointer in struct sk_buff!
+        */
+       memset(skb, 0, offsetof(struct sk_buff, tail));
++#ifdef CONFIG_NETVM
++      skb->emergency = emergency;
++#endif
+       skb->truesize = size + sizeof(struct sk_buff);
+       atomic_set(&skb->users, 1);
+       skb->head = data;
+@@ -215,7 +224,7 @@ struct sk_buff *__alloc_skb(unsigned int
+       shinfo->ip6_frag_id = 0;
+       shinfo->frag_list = NULL;
+-      if (fclone) {
++      if (flags & SKB_ALLOC_FCLONE) {
+               struct sk_buff *child = skb + 1;
+               atomic_t *fclone_ref = (atomic_t *) (child + 1);
+@@ -223,6 +232,9 @@ struct sk_buff *__alloc_skb(unsigned int
+               atomic_set(fclone_ref, 1);
+               child->fclone = SKB_FCLONE_UNAVAILABLE;
++#ifdef CONFIG_NETVM
++              child->emergency = skb->emergency;
++#endif
+       }
+ out:
+       return skb;
+@@ -251,7 +263,7 @@ struct sk_buff *__netdev_alloc_skb(struc
+       int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
+       struct sk_buff *skb;
+-      skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node);
++      skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, SKB_ALLOC_RX, node);
+       if (likely(skb)) {
+               skb_reserve(skb, NET_SKB_PAD);
+               skb->dev = dev;
+@@ -264,11 +276,19 @@ struct page *__netdev_alloc_page(struct 
+       int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
+       struct page *page;
+-      page = alloc_pages_node(node, gfp_mask, 0);
++      page = alloc_pages_reserve(node, gfp_mask | __GFP_MEMALLOC, 0,
++                      &net_skb_reserve, NULL);
++
+       return page;
+ }
+ EXPORT_SYMBOL(__netdev_alloc_page);
++void __netdev_free_page(struct net_device *dev, struct page *page)
++{
++      free_pages_reserve(page, 0, &net_skb_reserve, page->reserve);
++}
++EXPORT_SYMBOL(__netdev_free_page);
++
+ void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
+               int size)
+ {
+@@ -276,6 +296,27 @@ void skb_add_rx_frag(struct sk_buff *skb
+       skb->len += size;
+       skb->data_len += size;
+       skb->truesize += size;
++
++#ifdef CONFIG_NETVM
++      /*
++       * In the rare case that skb_emergency() != page->reserved we'll
++       * skew the accounting slightly, but since its only a 'small' constant
++       * shift its ok.
++       */
++      if (skb_emergency(skb)) {
++              /*
++               * We need to track fragment pages so that we properly
++               * release their reserve in skb_put_page().
++               */
++              atomic_set(&page->frag_count, 1);
++      } else if (unlikely(page->reserve)) {
++              /*
++               * Release the reserve now, because normal skbs don't
++               * do the emergency accounting.
++               */
++              mem_reserve_pages_charge(&net_skb_reserve, -1);
++      }
++#endif
+ }
+ EXPORT_SYMBOL(skb_add_rx_frag);
+@@ -327,21 +368,38 @@ static void skb_clone_fraglist(struct sk
+               skb_get(list);
+ }
++static void skb_get_page(struct sk_buff *skb, struct page *page)
++{
++      get_page(page);
++      if (skb_emergency(skb))
++              atomic_inc(&page->frag_count);
++}
++
++static void skb_put_page(struct sk_buff *skb, struct page *page)
++{
++      if (skb_emergency(skb) && atomic_dec_and_test(&page->frag_count))
++              mem_reserve_pages_charge(&net_skb_reserve, -1);
++      put_page(page);
++}
++
+ static void skb_release_data(struct sk_buff *skb)
+ {
+       if (!skb->cloned ||
+           !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
+                              &skb_shinfo(skb)->dataref)) {
++
+               if (skb_shinfo(skb)->nr_frags) {
+                       int i;
+-                      for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+-                              put_page(skb_shinfo(skb)->frags[i].page);
++                      for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
++                              skb_put_page(skb,
++                                           skb_shinfo(skb)->frags[i].page);
++                      }
+               }
+               if (skb_shinfo(skb)->frag_list)
+                       skb_drop_fraglist(skb);
+-              kfree(skb->head);
++              kfree_reserve(skb->head, &net_skb_reserve, skb_emergency(skb));
+       }
+ }
+@@ -462,6 +520,9 @@ static void __copy_skb_header(struct sk_
+ #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
+       new->ipvs_property      = old->ipvs_property;
+ #endif
++#ifdef CONFIG_NETVM
++      new->emergency          = old->emergency;
++#endif
+       new->protocol           = old->protocol;
+       new->mark               = old->mark;
+       __nf_copy(new, old);
+@@ -555,6 +616,9 @@ struct sk_buff *skb_clone(struct sk_buff
+               n->fclone = SKB_FCLONE_CLONE;
+               atomic_inc(fclone_ref);
+       } else {
++              if (skb_emergency(skb))
++                      gfp_mask |= __GFP_MEMALLOC;
++
+               n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
+               if (!n)
+                       return NULL;
+@@ -586,6 +650,14 @@ static void copy_skb_header(struct sk_bu
+       skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
+ }
++static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
++{
++      if (skb_emergency(skb))
++              return SKB_ALLOC_RX;
++
++      return 0;
++}
++
+ /**
+  *    skb_copy        -       create private copy of an sk_buff
+  *    @skb: buffer to copy
+@@ -606,15 +678,17 @@ static void copy_skb_header(struct sk_bu
+ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
+ {
+       int headerlen = skb->data - skb->head;
++      int size;
+       /*
+        *      Allocate the copy buffer
+        */
+       struct sk_buff *n;
+ #ifdef NET_SKBUFF_DATA_USES_OFFSET
+-      n = alloc_skb(skb->end + skb->data_len, gfp_mask);
++      size = skb->end + skb->data_len;
+ #else
+-      n = alloc_skb(skb->end - skb->head + skb->data_len, gfp_mask);
++      size = skb->end - skb->head + skb->data_len;
+ #endif
++      n = __alloc_skb(size, gfp_mask, skb_alloc_rx_flag(skb), -1);
+       if (!n)
+               return NULL;
+@@ -649,12 +723,14 @@ struct sk_buff *pskb_copy(struct sk_buff
+       /*
+        *      Allocate the copy buffer
+        */
++      int size;
+       struct sk_buff *n;
+ #ifdef NET_SKBUFF_DATA_USES_OFFSET
+-      n = alloc_skb(skb->end, gfp_mask);
++      size = skb->end;
+ #else
+-      n = alloc_skb(skb->end - skb->head, gfp_mask);
++      size = skb->end - skb->head;
+ #endif
++      n = __alloc_skb(size, gfp_mask, skb_alloc_rx_flag(skb), -1);
+       if (!n)
+               goto out;
+@@ -673,8 +749,9 @@ struct sk_buff *pskb_copy(struct sk_buff
+               int i;
+               for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+-                      skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
+-                      get_page(skb_shinfo(n)->frags[i].page);
++                      skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
++                      skb_shinfo(n)->frags[i] = *frag;
++                      skb_get_page(n, frag->page);
+               }
+               skb_shinfo(n)->nr_frags = i;
+       }
+@@ -722,7 +799,11 @@ int pskb_expand_head(struct sk_buff *skb
+       size = SKB_DATA_ALIGN(size);
+-      data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
++      if (skb_emergency(skb))
++              gfp_mask |= __GFP_MEMALLOC;
++
++      data = kmalloc_reserve(size + sizeof(struct skb_shared_info),
++                      gfp_mask, -1, &net_skb_reserve, NULL);
+       if (!data)
+               goto nodata;
+@@ -737,7 +818,7 @@ int pskb_expand_head(struct sk_buff *skb
+              sizeof(struct skb_shared_info));
+       for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+-              get_page(skb_shinfo(skb)->frags[i].page);
++              skb_get_page(skb, skb_shinfo(skb)->frags[i].page);
+       if (skb_shinfo(skb)->frag_list)
+               skb_clone_fraglist(skb);
+@@ -816,8 +897,8 @@ struct sk_buff *skb_copy_expand(const st
+       /*
+        *      Allocate the copy buffer
+        */
+-      struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom,
+-                                    gfp_mask);
++      struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom,
++                                      gfp_mask, skb_alloc_rx_flag(skb), -1);
+       int oldheadroom = skb_headroom(skb);
+       int head_copy_len, head_copy_off;
+       int off;
+@@ -1006,7 +1087,7 @@ drop_pages:
+               skb_shinfo(skb)->nr_frags = i;
+               for (; i < nfrags; i++)
+-                      put_page(skb_shinfo(skb)->frags[i].page);
++                      skb_put_page(skb, skb_shinfo(skb)->frags[i].page);
+               if (skb_shinfo(skb)->frag_list)
+                       skb_drop_fraglist(skb);
+@@ -1175,7 +1256,7 @@ pull_pages:
+       k = 0;
+       for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+               if (skb_shinfo(skb)->frags[i].size <= eat) {
+-                      put_page(skb_shinfo(skb)->frags[i].page);
++                      skb_put_page(skb, skb_shinfo(skb)->frags[i].page);
+                       eat -= skb_shinfo(skb)->frags[i].size;
+               } else {
+                       skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
+@@ -1925,6 +2006,7 @@ static inline void skb_split_no_header(s
+                       skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
+                       if (pos < len) {
++                              struct page *page = skb_shinfo(skb)->frags[i].page;
+                               /* Split frag.
+                                * We have two variants in this case:
+                                * 1. Move all the frag to the second
+@@ -1933,7 +2015,7 @@ static inline void skb_split_no_header(s
+                                *    where splitting is expensive.
+                                * 2. Split is accurately. We make this.
+                                */
+-                              get_page(skb_shinfo(skb)->frags[i].page);
++                              skb_get_page(skb1, page);
+                               skb_shinfo(skb1)->frags[0].page_offset += len - pos;
+                               skb_shinfo(skb1)->frags[0].size -= len - pos;
+                               skb_shinfo(skb)->frags[i].size  = len - pos;
+@@ -2264,7 +2346,8 @@ struct sk_buff *skb_segment(struct sk_bu
+               if (hsize > len || !sg)
+                       hsize = len;
+-              nskb = alloc_skb(hsize + doffset + headroom, GFP_ATOMIC);
++              nskb = __alloc_skb(hsize + doffset + headroom, GFP_ATOMIC,
++                                 skb_alloc_rx_flag(skb), -1);
+               if (unlikely(!nskb))
+                       goto err;
+@@ -2302,7 +2385,7 @@ struct sk_buff *skb_segment(struct sk_bu
+                       BUG_ON(i >= nfrags);
+                       *frag = skb_shinfo(skb)->frags[i];
+-                      get_page(frag->page);
++                      skb_get_page(nskb, frag->page);
+                       size = frag->size;
+                       if (pos < offset) {