From 8ec868ab22199856b01ac0f172be1323a86bc5f1 Mon Sep 17 00:00:00 2001 From: Michael Tremer Date: Mon, 10 Feb 2014 23:00:19 +0100 Subject: [PATCH] linux: Fix IMQ crash. Fixes #10474. --- lfs/linux | 2 +- src/patches/imq_kernel3.10.23.patch | 1568 ------ src/patches/linux-3.10.25-imq.patch | 6800 +++++++++++++++++++++++++++ 3 files changed, 6801 insertions(+), 1569 deletions(-) delete mode 100644 src/patches/imq_kernel3.10.23.patch create mode 100644 src/patches/linux-3.10.25-imq.patch diff --git a/lfs/linux b/lfs/linux index 6b7e9b80de..a422d7a118 100644 --- a/lfs/linux +++ b/lfs/linux @@ -112,7 +112,7 @@ $(TARGET) : $(patsubst %,$(DIR_DL)/%,$(objects)) ln -svf linux-$(VER) $(DIR_SRC)/linux # Linux Intermediate Queueing Device - cd $(DIR_APP) && patch -Np1 < $(DIR_SRC)/src/patches/imq_kernel3.10.23.patch + cd $(DIR_APP) && patch -Np1 < $(DIR_SRC)/src/patches/linux-3.10.25-imq.patch # ipp2p 0.8.2-ipfire cd $(DIR_APP) && patch -Np1 < $(DIR_SRC)/src/patches/linux-3.10-ipp2p-0.8.2-ipfire.patch diff --git a/src/patches/imq_kernel3.10.23.patch b/src/patches/imq_kernel3.10.23.patch deleted file mode 100644 index 9ee85e704d..0000000000 --- a/src/patches/imq_kernel3.10.23.patch +++ /dev/null @@ -1,1568 +0,0 @@ -diff -uNr linux-3.9.1/drivers/net/imq.c linux-3.9.1-imqmq/drivers/net/imq.c ---- linux-3.9.1/drivers/net/imq.c 1970-01-01 02:00:00.000000000 +0200 -+++ linux-3.9.1-imqmq/drivers/net/imq.c 2013-05-08 17:30:41.715552053 +0300 -@@ -0,0 +1,861 @@ -+/* -+ * Pseudo-driver for the intermediate queue device. -+ * -+ * This program is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU General Public License -+ * as published by the Free Software Foundation; either version -+ * 2 of the License, or (at your option) any later version. -+ * -+ * Authors: Patrick McHardy, -+ * -+ * The first version was written by Martin Devera, -+ * -+ * Credits: Jan Rafaj -+ * - Update patch to 2.4.21 -+ * Sebastian Strollo -+ * - Fix "Dead-loop on netdevice imq"-issue -+ * Marcel Sebek -+ * - Update to 2.6.2-rc1 -+ * -+ * After some time of inactivity there is a group taking care -+ * of IMQ again: http://www.linuximq.net -+ * -+ * -+ * 2004/06/30 - New version of IMQ patch to kernels <=2.6.7 -+ * including the following changes: -+ * -+ * - Correction of ipv6 support "+"s issue (Hasso Tepper) -+ * - Correction of imq_init_devs() issue that resulted in -+ * kernel OOPS unloading IMQ as module (Norbert Buchmuller) -+ * - Addition of functionality to choose number of IMQ devices -+ * during kernel config (Andre Correa) -+ * - Addition of functionality to choose how IMQ hooks on -+ * PRE and POSTROUTING (after or before NAT) (Andre Correa) -+ * - Cosmetic corrections (Norbert Buchmuller) (Andre Correa) -+ * -+ * -+ * 2005/12/16 - IMQ versions between 2.6.7 and 2.6.13 were -+ * released with almost no problems. 2.6.14-x was released -+ * with some important changes: nfcache was removed; After -+ * some weeks of trouble we figured out that some IMQ fields -+ * in skb were missing in skbuff.c - skb_clone and copy_skb_header. -+ * These functions are correctly patched by this new patch version. -+ * -+ * Thanks for all who helped to figure out all the problems with -+ * 2.6.14.x: Patrick McHardy, Rune Kock, VeNoMouS, Max CtRiX, -+ * Kevin Shanahan, Richard Lucassen, Valery Dachev (hopefully -+ * I didn't forget anybody). I apologize again for my lack of time. -+ * -+ * -+ * 2008/06/17 - 2.6.25 - Changed imq.c to use qdisc_run() instead -+ * of qdisc_restart() and moved qdisc_run() to tasklet to avoid -+ * recursive locking. New initialization routines to fix 'rmmod' not -+ * working anymore. Used code from ifb.c. (Jussi Kivilinna) -+ * -+ * 2008/08/06 - 2.6.26 - (JK) -+ * - Replaced tasklet with 'netif_schedule()'. -+ * - Cleaned up and added comments for imq_nf_queue(). -+ * -+ * 2009/04/12 -+ * - Add skb_save_cb/skb_restore_cb helper functions for backuping -+ * control buffer. This is needed because qdisc-layer on kernels -+ * 2.6.27 and newer overwrite control buffer. (Jussi Kivilinna) -+ * - Add better locking for IMQ device. Hopefully this will solve -+ * SMP issues. (Jussi Kivilinna) -+ * - Port to 2.6.27 -+ * - Port to 2.6.28 -+ * - Port to 2.6.29 + fix rmmod not working -+ * -+ * 2009/04/20 - (Jussi Kivilinna) -+ * - Use netdevice feature flags to avoid extra packet handling -+ * by core networking layer and possibly increase performance. -+ * -+ * 2009/09/26 - (Jussi Kivilinna) -+ * - Add imq_nf_reinject_lockless to fix deadlock with -+ * imq_nf_queue/imq_nf_reinject. -+ * -+ * 2009/12/08 - (Jussi Kivilinna) -+ * - Port to 2.6.32 -+ * - Add check for skb->nf_queue_entry==NULL in imq_dev_xmit() -+ * - Also add better error checking for skb->nf_queue_entry usage -+ * -+ * 2010/02/25 - (Jussi Kivilinna) -+ * - Port to 2.6.33 -+ * -+ * 2010/08/15 - (Jussi Kivilinna) -+ * - Port to 2.6.35 -+ * - Simplify hook registration by using nf_register_hooks. -+ * - nf_reinject doesn't need spinlock around it, therefore remove -+ * imq_nf_reinject function. Other nf_reinject users protect -+ * their own data with spinlock. With IMQ however all data is -+ * needed is stored per skbuff, so no locking is needed. -+ * - Changed IMQ to use 'separate' NF_IMQ_QUEUE instead of -+ * NF_QUEUE, this allows working coexistance of IMQ and other -+ * NF_QUEUE users. -+ * - Make IMQ multi-queue. Number of IMQ device queues can be -+ * increased with 'numqueues' module parameters. Default number -+ * of queues is 1, in other words by default IMQ works as -+ * single-queue device. Multi-queue selection is based on -+ * IFB multi-queue patch by Changli Gao . -+ * -+ * 2011/03/18 - (Jussi Kivilinna) -+ * - Port to 2.6.38 -+ * -+ * 2011/07/12 - (syoder89@gmail.com) -+ * - Crash fix that happens when the receiving interface has more -+ * than one queue (add missing skb_set_queue_mapping in -+ * imq_select_queue). -+ * -+ * 2011/07/26 - (Jussi Kivilinna) -+ * - Add queue mapping checks for packets exiting IMQ. -+ * - Port to 3.0 -+ * -+ * 2011/08/16 - (Jussi Kivilinna) -+ * - Clear IFF_TX_SKB_SHARING flag that was added for linux 3.0.2 -+ * -+ * 2011/11/03 - Germano Michel -+ * - Fix IMQ for net namespaces -+ * -+ * 2011/11/04 - Jussi Kivilinna -+ * - Port to 3.1 -+ * - Clean-up, move 'get imq device pointer by imqX name' to -+ * separate function from imq_nf_queue(). -+ * -+ * 2012/01/05 - Jussi Kivilinna -+ * - Port to 3.2 -+ * -+ * 2012/03/19 - Jussi Kivilinna -+ * - Port to 3.3 -+ * -+ * 2012/12/12 - Jussi Kivilinna -+ * - Port to 3.7 -+ * - Fix checkpatch.pl warnings -+ * -+ * Also, many thanks to pablo Sebastian Greco for making the initial -+ * patch and to those who helped the testing. -+ * -+ * More info at: http://www.linuximq.net/ (Andre Correa) -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -+ #include -+#endif -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+static int imq_nf_queue(struct nf_queue_entry *entry, unsigned int queue_num); -+ -+static nf_hookfn imq_nf_hook; -+ -+static struct nf_hook_ops imq_ops[] = { -+ { -+ /* imq_ingress_ipv4 */ -+ .hook = imq_nf_hook, -+ .owner = THIS_MODULE, -+ .pf = PF_INET, -+ .hooknum = NF_INET_PRE_ROUTING, -+#if defined(CONFIG_IMQ_BEHAVIOR_BA) || defined(CONFIG_IMQ_BEHAVIOR_BB) -+ .priority = NF_IP_PRI_MANGLE + 1, -+#else -+ .priority = NF_IP_PRI_NAT_DST + 1, -+#endif -+ }, -+ { -+ /* imq_egress_ipv4 */ -+ .hook = imq_nf_hook, -+ .owner = THIS_MODULE, -+ .pf = PF_INET, -+ .hooknum = NF_INET_POST_ROUTING, -+#if defined(CONFIG_IMQ_BEHAVIOR_AA) || defined(CONFIG_IMQ_BEHAVIOR_BA) -+ .priority = NF_IP_PRI_LAST, -+#else -+ .priority = NF_IP_PRI_NAT_SRC - 1, -+#endif -+ }, -+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -+ { -+ /* imq_ingress_ipv6 */ -+ .hook = imq_nf_hook, -+ .owner = THIS_MODULE, -+ .pf = PF_INET6, -+ .hooknum = NF_INET_PRE_ROUTING, -+#if defined(CONFIG_IMQ_BEHAVIOR_BA) || defined(CONFIG_IMQ_BEHAVIOR_BB) -+ .priority = NF_IP6_PRI_MANGLE + 1, -+#else -+ .priority = NF_IP6_PRI_NAT_DST + 1, -+#endif -+ }, -+ { -+ /* imq_egress_ipv6 */ -+ .hook = imq_nf_hook, -+ .owner = THIS_MODULE, -+ .pf = PF_INET6, -+ .hooknum = NF_INET_POST_ROUTING, -+#if defined(CONFIG_IMQ_BEHAVIOR_AA) || defined(CONFIG_IMQ_BEHAVIOR_BA) -+ .priority = NF_IP6_PRI_LAST, -+#else -+ .priority = NF_IP6_PRI_NAT_SRC - 1, -+#endif -+ }, -+#endif -+}; -+ -+#if defined(CONFIG_IMQ_NUM_DEVS) -+static int numdevs = CONFIG_IMQ_NUM_DEVS; -+#else -+static int numdevs = IMQ_MAX_DEVS; -+#endif -+ -+static struct net_device *imq_devs_cache[IMQ_MAX_DEVS]; -+ -+#define IMQ_MAX_QUEUES 32 -+static int numqueues = 1; -+static u32 imq_hashrnd; -+ -+static inline __be16 pppoe_proto(const struct sk_buff *skb) -+{ -+ return *((__be16 *)(skb_mac_header(skb) + ETH_HLEN + -+ sizeof(struct pppoe_hdr))); -+} -+ -+static u16 imq_hash(struct net_device *dev, struct sk_buff *skb) -+{ -+ unsigned int pull_len; -+ u16 protocol = skb->protocol; -+ u32 addr1, addr2; -+ u32 hash, ihl = 0; -+ union { -+ u16 in16[2]; -+ u32 in32; -+ } ports; -+ u8 ip_proto; -+ -+ pull_len = 0; -+ -+recheck: -+ switch (protocol) { -+ case htons(ETH_P_8021Q): { -+ if (unlikely(skb_pull(skb, VLAN_HLEN) == NULL)) -+ goto other; -+ -+ pull_len += VLAN_HLEN; -+ skb->network_header += VLAN_HLEN; -+ -+ protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto; -+ goto recheck; -+ } -+ -+ case htons(ETH_P_PPP_SES): { -+ if (unlikely(skb_pull(skb, PPPOE_SES_HLEN) == NULL)) -+ goto other; -+ -+ pull_len += PPPOE_SES_HLEN; -+ skb->network_header += PPPOE_SES_HLEN; -+ -+ protocol = pppoe_proto(skb); -+ goto recheck; -+ } -+ -+ case htons(ETH_P_IP): { -+ const struct iphdr *iph = ip_hdr(skb); -+ -+ if (unlikely(!pskb_may_pull(skb, sizeof(struct iphdr)))) -+ goto other; -+ -+ addr1 = iph->daddr; -+ addr2 = iph->saddr; -+ -+ ip_proto = !(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) ? -+ iph->protocol : 0; -+ ihl = ip_hdrlen(skb); -+ -+ break; -+ } -+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -+ case htons(ETH_P_IPV6): { -+ const struct ipv6hdr *iph = ipv6_hdr(skb); -+ __be16 fo = 0; -+ -+ if (unlikely(!pskb_may_pull(skb, sizeof(struct ipv6hdr)))) -+ goto other; -+ -+ addr1 = iph->daddr.s6_addr32[3]; -+ addr2 = iph->saddr.s6_addr32[3]; -+ ihl = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &ip_proto, -+ &fo); -+ if (unlikely(ihl < 0)) -+ goto other; -+ -+ break; -+ } -+#endif -+ default: -+other: -+ if (pull_len != 0) { -+ skb_push(skb, pull_len); -+ skb->network_header -= pull_len; -+ } -+ -+ return (u16)(ntohs(protocol) % dev->real_num_tx_queues); -+ } -+ -+ if (addr1 > addr2) -+ swap(addr1, addr2); -+ -+ switch (ip_proto) { -+ case IPPROTO_TCP: -+ case IPPROTO_UDP: -+ case IPPROTO_DCCP: -+ case IPPROTO_ESP: -+ case IPPROTO_AH: -+ case IPPROTO_SCTP: -+ case IPPROTO_UDPLITE: { -+ if (likely(skb_copy_bits(skb, ihl, &ports.in32, 4) >= 0)) { -+ if (ports.in16[0] > ports.in16[1]) -+ swap(ports.in16[0], ports.in16[1]); -+ break; -+ } -+ /* fall-through */ -+ } -+ default: -+ ports.in32 = 0; -+ break; -+ } -+ -+ if (pull_len != 0) { -+ skb_push(skb, pull_len); -+ skb->network_header -= pull_len; -+ } -+ -+ hash = jhash_3words(addr1, addr2, ports.in32, imq_hashrnd ^ ip_proto); -+ -+ return (u16)(((u64)hash * dev->real_num_tx_queues) >> 32); -+} -+ -+static inline bool sk_tx_queue_recorded(struct sock *sk) -+{ -+ return (sk_tx_queue_get(sk) >= 0); -+} -+ -+static struct netdev_queue *imq_select_queue(struct net_device *dev, -+ struct sk_buff *skb) -+{ -+ u16 queue_index = 0; -+ u32 hash; -+ -+ if (likely(dev->real_num_tx_queues == 1)) -+ goto out; -+ -+ /* IMQ can be receiving ingress or engress packets. */ -+ -+ /* Check first for if rx_queue is set */ -+ if (skb_rx_queue_recorded(skb)) { -+ queue_index = skb_get_rx_queue(skb); -+ goto out; -+ } -+ -+ /* Check if socket has tx_queue set */ -+ if (sk_tx_queue_recorded(skb->sk)) { -+ queue_index = sk_tx_queue_get(skb->sk); -+ goto out; -+ } -+ -+ /* Try use socket hash */ -+ if (skb->sk && skb->sk->sk_hash) { -+ hash = skb->sk->sk_hash; -+ queue_index = -+ (u16)(((u64)hash * dev->real_num_tx_queues) >> 32); -+ goto out; -+ } -+ -+ /* Generate hash from packet data */ -+ queue_index = imq_hash(dev, skb); -+ -+out: -+ if (unlikely(queue_index >= dev->real_num_tx_queues)) -+ queue_index = (u16)((u32)queue_index % dev->real_num_tx_queues); -+ -+ skb_set_queue_mapping(skb, queue_index); -+ return netdev_get_tx_queue(dev, queue_index); -+} -+ -+static struct net_device_stats *imq_get_stats(struct net_device *dev) -+{ -+ return &dev->stats; -+} -+ -+/* called for packets kfree'd in qdiscs at places other than enqueue */ -+static void imq_skb_destructor(struct sk_buff *skb) -+{ -+ struct nf_queue_entry *entry = skb->nf_queue_entry; -+ -+ skb->nf_queue_entry = NULL; -+ -+ if (entry) { -+ nf_queue_entry_release_refs(entry); -+ kfree(entry); -+ } -+ -+ skb_restore_cb(skb); /* kfree backup */ -+} -+ -+static void imq_done_check_queue_mapping(struct sk_buff *skb, -+ struct net_device *dev) -+{ -+ unsigned int queue_index; -+ -+ /* Don't let queue_mapping be left too large after exiting IMQ */ -+ if (likely(skb->dev != dev && skb->dev != NULL)) { -+ queue_index = skb_get_queue_mapping(skb); -+ if (unlikely(queue_index >= skb->dev->real_num_tx_queues)) { -+ queue_index = (u16)((u32)queue_index % -+ skb->dev->real_num_tx_queues); -+ skb_set_queue_mapping(skb, queue_index); -+ } -+ } else { -+ /* skb->dev was IMQ device itself or NULL, be on safe side and -+ * just clear queue mapping. -+ */ -+ skb_set_queue_mapping(skb, 0); -+ } -+} -+ -+static netdev_tx_t imq_dev_xmit(struct sk_buff *skb, struct net_device *dev) -+{ -+ struct nf_queue_entry *entry = skb->nf_queue_entry; -+ -+ skb->nf_queue_entry = NULL; -+ dev->trans_start = jiffies; -+ -+ dev->stats.tx_bytes += skb->len; -+ dev->stats.tx_packets++; -+ -+ if (unlikely(entry == NULL)) { -+ /* We don't know what is going on here.. packet is queued for -+ * imq device, but (probably) not by us. -+ * -+ * If this packet was not send here by imq_nf_queue(), then -+ * skb_save_cb() was not used and skb_free() should not show: -+ * WARNING: IMQ: kfree_skb: skb->cb_next:.. -+ * and/or -+ * WARNING: IMQ: kfree_skb: skb->nf_queue_entry... -+ * -+ * However if this message is shown, then IMQ is somehow broken -+ * and you should report this to linuximq.net. -+ */ -+ -+ /* imq_dev_xmit is black hole that eats all packets, report that -+ * we eat this packet happily and increase dropped counters. -+ */ -+ -+ dev->stats.tx_dropped++; -+ dev_kfree_skb(skb); -+ -+ return NETDEV_TX_OK; -+ } -+ -+ skb_restore_cb(skb); /* restore skb->cb */ -+ -+ skb->imq_flags = 0; -+ skb->destructor = NULL; -+ -+ imq_done_check_queue_mapping(skb, dev); -+ -+ nf_reinject(entry, NF_ACCEPT); -+ -+ return NETDEV_TX_OK; -+} -+ -+static struct net_device *get_imq_device_by_index(int index) -+{ -+ struct net_device *dev = NULL; -+ struct net *net; -+ char buf[8]; -+ -+ /* get device by name and cache result */ -+ snprintf(buf, sizeof(buf), "imq%d", index); -+ -+ /* Search device from all namespaces. */ -+ for_each_net(net) { -+ dev = dev_get_by_name(net, buf); -+ if (dev) -+ break; -+ } -+ -+ if (WARN_ON_ONCE(dev == NULL)) { -+ /* IMQ device not found. Exotic config? */ -+ return ERR_PTR(-ENODEV); -+ } -+ -+ imq_devs_cache[index] = dev; -+ dev_put(dev); -+ -+ return dev; -+} -+ -+static int imq_nf_queue(struct nf_queue_entry *entry, unsigned int queue_num) -+{ -+ struct net_device *dev; -+ struct sk_buff *skb_orig, *skb, *skb_shared; -+ struct Qdisc *q; -+ struct netdev_queue *txq; -+ spinlock_t *root_lock; -+ int users, index; -+ int retval = -EINVAL; -+ unsigned int orig_queue_index; -+ -+ index = entry->skb->imq_flags & IMQ_F_IFMASK; -+ if (unlikely(index > numdevs - 1)) { -+ if (net_ratelimit()) -+ pr_warn("IMQ: invalid device specified, highest is %u\n", -+ numdevs - 1); -+ retval = -EINVAL; -+ goto out; -+ } -+ -+ /* check for imq device by index from cache */ -+ dev = imq_devs_cache[index]; -+ if (unlikely(!dev)) { -+ dev = get_imq_device_by_index(index); -+ if (IS_ERR(dev)) { -+ retval = PTR_ERR(dev); -+ goto out; -+ } -+ } -+ -+ if (unlikely(!(dev->flags & IFF_UP))) { -+ entry->skb->imq_flags = 0; -+ nf_reinject(entry, NF_ACCEPT); -+ retval = 0; -+ goto out; -+ } -+ dev->last_rx = jiffies; -+ -+ skb = entry->skb; -+ skb_orig = NULL; -+ -+ /* skb has owner? => make clone */ -+ if (unlikely(skb->destructor)) { -+ skb_orig = skb; -+ skb = skb_clone(skb, GFP_ATOMIC); -+ if (unlikely(!skb)) { -+ retval = -ENOMEM; -+ goto out; -+ } -+ entry->skb = skb; -+ } -+ -+ skb->nf_queue_entry = entry; -+ -+ dev->stats.rx_bytes += skb->len; -+ dev->stats.rx_packets++; -+ -+ if (!skb->dev) { -+ /* skb->dev == NULL causes problems, try the find cause. */ -+ if (net_ratelimit()) { -+ dev_warn(&dev->dev, -+ "received packet with skb->dev == NULL\n"); -+ dump_stack(); -+ } -+ -+ skb->dev = dev; -+ } -+ -+ /* Disables softirqs for lock below */ -+ rcu_read_lock_bh(); -+ -+ /* Multi-queue selection */ -+ orig_queue_index = skb_get_queue_mapping(skb); -+ txq = imq_select_queue(dev, skb); -+ -+ q = rcu_dereference(txq->qdisc); -+ if (unlikely(!q->enqueue)) -+ goto packet_not_eaten_by_imq_dev; -+ -+ root_lock = qdisc_lock(q); -+ spin_lock(root_lock); -+ -+ users = atomic_read(&skb->users); -+ -+ skb_shared = skb_get(skb); /* increase reference count by one */ -+ -+ /* backup skb->cb, as qdisc layer will overwrite it */ -+ skb_save_cb(skb_shared); -+ qdisc_enqueue_root(skb_shared, q); /* might kfree_skb */ -+ -+ if (likely(atomic_read(&skb_shared->users) == users + 1)) { -+ kfree_skb(skb_shared); /* decrease reference count by one */ -+ -+ skb->destructor = &imq_skb_destructor; -+ -+ /* cloned? */ -+ if (unlikely(skb_orig)) -+ kfree_skb(skb_orig); /* free original */ -+ -+ spin_unlock(root_lock); -+ rcu_read_unlock_bh(); -+ -+ /* schedule qdisc dequeue */ -+ __netif_schedule(q); -+ -+ retval = 0; -+ goto out; -+ } else { -+ skb_restore_cb(skb_shared); /* restore skb->cb */ -+ skb->nf_queue_entry = NULL; -+ /* -+ * qdisc dropped packet and decreased skb reference count of -+ * skb, so we don't really want to and try refree as that would -+ * actually destroy the skb. -+ */ -+ spin_unlock(root_lock); -+ goto packet_not_eaten_by_imq_dev; -+ } -+ -+packet_not_eaten_by_imq_dev: -+ skb_set_queue_mapping(skb, orig_queue_index); -+ rcu_read_unlock_bh(); -+ -+ /* cloned? restore original */ -+ if (unlikely(skb_orig)) { -+ kfree_skb(skb); -+ entry->skb = skb_orig; -+ } -+ retval = -1; -+out: -+ return retval; -+} -+ -+static unsigned int imq_nf_hook(unsigned int hook, struct sk_buff *pskb, -+ const struct net_device *indev, -+ const struct net_device *outdev, -+ int (*okfn)(struct sk_buff *)) -+{ -+ return (pskb->imq_flags & IMQ_F_ENQUEUE) ? NF_IMQ_QUEUE : NF_ACCEPT; -+} -+ -+static int imq_close(struct net_device *dev) -+{ -+ netif_stop_queue(dev); -+ return 0; -+} -+ -+static int imq_open(struct net_device *dev) -+{ -+ netif_start_queue(dev); -+ return 0; -+} -+ -+static const struct net_device_ops imq_netdev_ops = { -+ .ndo_open = imq_open, -+ .ndo_stop = imq_close, -+ .ndo_start_xmit = imq_dev_xmit, -+ .ndo_get_stats = imq_get_stats, -+}; -+ -+static void imq_setup(struct net_device *dev) -+{ -+ dev->netdev_ops = &imq_netdev_ops; -+ dev->type = ARPHRD_VOID; -+ dev->mtu = 16000; /* too small? */ -+ dev->tx_queue_len = 11000; /* too big? */ -+ dev->flags = IFF_NOARP; -+ dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | -+ NETIF_F_GSO | NETIF_F_HW_CSUM | -+ NETIF_F_HIGHDMA; -+ dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | -+ IFF_TX_SKB_SHARING); -+} -+ -+static int imq_validate(struct nlattr *tb[], struct nlattr *data[]) -+{ -+ int ret = 0; -+ -+ if (tb[IFLA_ADDRESS]) { -+ if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { -+ ret = -EINVAL; -+ goto end; -+ } -+ if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { -+ ret = -EADDRNOTAVAIL; -+ goto end; -+ } -+ } -+ return 0; -+end: -+ pr_warn("IMQ: imq_validate failed (%d)\n", ret); -+ return ret; -+} -+ -+static struct rtnl_link_ops imq_link_ops __read_mostly = { -+ .kind = "imq", -+ .priv_size = 0, -+ .setup = imq_setup, -+ .validate = imq_validate, -+}; -+ -+static const struct nf_queue_handler imq_nfqh = { -+ .outfn = imq_nf_queue, -+}; -+ -+static int __init imq_init_hooks(void) -+{ -+ int ret; -+ -+ nf_register_queue_imq_handler(&imq_nfqh); -+ -+ ret = nf_register_hooks(imq_ops, ARRAY_SIZE(imq_ops)); -+ if (ret < 0) -+ nf_unregister_queue_imq_handler(); -+ -+ return ret; -+} -+ -+static int __init imq_init_one(int index) -+{ -+ struct net_device *dev; -+ int ret; -+ -+ dev = alloc_netdev_mq(0, "imq%d", imq_setup, numqueues); -+ if (!dev) -+ return -ENOMEM; -+ -+ ret = dev_alloc_name(dev, dev->name); -+ if (ret < 0) -+ goto fail; -+ -+ dev->rtnl_link_ops = &imq_link_ops; -+ ret = register_netdevice(dev); -+ if (ret < 0) -+ goto fail; -+ -+ return 0; -+fail: -+ free_netdev(dev); -+ return ret; -+} -+ -+static int __init imq_init_devs(void) -+{ -+ int err, i; -+ -+ if (numdevs < 1 || numdevs > IMQ_MAX_DEVS) { -+ pr_err("IMQ: numdevs has to be betweed 1 and %u\n", -+ IMQ_MAX_DEVS); -+ return -EINVAL; -+ } -+ -+ if (numqueues < 1 || numqueues > IMQ_MAX_QUEUES) { -+ pr_err("IMQ: numqueues has to be betweed 1 and %u\n", -+ IMQ_MAX_QUEUES); -+ return -EINVAL; -+ } -+ -+ get_random_bytes(&imq_hashrnd, sizeof(imq_hashrnd)); -+ -+ rtnl_lock(); -+ err = __rtnl_link_register(&imq_link_ops); -+ -+ for (i = 0; i < numdevs && !err; i++) -+ err = imq_init_one(i); -+ -+ if (err) { -+ __rtnl_link_unregister(&imq_link_ops); -+ memset(imq_devs_cache, 0, sizeof(imq_devs_cache)); -+ } -+ rtnl_unlock(); -+ -+ return err; -+} -+ -+static int __init imq_init_module(void) -+{ -+ int err; -+ -+#if defined(CONFIG_IMQ_NUM_DEVS) -+ BUILD_BUG_ON(CONFIG_IMQ_NUM_DEVS > 16); -+ BUILD_BUG_ON(CONFIG_IMQ_NUM_DEVS < 2); -+ BUILD_BUG_ON(CONFIG_IMQ_NUM_DEVS - 1 > IMQ_F_IFMASK); -+#endif -+ -+ err = imq_init_devs(); -+ if (err) { -+ pr_err("IMQ: Error trying imq_init_devs(net)\n"); -+ return err; -+ } -+ -+ err = imq_init_hooks(); -+ if (err) { -+ pr_err(KERN_ERR "IMQ: Error trying imq_init_hooks()\n"); -+ rtnl_link_unregister(&imq_link_ops); -+ memset(imq_devs_cache, 0, sizeof(imq_devs_cache)); -+ return err; -+ } -+ -+ pr_info("IMQ driver loaded successfully. (numdevs = %d, numqueues = %d)\n", -+ numdevs, numqueues); -+ -+#if defined(CONFIG_IMQ_BEHAVIOR_BA) || defined(CONFIG_IMQ_BEHAVIOR_BB) -+ pr_info("\tHooking IMQ before NAT on PREROUTING.\n"); -+#else -+ pr_info("\tHooking IMQ after NAT on PREROUTING.\n"); -+#endif -+#if defined(CONFIG_IMQ_BEHAVIOR_AB) || defined(CONFIG_IMQ_BEHAVIOR_BB) -+ pr_info("\tHooking IMQ before NAT on POSTROUTING.\n"); -+#else -+ pr_info("\tHooking IMQ after NAT on POSTROUTING.\n"); -+#endif -+ -+ return 0; -+} -+ -+static void __exit imq_unhook(void) -+{ -+ nf_unregister_hooks(imq_ops, ARRAY_SIZE(imq_ops)); -+ nf_unregister_queue_imq_handler(); -+} -+ -+static void __exit imq_cleanup_devs(void) -+{ -+ rtnl_link_unregister(&imq_link_ops); -+ memset(imq_devs_cache, 0, sizeof(imq_devs_cache)); -+} -+ -+static void __exit imq_exit_module(void) -+{ -+ imq_unhook(); -+ imq_cleanup_devs(); -+ pr_info("IMQ driver unloaded successfully.\n"); -+} -+ -+module_init(imq_init_module); -+module_exit(imq_exit_module); -+ -+module_param(numdevs, int, 0); -+module_param(numqueues, int, 0); -+MODULE_PARM_DESC(numdevs, "number of IMQ devices (how many imq* devices will be created)"); -+MODULE_PARM_DESC(numqueues, "number of queues per IMQ device"); -+MODULE_AUTHOR("http://www.linuximq.net"); -+MODULE_DESCRIPTION("Pseudo-driver for the intermediate queue device. See http://www.linuximq.net/ for more information."); -+MODULE_LICENSE("GPL"); -+MODULE_ALIAS_RTNL_LINK("imq"); -+ -diff -uNr linux-3.9.1/drivers/net/Kconfig linux-3.9.1-imqmq/drivers/net/Kconfig ---- linux-3.9.1/drivers/net/Kconfig 2013-05-08 06:58:03.000000000 +0300 -+++ linux-3.9.1-imqmq/drivers/net/Kconfig 2013-05-08 17:30:29.011952562 +0300 -@@ -206,6 +206,125 @@ - depends on RIONET - default "128" - -+config IMQ -+ tristate "IMQ (intermediate queueing device) support" -+ depends on NETDEVICES && NETFILTER -+ ---help--- -+ The IMQ device(s) is used as placeholder for QoS queueing -+ disciplines. Every packet entering/leaving the IP stack can be -+ directed through the IMQ device where it's enqueued/dequeued to the -+ attached qdisc. This allows you to treat network devices as classes -+ and distribute bandwidth among them. Iptables is used to specify -+ through which IMQ device, if any, packets travel. -+ -+ More information at: http://www.linuximq.net/ -+ -+ To compile this driver as a module, choose M here: the module -+ will be called imq. If unsure, say N. -+ -+choice -+ prompt "IMQ behavior (PRE/POSTROUTING)" -+ depends on IMQ -+ default IMQ_BEHAVIOR_AB -+ help -+ This setting defines how IMQ behaves in respect to its -+ hooking in PREROUTING and POSTROUTING. -+ -+ IMQ can work in any of the following ways: -+ -+ PREROUTING | POSTROUTING -+ -----------------|------------------- -+ #1 After NAT | After NAT -+ #2 After NAT | Before NAT -+ #3 Before NAT | After NAT -+ #4 Before NAT | Before NAT -+ -+ The default behavior is to hook before NAT on PREROUTING -+ and after NAT on POSTROUTING (#3). -+ -+ This settings are specially usefull when trying to use IMQ -+ to shape NATed clients. -+ -+ More information can be found at: www.linuximq.net -+ -+ If not sure leave the default settings alone. -+ -+config IMQ_BEHAVIOR_AA -+ bool "IMQ AA" -+ help -+ This setting defines how IMQ behaves in respect to its -+ hooking in PREROUTING and POSTROUTING. -+ -+ Choosing this option will make IMQ hook like this: -+ -+ PREROUTING: After NAT -+ POSTROUTING: After NAT -+ -+ More information can be found at: www.linuximq.net -+ -+ If not sure leave the default settings alone. -+ -+config IMQ_BEHAVIOR_AB -+ bool "IMQ AB" -+ help -+ This setting defines how IMQ behaves in respect to its -+ hooking in PREROUTING and POSTROUTING. -+ -+ Choosing this option will make IMQ hook like this: -+ -+ PREROUTING: After NAT -+ POSTROUTING: Before NAT -+ -+ More information can be found at: www.linuximq.net -+ -+ If not sure leave the default settings alone. -+ -+config IMQ_BEHAVIOR_BA -+ bool "IMQ BA" -+ help -+ This setting defines how IMQ behaves in respect to its -+ hooking in PREROUTING and POSTROUTING. -+ -+ Choosing this option will make IMQ hook like this: -+ -+ PREROUTING: Before NAT -+ POSTROUTING: After NAT -+ -+ More information can be found at: www.linuximq.net -+ -+ If not sure leave the default settings alone. -+ -+config IMQ_BEHAVIOR_BB -+ bool "IMQ BB" -+ help -+ This setting defines how IMQ behaves in respect to its -+ hooking in PREROUTING and POSTROUTING. -+ -+ Choosing this option will make IMQ hook like this: -+ -+ PREROUTING: Before NAT -+ POSTROUTING: Before NAT -+ -+ More information can be found at: www.linuximq.net -+ -+ If not sure leave the default settings alone. -+ -+endchoice -+ -+config IMQ_NUM_DEVS -+ int "Number of IMQ devices" -+ range 2 16 -+ depends on IMQ -+ default "16" -+ help -+ This setting defines how many IMQ devices will be created. -+ -+ The default value is 16. -+ -+ More information can be found at: www.linuximq.net -+ -+ If not sure leave the default settings alone. -+ - config TUN - tristate "Universal TUN/TAP device driver support" - select CRC32 -diff -uNr linux-3.9.1/drivers/net/Makefile linux-3.9.1-imqmq/drivers/net/Makefile ---- linux-3.9.1/drivers/net/Makefile 2013-05-08 06:58:03.000000000 +0300 -+++ linux-3.9.1-imqmq/drivers/net/Makefile 2013-05-08 17:30:29.011952562 +0300 -@@ -9,6 +9,7 @@ - obj-$(CONFIG_DUMMY) += dummy.o - obj-$(CONFIG_EQUALIZER) += eql.o - obj-$(CONFIG_IFB) += ifb.o -+obj-$(CONFIG_IMQ) += imq.o - obj-$(CONFIG_MACVLAN) += macvlan.o - obj-$(CONFIG_MACVTAP) += macvtap.o - obj-$(CONFIG_MII) += mii.o -diff -uNr linux-3.9.1/include/linux/imq.h linux-3.9.1-imqmq/include/linux/imq.h ---- linux-3.9.1/include/linux/imq.h 1970-01-01 02:00:00.000000000 +0200 -+++ linux-3.9.1-imqmq/include/linux/imq.h 2013-05-08 17:30:29.011952562 +0300 -@@ -0,0 +1,13 @@ -+#ifndef _IMQ_H -+#define _IMQ_H -+ -+/* IFMASK (16 device indexes, 0 to 15) and flag(s) fit in 5 bits */ -+#define IMQ_F_BITS 5 -+ -+#define IMQ_F_IFMASK 0x0f -+#define IMQ_F_ENQUEUE 0x10 -+ -+#define IMQ_MAX_DEVS (IMQ_F_IFMASK + 1) -+ -+#endif /* _IMQ_H */ -+ -diff -uNr linux-3.9.1/include/linux/netfilter/xt_IMQ.h linux-3.9.1-imqmq/include/linux/netfilter/xt_IMQ.h ---- linux-3.9.1/include/linux/netfilter/xt_IMQ.h 1970-01-01 02:00:00.000000000 +0200 -+++ linux-3.9.1-imqmq/include/linux/netfilter/xt_IMQ.h 2013-05-08 17:30:29.011952562 +0300 -@@ -0,0 +1,9 @@ -+#ifndef _XT_IMQ_H -+#define _XT_IMQ_H -+ -+struct xt_imq_info { -+ unsigned int todev; /* target imq device */ -+}; -+ -+#endif /* _XT_IMQ_H */ -+ -diff -uNr linux-3.9.1/include/linux/netfilter_ipv4/ipt_IMQ.h linux-3.9.1-imqmq/include/linux/netfilter_ipv4/ipt_IMQ.h ---- linux-3.9.1/include/linux/netfilter_ipv4/ipt_IMQ.h 1970-01-01 02:00:00.000000000 +0200 -+++ linux-3.9.1-imqmq/include/linux/netfilter_ipv4/ipt_IMQ.h 2013-05-08 17:30:29.011952562 +0300 -@@ -0,0 +1,10 @@ -+#ifndef _IPT_IMQ_H -+#define _IPT_IMQ_H -+ -+/* Backwards compatibility for old userspace */ -+#include -+ -+#define ipt_imq_info xt_imq_info -+ -+#endif /* _IPT_IMQ_H */ -+ -diff -uNr linux-3.9.1/include/linux/netfilter_ipv6/ip6t_IMQ.h linux-3.9.1-imqmq/include/linux/netfilter_ipv6/ip6t_IMQ.h ---- linux-3.9.1/include/linux/netfilter_ipv6/ip6t_IMQ.h 1970-01-01 02:00:00.000000000 +0200 -+++ linux-3.9.1-imqmq/include/linux/netfilter_ipv6/ip6t_IMQ.h 2013-05-08 17:30:29.011952562 +0300 -@@ -0,0 +1,10 @@ -+#ifndef _IP6T_IMQ_H -+#define _IP6T_IMQ_H -+ -+/* Backwards compatibility for old userspace */ -+#include -+ -+#define ip6t_imq_info xt_imq_info -+ -+#endif /* _IP6T_IMQ_H */ -+ -diff -uNr linux-3.9.1/include/net/netfilter/nf_queue.h linux-3.9.1-imqmq/include/net/netfilter/nf_queue.h ---- linux-3.9.1/include/net/netfilter/nf_queue.h 2013-05-08 06:58:03.000000000 +0300 -+++ linux-3.9.1-imqmq/include/net/netfilter/nf_queue.h 2013-05-08 17:30:29.015285965 +0300 -@@ -26,5 +26,11 @@ - void nf_register_queue_handler(const struct nf_queue_handler *qh); - void nf_unregister_queue_handler(void); - extern void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict); -+extern void nf_queue_entry_release_refs(struct nf_queue_entry *entry); -+ -+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) -+extern void nf_register_queue_imq_handler(const struct nf_queue_handler *qh); -+extern void nf_unregister_queue_imq_handler(void); -+#endif - - #endif /* _NF_QUEUE_H */ -diff -uNr linux-3.9.1/include/uapi/linux/netfilter.h linux-3.9.1-imqmq/include/uapi/linux/netfilter.h ---- linux-3.9.1/include/uapi/linux/netfilter.h 2013-05-08 06:58:03.000000000 +0300 -+++ linux-3.9.1-imqmq/include/uapi/linux/netfilter.h 2013-05-08 17:30:29.015285965 +0300 -@@ -13,7 +13,8 @@ - #define NF_QUEUE 3 - #define NF_REPEAT 4 - #define NF_STOP 5 --#define NF_MAX_VERDICT NF_STOP -+#define NF_IMQ_QUEUE 6 -+#define NF_MAX_VERDICT NF_IMQ_QUEUE - - /* we overload the higher bits for encoding auxiliary data such as the queue - * number or errno values. Not nice, but better than additional function -diff -uNr linux-3.9.1/net/core/dev.c linux-3.9.1-imqmq/net/core/dev.c ---- linux-3.9.1/net/core/dev.c 2013-05-08 06:58:03.000000000 +0300 -+++ linux-3.9.1-imqmq/net/core/dev.c 2013-05-08 17:30:29.018619368 +0300 -@@ -129,6 +129,9 @@ - #include - #include - #include -+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) -+#include -+#endif - - #include "net-sysfs.h" - -@@ -2529,7 +2532,12 @@ - } - } - -+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) -+ if (!list_empty(&ptype_all) && -+ !(skb->imq_flags & IMQ_F_ENQUEUE)) -+#else - if (!list_empty(&ptype_all)) -+#endif - dev_queue_xmit_nit(skb, dev); - - skb_len = skb->len; -diff -uNr linux-3.9.1/net/core/skbuff.c linux-3.9.1-imqmq/net/core/skbuff.c ---- linux-3.9.1/net/core/skbuff.c 2013-05-08 06:58:03.000000000 +0300 -+++ linux-3.9.1-imqmq/net/core/skbuff.c 2013-05-08 17:30:29.021952772 +0300 -@@ -73,6 +73,9 @@ - - struct kmem_cache *skbuff_head_cache __read_mostly; - static struct kmem_cache *skbuff_fclone_cache __read_mostly; -+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) -+static struct kmem_cache *skbuff_cb_store_cache __read_mostly; -+#endif - - static void sock_pipe_buf_release(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -@@ -92,6 +95,82 @@ - return 1; - } - -+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) -+/* Control buffer save/restore for IMQ devices */ -+struct skb_cb_table { -+ char cb[48] __aligned(8); -+ void *cb_next; -+ atomic_t refcnt; -+}; -+ -+static DEFINE_SPINLOCK(skb_cb_store_lock); -+ -+int skb_save_cb(struct sk_buff *skb) -+{ -+ struct skb_cb_table *next; -+ -+ next = kmem_cache_alloc(skbuff_cb_store_cache, GFP_ATOMIC); -+ if (!next) -+ return -ENOMEM; -+ -+ BUILD_BUG_ON(sizeof(skb->cb) != sizeof(next->cb)); -+ -+ memcpy(next->cb, skb->cb, sizeof(skb->cb)); -+ next->cb_next = skb->cb_next; -+ -+ atomic_set(&next->refcnt, 1); -+ -+ skb->cb_next = next; -+ return 0; -+} -+EXPORT_SYMBOL(skb_save_cb); -+ -+int skb_restore_cb(struct sk_buff *skb) -+{ -+ struct skb_cb_table *next; -+ -+ if (!skb->cb_next) -+ return 0; -+ -+ next = skb->cb_next; -+ -+ BUILD_BUG_ON(sizeof(skb->cb) != sizeof(next->cb)); -+ -+ memcpy(skb->cb, next->cb, sizeof(skb->cb)); -+ skb->cb_next = next->cb_next; -+ -+ spin_lock(&skb_cb_store_lock); -+ -+ if (atomic_dec_and_test(&next->refcnt)) -+ kmem_cache_free(skbuff_cb_store_cache, next); -+ -+ spin_unlock(&skb_cb_store_lock); -+ -+ return 0; -+} -+EXPORT_SYMBOL(skb_restore_cb); -+ -+static void skb_copy_stored_cb(struct sk_buff *new, const struct sk_buff *__old) -+{ -+ struct skb_cb_table *next; -+ struct sk_buff *old; -+ -+ if (!__old->cb_next) { -+ new->cb_next = NULL; -+ return; -+ } -+ -+ spin_lock(&skb_cb_store_lock); -+ -+ old = (struct sk_buff *)__old; -+ -+ next = old->cb_next; -+ atomic_inc(&next->refcnt); -+ new->cb_next = next; -+ -+ spin_unlock(&skb_cb_store_lock); -+} -+#endif - - /* Pipe buffer operations for a socket. */ - static const struct pipe_buf_operations sock_pipe_buf_ops = { -@@ -562,6 +641,28 @@ - WARN_ON(in_irq()); - skb->destructor(skb); - } -+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) -+ /* -+ * This should not happen. When it does, avoid memleak by restoring -+ * the chain of cb-backups. -+ */ -+ while (skb->cb_next != NULL) { -+ if (net_ratelimit()) -+ pr_warn("IMQ: kfree_skb: skb->cb_next: %08x\n", -+ (unsigned int)skb->cb_next); -+ -+ skb_restore_cb(skb); -+ } -+ /* -+ * This should not happen either, nf_queue_entry is nullified in -+ * imq_dev_xmit(). If we have non-NULL nf_queue_entry then we are -+ * leaking entry pointers, maybe memory. We don't know if this is -+ * pointer to already freed memory, or should this be freed. -+ * If this happens we need to add refcounting, etc for nf_queue_entry. -+ */ -+ if (skb->nf_queue_entry && net_ratelimit()) -+ pr_warn("%s\n", "IMQ: kfree_skb: skb->nf_queue_entry != NULL"); -+#endif - #if IS_ENABLED(CONFIG_NF_CONNTRACK) - nf_conntrack_put(skb->nfct); - #endif -@@ -683,6 +784,9 @@ - new->sp = secpath_get(old->sp); - #endif - memcpy(new->cb, old->cb, sizeof(old->cb)); -+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) -+ skb_copy_stored_cb(new, old); -+#endif - new->csum = old->csum; - new->local_df = old->local_df; - new->pkt_type = old->pkt_type; -@@ -3053,6 +3157,13 @@ - 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, - NULL); -+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) -+ skbuff_cb_store_cache = kmem_cache_create("skbuff_cb_store_cache", -+ sizeof(struct skb_cb_table), -+ 0, -+ SLAB_HWCACHE_ALIGN|SLAB_PANIC, -+ NULL); -+#endif - } - - /** -diff -uNr linux-3.9.1/net/ipv6/ip6_output.c linux-3.9.1-imqmq/net/ipv6/ip6_output.c ---- linux-3.9.1/net/ipv6/ip6_output.c 2013-05-08 06:58:03.000000000 +0300 -+++ linux-3.9.1-imqmq/net/ipv6/ip6_output.c 2013-05-08 17:30:29.021952772 +0300 -@@ -89,9 +89,6 @@ - struct in6_addr *nexthop; - int ret; - -- skb->protocol = htons(ETH_P_IPV6); -- skb->dev = dev; -- - if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) { - struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); - -@@ -167,6 +164,13 @@ - return 0; - } - -+ /* -+ * IMQ-patch: moved setting skb->dev and skb->protocol from -+ * ip6_finish_output2 to fix crashing at netif_skb_features(). -+ */ -+ skb->protocol = htons(ETH_P_IPV6); -+ skb->dev = dev; -+ - return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev, - ip6_finish_output, - !(IP6CB(skb)->flags & IP6SKB_REROUTED)); -diff -uNr linux-3.9.1/net/netfilter/core.c linux-3.9.1-imqmq/net/netfilter/core.c ---- linux-3.9.1/net/netfilter/core.c 2013-05-08 06:58:03.000000000 +0300 -+++ linux-3.9.1-imqmq/net/netfilter/core.c 2013-05-08 17:30:29.025286174 +0300 -@@ -188,9 +188,11 @@ - ret = NF_DROP_GETERR(verdict); - if (ret == 0) - ret = -EPERM; -- } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { -+ } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE || -+ (verdict & NF_VERDICT_MASK) == NF_IMQ_QUEUE) { - int err = nf_queue(skb, elem, pf, hook, indev, outdev, okfn, -- verdict >> NF_VERDICT_QBITS); -+ verdict >> NF_VERDICT_QBITS, -+ verdict & NF_VERDICT_MASK); - if (err < 0) { - if (err == -ECANCELED) - goto next_hook; -diff -uNr linux-3.9.1/net/netfilter/Kconfig linux-3.9.1-imqmq/net/netfilter/Kconfig ---- linux-3.9.1/net/netfilter/Kconfig 2013-05-08 06:58:03.000000000 +0300 -+++ linux-3.9.1-imqmq/net/netfilter/Kconfig 2013-05-08 17:30:29.025286174 +0300 -@@ -641,6 +641,18 @@ - - To compile it as a module, choose M here. If unsure, say N. - -+config NETFILTER_XT_TARGET_IMQ -+ tristate '"IMQ" target support' -+ depends on NETFILTER_XTABLES -+ depends on IP_NF_MANGLE || IP6_NF_MANGLE -+ select IMQ -+ default m if NETFILTER_ADVANCED=n -+ help -+ This option adds a `IMQ' target which is used to specify if and -+ to which imq device packets should get enqueued/dequeued. -+ -+ To compile it as a module, choose M here. If unsure, say N. -+ - config NETFILTER_XT_TARGET_MARK - tristate '"MARK" target support' - depends on NETFILTER_ADVANCED -diff -uNr linux-3.9.1/net/netfilter/Makefile linux-3.9.1-imqmq/net/netfilter/Makefile ---- linux-3.9.1/net/netfilter/Makefile 2013-05-08 06:58:03.000000000 +0300 -+++ linux-3.9.1-imqmq/net/netfilter/Makefile 2013-05-08 17:30:29.025286174 +0300 -@@ -82,6 +82,7 @@ - obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o - obj-$(CONFIG_NETFILTER_XT_TARGET_HL) += xt_HL.o - obj-$(CONFIG_NETFILTER_XT_TARGET_HMARK) += xt_HMARK.o -+obj-$(CONFIG_NETFILTER_XT_TARGET_IMQ) += xt_IMQ.o - obj-$(CONFIG_NETFILTER_XT_TARGET_LED) += xt_LED.o - obj-$(CONFIG_NETFILTER_XT_TARGET_LOG) += xt_LOG.o - obj-$(CONFIG_NETFILTER_XT_TARGET_NETMAP) += xt_NETMAP.o -diff -uNr linux-3.9.1/net/netfilter/nf_internals.h linux-3.9.1-imqmq/net/netfilter/nf_internals.h ---- linux-3.9.1/net/netfilter/nf_internals.h 2013-05-08 06:58:03.000000000 +0300 -+++ linux-3.9.1-imqmq/net/netfilter/nf_internals.h 2013-05-08 17:30:29.025286174 +0300 -@@ -29,7 +29,7 @@ - struct net_device *indev, - struct net_device *outdev, - int (*okfn)(struct sk_buff *), -- unsigned int queuenum); -+ unsigned int queuenum, unsigned int queuetype); - extern int __init netfilter_queue_init(void); - - /* nf_log.c */ -diff -uNr linux-3.9.1/net/netfilter/nf_queue.c linux-3.9.1-imqmq/net/netfilter/nf_queue.c ---- linux-3.9.1/net/netfilter/nf_queue.c 2013-05-08 06:58:03.000000000 +0300 -+++ linux-3.9.1-imqmq/net/netfilter/nf_queue.c 2013-05-08 17:30:29.025286174 +0300 -@@ -22,6 +22,23 @@ - */ - static const struct nf_queue_handler __rcu *queue_handler __read_mostly; - -+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) -+static const struct nf_queue_handler __rcu *queue_imq_handler __read_mostly; -+ -+void nf_register_queue_imq_handler(const struct nf_queue_handler *qh) -+{ -+ rcu_assign_pointer(queue_imq_handler, qh); -+} -+EXPORT_SYMBOL_GPL(nf_register_queue_imq_handler); -+ -+void nf_unregister_queue_imq_handler(void) -+{ -+ RCU_INIT_POINTER(queue_imq_handler, NULL); -+ synchronize_rcu(); -+} -+EXPORT_SYMBOL_GPL(nf_unregister_queue_imq_handler); -+#endif -+ - /* return EBUSY when somebody else is registered, return EEXIST if the - * same handler is registered, return 0 in case of success. */ - void nf_register_queue_handler(const struct nf_queue_handler *qh) -@@ -71,7 +89,8 @@ - struct net_device *indev, - struct net_device *outdev, - int (*okfn)(struct sk_buff *), -- unsigned int queuenum) -+ unsigned int queuenum, -+ unsigned int queuetype) - { - int status = -ENOENT; - struct nf_queue_entry *entry = NULL; -@@ -85,7 +104,17 @@ - /* QUEUE == DROP if no one is waiting, to be safe. */ - rcu_read_lock(); - -- qh = rcu_dereference(queue_handler); -+ if (queuetype == NF_IMQ_QUEUE) { -+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) -+ qh = rcu_dereference(queue_imq_handler); -+#else -+ BUG(); -+ goto err_unlock; -+#endif -+ } else { -+ qh = rcu_dereference(queue_handler); -+ } -+ - if (!qh) { - status = -ESRCH; - goto err_unlock; -@@ -233,9 +261,11 @@ - local_bh_enable(); - break; - case NF_QUEUE: -+ case NF_IMQ_QUEUE: - err = nf_queue(skb, elem, entry->pf, entry->hook, - entry->indev, entry->outdev, entry->okfn, -- verdict >> NF_VERDICT_QBITS); -+ verdict >> NF_VERDICT_QBITS, -+ verdict & NF_VERDICT_MASK); - if (err < 0) { - if (err == -ECANCELED) - goto next_hook; -diff -uNr linux-3.9.1/net/netfilter/xt_IMQ.c linux-3.9.1-imqmq/net/netfilter/xt_IMQ.c ---- linux-3.9.1/net/netfilter/xt_IMQ.c 1970-01-01 02:00:00.000000000 +0200 -+++ linux-3.9.1-imqmq/net/netfilter/xt_IMQ.c 2013-05-08 17:30:29.025286174 +0300 -@@ -0,0 +1,72 @@ -+/* -+ * This target marks packets to be enqueued to an imq device -+ */ -+#include -+#include -+#include -+#include -+#include -+ -+static unsigned int imq_target(struct sk_buff *pskb, -+ const struct xt_action_param *par) -+{ -+ const struct xt_imq_info *mr = par->targinfo; -+ -+ pskb->imq_flags = (mr->todev & IMQ_F_IFMASK) | IMQ_F_ENQUEUE; -+ -+ return XT_CONTINUE; -+} -+ -+static int imq_checkentry(const struct xt_tgchk_param *par) -+{ -+ struct xt_imq_info *mr = par->targinfo; -+ -+ if (mr->todev > IMQ_MAX_DEVS - 1) { -+ pr_warn("IMQ: invalid device specified, highest is %u\n", -+ IMQ_MAX_DEVS - 1); -+ return -EINVAL; -+ } -+ -+ return 0; -+} -+ -+static struct xt_target xt_imq_reg[] __read_mostly = { -+ { -+ .name = "IMQ", -+ .family = AF_INET, -+ .checkentry = imq_checkentry, -+ .target = imq_target, -+ .targetsize = sizeof(struct xt_imq_info), -+ .table = "mangle", -+ .me = THIS_MODULE -+ }, -+ { -+ .name = "IMQ", -+ .family = AF_INET6, -+ .checkentry = imq_checkentry, -+ .target = imq_target, -+ .targetsize = sizeof(struct xt_imq_info), -+ .table = "mangle", -+ .me = THIS_MODULE -+ }, -+}; -+ -+static int __init imq_init(void) -+{ -+ return xt_register_targets(xt_imq_reg, ARRAY_SIZE(xt_imq_reg)); -+} -+ -+static void __exit imq_fini(void) -+{ -+ xt_unregister_targets(xt_imq_reg, ARRAY_SIZE(xt_imq_reg)); -+} -+ -+module_init(imq_init); -+module_exit(imq_fini); -+ -+MODULE_AUTHOR("http://www.linuximq.net"); -+MODULE_DESCRIPTION("Pseudo-driver for the intermediate queue device. See http://www.linuximq.net/ for more information."); -+MODULE_LICENSE("GPL"); -+MODULE_ALIAS("ipt_IMQ"); -+MODULE_ALIAS("ip6t_IMQ"); -+ -diff -uNr linux-3.10.23/include/linux/skbuff.h linux-3.10.23-imqmq/include/linux/skbuff.h ---- linux-3.10.23/include/linux/skbuff.h 2013-12-08 17:17:21.000000000 +0100 -+++ linux-3.10.23-imqmq/include/linux/skbuff.h 2013-12-09 15:49:26.329991968 +0100 -@@ -33,6 +33,9 @@ - #include - #include - #include -+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) -+#include -+#endif - - /* Don't change this without changing skb_csum_unnecessary! */ - #define CHECKSUM_NONE 0 -@@ -414,6 +417,9 @@ - * first. This is owned by whoever has the skb queued ATM. - */ - char cb[48] __aligned(8); -+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) -+ void *cb_next; -+#endif - - unsigned long _skb_refdst; - #ifdef CONFIG_XFRM -@@ -449,6 +455,9 @@ - #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) - struct nf_conntrack *nfct; - #endif -+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) -+ struct nf_queue_entry *nf_queue_entry; -+#endif - #ifdef CONFIG_BRIDGE_NETFILTER - struct nf_bridge_info *nf_bridge; - #endif -@@ -488,6 +497,10 @@ - /* 7/9 bit hole (depending on ndisc_nodetype presence) */ - kmemcheck_bitfield_end(flags2); - -+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) -+ __u8 imq_flags:IMQ_F_BITS; -+#endif -+ - #ifdef CONFIG_NET_DMA - dma_cookie_t dma_cookie; - #endif -@@ -617,6 +630,12 @@ - return (struct rtable *)skb_dst(skb); - } - -+ -+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) -+extern int skb_save_cb(struct sk_buff *skb); -+extern int skb_restore_cb(struct sk_buff *skb); -+#endif -+ - extern void kfree_skb(struct sk_buff *skb); - extern void kfree_skb_list(struct sk_buff *segs); - extern void skb_tx_error(struct sk_buff *skb); -@@ -2730,6 +2749,10 @@ - nf_conntrack_get(src->nfct); - dst->nfctinfo = src->nfctinfo; - #endif -+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) -+ dst->imq_flags = src->imq_flags; -+ dst->nf_queue_entry = src->nf_queue_entry; -+#endif - #ifdef CONFIG_BRIDGE_NETFILTER - dst->nf_bridge = src->nf_bridge; - nf_bridge_get(src->nf_bridge); diff --git a/src/patches/linux-3.10.25-imq.patch b/src/patches/linux-3.10.25-imq.patch new file mode 100644 index 0000000000..cb4a2d418c --- /dev/null +++ b/src/patches/linux-3.10.25-imq.patch @@ -0,0 +1,6800 @@ +diff -ruN linux-3.10.27/drivers/net/imq.c linux-3.10.27-imq/drivers/net/imq.c +--- linux-3.10.27/drivers/net/imq.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.10.27-imq/drivers/net/imq.c 2014-01-18 10:19:59.342342913 +0100 +@@ -0,0 +1,1001 @@ ++/* ++ * Pseudo-driver for the intermediate queue device. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ * ++ * Authors: Patrick McHardy, ++ * ++ * The first version was written by Martin Devera, ++ * ++ * Credits: Jan Rafaj ++ * - Update patch to 2.4.21 ++ * Sebastian Strollo ++ * - Fix "Dead-loop on netdevice imq"-issue ++ * Marcel Sebek ++ * - Update to 2.6.2-rc1 ++ * ++ * After some time of inactivity there is a group taking care ++ * of IMQ again: http://www.linuximq.net ++ * ++ * ++ * 2004/06/30 - New version of IMQ patch to kernels <=2.6.7 ++ * including the following changes: ++ * ++ * - Correction of ipv6 support "+"s issue (Hasso Tepper) ++ * - Correction of imq_init_devs() issue that resulted in ++ * kernel OOPS unloading IMQ as module (Norbert Buchmuller) ++ * - Addition of functionality to choose number of IMQ devices ++ * during kernel config (Andre Correa) ++ * - Addition of functionality to choose how IMQ hooks on ++ * PRE and POSTROUTING (after or before NAT) (Andre Correa) ++ * - Cosmetic corrections (Norbert Buchmuller) (Andre Correa) ++ * ++ * ++ * 2005/12/16 - IMQ versions between 2.6.7 and 2.6.13 were ++ * released with almost no problems. 2.6.14-x was released ++ * with some important changes: nfcache was removed; After ++ * some weeks of trouble we figured out that some IMQ fields ++ * in skb were missing in skbuff.c - skb_clone and copy_skb_header. ++ * These functions are correctly patched by this new patch version. ++ * ++ * Thanks for all who helped to figure out all the problems with ++ * 2.6.14.x: Patrick McHardy, Rune Kock, VeNoMouS, Max CtRiX, ++ * Kevin Shanahan, Richard Lucassen, Valery Dachev (hopefully ++ * I didn't forget anybody). I apologize again for my lack of time. ++ * ++ * ++ * 2008/06/17 - 2.6.25 - Changed imq.c to use qdisc_run() instead ++ * of qdisc_restart() and moved qdisc_run() to tasklet to avoid ++ * recursive locking. New initialization routines to fix 'rmmod' not ++ * working anymore. Used code from ifb.c. (Jussi Kivilinna) ++ * ++ * 2008/08/06 - 2.6.26 - (JK) ++ * - Replaced tasklet with 'netif_schedule()'. ++ * - Cleaned up and added comments for imq_nf_queue(). ++ * ++ * 2009/04/12 ++ * - Add skb_save_cb/skb_restore_cb helper functions for backuping ++ * control buffer. This is needed because qdisc-layer on kernels ++ * 2.6.27 and newer overwrite control buffer. (Jussi Kivilinna) ++ * - Add better locking for IMQ device. Hopefully this will solve ++ * SMP issues. (Jussi Kivilinna) ++ * - Port to 2.6.27 ++ * - Port to 2.6.28 ++ * - Port to 2.6.29 + fix rmmod not working ++ * ++ * 2009/04/20 - (Jussi Kivilinna) ++ * - Use netdevice feature flags to avoid extra packet handling ++ * by core networking layer and possibly increase performance. ++ * ++ * 2009/09/26 - (Jussi Kivilinna) ++ * - Add imq_nf_reinject_lockless to fix deadlock with ++ * imq_nf_queue/imq_nf_reinject. ++ * ++ * 2009/12/08 - (Jussi Kivilinna) ++ * - Port to 2.6.32 ++ * - Add check for skb->nf_queue_entry==NULL in imq_dev_xmit() ++ * - Also add better error checking for skb->nf_queue_entry usage ++ * ++ * 2010/02/25 - (Jussi Kivilinna) ++ * - Port to 2.6.33 ++ * ++ * 2010/08/15 - (Jussi Kivilinna) ++ * - Port to 2.6.35 ++ * - Simplify hook registration by using nf_register_hooks. ++ * - nf_reinject doesn't need spinlock around it, therefore remove ++ * imq_nf_reinject function. Other nf_reinject users protect ++ * their own data with spinlock. With IMQ however all data is ++ * needed is stored per skbuff, so no locking is needed. ++ * - Changed IMQ to use 'separate' NF_IMQ_QUEUE instead of ++ * NF_QUEUE, this allows working coexistance of IMQ and other ++ * NF_QUEUE users. ++ * - Make IMQ multi-queue. Number of IMQ device queues can be ++ * increased with 'numqueues' module parameters. Default number ++ * of queues is 1, in other words by default IMQ works as ++ * single-queue device. Multi-queue selection is based on ++ * IFB multi-queue patch by Changli Gao . ++ * ++ * 2011/03/18 - (Jussi Kivilinna) ++ * - Port to 2.6.38 ++ * ++ * 2011/07/12 - (syoder89@gmail.com) ++ * - Crash fix that happens when the receiving interface has more ++ * than one queue (add missing skb_set_queue_mapping in ++ * imq_select_queue). ++ * ++ * 2011/07/26 - (Jussi Kivilinna) ++ * - Add queue mapping checks for packets exiting IMQ. ++ * - Port to 3.0 ++ * ++ * 2011/08/16 - (Jussi Kivilinna) ++ * - Clear IFF_TX_SKB_SHARING flag that was added for linux 3.0.2 ++ * ++ * 2011/11/03 - Germano Michel ++ * - Fix IMQ for net namespaces ++ * ++ * 2011/11/04 - Jussi Kivilinna ++ * - Port to 3.1 ++ * - Clean-up, move 'get imq device pointer by imqX name' to ++ * separate function from imq_nf_queue(). ++ * ++ * 2012/01/05 - Jussi Kivilinna ++ * - Port to 3.2 ++ * ++ * 2012/03/19 - Jussi Kivilinna ++ * - Port to 3.3 ++ * ++ * 2012/12/12 - Jussi Kivilinna ++ * - Port to 3.7 ++ * - Fix checkpatch.pl warnings ++ * ++ * 2013/09/10 - Jussi Kivilinna ++ * - Fixed GSO handling for 3.10, see imq_nf_queue() for comments. ++ * - Don't copy skb->cb_next when copying or cloning skbuffs. ++ * ++ * Also, many thanks to pablo Sebastian Greco for making the initial ++ * patch and to those who helped the testing. ++ * ++ * More info at: http://www.linuximq.net/ (Andre Correa) ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ #include ++#endif ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static int imq_nf_queue(struct nf_queue_entry *entry, unsigned queue_num); ++ ++static nf_hookfn imq_nf_hook; ++ ++static struct nf_hook_ops imq_ops[] = { ++ { ++ /* imq_ingress_ipv4 */ ++ .hook = imq_nf_hook, ++ .owner = THIS_MODULE, ++ .pf = PF_INET, ++ .hooknum = NF_INET_PRE_ROUTING, ++#if defined(CONFIG_IMQ_BEHAVIOR_BA) || defined(CONFIG_IMQ_BEHAVIOR_BB) ++ .priority = NF_IP_PRI_MANGLE + 1, ++#else ++ .priority = NF_IP_PRI_NAT_DST + 1, ++#endif ++ }, ++ { ++ /* imq_egress_ipv4 */ ++ .hook = imq_nf_hook, ++ .owner = THIS_MODULE, ++ .pf = PF_INET, ++ .hooknum = NF_INET_POST_ROUTING, ++#if defined(CONFIG_IMQ_BEHAVIOR_AA) || defined(CONFIG_IMQ_BEHAVIOR_BA) ++ .priority = NF_IP_PRI_LAST, ++#else ++ .priority = NF_IP_PRI_NAT_SRC - 1, ++#endif ++ }, ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ { ++ /* imq_ingress_ipv6 */ ++ .hook = imq_nf_hook, ++ .owner = THIS_MODULE, ++ .pf = PF_INET6, ++ .hooknum = NF_INET_PRE_ROUTING, ++#if defined(CONFIG_IMQ_BEHAVIOR_BA) || defined(CONFIG_IMQ_BEHAVIOR_BB) ++ .priority = NF_IP6_PRI_MANGLE + 1, ++#else ++ .priority = NF_IP6_PRI_NAT_DST + 1, ++#endif ++ }, ++ { ++ /* imq_egress_ipv6 */ ++ .hook = imq_nf_hook, ++ .owner = THIS_MODULE, ++ .pf = PF_INET6, ++ .hooknum = NF_INET_POST_ROUTING, ++#if defined(CONFIG_IMQ_BEHAVIOR_AA) || defined(CONFIG_IMQ_BEHAVIOR_BA) ++ .priority = NF_IP6_PRI_LAST, ++#else ++ .priority = NF_IP6_PRI_NAT_SRC - 1, ++#endif ++ }, ++#endif ++}; ++ ++#if defined(CONFIG_IMQ_NUM_DEVS) ++static int numdevs = CONFIG_IMQ_NUM_DEVS; ++#else ++static int numdevs = IMQ_MAX_DEVS; ++#endif ++ ++static struct net_device *imq_devs_cache[IMQ_MAX_DEVS]; ++ ++#define IMQ_MAX_QUEUES 32 ++static int numqueues = 1; ++static u32 imq_hashrnd; ++ ++static inline __be16 pppoe_proto(const struct sk_buff *skb) ++{ ++ return *((__be16 *)(skb_mac_header(skb) + ETH_HLEN + ++ sizeof(struct pppoe_hdr))); ++} ++ ++static u16 imq_hash(struct net_device *dev, struct sk_buff *skb) ++{ ++ unsigned int pull_len; ++ u16 protocol = skb->protocol; ++ u32 addr1, addr2; ++ u32 hash, ihl = 0; ++ union { ++ u16 in16[2]; ++ u32 in32; ++ } ports; ++ u8 ip_proto; ++ ++ pull_len = 0; ++ ++recheck: ++ switch (protocol) { ++ case htons(ETH_P_8021Q): { ++ if (unlikely(skb_pull(skb, VLAN_HLEN) == NULL)) ++ goto other; ++ ++ pull_len += VLAN_HLEN; ++ skb->network_header += VLAN_HLEN; ++ ++ protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto; ++ goto recheck; ++ } ++ ++ case htons(ETH_P_PPP_SES): { ++ if (unlikely(skb_pull(skb, PPPOE_SES_HLEN) == NULL)) ++ goto other; ++ ++ pull_len += PPPOE_SES_HLEN; ++ skb->network_header += PPPOE_SES_HLEN; ++ ++ protocol = pppoe_proto(skb); ++ goto recheck; ++ } ++ ++ case htons(ETH_P_IP): { ++ const struct iphdr *iph = ip_hdr(skb); ++ ++ if (unlikely(!pskb_may_pull(skb, sizeof(struct iphdr)))) ++ goto other; ++ ++ addr1 = iph->daddr; ++ addr2 = iph->saddr; ++ ++ ip_proto = !(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) ? ++ iph->protocol : 0; ++ ihl = ip_hdrlen(skb); ++ ++ break; ++ } ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ case htons(ETH_P_IPV6): { ++ const struct ipv6hdr *iph = ipv6_hdr(skb); ++ __be16 fo = 0; ++ ++ if (unlikely(!pskb_may_pull(skb, sizeof(struct ipv6hdr)))) ++ goto other; ++ ++ addr1 = iph->daddr.s6_addr32[3]; ++ addr2 = iph->saddr.s6_addr32[3]; ++ ihl = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &ip_proto, ++ &fo); ++ if (unlikely(ihl < 0)) ++ goto other; ++ ++ break; ++ } ++#endif ++ default: ++other: ++ if (pull_len != 0) { ++ skb_push(skb, pull_len); ++ skb->network_header -= pull_len; ++ } ++ ++ return (u16)(ntohs(protocol) % dev->real_num_tx_queues); ++ } ++ ++ if (addr1 > addr2) ++ swap(addr1, addr2); ++ ++ switch (ip_proto) { ++ case IPPROTO_TCP: ++ case IPPROTO_UDP: ++ case IPPROTO_DCCP: ++ case IPPROTO_ESP: ++ case IPPROTO_AH: ++ case IPPROTO_SCTP: ++ case IPPROTO_UDPLITE: { ++ if (likely(skb_copy_bits(skb, ihl, &ports.in32, 4) >= 0)) { ++ if (ports.in16[0] > ports.in16[1]) ++ swap(ports.in16[0], ports.in16[1]); ++ break; ++ } ++ /* fall-through */ ++ } ++ default: ++ ports.in32 = 0; ++ break; ++ } ++ ++ if (pull_len != 0) { ++ skb_push(skb, pull_len); ++ skb->network_header -= pull_len; ++ } ++ ++ hash = jhash_3words(addr1, addr2, ports.in32, imq_hashrnd ^ ip_proto); ++ ++ return (u16)(((u64)hash * dev->real_num_tx_queues) >> 32); ++} ++ ++static inline bool sk_tx_queue_recorded(struct sock *sk) ++{ ++ return (sk_tx_queue_get(sk) >= 0); ++} ++ ++static struct netdev_queue *imq_select_queue(struct net_device *dev, ++ struct sk_buff *skb) ++{ ++ u16 queue_index = 0; ++ u32 hash; ++ ++ if (likely(dev->real_num_tx_queues == 1)) ++ goto out; ++ ++ /* IMQ can be receiving ingress or engress packets. */ ++ ++ /* Check first for if rx_queue is set */ ++ if (skb_rx_queue_recorded(skb)) { ++ queue_index = skb_get_rx_queue(skb); ++ goto out; ++ } ++ ++ /* Check if socket has tx_queue set */ ++ if (sk_tx_queue_recorded(skb->sk)) { ++ queue_index = sk_tx_queue_get(skb->sk); ++ goto out; ++ } ++ ++ /* Try use socket hash */ ++ if (skb->sk && skb->sk->sk_hash) { ++ hash = skb->sk->sk_hash; ++ queue_index = ++ (u16)(((u64)hash * dev->real_num_tx_queues) >> 32); ++ goto out; ++ } ++ ++ /* Generate hash from packet data */ ++ queue_index = imq_hash(dev, skb); ++ ++out: ++ if (unlikely(queue_index >= dev->real_num_tx_queues)) ++ queue_index = (u16)((u32)queue_index % dev->real_num_tx_queues); ++ ++ skb_set_queue_mapping(skb, queue_index); ++ return netdev_get_tx_queue(dev, queue_index); ++} ++ ++static struct net_device_stats *imq_get_stats(struct net_device *dev) ++{ ++ return &dev->stats; ++} ++ ++/* called for packets kfree'd in qdiscs at places other than enqueue */ ++static void imq_skb_destructor(struct sk_buff *skb) ++{ ++ struct nf_queue_entry *entry = skb->nf_queue_entry; ++ ++ skb->nf_queue_entry = NULL; ++ ++ if (entry) { ++ nf_queue_entry_release_refs(entry); ++ kfree(entry); ++ } ++ ++ skb_restore_cb(skb); /* kfree backup */ ++} ++ ++static void imq_done_check_queue_mapping(struct sk_buff *skb, ++ struct net_device *dev) ++{ ++ unsigned int queue_index; ++ ++ /* Don't let queue_mapping be left too large after exiting IMQ */ ++ if (likely(skb->dev != dev && skb->dev != NULL)) { ++ queue_index = skb_get_queue_mapping(skb); ++ if (unlikely(queue_index >= skb->dev->real_num_tx_queues)) { ++ queue_index = (u16)((u32)queue_index % ++ skb->dev->real_num_tx_queues); ++ skb_set_queue_mapping(skb, queue_index); ++ } ++ } else { ++ /* skb->dev was IMQ device itself or NULL, be on safe side and ++ * just clear queue mapping. ++ */ ++ skb_set_queue_mapping(skb, 0); ++ } ++} ++ ++static netdev_tx_t imq_dev_xmit(struct sk_buff *skb, struct net_device *dev) ++{ ++ struct nf_queue_entry *entry = skb->nf_queue_entry; ++ ++ skb->nf_queue_entry = NULL; ++ dev->trans_start = jiffies; ++ ++ dev->stats.tx_bytes += skb->len; ++ dev->stats.tx_packets++; ++ ++ if (unlikely(entry == NULL)) { ++ /* We don't know what is going on here.. packet is queued for ++ * imq device, but (probably) not by us. ++ * ++ * If this packet was not send here by imq_nf_queue(), then ++ * skb_save_cb() was not used and skb_free() should not show: ++ * WARNING: IMQ: kfree_skb: skb->cb_next:.. ++ * and/or ++ * WARNING: IMQ: kfree_skb: skb->nf_queue_entry... ++ * ++ * However if this message is shown, then IMQ is somehow broken ++ * and you should report this to linuximq.net. ++ */ ++ ++ /* imq_dev_xmit is black hole that eats all packets, report that ++ * we eat this packet happily and increase dropped counters. ++ */ ++ ++ dev->stats.tx_dropped++; ++ dev_kfree_skb(skb); ++ ++ return NETDEV_TX_OK; ++ } ++ ++ skb_restore_cb(skb); /* restore skb->cb */ ++ ++ skb->imq_flags = 0; ++ skb->destructor = NULL; ++ ++ imq_done_check_queue_mapping(skb, dev); ++ ++ nf_reinject(entry, NF_ACCEPT); ++ ++ return NETDEV_TX_OK; ++} ++ ++static struct net_device *get_imq_device_by_index(int index) ++{ ++ struct net_device *dev = NULL; ++ struct net *net; ++ char buf[8]; ++ ++ /* get device by name and cache result */ ++ snprintf(buf, sizeof(buf), "imq%d", index); ++ ++ /* Search device from all namespaces. */ ++ for_each_net(net) { ++ dev = dev_get_by_name(net, buf); ++ if (dev) ++ break; ++ } ++ ++ if (WARN_ON_ONCE(dev == NULL)) { ++ /* IMQ device not found. Exotic config? */ ++ return ERR_PTR(-ENODEV); ++ } ++ ++ imq_devs_cache[index] = dev; ++ dev_put(dev); ++ ++ return dev; ++} ++ ++static struct nf_queue_entry *nf_queue_entry_dup(struct nf_queue_entry *e) ++{ ++ struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC); ++ if (entry) { ++ if (nf_queue_entry_get_refs(entry)) ++ return entry; ++ kfree(entry); ++ } ++ return NULL; ++} ++ ++#ifdef CONFIG_BRIDGE_NETFILTER ++/* When called from bridge netfilter, skb->data must point to MAC header ++ * before calling skb_gso_segment(). Else, original MAC header is lost ++ * and segmented skbs will be sent to wrong destination. ++ */ ++static void nf_bridge_adjust_skb_data(struct sk_buff *skb) ++{ ++ if (skb->nf_bridge) ++ __skb_push(skb, skb->network_header - skb->mac_header); ++} ++ ++static void nf_bridge_adjust_segmented_data(struct sk_buff *skb) ++{ ++ if (skb->nf_bridge) ++ __skb_pull(skb, skb->network_header - skb->mac_header); ++} ++#else ++#define nf_bridge_adjust_skb_data(s) do {} while (0) ++#define nf_bridge_adjust_segmented_data(s) do {} while (0) ++#endif ++ ++static void free_entry(struct nf_queue_entry *entry) ++{ ++ nf_queue_entry_release_refs(entry); ++ kfree(entry); ++} ++ ++static int __imq_nf_queue(struct nf_queue_entry *entry, struct net_device *dev); ++ ++static int __imq_nf_queue_gso(struct nf_queue_entry *entry, ++ struct net_device *dev, struct sk_buff *skb) ++{ ++ int ret = -ENOMEM; ++ struct nf_queue_entry *entry_seg; ++ ++ nf_bridge_adjust_segmented_data(skb); ++ ++ if (skb->next == NULL) { /* last packet, no need to copy entry */ ++ struct sk_buff *gso_skb = entry->skb; ++ entry->skb = skb; ++ ret = __imq_nf_queue(entry, dev); ++ if (ret) ++ entry->skb = gso_skb; ++ return ret; ++ } ++ ++ skb->next = NULL; ++ ++ entry_seg = nf_queue_entry_dup(entry); ++ if (entry_seg) { ++ entry_seg->skb = skb; ++ ret = __imq_nf_queue(entry_seg, dev); ++ if (ret) ++ free_entry(entry_seg); ++ } ++ return ret; ++} ++ ++static int imq_nf_queue(struct nf_queue_entry *entry, unsigned queue_num) ++{ ++ struct sk_buff *skb, *segs; ++ struct net_device *dev; ++ unsigned int queued; ++ int index, retval, err; ++ ++ index = entry->skb->imq_flags & IMQ_F_IFMASK; ++ if (unlikely(index > numdevs - 1)) { ++ if (net_ratelimit()) ++ pr_warn("IMQ: invalid device specified, highest is %u\n", ++ numdevs - 1); ++ retval = -EINVAL; ++ goto out_no_dev; ++ } ++ ++ /* check for imq device by index from cache */ ++ dev = imq_devs_cache[index]; ++ if (unlikely(!dev)) { ++ dev = get_imq_device_by_index(index); ++ if (IS_ERR(dev)) { ++ retval = PTR_ERR(dev); ++ goto out_no_dev; ++ } ++ } ++ ++ if (unlikely(!(dev->flags & IFF_UP))) { ++ entry->skb->imq_flags = 0; ++ retval = -ECANCELED; ++ goto out_no_dev; ++ } ++ ++ if (!skb_is_gso(entry->skb)) ++ return __imq_nf_queue(entry, dev); ++ ++ /* Since 3.10.x, GSO handling moved here as result of upstream commit ++ * a5fedd43d5f6c94c71053a66e4c3d2e35f1731a2 (netfilter: move ++ * skb_gso_segment into nfnetlink_queue module). ++ * ++ * Following code replicates the gso handling from ++ * 'net/netfilter/nfnetlink_queue_core.c':nfqnl_enqueue_packet(). ++ */ ++ ++ skb = entry->skb; ++ ++ switch (entry->pf) { ++ case NFPROTO_IPV4: ++ skb->protocol = htons(ETH_P_IP); ++ break; ++ case NFPROTO_IPV6: ++ skb->protocol = htons(ETH_P_IPV6); ++ break; ++ } ++ ++ nf_bridge_adjust_skb_data(skb); ++ segs = skb_gso_segment(skb, 0); ++ /* Does not use PTR_ERR to limit the number of error codes that can be ++ * returned by nf_queue. For instance, callers rely on -ECANCELED to ++ * mean 'ignore this hook'. ++ */ ++ err = -ENOBUFS; ++ if (IS_ERR(segs)) ++ goto out_err; ++ queued = 0; ++ err = 0; ++ do { ++ struct sk_buff *nskb = segs->next; ++ if (nskb && nskb->next) ++ nskb->cb_next = NULL; ++ if (err == 0) ++ err = __imq_nf_queue_gso(entry, dev, segs); ++ if (err == 0) ++ queued++; ++ else ++ kfree_skb(segs); ++ segs = nskb; ++ } while (segs); ++ ++ if (queued) { ++ if (err) /* some segments are already queued */ ++ free_entry(entry); ++ kfree_skb(skb); ++ return 0; ++ } ++ ++out_err: ++ nf_bridge_adjust_segmented_data(skb); ++ retval = err; ++out_no_dev: ++ return retval; ++} ++ ++static int __imq_nf_queue(struct nf_queue_entry *entry, struct net_device *dev) ++{ ++ struct sk_buff *skb_orig, *skb, *skb_shared; ++ struct Qdisc *q; ++ struct netdev_queue *txq; ++ spinlock_t *root_lock; ++ int users; ++ int retval = -EINVAL; ++ unsigned int orig_queue_index; ++ ++ dev->last_rx = jiffies; ++ ++ skb = entry->skb; ++ skb_orig = NULL; ++ ++ /* skb has owner? => make clone */ ++ if (unlikely(skb->destructor)) { ++ skb_orig = skb; ++ skb = skb_clone(skb, GFP_ATOMIC); ++ if (unlikely(!skb)) { ++ retval = -ENOMEM; ++ goto out; ++ } ++ skb->cb_next = NULL; ++ entry->skb = skb; ++ } ++ ++ skb->nf_queue_entry = entry; ++ ++ dev->stats.rx_bytes += skb->len; ++ dev->stats.rx_packets++; ++ ++ if (!skb->dev) { ++ /* skb->dev == NULL causes problems, try the find cause. */ ++ if (net_ratelimit()) { ++ dev_warn(&dev->dev, ++ "received packet with skb->dev == NULL\n"); ++ dump_stack(); ++ } ++ ++ skb->dev = dev; ++ } ++ ++ /* Disables softirqs for lock below */ ++ rcu_read_lock_bh(); ++ ++ /* Multi-queue selection */ ++ orig_queue_index = skb_get_queue_mapping(skb); ++ txq = imq_select_queue(dev, skb); ++ ++ q = rcu_dereference(txq->qdisc); ++ if (unlikely(!q->enqueue)) ++ goto packet_not_eaten_by_imq_dev; ++ ++ root_lock = qdisc_lock(q); ++ spin_lock(root_lock); ++ ++ users = atomic_read(&skb->users); ++ ++ skb_shared = skb_get(skb); /* increase reference count by one */ ++ ++ /* backup skb->cb, as qdisc layer will overwrite it */ ++ skb_save_cb(skb_shared); ++ qdisc_enqueue_root(skb_shared, q); /* might kfree_skb */ ++ ++ if (likely(atomic_read(&skb_shared->users) == users + 1)) { ++ kfree_skb(skb_shared); /* decrease reference count by one */ ++ ++ skb->destructor = &imq_skb_destructor; ++ ++ /* cloned? */ ++ if (unlikely(skb_orig)) ++ kfree_skb(skb_orig); /* free original */ ++ ++ spin_unlock(root_lock); ++ rcu_read_unlock_bh(); ++ ++ /* schedule qdisc dequeue */ ++ __netif_schedule(q); ++ ++ retval = 0; ++ goto out; ++ } else { ++ skb_restore_cb(skb_shared); /* restore skb->cb */ ++ skb->nf_queue_entry = NULL; ++ /* ++ * qdisc dropped packet and decreased skb reference count of ++ * skb, so we don't really want to and try refree as that would ++ * actually destroy the skb. ++ */ ++ spin_unlock(root_lock); ++ goto packet_not_eaten_by_imq_dev; ++ } ++ ++packet_not_eaten_by_imq_dev: ++ skb_set_queue_mapping(skb, orig_queue_index); ++ rcu_read_unlock_bh(); ++ ++ /* cloned? restore original */ ++ if (unlikely(skb_orig)) { ++ kfree_skb(skb); ++ entry->skb = skb_orig; ++ } ++ retval = -1; ++out: ++ return retval; ++} ++ ++static unsigned int imq_nf_hook(unsigned int hook, struct sk_buff *pskb, ++ const struct net_device *indev, ++ const struct net_device *outdev, ++ int (*okfn)(struct sk_buff *)) ++{ ++ return (pskb->imq_flags & IMQ_F_ENQUEUE) ? NF_IMQ_QUEUE : NF_ACCEPT; ++} ++ ++static int imq_close(struct net_device *dev) ++{ ++ netif_stop_queue(dev); ++ return 0; ++} ++ ++static int imq_open(struct net_device *dev) ++{ ++ netif_start_queue(dev); ++ return 0; ++} ++ ++static const struct net_device_ops imq_netdev_ops = { ++ .ndo_open = imq_open, ++ .ndo_stop = imq_close, ++ .ndo_start_xmit = imq_dev_xmit, ++ .ndo_get_stats = imq_get_stats, ++}; ++ ++static void imq_setup(struct net_device *dev) ++{ ++ dev->netdev_ops = &imq_netdev_ops; ++ dev->type = ARPHRD_VOID; ++ dev->mtu = 16000; /* too small? */ ++ dev->tx_queue_len = 11000; /* too big? */ ++ dev->flags = IFF_NOARP; ++ dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | ++ NETIF_F_GSO | NETIF_F_HW_CSUM | ++ NETIF_F_HIGHDMA; ++ dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | ++ IFF_TX_SKB_SHARING); ++} ++ ++static int imq_validate(struct nlattr *tb[], struct nlattr *data[]) ++{ ++ int ret = 0; ++ ++ if (tb[IFLA_ADDRESS]) { ++ if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { ++ ret = -EINVAL; ++ goto end; ++ } ++ if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { ++ ret = -EADDRNOTAVAIL; ++ goto end; ++ } ++ } ++ return 0; ++end: ++ pr_warn("IMQ: imq_validate failed (%d)\n", ret); ++ return ret; ++} ++ ++static struct rtnl_link_ops imq_link_ops __read_mostly = { ++ .kind = "imq", ++ .priv_size = 0, ++ .setup = imq_setup, ++ .validate = imq_validate, ++}; ++ ++static const struct nf_queue_handler imq_nfqh = { ++ .outfn = imq_nf_queue, ++}; ++ ++static int __init imq_init_hooks(void) ++{ ++ int ret; ++ ++ nf_register_queue_imq_handler(&imq_nfqh); ++ ++ ret = nf_register_hooks(imq_ops, ARRAY_SIZE(imq_ops)); ++ if (ret < 0) ++ nf_unregister_queue_imq_handler(); ++ ++ return ret; ++} ++ ++static int __init imq_init_one(int index) ++{ ++ struct net_device *dev; ++ int ret; ++ ++ dev = alloc_netdev_mq(0, "imq%d", imq_setup, numqueues); ++ if (!dev) ++ return -ENOMEM; ++ ++ ret = dev_alloc_name(dev, dev->name); ++ if (ret < 0) ++ goto fail; ++ ++ dev->rtnl_link_ops = &imq_link_ops; ++ ret = register_netdevice(dev); ++ if (ret < 0) ++ goto fail; ++ ++ return 0; ++fail: ++ free_netdev(dev); ++ return ret; ++} ++ ++static int __init imq_init_devs(void) ++{ ++ int err, i; ++ ++ if (numdevs < 1 || numdevs > IMQ_MAX_DEVS) { ++ pr_err("IMQ: numdevs has to be betweed 1 and %u\n", ++ IMQ_MAX_DEVS); ++ return -EINVAL; ++ } ++ ++ if (numqueues < 1 || numqueues > IMQ_MAX_QUEUES) { ++ pr_err("IMQ: numqueues has to be betweed 1 and %u\n", ++ IMQ_MAX_QUEUES); ++ return -EINVAL; ++ } ++ ++ get_random_bytes(&imq_hashrnd, sizeof(imq_hashrnd)); ++ ++ rtnl_lock(); ++ err = __rtnl_link_register(&imq_link_ops); ++ ++ for (i = 0; i < numdevs && !err; i++) ++ err = imq_init_one(i); ++ ++ if (err) { ++ __rtnl_link_unregister(&imq_link_ops); ++ memset(imq_devs_cache, 0, sizeof(imq_devs_cache)); ++ } ++ rtnl_unlock(); ++ ++ return err; ++} ++ ++static int __init imq_init_module(void) ++{ ++ int err; ++ ++#if defined(CONFIG_IMQ_NUM_DEVS) ++ BUILD_BUG_ON(CONFIG_IMQ_NUM_DEVS > 16); ++ BUILD_BUG_ON(CONFIG_IMQ_NUM_DEVS < 2); ++ BUILD_BUG_ON(CONFIG_IMQ_NUM_DEVS - 1 > IMQ_F_IFMASK); ++#endif ++ ++ err = imq_init_devs(); ++ if (err) { ++ pr_err("IMQ: Error trying imq_init_devs(net)\n"); ++ return err; ++ } ++ ++ err = imq_init_hooks(); ++ if (err) { ++ pr_err(KERN_ERR "IMQ: Error trying imq_init_hooks()\n"); ++ rtnl_link_unregister(&imq_link_ops); ++ memset(imq_devs_cache, 0, sizeof(imq_devs_cache)); ++ return err; ++ } ++ ++ pr_info("IMQ driver loaded successfully. (numdevs = %d, numqueues = %d)\n", ++ numdevs, numqueues); ++ ++#if defined(CONFIG_IMQ_BEHAVIOR_BA) || defined(CONFIG_IMQ_BEHAVIOR_BB) ++ pr_info("\tHooking IMQ before NAT on PREROUTING.\n"); ++#else ++ pr_info("\tHooking IMQ after NAT on PREROUTING.\n"); ++#endif ++#if defined(CONFIG_IMQ_BEHAVIOR_AB) || defined(CONFIG_IMQ_BEHAVIOR_BB) ++ pr_info("\tHooking IMQ before NAT on POSTROUTING.\n"); ++#else ++ pr_info("\tHooking IMQ after NAT on POSTROUTING.\n"); ++#endif ++ ++ return 0; ++} ++ ++static void __exit imq_unhook(void) ++{ ++ nf_unregister_hooks(imq_ops, ARRAY_SIZE(imq_ops)); ++ nf_unregister_queue_imq_handler(); ++} ++ ++static void __exit imq_cleanup_devs(void) ++{ ++ rtnl_link_unregister(&imq_link_ops); ++ memset(imq_devs_cache, 0, sizeof(imq_devs_cache)); ++} ++ ++static void __exit imq_exit_module(void) ++{ ++ imq_unhook(); ++ imq_cleanup_devs(); ++ pr_info("IMQ driver unloaded successfully.\n"); ++} ++ ++module_init(imq_init_module); ++module_exit(imq_exit_module); ++ ++module_param(numdevs, int, 0); ++module_param(numqueues, int, 0); ++MODULE_PARM_DESC(numdevs, "number of IMQ devices (how many imq* devices will be created)"); ++MODULE_PARM_DESC(numqueues, "number of queues per IMQ device"); ++MODULE_AUTHOR("http://www.linuximq.net"); ++MODULE_DESCRIPTION("Pseudo-driver for the intermediate queue device. See http://www.linuximq.net/ for more information."); ++MODULE_LICENSE("GPL"); ++MODULE_ALIAS_RTNL_LINK("imq"); ++ +diff -ruN linux-3.10.27/drivers/net/Kconfig linux-3.10.27-imq/drivers/net/Kconfig +--- linux-3.10.27/drivers/net/Kconfig 2014-01-16 00:29:14.000000000 +0100 ++++ linux-3.10.27-imq/drivers/net/Kconfig 2014-01-18 10:19:59.341342885 +0100 +@@ -207,6 +207,125 @@ + depends on RIONET + default "128" + ++config IMQ ++ tristate "IMQ (intermediate queueing device) support" ++ depends on NETDEVICES && NETFILTER ++ ---help--- ++ The IMQ device(s) is used as placeholder for QoS queueing ++ disciplines. Every packet entering/leaving the IP stack can be ++ directed through the IMQ device where it's enqueued/dequeued to the ++ attached qdisc. This allows you to treat network devices as classes ++ and distribute bandwidth among them. Iptables is used to specify ++ through which IMQ device, if any, packets travel. ++ ++ More information at: http://www.linuximq.net/ ++ ++ To compile this driver as a module, choose M here: the module ++ will be called imq. If unsure, say N. ++ ++choice ++ prompt "IMQ behavior (PRE/POSTROUTING)" ++ depends on IMQ ++ default IMQ_BEHAVIOR_AB ++ help ++ This setting defines how IMQ behaves in respect to its ++ hooking in PREROUTING and POSTROUTING. ++ ++ IMQ can work in any of the following ways: ++ ++ PREROUTING | POSTROUTING ++ -----------------|------------------- ++ #1 After NAT | After NAT ++ #2 After NAT | Before NAT ++ #3 Before NAT | After NAT ++ #4 Before NAT | Before NAT ++ ++ The default behavior is to hook before NAT on PREROUTING ++ and after NAT on POSTROUTING (#3). ++ ++ This settings are specially usefull when trying to use IMQ ++ to shape NATed clients. ++ ++ More information can be found at: www.linuximq.net ++ ++ If not sure leave the default settings alone. ++ ++config IMQ_BEHAVIOR_AA ++ bool "IMQ AA" ++ help ++ This setting defines how IMQ behaves in respect to its ++ hooking in PREROUTING and POSTROUTING. ++ ++ Choosing this option will make IMQ hook like this: ++ ++ PREROUTING: After NAT ++ POSTROUTING: After NAT ++ ++ More information can be found at: www.linuximq.net ++ ++ If not sure leave the default settings alone. ++ ++config IMQ_BEHAVIOR_AB ++ bool "IMQ AB" ++ help ++ This setting defines how IMQ behaves in respect to its ++ hooking in PREROUTING and POSTROUTING. ++ ++ Choosing this option will make IMQ hook like this: ++ ++ PREROUTING: After NAT ++ POSTROUTING: Before NAT ++ ++ More information can be found at: www.linuximq.net ++ ++ If not sure leave the default settings alone. ++ ++config IMQ_BEHAVIOR_BA ++ bool "IMQ BA" ++ help ++ This setting defines how IMQ behaves in respect to its ++ hooking in PREROUTING and POSTROUTING. ++ ++ Choosing this option will make IMQ hook like this: ++ ++ PREROUTING: Before NAT ++ POSTROUTING: After NAT ++ ++ More information can be found at: www.linuximq.net ++ ++ If not sure leave the default settings alone. ++ ++config IMQ_BEHAVIOR_BB ++ bool "IMQ BB" ++ help ++ This setting defines how IMQ behaves in respect to its ++ hooking in PREROUTING and POSTROUTING. ++ ++ Choosing this option will make IMQ hook like this: ++ ++ PREROUTING: Before NAT ++ POSTROUTING: Before NAT ++ ++ More information can be found at: www.linuximq.net ++ ++ If not sure leave the default settings alone. ++ ++endchoice ++ ++config IMQ_NUM_DEVS ++ int "Number of IMQ devices" ++ range 2 16 ++ depends on IMQ ++ default "16" ++ help ++ This setting defines how many IMQ devices will be created. ++ ++ The default value is 16. ++ ++ More information can be found at: www.linuximq.net ++ ++ If not sure leave the default settings alone. ++ + config TUN + tristate "Universal TUN/TAP device driver support" + select CRC32 +diff -ruN linux-3.10.27/drivers/net/Makefile linux-3.10.27-imq/drivers/net/Makefile +--- linux-3.10.27/drivers/net/Makefile 2014-01-16 00:29:14.000000000 +0100 ++++ linux-3.10.27-imq/drivers/net/Makefile 2014-01-18 10:19:59.341342885 +0100 +@@ -9,6 +9,7 @@ + obj-$(CONFIG_DUMMY) += dummy.o + obj-$(CONFIG_EQUALIZER) += eql.o + obj-$(CONFIG_IFB) += ifb.o ++obj-$(CONFIG_IMQ) += imq.o + obj-$(CONFIG_MACVLAN) += macvlan.o + obj-$(CONFIG_MACVTAP) += macvtap.o + obj-$(CONFIG_MII) += mii.o +diff -ruN linux-3.10.27/include/linux/imq.h linux-3.10.27-imq/include/linux/imq.h +--- linux-3.10.27/include/linux/imq.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.10.27-imq/include/linux/imq.h 2014-01-18 10:19:59.342342913 +0100 +@@ -0,0 +1,13 @@ ++#ifndef _IMQ_H ++#define _IMQ_H ++ ++/* IFMASK (16 device indexes, 0 to 15) and flag(s) fit in 5 bits */ ++#define IMQ_F_BITS 5 ++ ++#define IMQ_F_IFMASK 0x0f ++#define IMQ_F_ENQUEUE 0x10 ++ ++#define IMQ_MAX_DEVS (IMQ_F_IFMASK + 1) ++ ++#endif /* _IMQ_H */ ++ +diff -ruN linux-3.10.27/include/linux/netfilter/xt_IMQ.h linux-3.10.27-imq/include/linux/netfilter/xt_IMQ.h +--- linux-3.10.27/include/linux/netfilter/xt_IMQ.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.10.27-imq/include/linux/netfilter/xt_IMQ.h 2014-01-18 10:19:59.342342913 +0100 +@@ -0,0 +1,9 @@ ++#ifndef _XT_IMQ_H ++#define _XT_IMQ_H ++ ++struct xt_imq_info { ++ unsigned int todev; /* target imq device */ ++}; ++ ++#endif /* _XT_IMQ_H */ ++ +diff -ruN linux-3.10.27/include/linux/netfilter_ipv4/ipt_IMQ.h linux-3.10.27-imq/include/linux/netfilter_ipv4/ipt_IMQ.h +--- linux-3.10.27/include/linux/netfilter_ipv4/ipt_IMQ.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.10.27-imq/include/linux/netfilter_ipv4/ipt_IMQ.h 2014-01-18 10:19:59.343342933 +0100 +@@ -0,0 +1,10 @@ ++#ifndef _IPT_IMQ_H ++#define _IPT_IMQ_H ++ ++/* Backwards compatibility for old userspace */ ++#include ++ ++#define ipt_imq_info xt_imq_info ++ ++#endif /* _IPT_IMQ_H */ ++ +diff -ruN linux-3.10.27/include/linux/netfilter_ipv6/ip6t_IMQ.h linux-3.10.27-imq/include/linux/netfilter_ipv6/ip6t_IMQ.h +--- linux-3.10.27/include/linux/netfilter_ipv6/ip6t_IMQ.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.10.27-imq/include/linux/netfilter_ipv6/ip6t_IMQ.h 2014-01-18 10:19:59.343342933 +0100 +@@ -0,0 +1,10 @@ ++#ifndef _IP6T_IMQ_H ++#define _IP6T_IMQ_H ++ ++/* Backwards compatibility for old userspace */ ++#include ++ ++#define ip6t_imq_info xt_imq_info ++ ++#endif /* _IP6T_IMQ_H */ ++ +diff -ruN linux-3.10.27/include/linux/skbuff.h linux-3.10.27-imq/include/linux/skbuff.h +--- linux-3.10.27/include/linux/skbuff.h 2014-01-16 00:29:14.000000000 +0100 ++++ linux-3.10.27-imq/include/linux/skbuff.h 2014-01-18 10:18:22.220271201 +0100 +@@ -33,6 +33,9 @@ + #include + #include + #include ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++#include ++#endif + + /* Don't change this without changing skb_csum_unnecessary! */ + #define CHECKSUM_NONE 0 +@@ -414,6 +417,9 @@ + * first. This is owned by whoever has the skb queued ATM. + */ + char cb[48] __aligned(8); ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++ void *cb_next; ++#endif + + unsigned long _skb_refdst; + #ifdef CONFIG_XFRM +@@ -449,6 +455,9 @@ + #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + struct nf_conntrack *nfct; + #endif ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++ struct nf_queue_entry *nf_queue_entry; ++#endif + #ifdef CONFIG_BRIDGE_NETFILTER + struct nf_bridge_info *nf_bridge; + #endif +@@ -487,7 +496,9 @@ + __u8 encapsulation:1; + /* 7/9 bit hole (depending on ndisc_nodetype presence) */ + kmemcheck_bitfield_end(flags2); +- ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++ __u8 imq_flags:IMQ_F_BITS; ++#endif + #ifdef CONFIG_NET_DMA + dma_cookie_t dma_cookie; + #endif +@@ -616,7 +627,10 @@ + { + return (struct rtable *)skb_dst(skb); + } +- ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++extern int skb_save_cb(struct sk_buff *skb); ++extern int skb_restore_cb(struct sk_buff *skb); ++#endif + extern void kfree_skb(struct sk_buff *skb); + extern void kfree_skb_list(struct sk_buff *segs); + extern void skb_tx_error(struct sk_buff *skb); +@@ -2735,6 +2749,10 @@ + nf_conntrack_get(src->nfct); + dst->nfctinfo = src->nfctinfo; + #endif ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++ dst->imq_flags = src->imq_flags; ++ dst->nf_queue_entry = src->nf_queue_entry; ++#endif + #ifdef CONFIG_BRIDGE_NETFILTER + dst->nf_bridge = src->nf_bridge; + nf_bridge_get(src->nf_bridge); +diff -ruN linux-3.10.27/include/net/netfilter/nf_queue.h linux-3.10.27-imq/include/net/netfilter/nf_queue.h +--- linux-3.10.27/include/net/netfilter/nf_queue.h 2014-01-16 00:29:14.000000000 +0100 ++++ linux-3.10.27-imq/include/net/netfilter/nf_queue.h 2014-01-18 10:19:59.345342949 +0100 +@@ -29,6 +29,12 @@ + void nf_register_queue_handler(const struct nf_queue_handler *qh); + void nf_unregister_queue_handler(void); + extern void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict); ++extern void nf_queue_entry_release_refs(struct nf_queue_entry *entry); ++ ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++extern void nf_register_queue_imq_handler(const struct nf_queue_handler *qh); ++extern void nf_unregister_queue_imq_handler(void); ++#endif + + bool nf_queue_entry_get_refs(struct nf_queue_entry *entry); + void nf_queue_entry_release_refs(struct nf_queue_entry *entry); +diff -ruN linux-3.10.27/include/uapi/linux/netfilter.h linux-3.10.27-imq/include/uapi/linux/netfilter.h +--- linux-3.10.27/include/uapi/linux/netfilter.h 2014-01-16 00:29:14.000000000 +0100 ++++ linux-3.10.27-imq/include/uapi/linux/netfilter.h 2014-01-18 10:19:59.345342949 +0100 +@@ -13,7 +13,8 @@ + #define NF_QUEUE 3 + #define NF_REPEAT 4 + #define NF_STOP 5 +-#define NF_MAX_VERDICT NF_STOP ++#define NF_IMQ_QUEUE 6 ++#define NF_MAX_VERDICT NF_IMQ_QUEUE + + /* we overload the higher bits for encoding auxiliary data such as the queue + * number or errno values. Not nice, but better than additional function +diff -ruN linux-3.10.27/net/core/dev.c linux-3.10.27-imq/net/core/dev.c +--- linux-3.10.27/net/core/dev.c 2014-01-16 00:29:14.000000000 +0100 ++++ linux-3.10.27-imq/net/core/dev.c 2014-01-18 10:19:59.347342963 +0100 +@@ -129,6 +129,9 @@ + #include + #include + #include ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++#include ++#endif + + #include "net-sysfs.h" + +@@ -2573,7 +2576,12 @@ + } + } + ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++ if (!list_empty(&ptype_all) && ++ !(skb->imq_flags & IMQ_F_ENQUEUE)) ++#else + if (!list_empty(&ptype_all)) ++#endif + dev_queue_xmit_nit(skb, dev); + + skb_len = skb->len; +diff -ruN linux-3.10.27/net/core/skbuff.c linux-3.10.27-imq/net/core/skbuff.c +--- linux-3.10.27/net/core/skbuff.c 2014-01-16 00:29:14.000000000 +0100 ++++ linux-3.10.27-imq/net/core/skbuff.c 2014-01-18 10:19:59.348342972 +0100 +@@ -73,6 +73,9 @@ + + struct kmem_cache *skbuff_head_cache __read_mostly; + static struct kmem_cache *skbuff_fclone_cache __read_mostly; ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++static struct kmem_cache *skbuff_cb_store_cache __read_mostly; ++#endif + + static void sock_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +@@ -92,6 +95,82 @@ + return 1; + } + ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++/* Control buffer save/restore for IMQ devices */ ++struct skb_cb_table { ++ char cb[48] __aligned(8); ++ void *cb_next; ++ atomic_t refcnt; ++}; ++ ++static DEFINE_SPINLOCK(skb_cb_store_lock); ++ ++int skb_save_cb(struct sk_buff *skb) ++{ ++ struct skb_cb_table *next; ++ ++ next = kmem_cache_alloc(skbuff_cb_store_cache, GFP_ATOMIC); ++ if (!next) ++ return -ENOMEM; ++ ++ BUILD_BUG_ON(sizeof(skb->cb) != sizeof(next->cb)); ++ ++ memcpy(next->cb, skb->cb, sizeof(skb->cb)); ++ next->cb_next = skb->cb_next; ++ ++ atomic_set(&next->refcnt, 1); ++ ++ skb->cb_next = next; ++ return 0; ++} ++EXPORT_SYMBOL(skb_save_cb); ++ ++int skb_restore_cb(struct sk_buff *skb) ++{ ++ struct skb_cb_table *next; ++ ++ if (!skb->cb_next) ++ return 0; ++ ++ next = skb->cb_next; ++ ++ BUILD_BUG_ON(sizeof(skb->cb) != sizeof(next->cb)); ++ ++ memcpy(skb->cb, next->cb, sizeof(skb->cb)); ++ skb->cb_next = next->cb_next; ++ ++ spin_lock(&skb_cb_store_lock); ++ ++ if (atomic_dec_and_test(&next->refcnt)) ++ kmem_cache_free(skbuff_cb_store_cache, next); ++ ++ spin_unlock(&skb_cb_store_lock); ++ ++ return 0; ++} ++EXPORT_SYMBOL(skb_restore_cb); ++ ++static void skb_copy_stored_cb(struct sk_buff *new, const struct sk_buff *__old) ++{ ++ struct skb_cb_table *next; ++ struct sk_buff *old; ++ ++ if (!__old->cb_next) { ++ new->cb_next = NULL; ++ return; ++ } ++ ++ spin_lock(&skb_cb_store_lock); ++ ++ old = (struct sk_buff *)__old; ++ ++ next = old->cb_next; ++ atomic_inc(&next->refcnt); ++ new->cb_next = next; ++ ++ spin_unlock(&skb_cb_store_lock); ++} ++#endif + + /* Pipe buffer operations for a socket. */ + static const struct pipe_buf_operations sock_pipe_buf_ops = { +@@ -582,6 +661,28 @@ + WARN_ON(in_irq()); + skb->destructor(skb); + } ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++ /* ++ * This should not happen. When it does, avoid memleak by restoring ++ * the chain of cb-backups. ++ */ ++ while (skb->cb_next != NULL) { ++ if (net_ratelimit()) ++ pr_warn("IMQ: kfree_skb: skb->cb_next: %08x\n", ++ (unsigned int)skb->cb_next); ++ ++ skb_restore_cb(skb); ++ } ++ /* ++ * This should not happen either, nf_queue_entry is nullified in ++ * imq_dev_xmit(). If we have non-NULL nf_queue_entry then we are ++ * leaking entry pointers, maybe memory. We don't know if this is ++ * pointer to already freed memory, or should this be freed. ++ * If this happens we need to add refcounting, etc for nf_queue_entry. ++ */ ++ if (skb->nf_queue_entry && net_ratelimit()) ++ pr_warn("%s\n", "IMQ: kfree_skb: skb->nf_queue_entry != NULL"); ++#endif + #if IS_ENABLED(CONFIG_NF_CONNTRACK) + nf_conntrack_put(skb->nfct); + #endif +@@ -713,6 +814,10 @@ + new->sp = secpath_get(old->sp); + #endif + memcpy(new->cb, old->cb, sizeof(old->cb)); ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++ new->cb_next = NULL; ++ /*skb_copy_stored_cb(new, old);*/ ++#endif + new->csum = old->csum; + new->local_df = old->local_df; + new->pkt_type = old->pkt_type; +@@ -3093,6 +3198,13 @@ + 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, + NULL); ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++ skbuff_cb_store_cache = kmem_cache_create("skbuff_cb_store_cache", ++ sizeof(struct skb_cb_table), ++ 0, ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC, ++ NULL); ++#endif + } + + /** +diff -ruN linux-3.10.27/net/core/skbuff.c.orig linux-3.10.27-imq/net/core/skbuff.c.orig +--- linux-3.10.27/net/core/skbuff.c.orig 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.10.27-imq/net/core/skbuff.c.orig 2014-01-16 00:29:14.000000000 +0100 +@@ -0,0 +1,3503 @@ ++/* ++ * Routines having to do with the 'struct sk_buff' memory handlers. ++ * ++ * Authors: Alan Cox ++ * Florian La Roche ++ * ++ * Fixes: ++ * Alan Cox : Fixed the worst of the load ++ * balancer bugs. ++ * Dave Platt : Interrupt stacking fix. ++ * Richard Kooijman : Timestamp fixes. ++ * Alan Cox : Changed buffer format. ++ * Alan Cox : destructor hook for AF_UNIX etc. ++ * Linus Torvalds : Better skb_clone. ++ * Alan Cox : Added skb_copy. ++ * Alan Cox : Added all the changed routines Linus ++ * only put in the headers ++ * Ray VanTassle : Fixed --skb->lock in free ++ * Alan Cox : skb_copy copy arp field ++ * Andi Kleen : slabified it. ++ * Robert Olsson : Removed skb_head_pool ++ * ++ * NOTE: ++ * The __skb_ routines should be called with interrupts ++ * disabled, or you better be *real* sure that the operation is atomic ++ * with respect to whatever list is being frobbed (e.g. via lock_sock() ++ * or via disabling bottom half handlers, etc). ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++ ++/* ++ * The functions in this file will not compile correctly with gcc 2.4.x ++ */ ++ ++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#ifdef CONFIG_NET_CLS_ACT ++#include ++#endif ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++struct kmem_cache *skbuff_head_cache __read_mostly; ++static struct kmem_cache *skbuff_fclone_cache __read_mostly; ++ ++static void sock_pipe_buf_release(struct pipe_inode_info *pipe, ++ struct pipe_buffer *buf) ++{ ++ put_page(buf->page); ++} ++ ++static void sock_pipe_buf_get(struct pipe_inode_info *pipe, ++ struct pipe_buffer *buf) ++{ ++ get_page(buf->page); ++} ++ ++static int sock_pipe_buf_steal(struct pipe_inode_info *pipe, ++ struct pipe_buffer *buf) ++{ ++ return 1; ++} ++ ++ ++/* Pipe buffer operations for a socket. */ ++static const struct pipe_buf_operations sock_pipe_buf_ops = { ++ .can_merge = 0, ++ .map = generic_pipe_buf_map, ++ .unmap = generic_pipe_buf_unmap, ++ .confirm = generic_pipe_buf_confirm, ++ .release = sock_pipe_buf_release, ++ .steal = sock_pipe_buf_steal, ++ .get = sock_pipe_buf_get, ++}; ++ ++/** ++ * skb_panic - private function for out-of-line support ++ * @skb: buffer ++ * @sz: size ++ * @addr: address ++ * @msg: skb_over_panic or skb_under_panic ++ * ++ * Out-of-line support for skb_put() and skb_push(). ++ * Called via the wrapper skb_over_panic() or skb_under_panic(). ++ * Keep out of line to prevent kernel bloat. ++ * __builtin_return_address is not used because it is not always reliable. ++ */ ++static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr, ++ const char msg[]) ++{ ++ pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n", ++ msg, addr, skb->len, sz, skb->head, skb->data, ++ (unsigned long)skb->tail, (unsigned long)skb->end, ++ skb->dev ? skb->dev->name : ""); ++ BUG(); ++} ++ ++static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr) ++{ ++ skb_panic(skb, sz, addr, __func__); ++} ++ ++static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) ++{ ++ skb_panic(skb, sz, addr, __func__); ++} ++ ++/* ++ * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells ++ * the caller if emergency pfmemalloc reserves are being used. If it is and ++ * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves ++ * may be used. Otherwise, the packet data may be discarded until enough ++ * memory is free ++ */ ++#define kmalloc_reserve(size, gfp, node, pfmemalloc) \ ++ __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc) ++ ++static void *__kmalloc_reserve(size_t size, gfp_t flags, int node, ++ unsigned long ip, bool *pfmemalloc) ++{ ++ void *obj; ++ bool ret_pfmemalloc = false; ++ ++ /* ++ * Try a regular allocation, when that fails and we're not entitled ++ * to the reserves, fail. ++ */ ++ obj = kmalloc_node_track_caller(size, ++ flags | __GFP_NOMEMALLOC | __GFP_NOWARN, ++ node); ++ if (obj || !(gfp_pfmemalloc_allowed(flags))) ++ goto out; ++ ++ /* Try again but now we are using pfmemalloc reserves */ ++ ret_pfmemalloc = true; ++ obj = kmalloc_node_track_caller(size, flags, node); ++ ++out: ++ if (pfmemalloc) ++ *pfmemalloc = ret_pfmemalloc; ++ ++ return obj; ++} ++ ++/* Allocate a new skbuff. We do this ourselves so we can fill in a few ++ * 'private' fields and also do memory statistics to find all the ++ * [BEEP] leaks. ++ * ++ */ ++ ++struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node) ++{ ++ struct sk_buff *skb; ++ ++ /* Get the HEAD */ ++ skb = kmem_cache_alloc_node(skbuff_head_cache, ++ gfp_mask & ~__GFP_DMA, node); ++ if (!skb) ++ goto out; ++ ++ /* ++ * Only clear those fields we need to clear, not those that we will ++ * actually initialise below. Hence, don't put any more fields after ++ * the tail pointer in struct sk_buff! ++ */ ++ memset(skb, 0, offsetof(struct sk_buff, tail)); ++ skb->head = NULL; ++ skb->truesize = sizeof(struct sk_buff); ++ atomic_set(&skb->users, 1); ++ ++#ifdef NET_SKBUFF_DATA_USES_OFFSET ++ skb->mac_header = ~0U; ++#endif ++out: ++ return skb; ++} ++ ++/** ++ * __alloc_skb - allocate a network buffer ++ * @size: size to allocate ++ * @gfp_mask: allocation mask ++ * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache ++ * instead of head cache and allocate a cloned (child) skb. ++ * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for ++ * allocations in case the data is required for writeback ++ * @node: numa node to allocate memory on ++ * ++ * Allocate a new &sk_buff. The returned buffer has no headroom and a ++ * tail room of at least size bytes. The object has a reference count ++ * of one. The return is the buffer. On a failure the return is %NULL. ++ * ++ * Buffers may only be allocated from interrupts using a @gfp_mask of ++ * %GFP_ATOMIC. ++ */ ++struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, ++ int flags, int node) ++{ ++ struct kmem_cache *cache; ++ struct skb_shared_info *shinfo; ++ struct sk_buff *skb; ++ u8 *data; ++ bool pfmemalloc; ++ ++ cache = (flags & SKB_ALLOC_FCLONE) ++ ? skbuff_fclone_cache : skbuff_head_cache; ++ ++ if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) ++ gfp_mask |= __GFP_MEMALLOC; ++ ++ /* Get the HEAD */ ++ skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node); ++ if (!skb) ++ goto out; ++ prefetchw(skb); ++ ++ /* We do our best to align skb_shared_info on a separate cache ++ * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives ++ * aligned memory blocks, unless SLUB/SLAB debug is enabled. ++ * Both skb->head and skb_shared_info are cache line aligned. ++ */ ++ size = SKB_DATA_ALIGN(size); ++ size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); ++ data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc); ++ if (!data) ++ goto nodata; ++ /* kmalloc(size) might give us more room than requested. ++ * Put skb_shared_info exactly at the end of allocated zone, ++ * to allow max possible filling before reallocation. ++ */ ++ size = SKB_WITH_OVERHEAD(ksize(data)); ++ prefetchw(data + size); ++ ++ /* ++ * Only clear those fields we need to clear, not those that we will ++ * actually initialise below. Hence, don't put any more fields after ++ * the tail pointer in struct sk_buff! ++ */ ++ memset(skb, 0, offsetof(struct sk_buff, tail)); ++ /* Account for allocated memory : skb + skb->head */ ++ skb->truesize = SKB_TRUESIZE(size); ++ skb->pfmemalloc = pfmemalloc; ++ atomic_set(&skb->users, 1); ++ skb->head = data; ++ skb->data = data; ++ skb_reset_tail_pointer(skb); ++ skb->end = skb->tail + size; ++#ifdef NET_SKBUFF_DATA_USES_OFFSET ++ skb->mac_header = ~0U; ++ skb->transport_header = ~0U; ++#endif ++ ++ /* make sure we initialize shinfo sequentially */ ++ shinfo = skb_shinfo(skb); ++ memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); ++ atomic_set(&shinfo->dataref, 1); ++ kmemcheck_annotate_variable(shinfo->destructor_arg); ++ ++ if (flags & SKB_ALLOC_FCLONE) { ++ struct sk_buff *child = skb + 1; ++ atomic_t *fclone_ref = (atomic_t *) (child + 1); ++ ++ kmemcheck_annotate_bitfield(child, flags1); ++ kmemcheck_annotate_bitfield(child, flags2); ++ skb->fclone = SKB_FCLONE_ORIG; ++ atomic_set(fclone_ref, 1); ++ ++ child->fclone = SKB_FCLONE_UNAVAILABLE; ++ child->pfmemalloc = pfmemalloc; ++ } ++out: ++ return skb; ++nodata: ++ kmem_cache_free(cache, skb); ++ skb = NULL; ++ goto out; ++} ++EXPORT_SYMBOL(__alloc_skb); ++ ++/** ++ * build_skb - build a network buffer ++ * @data: data buffer provided by caller ++ * @frag_size: size of fragment, or 0 if head was kmalloced ++ * ++ * Allocate a new &sk_buff. Caller provides space holding head and ++ * skb_shared_info. @data must have been allocated by kmalloc() ++ * The return is the new skb buffer. ++ * On a failure the return is %NULL, and @data is not freed. ++ * Notes : ++ * Before IO, driver allocates only data buffer where NIC put incoming frame ++ * Driver should add room at head (NET_SKB_PAD) and ++ * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info)) ++ * After IO, driver calls build_skb(), to allocate sk_buff and populate it ++ * before giving packet to stack. ++ * RX rings only contains data buffers, not full skbs. ++ */ ++struct sk_buff *build_skb(void *data, unsigned int frag_size) ++{ ++ struct skb_shared_info *shinfo; ++ struct sk_buff *skb; ++ unsigned int size = frag_size ? : ksize(data); ++ ++ skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); ++ if (!skb) ++ return NULL; ++ ++ size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); ++ ++ memset(skb, 0, offsetof(struct sk_buff, tail)); ++ skb->truesize = SKB_TRUESIZE(size); ++ skb->head_frag = frag_size != 0; ++ atomic_set(&skb->users, 1); ++ skb->head = data; ++ skb->data = data; ++ skb_reset_tail_pointer(skb); ++ skb->end = skb->tail + size; ++#ifdef NET_SKBUFF_DATA_USES_OFFSET ++ skb->mac_header = ~0U; ++ skb->transport_header = ~0U; ++#endif ++ ++ /* make sure we initialize shinfo sequentially */ ++ shinfo = skb_shinfo(skb); ++ memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); ++ atomic_set(&shinfo->dataref, 1); ++ kmemcheck_annotate_variable(shinfo->destructor_arg); ++ ++ return skb; ++} ++EXPORT_SYMBOL(build_skb); ++ ++struct netdev_alloc_cache { ++ struct page_frag frag; ++ /* we maintain a pagecount bias, so that we dont dirty cache line ++ * containing page->_count every time we allocate a fragment. ++ */ ++ unsigned int pagecnt_bias; ++}; ++static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache); ++ ++static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) ++{ ++ struct netdev_alloc_cache *nc; ++ void *data = NULL; ++ int order; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ nc = &__get_cpu_var(netdev_alloc_cache); ++ if (unlikely(!nc->frag.page)) { ++refill: ++ for (order = NETDEV_FRAG_PAGE_MAX_ORDER; ;) { ++ gfp_t gfp = gfp_mask; ++ ++ if (order) ++ gfp |= __GFP_COMP | __GFP_NOWARN; ++ nc->frag.page = alloc_pages(gfp, order); ++ if (likely(nc->frag.page)) ++ break; ++ if (--order < 0) ++ goto end; ++ } ++ nc->frag.size = PAGE_SIZE << order; ++recycle: ++ atomic_set(&nc->frag.page->_count, NETDEV_PAGECNT_MAX_BIAS); ++ nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS; ++ nc->frag.offset = 0; ++ } ++ ++ if (nc->frag.offset + fragsz > nc->frag.size) { ++ /* avoid unnecessary locked operations if possible */ ++ if ((atomic_read(&nc->frag.page->_count) == nc->pagecnt_bias) || ++ atomic_sub_and_test(nc->pagecnt_bias, &nc->frag.page->_count)) ++ goto recycle; ++ goto refill; ++ } ++ ++ data = page_address(nc->frag.page) + nc->frag.offset; ++ nc->frag.offset += fragsz; ++ nc->pagecnt_bias--; ++end: ++ local_irq_restore(flags); ++ return data; ++} ++ ++/** ++ * netdev_alloc_frag - allocate a page fragment ++ * @fragsz: fragment size ++ * ++ * Allocates a frag from a page for receive buffer. ++ * Uses GFP_ATOMIC allocations. ++ */ ++void *netdev_alloc_frag(unsigned int fragsz) ++{ ++ return __netdev_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD); ++} ++EXPORT_SYMBOL(netdev_alloc_frag); ++ ++/** ++ * __netdev_alloc_skb - allocate an skbuff for rx on a specific device ++ * @dev: network device to receive on ++ * @length: length to allocate ++ * @gfp_mask: get_free_pages mask, passed to alloc_skb ++ * ++ * Allocate a new &sk_buff and assign it a usage count of one. The ++ * buffer has unspecified headroom built in. Users should allocate ++ * the headroom they think they need without accounting for the ++ * built in space. The built in space is used for optimisations. ++ * ++ * %NULL is returned if there is no free memory. ++ */ ++struct sk_buff *__netdev_alloc_skb(struct net_device *dev, ++ unsigned int length, gfp_t gfp_mask) ++{ ++ struct sk_buff *skb = NULL; ++ unsigned int fragsz = SKB_DATA_ALIGN(length + NET_SKB_PAD) + ++ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); ++ ++ if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) { ++ void *data; ++ ++ if (sk_memalloc_socks()) ++ gfp_mask |= __GFP_MEMALLOC; ++ ++ data = __netdev_alloc_frag(fragsz, gfp_mask); ++ ++ if (likely(data)) { ++ skb = build_skb(data, fragsz); ++ if (unlikely(!skb)) ++ put_page(virt_to_head_page(data)); ++ } ++ } else { ++ skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, ++ SKB_ALLOC_RX, NUMA_NO_NODE); ++ } ++ if (likely(skb)) { ++ skb_reserve(skb, NET_SKB_PAD); ++ skb->dev = dev; ++ } ++ return skb; ++} ++EXPORT_SYMBOL(__netdev_alloc_skb); ++ ++void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, ++ int size, unsigned int truesize) ++{ ++ skb_fill_page_desc(skb, i, page, off, size); ++ skb->len += size; ++ skb->data_len += size; ++ skb->truesize += truesize; ++} ++EXPORT_SYMBOL(skb_add_rx_frag); ++ ++static void skb_drop_list(struct sk_buff **listp) ++{ ++ kfree_skb_list(*listp); ++ *listp = NULL; ++} ++ ++static inline void skb_drop_fraglist(struct sk_buff *skb) ++{ ++ skb_drop_list(&skb_shinfo(skb)->frag_list); ++} ++ ++static void skb_clone_fraglist(struct sk_buff *skb) ++{ ++ struct sk_buff *list; ++ ++ skb_walk_frags(skb, list) ++ skb_get(list); ++} ++ ++static void skb_free_head(struct sk_buff *skb) ++{ ++ if (skb->head_frag) ++ put_page(virt_to_head_page(skb->head)); ++ else ++ kfree(skb->head); ++} ++ ++static void skb_release_data(struct sk_buff *skb) ++{ ++ if (!skb->cloned || ++ !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, ++ &skb_shinfo(skb)->dataref)) { ++ if (skb_shinfo(skb)->nr_frags) { ++ int i; ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) ++ skb_frag_unref(skb, i); ++ } ++ ++ /* ++ * If skb buf is from userspace, we need to notify the caller ++ * the lower device DMA has done; ++ */ ++ if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { ++ struct ubuf_info *uarg; ++ ++ uarg = skb_shinfo(skb)->destructor_arg; ++ if (uarg->callback) ++ uarg->callback(uarg, true); ++ } ++ ++ if (skb_has_frag_list(skb)) ++ skb_drop_fraglist(skb); ++ ++ skb_free_head(skb); ++ } ++} ++ ++/* ++ * Free an skbuff by memory without cleaning the state. ++ */ ++static void kfree_skbmem(struct sk_buff *skb) ++{ ++ struct sk_buff *other; ++ atomic_t *fclone_ref; ++ ++ switch (skb->fclone) { ++ case SKB_FCLONE_UNAVAILABLE: ++ kmem_cache_free(skbuff_head_cache, skb); ++ break; ++ ++ case SKB_FCLONE_ORIG: ++ fclone_ref = (atomic_t *) (skb + 2); ++ if (atomic_dec_and_test(fclone_ref)) ++ kmem_cache_free(skbuff_fclone_cache, skb); ++ break; ++ ++ case SKB_FCLONE_CLONE: ++ fclone_ref = (atomic_t *) (skb + 1); ++ other = skb - 1; ++ ++ /* The clone portion is available for ++ * fast-cloning again. ++ */ ++ skb->fclone = SKB_FCLONE_UNAVAILABLE; ++ ++ if (atomic_dec_and_test(fclone_ref)) ++ kmem_cache_free(skbuff_fclone_cache, other); ++ break; ++ } ++} ++ ++static void skb_release_head_state(struct sk_buff *skb) ++{ ++ skb_dst_drop(skb); ++#ifdef CONFIG_XFRM ++ secpath_put(skb->sp); ++#endif ++ if (skb->destructor) { ++ WARN_ON(in_irq()); ++ skb->destructor(skb); ++ } ++#if IS_ENABLED(CONFIG_NF_CONNTRACK) ++ nf_conntrack_put(skb->nfct); ++#endif ++#ifdef CONFIG_BRIDGE_NETFILTER ++ nf_bridge_put(skb->nf_bridge); ++#endif ++/* XXX: IS this still necessary? - JHS */ ++#ifdef CONFIG_NET_SCHED ++ skb->tc_index = 0; ++#ifdef CONFIG_NET_CLS_ACT ++ skb->tc_verd = 0; ++#endif ++#endif ++} ++ ++/* Free everything but the sk_buff shell. */ ++static void skb_release_all(struct sk_buff *skb) ++{ ++ skb_release_head_state(skb); ++ if (likely(skb->head)) ++ skb_release_data(skb); ++} ++ ++/** ++ * __kfree_skb - private function ++ * @skb: buffer ++ * ++ * Free an sk_buff. Release anything attached to the buffer. ++ * Clean the state. This is an internal helper function. Users should ++ * always call kfree_skb ++ */ ++ ++void __kfree_skb(struct sk_buff *skb) ++{ ++ skb_release_all(skb); ++ kfree_skbmem(skb); ++} ++EXPORT_SYMBOL(__kfree_skb); ++ ++/** ++ * kfree_skb - free an sk_buff ++ * @skb: buffer to free ++ * ++ * Drop a reference to the buffer and free it if the usage count has ++ * hit zero. ++ */ ++void kfree_skb(struct sk_buff *skb) ++{ ++ if (unlikely(!skb)) ++ return; ++ if (likely(atomic_read(&skb->users) == 1)) ++ smp_rmb(); ++ else if (likely(!atomic_dec_and_test(&skb->users))) ++ return; ++ trace_kfree_skb(skb, __builtin_return_address(0)); ++ __kfree_skb(skb); ++} ++EXPORT_SYMBOL(kfree_skb); ++ ++void kfree_skb_list(struct sk_buff *segs) ++{ ++ while (segs) { ++ struct sk_buff *next = segs->next; ++ ++ kfree_skb(segs); ++ segs = next; ++ } ++} ++EXPORT_SYMBOL(kfree_skb_list); ++ ++/** ++ * skb_tx_error - report an sk_buff xmit error ++ * @skb: buffer that triggered an error ++ * ++ * Report xmit error if a device callback is tracking this skb. ++ * skb must be freed afterwards. ++ */ ++void skb_tx_error(struct sk_buff *skb) ++{ ++ if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { ++ struct ubuf_info *uarg; ++ ++ uarg = skb_shinfo(skb)->destructor_arg; ++ if (uarg->callback) ++ uarg->callback(uarg, false); ++ skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; ++ } ++} ++EXPORT_SYMBOL(skb_tx_error); ++ ++/** ++ * consume_skb - free an skbuff ++ * @skb: buffer to free ++ * ++ * Drop a ref to the buffer and free it if the usage count has hit zero ++ * Functions identically to kfree_skb, but kfree_skb assumes that the frame ++ * is being dropped after a failure and notes that ++ */ ++void consume_skb(struct sk_buff *skb) ++{ ++ if (unlikely(!skb)) ++ return; ++ if (likely(atomic_read(&skb->users) == 1)) ++ smp_rmb(); ++ else if (likely(!atomic_dec_and_test(&skb->users))) ++ return; ++ trace_consume_skb(skb); ++ __kfree_skb(skb); ++} ++EXPORT_SYMBOL(consume_skb); ++ ++static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) ++{ ++ new->tstamp = old->tstamp; ++ new->dev = old->dev; ++ new->transport_header = old->transport_header; ++ new->network_header = old->network_header; ++ new->mac_header = old->mac_header; ++ new->inner_transport_header = old->inner_transport_header; ++ new->inner_network_header = old->inner_network_header; ++ new->inner_mac_header = old->inner_mac_header; ++ skb_dst_copy(new, old); ++ new->rxhash = old->rxhash; ++ new->ooo_okay = old->ooo_okay; ++ new->l4_rxhash = old->l4_rxhash; ++ new->no_fcs = old->no_fcs; ++ new->encapsulation = old->encapsulation; ++#ifdef CONFIG_XFRM ++ new->sp = secpath_get(old->sp); ++#endif ++ memcpy(new->cb, old->cb, sizeof(old->cb)); ++ new->csum = old->csum; ++ new->local_df = old->local_df; ++ new->pkt_type = old->pkt_type; ++ new->ip_summed = old->ip_summed; ++ skb_copy_queue_mapping(new, old); ++ new->priority = old->priority; ++#if IS_ENABLED(CONFIG_IP_VS) ++ new->ipvs_property = old->ipvs_property; ++#endif ++ new->pfmemalloc = old->pfmemalloc; ++ new->protocol = old->protocol; ++ new->mark = old->mark; ++ new->skb_iif = old->skb_iif; ++ __nf_copy(new, old); ++#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) ++ new->nf_trace = old->nf_trace; ++#endif ++#ifdef CONFIG_NET_SCHED ++ new->tc_index = old->tc_index; ++#ifdef CONFIG_NET_CLS_ACT ++ new->tc_verd = old->tc_verd; ++#endif ++#endif ++ new->vlan_proto = old->vlan_proto; ++ new->vlan_tci = old->vlan_tci; ++ ++ skb_copy_secmark(new, old); ++} ++ ++/* ++ * You should not add any new code to this function. Add it to ++ * __copy_skb_header above instead. ++ */ ++static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) ++{ ++#define C(x) n->x = skb->x ++ ++ n->next = n->prev = NULL; ++ n->sk = NULL; ++ __copy_skb_header(n, skb); ++ ++ C(len); ++ C(data_len); ++ C(mac_len); ++ n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; ++ n->cloned = 1; ++ n->nohdr = 0; ++ n->destructor = NULL; ++ C(tail); ++ C(end); ++ C(head); ++ C(head_frag); ++ C(data); ++ C(truesize); ++ atomic_set(&n->users, 1); ++ ++ atomic_inc(&(skb_shinfo(skb)->dataref)); ++ skb->cloned = 1; ++ ++ return n; ++#undef C ++} ++ ++/** ++ * skb_morph - morph one skb into another ++ * @dst: the skb to receive the contents ++ * @src: the skb to supply the contents ++ * ++ * This is identical to skb_clone except that the target skb is ++ * supplied by the user. ++ * ++ * The target skb is returned upon exit. ++ */ ++struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) ++{ ++ skb_release_all(dst); ++ return __skb_clone(dst, src); ++} ++EXPORT_SYMBOL_GPL(skb_morph); ++ ++/** ++ * skb_copy_ubufs - copy userspace skb frags buffers to kernel ++ * @skb: the skb to modify ++ * @gfp_mask: allocation priority ++ * ++ * This must be called on SKBTX_DEV_ZEROCOPY skb. ++ * It will copy all frags into kernel and drop the reference ++ * to userspace pages. ++ * ++ * If this function is called from an interrupt gfp_mask() must be ++ * %GFP_ATOMIC. ++ * ++ * Returns 0 on success or a negative error code on failure ++ * to allocate kernel memory to copy to. ++ */ ++int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) ++{ ++ int i; ++ int num_frags = skb_shinfo(skb)->nr_frags; ++ struct page *page, *head = NULL; ++ struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg; ++ ++ for (i = 0; i < num_frags; i++) { ++ u8 *vaddr; ++ skb_frag_t *f = &skb_shinfo(skb)->frags[i]; ++ ++ page = alloc_page(gfp_mask); ++ if (!page) { ++ while (head) { ++ struct page *next = (struct page *)head->private; ++ put_page(head); ++ head = next; ++ } ++ return -ENOMEM; ++ } ++ vaddr = kmap_atomic(skb_frag_page(f)); ++ memcpy(page_address(page), ++ vaddr + f->page_offset, skb_frag_size(f)); ++ kunmap_atomic(vaddr); ++ page->private = (unsigned long)head; ++ head = page; ++ } ++ ++ /* skb frags release userspace buffers */ ++ for (i = 0; i < num_frags; i++) ++ skb_frag_unref(skb, i); ++ ++ uarg->callback(uarg, false); ++ ++ /* skb frags point to kernel buffers */ ++ for (i = num_frags - 1; i >= 0; i--) { ++ __skb_fill_page_desc(skb, i, head, 0, ++ skb_shinfo(skb)->frags[i].size); ++ head = (struct page *)head->private; ++ } ++ ++ skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; ++ return 0; ++} ++EXPORT_SYMBOL_GPL(skb_copy_ubufs); ++ ++/** ++ * skb_clone - duplicate an sk_buff ++ * @skb: buffer to clone ++ * @gfp_mask: allocation priority ++ * ++ * Duplicate an &sk_buff. The new one is not owned by a socket. Both ++ * copies share the same packet data but not structure. The new ++ * buffer has a reference count of 1. If the allocation fails the ++ * function returns %NULL otherwise the new buffer is returned. ++ * ++ * If this function is called from an interrupt gfp_mask() must be ++ * %GFP_ATOMIC. ++ */ ++ ++struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) ++{ ++ struct sk_buff *n; ++ ++ if (skb_orphan_frags(skb, gfp_mask)) ++ return NULL; ++ ++ n = skb + 1; ++ if (skb->fclone == SKB_FCLONE_ORIG && ++ n->fclone == SKB_FCLONE_UNAVAILABLE) { ++ atomic_t *fclone_ref = (atomic_t *) (n + 1); ++ n->fclone = SKB_FCLONE_CLONE; ++ atomic_inc(fclone_ref); ++ } else { ++ if (skb_pfmemalloc(skb)) ++ gfp_mask |= __GFP_MEMALLOC; ++ ++ n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); ++ if (!n) ++ return NULL; ++ ++ kmemcheck_annotate_bitfield(n, flags1); ++ kmemcheck_annotate_bitfield(n, flags2); ++ n->fclone = SKB_FCLONE_UNAVAILABLE; ++ } ++ ++ return __skb_clone(n, skb); ++} ++EXPORT_SYMBOL(skb_clone); ++ ++static void skb_headers_offset_update(struct sk_buff *skb, int off) ++{ ++ /* {transport,network,mac}_header and tail are relative to skb->head */ ++ skb->transport_header += off; ++ skb->network_header += off; ++ if (skb_mac_header_was_set(skb)) ++ skb->mac_header += off; ++ skb->inner_transport_header += off; ++ skb->inner_network_header += off; ++ skb->inner_mac_header += off; ++} ++ ++static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) ++{ ++#ifndef NET_SKBUFF_DATA_USES_OFFSET ++ /* ++ * Shift between the two data areas in bytes ++ */ ++ unsigned long offset = new->data - old->data; ++#endif ++ ++ __copy_skb_header(new, old); ++ ++#ifndef NET_SKBUFF_DATA_USES_OFFSET ++ skb_headers_offset_update(new, offset); ++#endif ++ skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; ++ skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; ++ skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; ++} ++ ++static inline int skb_alloc_rx_flag(const struct sk_buff *skb) ++{ ++ if (skb_pfmemalloc(skb)) ++ return SKB_ALLOC_RX; ++ return 0; ++} ++ ++/** ++ * skb_copy - create private copy of an sk_buff ++ * @skb: buffer to copy ++ * @gfp_mask: allocation priority ++ * ++ * Make a copy of both an &sk_buff and its data. This is used when the ++ * caller wishes to modify the data and needs a private copy of the ++ * data to alter. Returns %NULL on failure or the pointer to the buffer ++ * on success. The returned buffer has a reference count of 1. ++ * ++ * As by-product this function converts non-linear &sk_buff to linear ++ * one, so that &sk_buff becomes completely private and caller is allowed ++ * to modify all the data of returned buffer. This means that this ++ * function is not recommended for use in circumstances when only ++ * header is going to be modified. Use pskb_copy() instead. ++ */ ++ ++struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) ++{ ++ int headerlen = skb_headroom(skb); ++ unsigned int size = skb_end_offset(skb) + skb->data_len; ++ struct sk_buff *n = __alloc_skb(size, gfp_mask, ++ skb_alloc_rx_flag(skb), NUMA_NO_NODE); ++ ++ if (!n) ++ return NULL; ++ ++ /* Set the data pointer */ ++ skb_reserve(n, headerlen); ++ /* Set the tail pointer and length */ ++ skb_put(n, skb->len); ++ ++ if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)) ++ BUG(); ++ ++ copy_skb_header(n, skb); ++ return n; ++} ++EXPORT_SYMBOL(skb_copy); ++ ++/** ++ * __pskb_copy - create copy of an sk_buff with private head. ++ * @skb: buffer to copy ++ * @headroom: headroom of new skb ++ * @gfp_mask: allocation priority ++ * ++ * Make a copy of both an &sk_buff and part of its data, located ++ * in header. Fragmented data remain shared. This is used when ++ * the caller wishes to modify only header of &sk_buff and needs ++ * private copy of the header to alter. Returns %NULL on failure ++ * or the pointer to the buffer on success. ++ * The returned buffer has a reference count of 1. ++ */ ++ ++struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask) ++{ ++ unsigned int size = skb_headlen(skb) + headroom; ++ struct sk_buff *n = __alloc_skb(size, gfp_mask, ++ skb_alloc_rx_flag(skb), NUMA_NO_NODE); ++ ++ if (!n) ++ goto out; ++ ++ /* Set the data pointer */ ++ skb_reserve(n, headroom); ++ /* Set the tail pointer and length */ ++ skb_put(n, skb_headlen(skb)); ++ /* Copy the bytes */ ++ skb_copy_from_linear_data(skb, n->data, n->len); ++ ++ n->truesize += skb->data_len; ++ n->data_len = skb->data_len; ++ n->len = skb->len; ++ ++ if (skb_shinfo(skb)->nr_frags) { ++ int i; ++ ++ if (skb_orphan_frags(skb, gfp_mask)) { ++ kfree_skb(n); ++ n = NULL; ++ goto out; ++ } ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; ++ skb_frag_ref(skb, i); ++ } ++ skb_shinfo(n)->nr_frags = i; ++ } ++ ++ if (skb_has_frag_list(skb)) { ++ skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; ++ skb_clone_fraglist(n); ++ } ++ ++ copy_skb_header(n, skb); ++out: ++ return n; ++} ++EXPORT_SYMBOL(__pskb_copy); ++ ++/** ++ * pskb_expand_head - reallocate header of &sk_buff ++ * @skb: buffer to reallocate ++ * @nhead: room to add at head ++ * @ntail: room to add at tail ++ * @gfp_mask: allocation priority ++ * ++ * Expands (or creates identical copy, if &nhead and &ntail are zero) ++ * header of skb. &sk_buff itself is not changed. &sk_buff MUST have ++ * reference count of 1. Returns zero in the case of success or error, ++ * if expansion failed. In the last case, &sk_buff is not changed. ++ * ++ * All the pointers pointing into skb header may change and must be ++ * reloaded after call to this function. ++ */ ++ ++int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, ++ gfp_t gfp_mask) ++{ ++ int i; ++ u8 *data; ++ int size = nhead + skb_end_offset(skb) + ntail; ++ long off; ++ ++ BUG_ON(nhead < 0); ++ ++ if (skb_shared(skb)) ++ BUG(); ++ ++ size = SKB_DATA_ALIGN(size); ++ ++ if (skb_pfmemalloc(skb)) ++ gfp_mask |= __GFP_MEMALLOC; ++ data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), ++ gfp_mask, NUMA_NO_NODE, NULL); ++ if (!data) ++ goto nodata; ++ size = SKB_WITH_OVERHEAD(ksize(data)); ++ ++ /* Copy only real data... and, alas, header. This should be ++ * optimized for the cases when header is void. ++ */ ++ memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head); ++ ++ memcpy((struct skb_shared_info *)(data + size), ++ skb_shinfo(skb), ++ offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); ++ ++ /* ++ * if shinfo is shared we must drop the old head gracefully, but if it ++ * is not we can just drop the old head and let the existing refcount ++ * be since all we did is relocate the values ++ */ ++ if (skb_cloned(skb)) { ++ /* copy this zero copy skb frags */ ++ if (skb_orphan_frags(skb, gfp_mask)) ++ goto nofrags; ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) ++ skb_frag_ref(skb, i); ++ ++ if (skb_has_frag_list(skb)) ++ skb_clone_fraglist(skb); ++ ++ skb_release_data(skb); ++ } else { ++ skb_free_head(skb); ++ } ++ off = (data + nhead) - skb->head; ++ ++ skb->head = data; ++ skb->head_frag = 0; ++ skb->data += off; ++#ifdef NET_SKBUFF_DATA_USES_OFFSET ++ skb->end = size; ++ off = nhead; ++#else ++ skb->end = skb->head + size; ++#endif ++ skb->tail += off; ++ skb_headers_offset_update(skb, off); ++ /* Only adjust this if it actually is csum_start rather than csum */ ++ if (skb->ip_summed == CHECKSUM_PARTIAL) ++ skb->csum_start += nhead; ++ skb->cloned = 0; ++ skb->hdr_len = 0; ++ skb->nohdr = 0; ++ atomic_set(&skb_shinfo(skb)->dataref, 1); ++ return 0; ++ ++nofrags: ++ kfree(data); ++nodata: ++ return -ENOMEM; ++} ++EXPORT_SYMBOL(pskb_expand_head); ++ ++/* Make private copy of skb with writable head and some headroom */ ++ ++struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) ++{ ++ struct sk_buff *skb2; ++ int delta = headroom - skb_headroom(skb); ++ ++ if (delta <= 0) ++ skb2 = pskb_copy(skb, GFP_ATOMIC); ++ else { ++ skb2 = skb_clone(skb, GFP_ATOMIC); ++ if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, ++ GFP_ATOMIC)) { ++ kfree_skb(skb2); ++ skb2 = NULL; ++ } ++ } ++ return skb2; ++} ++EXPORT_SYMBOL(skb_realloc_headroom); ++ ++/** ++ * skb_copy_expand - copy and expand sk_buff ++ * @skb: buffer to copy ++ * @newheadroom: new free bytes at head ++ * @newtailroom: new free bytes at tail ++ * @gfp_mask: allocation priority ++ * ++ * Make a copy of both an &sk_buff and its data and while doing so ++ * allocate additional space. ++ * ++ * This is used when the caller wishes to modify the data and needs a ++ * private copy of the data to alter as well as more space for new fields. ++ * Returns %NULL on failure or the pointer to the buffer ++ * on success. The returned buffer has a reference count of 1. ++ * ++ * You must pass %GFP_ATOMIC as the allocation priority if this function ++ * is called from an interrupt. ++ */ ++struct sk_buff *skb_copy_expand(const struct sk_buff *skb, ++ int newheadroom, int newtailroom, ++ gfp_t gfp_mask) ++{ ++ /* ++ * Allocate the copy buffer ++ */ ++ struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom, ++ gfp_mask, skb_alloc_rx_flag(skb), ++ NUMA_NO_NODE); ++ int oldheadroom = skb_headroom(skb); ++ int head_copy_len, head_copy_off; ++ int off; ++ ++ if (!n) ++ return NULL; ++ ++ skb_reserve(n, newheadroom); ++ ++ /* Set the tail pointer and length */ ++ skb_put(n, skb->len); ++ ++ head_copy_len = oldheadroom; ++ head_copy_off = 0; ++ if (newheadroom <= head_copy_len) ++ head_copy_len = newheadroom; ++ else ++ head_copy_off = newheadroom - head_copy_len; ++ ++ /* Copy the linear header and data. */ ++ if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, ++ skb->len + head_copy_len)) ++ BUG(); ++ ++ copy_skb_header(n, skb); ++ ++ off = newheadroom - oldheadroom; ++ if (n->ip_summed == CHECKSUM_PARTIAL) ++ n->csum_start += off; ++#ifdef NET_SKBUFF_DATA_USES_OFFSET ++ skb_headers_offset_update(n, off); ++#endif ++ ++ return n; ++} ++EXPORT_SYMBOL(skb_copy_expand); ++ ++/** ++ * skb_pad - zero pad the tail of an skb ++ * @skb: buffer to pad ++ * @pad: space to pad ++ * ++ * Ensure that a buffer is followed by a padding area that is zero ++ * filled. Used by network drivers which may DMA or transfer data ++ * beyond the buffer end onto the wire. ++ * ++ * May return error in out of memory cases. The skb is freed on error. ++ */ ++ ++int skb_pad(struct sk_buff *skb, int pad) ++{ ++ int err; ++ int ntail; ++ ++ /* If the skbuff is non linear tailroom is always zero.. */ ++ if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) { ++ memset(skb->data+skb->len, 0, pad); ++ return 0; ++ } ++ ++ ntail = skb->data_len + pad - (skb->end - skb->tail); ++ if (likely(skb_cloned(skb) || ntail > 0)) { ++ err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC); ++ if (unlikely(err)) ++ goto free_skb; ++ } ++ ++ /* FIXME: The use of this function with non-linear skb's really needs ++ * to be audited. ++ */ ++ err = skb_linearize(skb); ++ if (unlikely(err)) ++ goto free_skb; ++ ++ memset(skb->data + skb->len, 0, pad); ++ return 0; ++ ++free_skb: ++ kfree_skb(skb); ++ return err; ++} ++EXPORT_SYMBOL(skb_pad); ++ ++/** ++ * skb_put - add data to a buffer ++ * @skb: buffer to use ++ * @len: amount of data to add ++ * ++ * This function extends the used data area of the buffer. If this would ++ * exceed the total buffer size the kernel will panic. A pointer to the ++ * first byte of the extra data is returned. ++ */ ++unsigned char *skb_put(struct sk_buff *skb, unsigned int len) ++{ ++ unsigned char *tmp = skb_tail_pointer(skb); ++ SKB_LINEAR_ASSERT(skb); ++ skb->tail += len; ++ skb->len += len; ++ if (unlikely(skb->tail > skb->end)) ++ skb_over_panic(skb, len, __builtin_return_address(0)); ++ return tmp; ++} ++EXPORT_SYMBOL(skb_put); ++ ++/** ++ * skb_push - add data to the start of a buffer ++ * @skb: buffer to use ++ * @len: amount of data to add ++ * ++ * This function extends the used data area of the buffer at the buffer ++ * start. If this would exceed the total buffer headroom the kernel will ++ * panic. A pointer to the first byte of the extra data is returned. ++ */ ++unsigned char *skb_push(struct sk_buff *skb, unsigned int len) ++{ ++ skb->data -= len; ++ skb->len += len; ++ if (unlikely(skb->datahead)) ++ skb_under_panic(skb, len, __builtin_return_address(0)); ++ return skb->data; ++} ++EXPORT_SYMBOL(skb_push); ++ ++/** ++ * skb_pull - remove data from the start of a buffer ++ * @skb: buffer to use ++ * @len: amount of data to remove ++ * ++ * This function removes data from the start of a buffer, returning ++ * the memory to the headroom. A pointer to the next data in the buffer ++ * is returned. Once the data has been pulled future pushes will overwrite ++ * the old data. ++ */ ++unsigned char *skb_pull(struct sk_buff *skb, unsigned int len) ++{ ++ return skb_pull_inline(skb, len); ++} ++EXPORT_SYMBOL(skb_pull); ++ ++/** ++ * skb_trim - remove end from a buffer ++ * @skb: buffer to alter ++ * @len: new length ++ * ++ * Cut the length of a buffer down by removing data from the tail. If ++ * the buffer is already under the length specified it is not modified. ++ * The skb must be linear. ++ */ ++void skb_trim(struct sk_buff *skb, unsigned int len) ++{ ++ if (skb->len > len) ++ __skb_trim(skb, len); ++} ++EXPORT_SYMBOL(skb_trim); ++ ++/* Trims skb to length len. It can change skb pointers. ++ */ ++ ++int ___pskb_trim(struct sk_buff *skb, unsigned int len) ++{ ++ struct sk_buff **fragp; ++ struct sk_buff *frag; ++ int offset = skb_headlen(skb); ++ int nfrags = skb_shinfo(skb)->nr_frags; ++ int i; ++ int err; ++ ++ if (skb_cloned(skb) && ++ unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))) ++ return err; ++ ++ i = 0; ++ if (offset >= len) ++ goto drop_pages; ++ ++ for (; i < nfrags; i++) { ++ int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]); ++ ++ if (end < len) { ++ offset = end; ++ continue; ++ } ++ ++ skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset); ++ ++drop_pages: ++ skb_shinfo(skb)->nr_frags = i; ++ ++ for (; i < nfrags; i++) ++ skb_frag_unref(skb, i); ++ ++ if (skb_has_frag_list(skb)) ++ skb_drop_fraglist(skb); ++ goto done; ++ } ++ ++ for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp); ++ fragp = &frag->next) { ++ int end = offset + frag->len; ++ ++ if (skb_shared(frag)) { ++ struct sk_buff *nfrag; ++ ++ nfrag = skb_clone(frag, GFP_ATOMIC); ++ if (unlikely(!nfrag)) ++ return -ENOMEM; ++ ++ nfrag->next = frag->next; ++ consume_skb(frag); ++ frag = nfrag; ++ *fragp = frag; ++ } ++ ++ if (end < len) { ++ offset = end; ++ continue; ++ } ++ ++ if (end > len && ++ unlikely((err = pskb_trim(frag, len - offset)))) ++ return err; ++ ++ if (frag->next) ++ skb_drop_list(&frag->next); ++ break; ++ } ++ ++done: ++ if (len > skb_headlen(skb)) { ++ skb->data_len -= skb->len - len; ++ skb->len = len; ++ } else { ++ skb->len = len; ++ skb->data_len = 0; ++ skb_set_tail_pointer(skb, len); ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL(___pskb_trim); ++ ++/** ++ * __pskb_pull_tail - advance tail of skb header ++ * @skb: buffer to reallocate ++ * @delta: number of bytes to advance tail ++ * ++ * The function makes a sense only on a fragmented &sk_buff, ++ * it expands header moving its tail forward and copying necessary ++ * data from fragmented part. ++ * ++ * &sk_buff MUST have reference count of 1. ++ * ++ * Returns %NULL (and &sk_buff does not change) if pull failed ++ * or value of new tail of skb in the case of success. ++ * ++ * All the pointers pointing into skb header may change and must be ++ * reloaded after call to this function. ++ */ ++ ++/* Moves tail of skb head forward, copying data from fragmented part, ++ * when it is necessary. ++ * 1. It may fail due to malloc failure. ++ * 2. It may change skb pointers. ++ * ++ * It is pretty complicated. Luckily, it is called only in exceptional cases. ++ */ ++unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) ++{ ++ /* If skb has not enough free space at tail, get new one ++ * plus 128 bytes for future expansions. If we have enough ++ * room at tail, reallocate without expansion only if skb is cloned. ++ */ ++ int i, k, eat = (skb->tail + delta) - skb->end; ++ ++ if (eat > 0 || skb_cloned(skb)) { ++ if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, ++ GFP_ATOMIC)) ++ return NULL; ++ } ++ ++ if (skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta)) ++ BUG(); ++ ++ /* Optimization: no fragments, no reasons to preestimate ++ * size of pulled pages. Superb. ++ */ ++ if (!skb_has_frag_list(skb)) ++ goto pull_pages; ++ ++ /* Estimate size of pulled pages. */ ++ eat = delta; ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); ++ ++ if (size >= eat) ++ goto pull_pages; ++ eat -= size; ++ } ++ ++ /* If we need update frag list, we are in troubles. ++ * Certainly, it possible to add an offset to skb data, ++ * but taking into account that pulling is expected to ++ * be very rare operation, it is worth to fight against ++ * further bloating skb head and crucify ourselves here instead. ++ * Pure masohism, indeed. 8)8) ++ */ ++ if (eat) { ++ struct sk_buff *list = skb_shinfo(skb)->frag_list; ++ struct sk_buff *clone = NULL; ++ struct sk_buff *insp = NULL; ++ ++ do { ++ BUG_ON(!list); ++ ++ if (list->len <= eat) { ++ /* Eaten as whole. */ ++ eat -= list->len; ++ list = list->next; ++ insp = list; ++ } else { ++ /* Eaten partially. */ ++ ++ if (skb_shared(list)) { ++ /* Sucks! We need to fork list. :-( */ ++ clone = skb_clone(list, GFP_ATOMIC); ++ if (!clone) ++ return NULL; ++ insp = list->next; ++ list = clone; ++ } else { ++ /* This may be pulled without ++ * problems. */ ++ insp = list; ++ } ++ if (!pskb_pull(list, eat)) { ++ kfree_skb(clone); ++ return NULL; ++ } ++ break; ++ } ++ } while (eat); ++ ++ /* Free pulled out fragments. */ ++ while ((list = skb_shinfo(skb)->frag_list) != insp) { ++ skb_shinfo(skb)->frag_list = list->next; ++ kfree_skb(list); ++ } ++ /* And insert new clone at head. */ ++ if (clone) { ++ clone->next = list; ++ skb_shinfo(skb)->frag_list = clone; ++ } ++ } ++ /* Success! Now we may commit changes to skb data. */ ++ ++pull_pages: ++ eat = delta; ++ k = 0; ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); ++ ++ if (size <= eat) { ++ skb_frag_unref(skb, i); ++ eat -= size; ++ } else { ++ skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; ++ if (eat) { ++ skb_shinfo(skb)->frags[k].page_offset += eat; ++ skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat); ++ eat = 0; ++ } ++ k++; ++ } ++ } ++ skb_shinfo(skb)->nr_frags = k; ++ ++ skb->tail += delta; ++ skb->data_len -= delta; ++ ++ return skb_tail_pointer(skb); ++} ++EXPORT_SYMBOL(__pskb_pull_tail); ++ ++/** ++ * skb_copy_bits - copy bits from skb to kernel buffer ++ * @skb: source skb ++ * @offset: offset in source ++ * @to: destination buffer ++ * @len: number of bytes to copy ++ * ++ * Copy the specified number of bytes from the source skb to the ++ * destination buffer. ++ * ++ * CAUTION ! : ++ * If its prototype is ever changed, ++ * check arch/{*}/net/{*}.S files, ++ * since it is called from BPF assembly code. ++ */ ++int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) ++{ ++ int start = skb_headlen(skb); ++ struct sk_buff *frag_iter; ++ int i, copy; ++ ++ if (offset > (int)skb->len - len) ++ goto fault; ++ ++ /* Copy header. */ ++ if ((copy = start - offset) > 0) { ++ if (copy > len) ++ copy = len; ++ skb_copy_from_linear_data_offset(skb, offset, to, copy); ++ if ((len -= copy) == 0) ++ return 0; ++ offset += copy; ++ to += copy; ++ } ++ ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ int end; ++ skb_frag_t *f = &skb_shinfo(skb)->frags[i]; ++ ++ WARN_ON(start > offset + len); ++ ++ end = start + skb_frag_size(f); ++ if ((copy = end - offset) > 0) { ++ u8 *vaddr; ++ ++ if (copy > len) ++ copy = len; ++ ++ vaddr = kmap_atomic(skb_frag_page(f)); ++ memcpy(to, ++ vaddr + f->page_offset + offset - start, ++ copy); ++ kunmap_atomic(vaddr); ++ ++ if ((len -= copy) == 0) ++ return 0; ++ offset += copy; ++ to += copy; ++ } ++ start = end; ++ } ++ ++ skb_walk_frags(skb, frag_iter) { ++ int end; ++ ++ WARN_ON(start > offset + len); ++ ++ end = start + frag_iter->len; ++ if ((copy = end - offset) > 0) { ++ if (copy > len) ++ copy = len; ++ if (skb_copy_bits(frag_iter, offset - start, to, copy)) ++ goto fault; ++ if ((len -= copy) == 0) ++ return 0; ++ offset += copy; ++ to += copy; ++ } ++ start = end; ++ } ++ ++ if (!len) ++ return 0; ++ ++fault: ++ return -EFAULT; ++} ++EXPORT_SYMBOL(skb_copy_bits); ++ ++/* ++ * Callback from splice_to_pipe(), if we need to release some pages ++ * at the end of the spd in case we error'ed out in filling the pipe. ++ */ ++static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) ++{ ++ put_page(spd->pages[i]); ++} ++ ++static struct page *linear_to_page(struct page *page, unsigned int *len, ++ unsigned int *offset, ++ struct sock *sk) ++{ ++ struct page_frag *pfrag = sk_page_frag(sk); ++ ++ if (!sk_page_frag_refill(sk, pfrag)) ++ return NULL; ++ ++ *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset); ++ ++ memcpy(page_address(pfrag->page) + pfrag->offset, ++ page_address(page) + *offset, *len); ++ *offset = pfrag->offset; ++ pfrag->offset += *len; ++ ++ return pfrag->page; ++} ++ ++static bool spd_can_coalesce(const struct splice_pipe_desc *spd, ++ struct page *page, ++ unsigned int offset) ++{ ++ return spd->nr_pages && ++ spd->pages[spd->nr_pages - 1] == page && ++ (spd->partial[spd->nr_pages - 1].offset + ++ spd->partial[spd->nr_pages - 1].len == offset); ++} ++ ++/* ++ * Fill page/offset/length into spd, if it can hold more pages. ++ */ ++static bool spd_fill_page(struct splice_pipe_desc *spd, ++ struct pipe_inode_info *pipe, struct page *page, ++ unsigned int *len, unsigned int offset, ++ bool linear, ++ struct sock *sk) ++{ ++ if (unlikely(spd->nr_pages == MAX_SKB_FRAGS)) ++ return true; ++ ++ if (linear) { ++ page = linear_to_page(page, len, &offset, sk); ++ if (!page) ++ return true; ++ } ++ if (spd_can_coalesce(spd, page, offset)) { ++ spd->partial[spd->nr_pages - 1].len += *len; ++ return false; ++ } ++ get_page(page); ++ spd->pages[spd->nr_pages] = page; ++ spd->partial[spd->nr_pages].len = *len; ++ spd->partial[spd->nr_pages].offset = offset; ++ spd->nr_pages++; ++ ++ return false; ++} ++ ++static bool __splice_segment(struct page *page, unsigned int poff, ++ unsigned int plen, unsigned int *off, ++ unsigned int *len, ++ struct splice_pipe_desc *spd, bool linear, ++ struct sock *sk, ++ struct pipe_inode_info *pipe) ++{ ++ if (!*len) ++ return true; ++ ++ /* skip this segment if already processed */ ++ if (*off >= plen) { ++ *off -= plen; ++ return false; ++ } ++ ++ /* ignore any bits we already processed */ ++ poff += *off; ++ plen -= *off; ++ *off = 0; ++ ++ do { ++ unsigned int flen = min(*len, plen); ++ ++ if (spd_fill_page(spd, pipe, page, &flen, poff, ++ linear, sk)) ++ return true; ++ poff += flen; ++ plen -= flen; ++ *len -= flen; ++ } while (*len && plen); ++ ++ return false; ++} ++ ++/* ++ * Map linear and fragment data from the skb to spd. It reports true if the ++ * pipe is full or if we already spliced the requested length. ++ */ ++static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, ++ unsigned int *offset, unsigned int *len, ++ struct splice_pipe_desc *spd, struct sock *sk) ++{ ++ int seg; ++ ++ /* map the linear part : ++ * If skb->head_frag is set, this 'linear' part is backed by a ++ * fragment, and if the head is not shared with any clones then ++ * we can avoid a copy since we own the head portion of this page. ++ */ ++ if (__splice_segment(virt_to_page(skb->data), ++ (unsigned long) skb->data & (PAGE_SIZE - 1), ++ skb_headlen(skb), ++ offset, len, spd, ++ skb_head_is_locked(skb), ++ sk, pipe)) ++ return true; ++ ++ /* ++ * then map the fragments ++ */ ++ for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { ++ const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; ++ ++ if (__splice_segment(skb_frag_page(f), ++ f->page_offset, skb_frag_size(f), ++ offset, len, spd, false, sk, pipe)) ++ return true; ++ } ++ ++ return false; ++} ++ ++/* ++ * Map data from the skb to a pipe. Should handle both the linear part, ++ * the fragments, and the frag list. It does NOT handle frag lists within ++ * the frag list, if such a thing exists. We'd probably need to recurse to ++ * handle that cleanly. ++ */ ++int skb_splice_bits(struct sk_buff *skb, unsigned int offset, ++ struct pipe_inode_info *pipe, unsigned int tlen, ++ unsigned int flags) ++{ ++ struct partial_page partial[MAX_SKB_FRAGS]; ++ struct page *pages[MAX_SKB_FRAGS]; ++ struct splice_pipe_desc spd = { ++ .pages = pages, ++ .partial = partial, ++ .nr_pages_max = MAX_SKB_FRAGS, ++ .flags = flags, ++ .ops = &sock_pipe_buf_ops, ++ .spd_release = sock_spd_release, ++ }; ++ struct sk_buff *frag_iter; ++ struct sock *sk = skb->sk; ++ int ret = 0; ++ ++ /* ++ * __skb_splice_bits() only fails if the output has no room left, ++ * so no point in going over the frag_list for the error case. ++ */ ++ if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk)) ++ goto done; ++ else if (!tlen) ++ goto done; ++ ++ /* ++ * now see if we have a frag_list to map ++ */ ++ skb_walk_frags(skb, frag_iter) { ++ if (!tlen) ++ break; ++ if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk)) ++ break; ++ } ++ ++done: ++ if (spd.nr_pages) { ++ /* ++ * Drop the socket lock, otherwise we have reverse ++ * locking dependencies between sk_lock and i_mutex ++ * here as compared to sendfile(). We enter here ++ * with the socket lock held, and splice_to_pipe() will ++ * grab the pipe inode lock. For sendfile() emulation, ++ * we call into ->sendpage() with the i_mutex lock held ++ * and networking will grab the socket lock. ++ */ ++ release_sock(sk); ++ ret = splice_to_pipe(pipe, &spd); ++ lock_sock(sk); ++ } ++ ++ return ret; ++} ++ ++/** ++ * skb_store_bits - store bits from kernel buffer to skb ++ * @skb: destination buffer ++ * @offset: offset in destination ++ * @from: source buffer ++ * @len: number of bytes to copy ++ * ++ * Copy the specified number of bytes from the source buffer to the ++ * destination skb. This function handles all the messy bits of ++ * traversing fragment lists and such. ++ */ ++ ++int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) ++{ ++ int start = skb_headlen(skb); ++ struct sk_buff *frag_iter; ++ int i, copy; ++ ++ if (offset > (int)skb->len - len) ++ goto fault; ++ ++ if ((copy = start - offset) > 0) { ++ if (copy > len) ++ copy = len; ++ skb_copy_to_linear_data_offset(skb, offset, from, copy); ++ if ((len -= copy) == 0) ++ return 0; ++ offset += copy; ++ from += copy; ++ } ++ ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; ++ int end; ++ ++ WARN_ON(start > offset + len); ++ ++ end = start + skb_frag_size(frag); ++ if ((copy = end - offset) > 0) { ++ u8 *vaddr; ++ ++ if (copy > len) ++ copy = len; ++ ++ vaddr = kmap_atomic(skb_frag_page(frag)); ++ memcpy(vaddr + frag->page_offset + offset - start, ++ from, copy); ++ kunmap_atomic(vaddr); ++ ++ if ((len -= copy) == 0) ++ return 0; ++ offset += copy; ++ from += copy; ++ } ++ start = end; ++ } ++ ++ skb_walk_frags(skb, frag_iter) { ++ int end; ++ ++ WARN_ON(start > offset + len); ++ ++ end = start + frag_iter->len; ++ if ((copy = end - offset) > 0) { ++ if (copy > len) ++ copy = len; ++ if (skb_store_bits(frag_iter, offset - start, ++ from, copy)) ++ goto fault; ++ if ((len -= copy) == 0) ++ return 0; ++ offset += copy; ++ from += copy; ++ } ++ start = end; ++ } ++ if (!len) ++ return 0; ++ ++fault: ++ return -EFAULT; ++} ++EXPORT_SYMBOL(skb_store_bits); ++ ++/* Checksum skb data. */ ++ ++__wsum skb_checksum(const struct sk_buff *skb, int offset, ++ int len, __wsum csum) ++{ ++ int start = skb_headlen(skb); ++ int i, copy = start - offset; ++ struct sk_buff *frag_iter; ++ int pos = 0; ++ ++ /* Checksum header. */ ++ if (copy > 0) { ++ if (copy > len) ++ copy = len; ++ csum = csum_partial(skb->data + offset, copy, csum); ++ if ((len -= copy) == 0) ++ return csum; ++ offset += copy; ++ pos = copy; ++ } ++ ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ int end; ++ skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; ++ ++ WARN_ON(start > offset + len); ++ ++ end = start + skb_frag_size(frag); ++ if ((copy = end - offset) > 0) { ++ __wsum csum2; ++ u8 *vaddr; ++ ++ if (copy > len) ++ copy = len; ++ vaddr = kmap_atomic(skb_frag_page(frag)); ++ csum2 = csum_partial(vaddr + frag->page_offset + ++ offset - start, copy, 0); ++ kunmap_atomic(vaddr); ++ csum = csum_block_add(csum, csum2, pos); ++ if (!(len -= copy)) ++ return csum; ++ offset += copy; ++ pos += copy; ++ } ++ start = end; ++ } ++ ++ skb_walk_frags(skb, frag_iter) { ++ int end; ++ ++ WARN_ON(start > offset + len); ++ ++ end = start + frag_iter->len; ++ if ((copy = end - offset) > 0) { ++ __wsum csum2; ++ if (copy > len) ++ copy = len; ++ csum2 = skb_checksum(frag_iter, offset - start, ++ copy, 0); ++ csum = csum_block_add(csum, csum2, pos); ++ if ((len -= copy) == 0) ++ return csum; ++ offset += copy; ++ pos += copy; ++ } ++ start = end; ++ } ++ BUG_ON(len); ++ ++ return csum; ++} ++EXPORT_SYMBOL(skb_checksum); ++ ++/* Both of above in one bottle. */ ++ ++__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, ++ u8 *to, int len, __wsum csum) ++{ ++ int start = skb_headlen(skb); ++ int i, copy = start - offset; ++ struct sk_buff *frag_iter; ++ int pos = 0; ++ ++ /* Copy header. */ ++ if (copy > 0) { ++ if (copy > len) ++ copy = len; ++ csum = csum_partial_copy_nocheck(skb->data + offset, to, ++ copy, csum); ++ if ((len -= copy) == 0) ++ return csum; ++ offset += copy; ++ to += copy; ++ pos = copy; ++ } ++ ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ int end; ++ ++ WARN_ON(start > offset + len); ++ ++ end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); ++ if ((copy = end - offset) > 0) { ++ __wsum csum2; ++ u8 *vaddr; ++ skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; ++ ++ if (copy > len) ++ copy = len; ++ vaddr = kmap_atomic(skb_frag_page(frag)); ++ csum2 = csum_partial_copy_nocheck(vaddr + ++ frag->page_offset + ++ offset - start, to, ++ copy, 0); ++ kunmap_atomic(vaddr); ++ csum = csum_block_add(csum, csum2, pos); ++ if (!(len -= copy)) ++ return csum; ++ offset += copy; ++ to += copy; ++ pos += copy; ++ } ++ start = end; ++ } ++ ++ skb_walk_frags(skb, frag_iter) { ++ __wsum csum2; ++ int end; ++ ++ WARN_ON(start > offset + len); ++ ++ end = start + frag_iter->len; ++ if ((copy = end - offset) > 0) { ++ if (copy > len) ++ copy = len; ++ csum2 = skb_copy_and_csum_bits(frag_iter, ++ offset - start, ++ to, copy, 0); ++ csum = csum_block_add(csum, csum2, pos); ++ if ((len -= copy) == 0) ++ return csum; ++ offset += copy; ++ to += copy; ++ pos += copy; ++ } ++ start = end; ++ } ++ BUG_ON(len); ++ return csum; ++} ++EXPORT_SYMBOL(skb_copy_and_csum_bits); ++ ++void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) ++{ ++ __wsum csum; ++ long csstart; ++ ++ if (skb->ip_summed == CHECKSUM_PARTIAL) ++ csstart = skb_checksum_start_offset(skb); ++ else ++ csstart = skb_headlen(skb); ++ ++ BUG_ON(csstart > skb_headlen(skb)); ++ ++ skb_copy_from_linear_data(skb, to, csstart); ++ ++ csum = 0; ++ if (csstart != skb->len) ++ csum = skb_copy_and_csum_bits(skb, csstart, to + csstart, ++ skb->len - csstart, 0); ++ ++ if (skb->ip_summed == CHECKSUM_PARTIAL) { ++ long csstuff = csstart + skb->csum_offset; ++ ++ *((__sum16 *)(to + csstuff)) = csum_fold(csum); ++ } ++} ++EXPORT_SYMBOL(skb_copy_and_csum_dev); ++ ++/** ++ * skb_dequeue - remove from the head of the queue ++ * @list: list to dequeue from ++ * ++ * Remove the head of the list. The list lock is taken so the function ++ * may be used safely with other locking list functions. The head item is ++ * returned or %NULL if the list is empty. ++ */ ++ ++struct sk_buff *skb_dequeue(struct sk_buff_head *list) ++{ ++ unsigned long flags; ++ struct sk_buff *result; ++ ++ spin_lock_irqsave(&list->lock, flags); ++ result = __skb_dequeue(list); ++ spin_unlock_irqrestore(&list->lock, flags); ++ return result; ++} ++EXPORT_SYMBOL(skb_dequeue); ++ ++/** ++ * skb_dequeue_tail - remove from the tail of the queue ++ * @list: list to dequeue from ++ * ++ * Remove the tail of the list. The list lock is taken so the function ++ * may be used safely with other locking list functions. The tail item is ++ * returned or %NULL if the list is empty. ++ */ ++struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) ++{ ++ unsigned long flags; ++ struct sk_buff *result; ++ ++ spin_lock_irqsave(&list->lock, flags); ++ result = __skb_dequeue_tail(list); ++ spin_unlock_irqrestore(&list->lock, flags); ++ return result; ++} ++EXPORT_SYMBOL(skb_dequeue_tail); ++ ++/** ++ * skb_queue_purge - empty a list ++ * @list: list to empty ++ * ++ * Delete all buffers on an &sk_buff list. Each buffer is removed from ++ * the list and one reference dropped. This function takes the list ++ * lock and is atomic with respect to other list locking functions. ++ */ ++void skb_queue_purge(struct sk_buff_head *list) ++{ ++ struct sk_buff *skb; ++ while ((skb = skb_dequeue(list)) != NULL) ++ kfree_skb(skb); ++} ++EXPORT_SYMBOL(skb_queue_purge); ++ ++/** ++ * skb_queue_head - queue a buffer at the list head ++ * @list: list to use ++ * @newsk: buffer to queue ++ * ++ * Queue a buffer at the start of the list. This function takes the ++ * list lock and can be used safely with other locking &sk_buff functions ++ * safely. ++ * ++ * A buffer cannot be placed on two lists at the same time. ++ */ ++void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&list->lock, flags); ++ __skb_queue_head(list, newsk); ++ spin_unlock_irqrestore(&list->lock, flags); ++} ++EXPORT_SYMBOL(skb_queue_head); ++ ++/** ++ * skb_queue_tail - queue a buffer at the list tail ++ * @list: list to use ++ * @newsk: buffer to queue ++ * ++ * Queue a buffer at the tail of the list. This function takes the ++ * list lock and can be used safely with other locking &sk_buff functions ++ * safely. ++ * ++ * A buffer cannot be placed on two lists at the same time. ++ */ ++void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&list->lock, flags); ++ __skb_queue_tail(list, newsk); ++ spin_unlock_irqrestore(&list->lock, flags); ++} ++EXPORT_SYMBOL(skb_queue_tail); ++ ++/** ++ * skb_unlink - remove a buffer from a list ++ * @skb: buffer to remove ++ * @list: list to use ++ * ++ * Remove a packet from a list. The list locks are taken and this ++ * function is atomic with respect to other list locked calls ++ * ++ * You must know what list the SKB is on. ++ */ ++void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&list->lock, flags); ++ __skb_unlink(skb, list); ++ spin_unlock_irqrestore(&list->lock, flags); ++} ++EXPORT_SYMBOL(skb_unlink); ++ ++/** ++ * skb_append - append a buffer ++ * @old: buffer to insert after ++ * @newsk: buffer to insert ++ * @list: list to use ++ * ++ * Place a packet after a given packet in a list. The list locks are taken ++ * and this function is atomic with respect to other list locked calls. ++ * A buffer cannot be placed on two lists at the same time. ++ */ ++void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&list->lock, flags); ++ __skb_queue_after(list, old, newsk); ++ spin_unlock_irqrestore(&list->lock, flags); ++} ++EXPORT_SYMBOL(skb_append); ++ ++/** ++ * skb_insert - insert a buffer ++ * @old: buffer to insert before ++ * @newsk: buffer to insert ++ * @list: list to use ++ * ++ * Place a packet before a given packet in a list. The list locks are ++ * taken and this function is atomic with respect to other list locked ++ * calls. ++ * ++ * A buffer cannot be placed on two lists at the same time. ++ */ ++void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&list->lock, flags); ++ __skb_insert(newsk, old->prev, old, list); ++ spin_unlock_irqrestore(&list->lock, flags); ++} ++EXPORT_SYMBOL(skb_insert); ++ ++static inline void skb_split_inside_header(struct sk_buff *skb, ++ struct sk_buff* skb1, ++ const u32 len, const int pos) ++{ ++ int i; ++ ++ skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len), ++ pos - len); ++ /* And move data appendix as is. */ ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) ++ skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; ++ ++ skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; ++ skb_shinfo(skb)->nr_frags = 0; ++ skb1->data_len = skb->data_len; ++ skb1->len += skb1->data_len; ++ skb->data_len = 0; ++ skb->len = len; ++ skb_set_tail_pointer(skb, len); ++} ++ ++static inline void skb_split_no_header(struct sk_buff *skb, ++ struct sk_buff* skb1, ++ const u32 len, int pos) ++{ ++ int i, k = 0; ++ const int nfrags = skb_shinfo(skb)->nr_frags; ++ ++ skb_shinfo(skb)->nr_frags = 0; ++ skb1->len = skb1->data_len = skb->len - len; ++ skb->len = len; ++ skb->data_len = len - pos; ++ ++ for (i = 0; i < nfrags; i++) { ++ int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); ++ ++ if (pos + size > len) { ++ skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; ++ ++ if (pos < len) { ++ /* Split frag. ++ * We have two variants in this case: ++ * 1. Move all the frag to the second ++ * part, if it is possible. F.e. ++ * this approach is mandatory for TUX, ++ * where splitting is expensive. ++ * 2. Split is accurately. We make this. ++ */ ++ skb_frag_ref(skb, i); ++ skb_shinfo(skb1)->frags[0].page_offset += len - pos; ++ skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos); ++ skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos); ++ skb_shinfo(skb)->nr_frags++; ++ } ++ k++; ++ } else ++ skb_shinfo(skb)->nr_frags++; ++ pos += size; ++ } ++ skb_shinfo(skb1)->nr_frags = k; ++} ++ ++/** ++ * skb_split - Split fragmented skb to two parts at length len. ++ * @skb: the buffer to split ++ * @skb1: the buffer to receive the second part ++ * @len: new length for skb ++ */ ++void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) ++{ ++ int pos = skb_headlen(skb); ++ ++ skb_shinfo(skb1)->tx_flags = skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG; ++ if (len < pos) /* Split line is inside header. */ ++ skb_split_inside_header(skb, skb1, len, pos); ++ else /* Second chunk has no header, nothing to copy. */ ++ skb_split_no_header(skb, skb1, len, pos); ++} ++EXPORT_SYMBOL(skb_split); ++ ++/* Shifting from/to a cloned skb is a no-go. ++ * ++ * Caller cannot keep skb_shinfo related pointers past calling here! ++ */ ++static int skb_prepare_for_shift(struct sk_buff *skb) ++{ ++ return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC); ++} ++ ++/** ++ * skb_shift - Shifts paged data partially from skb to another ++ * @tgt: buffer into which tail data gets added ++ * @skb: buffer from which the paged data comes from ++ * @shiftlen: shift up to this many bytes ++ * ++ * Attempts to shift up to shiftlen worth of bytes, which may be less than ++ * the length of the skb, from skb to tgt. Returns number bytes shifted. ++ * It's up to caller to free skb if everything was shifted. ++ * ++ * If @tgt runs out of frags, the whole operation is aborted. ++ * ++ * Skb cannot include anything else but paged data while tgt is allowed ++ * to have non-paged data as well. ++ * ++ * TODO: full sized shift could be optimized but that would need ++ * specialized skb free'er to handle frags without up-to-date nr_frags. ++ */ ++int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) ++{ ++ int from, to, merge, todo; ++ struct skb_frag_struct *fragfrom, *fragto; ++ ++ BUG_ON(shiftlen > skb->len); ++ BUG_ON(skb_headlen(skb)); /* Would corrupt stream */ ++ ++ todo = shiftlen; ++ from = 0; ++ to = skb_shinfo(tgt)->nr_frags; ++ fragfrom = &skb_shinfo(skb)->frags[from]; ++ ++ /* Actual merge is delayed until the point when we know we can ++ * commit all, so that we don't have to undo partial changes ++ */ ++ if (!to || ++ !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom), ++ fragfrom->page_offset)) { ++ merge = -1; ++ } else { ++ merge = to - 1; ++ ++ todo -= skb_frag_size(fragfrom); ++ if (todo < 0) { ++ if (skb_prepare_for_shift(skb) || ++ skb_prepare_for_shift(tgt)) ++ return 0; ++ ++ /* All previous frag pointers might be stale! */ ++ fragfrom = &skb_shinfo(skb)->frags[from]; ++ fragto = &skb_shinfo(tgt)->frags[merge]; ++ ++ skb_frag_size_add(fragto, shiftlen); ++ skb_frag_size_sub(fragfrom, shiftlen); ++ fragfrom->page_offset += shiftlen; ++ ++ goto onlymerged; ++ } ++ ++ from++; ++ } ++ ++ /* Skip full, not-fitting skb to avoid expensive operations */ ++ if ((shiftlen == skb->len) && ++ (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to)) ++ return 0; ++ ++ if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt)) ++ return 0; ++ ++ while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) { ++ if (to == MAX_SKB_FRAGS) ++ return 0; ++ ++ fragfrom = &skb_shinfo(skb)->frags[from]; ++ fragto = &skb_shinfo(tgt)->frags[to]; ++ ++ if (todo >= skb_frag_size(fragfrom)) { ++ *fragto = *fragfrom; ++ todo -= skb_frag_size(fragfrom); ++ from++; ++ to++; ++ ++ } else { ++ __skb_frag_ref(fragfrom); ++ fragto->page = fragfrom->page; ++ fragto->page_offset = fragfrom->page_offset; ++ skb_frag_size_set(fragto, todo); ++ ++ fragfrom->page_offset += todo; ++ skb_frag_size_sub(fragfrom, todo); ++ todo = 0; ++ ++ to++; ++ break; ++ } ++ } ++ ++ /* Ready to "commit" this state change to tgt */ ++ skb_shinfo(tgt)->nr_frags = to; ++ ++ if (merge >= 0) { ++ fragfrom = &skb_shinfo(skb)->frags[0]; ++ fragto = &skb_shinfo(tgt)->frags[merge]; ++ ++ skb_frag_size_add(fragto, skb_frag_size(fragfrom)); ++ __skb_frag_unref(fragfrom); ++ } ++ ++ /* Reposition in the original skb */ ++ to = 0; ++ while (from < skb_shinfo(skb)->nr_frags) ++ skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++]; ++ skb_shinfo(skb)->nr_frags = to; ++ ++ BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags); ++ ++onlymerged: ++ /* Most likely the tgt won't ever need its checksum anymore, skb on ++ * the other hand might need it if it needs to be resent ++ */ ++ tgt->ip_summed = CHECKSUM_PARTIAL; ++ skb->ip_summed = CHECKSUM_PARTIAL; ++ ++ /* Yak, is it really working this way? Some helper please? */ ++ skb->len -= shiftlen; ++ skb->data_len -= shiftlen; ++ skb->truesize -= shiftlen; ++ tgt->len += shiftlen; ++ tgt->data_len += shiftlen; ++ tgt->truesize += shiftlen; ++ ++ return shiftlen; ++} ++ ++/** ++ * skb_prepare_seq_read - Prepare a sequential read of skb data ++ * @skb: the buffer to read ++ * @from: lower offset of data to be read ++ * @to: upper offset of data to be read ++ * @st: state variable ++ * ++ * Initializes the specified state variable. Must be called before ++ * invoking skb_seq_read() for the first time. ++ */ ++void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, ++ unsigned int to, struct skb_seq_state *st) ++{ ++ st->lower_offset = from; ++ st->upper_offset = to; ++ st->root_skb = st->cur_skb = skb; ++ st->frag_idx = st->stepped_offset = 0; ++ st->frag_data = NULL; ++} ++EXPORT_SYMBOL(skb_prepare_seq_read); ++ ++/** ++ * skb_seq_read - Sequentially read skb data ++ * @consumed: number of bytes consumed by the caller so far ++ * @data: destination pointer for data to be returned ++ * @st: state variable ++ * ++ * Reads a block of skb data at &consumed relative to the ++ * lower offset specified to skb_prepare_seq_read(). Assigns ++ * the head of the data block to &data and returns the length ++ * of the block or 0 if the end of the skb data or the upper ++ * offset has been reached. ++ * ++ * The caller is not required to consume all of the data ++ * returned, i.e. &consumed is typically set to the number ++ * of bytes already consumed and the next call to ++ * skb_seq_read() will return the remaining part of the block. ++ * ++ * Note 1: The size of each block of data returned can be arbitrary, ++ * this limitation is the cost for zerocopy seqeuental ++ * reads of potentially non linear data. ++ * ++ * Note 2: Fragment lists within fragments are not implemented ++ * at the moment, state->root_skb could be replaced with ++ * a stack for this purpose. ++ */ ++unsigned int skb_seq_read(unsigned int consumed, const u8 **data, ++ struct skb_seq_state *st) ++{ ++ unsigned int block_limit, abs_offset = consumed + st->lower_offset; ++ skb_frag_t *frag; ++ ++ if (unlikely(abs_offset >= st->upper_offset)) ++ return 0; ++ ++next_skb: ++ block_limit = skb_headlen(st->cur_skb) + st->stepped_offset; ++ ++ if (abs_offset < block_limit && !st->frag_data) { ++ *data = st->cur_skb->data + (abs_offset - st->stepped_offset); ++ return block_limit - abs_offset; ++ } ++ ++ if (st->frag_idx == 0 && !st->frag_data) ++ st->stepped_offset += skb_headlen(st->cur_skb); ++ ++ while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { ++ frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; ++ block_limit = skb_frag_size(frag) + st->stepped_offset; ++ ++ if (abs_offset < block_limit) { ++ if (!st->frag_data) ++ st->frag_data = kmap_atomic(skb_frag_page(frag)); ++ ++ *data = (u8 *) st->frag_data + frag->page_offset + ++ (abs_offset - st->stepped_offset); ++ ++ return block_limit - abs_offset; ++ } ++ ++ if (st->frag_data) { ++ kunmap_atomic(st->frag_data); ++ st->frag_data = NULL; ++ } ++ ++ st->frag_idx++; ++ st->stepped_offset += skb_frag_size(frag); ++ } ++ ++ if (st->frag_data) { ++ kunmap_atomic(st->frag_data); ++ st->frag_data = NULL; ++ } ++ ++ if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) { ++ st->cur_skb = skb_shinfo(st->root_skb)->frag_list; ++ st->frag_idx = 0; ++ goto next_skb; ++ } else if (st->cur_skb->next) { ++ st->cur_skb = st->cur_skb->next; ++ st->frag_idx = 0; ++ goto next_skb; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL(skb_seq_read); ++ ++/** ++ * skb_abort_seq_read - Abort a sequential read of skb data ++ * @st: state variable ++ * ++ * Must be called if skb_seq_read() was not called until it ++ * returned 0. ++ */ ++void skb_abort_seq_read(struct skb_seq_state *st) ++{ ++ if (st->frag_data) ++ kunmap_atomic(st->frag_data); ++} ++EXPORT_SYMBOL(skb_abort_seq_read); ++ ++#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb)) ++ ++static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text, ++ struct ts_config *conf, ++ struct ts_state *state) ++{ ++ return skb_seq_read(offset, text, TS_SKB_CB(state)); ++} ++ ++static void skb_ts_finish(struct ts_config *conf, struct ts_state *state) ++{ ++ skb_abort_seq_read(TS_SKB_CB(state)); ++} ++ ++/** ++ * skb_find_text - Find a text pattern in skb data ++ * @skb: the buffer to look in ++ * @from: search offset ++ * @to: search limit ++ * @config: textsearch configuration ++ * @state: uninitialized textsearch state variable ++ * ++ * Finds a pattern in the skb data according to the specified ++ * textsearch configuration. Use textsearch_next() to retrieve ++ * subsequent occurrences of the pattern. Returns the offset ++ * to the first occurrence or UINT_MAX if no match was found. ++ */ ++unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, ++ unsigned int to, struct ts_config *config, ++ struct ts_state *state) ++{ ++ unsigned int ret; ++ ++ config->get_next_block = skb_ts_get_next_block; ++ config->finish = skb_ts_finish; ++ ++ skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state)); ++ ++ ret = textsearch_find(config, state); ++ return (ret <= to - from ? ret : UINT_MAX); ++} ++EXPORT_SYMBOL(skb_find_text); ++ ++/** ++ * skb_append_datato_frags - append the user data to a skb ++ * @sk: sock structure ++ * @skb: skb structure to be appened with user data. ++ * @getfrag: call back function to be used for getting the user data ++ * @from: pointer to user message iov ++ * @length: length of the iov message ++ * ++ * Description: This procedure append the user data in the fragment part ++ * of the skb if any page alloc fails user this procedure returns -ENOMEM ++ */ ++int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, ++ int (*getfrag)(void *from, char *to, int offset, ++ int len, int odd, struct sk_buff *skb), ++ void *from, int length) ++{ ++ int frg_cnt = skb_shinfo(skb)->nr_frags; ++ int copy; ++ int offset = 0; ++ int ret; ++ struct page_frag *pfrag = ¤t->task_frag; ++ ++ do { ++ /* Return error if we don't have space for new frag */ ++ if (frg_cnt >= MAX_SKB_FRAGS) ++ return -EMSGSIZE; ++ ++ if (!sk_page_frag_refill(sk, pfrag)) ++ return -ENOMEM; ++ ++ /* copy the user data to page */ ++ copy = min_t(int, length, pfrag->size - pfrag->offset); ++ ++ ret = getfrag(from, page_address(pfrag->page) + pfrag->offset, ++ offset, copy, 0, skb); ++ if (ret < 0) ++ return -EFAULT; ++ ++ /* copy was successful so update the size parameters */ ++ skb_fill_page_desc(skb, frg_cnt, pfrag->page, pfrag->offset, ++ copy); ++ frg_cnt++; ++ pfrag->offset += copy; ++ get_page(pfrag->page); ++ ++ skb->truesize += copy; ++ atomic_add(copy, &sk->sk_wmem_alloc); ++ skb->len += copy; ++ skb->data_len += copy; ++ offset += copy; ++ length -= copy; ++ ++ } while (length > 0); ++ ++ return 0; ++} ++EXPORT_SYMBOL(skb_append_datato_frags); ++ ++/** ++ * skb_pull_rcsum - pull skb and update receive checksum ++ * @skb: buffer to update ++ * @len: length of data pulled ++ * ++ * This function performs an skb_pull on the packet and updates ++ * the CHECKSUM_COMPLETE checksum. It should be used on ++ * receive path processing instead of skb_pull unless you know ++ * that the checksum difference is zero (e.g., a valid IP header) ++ * or you are setting ip_summed to CHECKSUM_NONE. ++ */ ++unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) ++{ ++ BUG_ON(len > skb->len); ++ skb->len -= len; ++ BUG_ON(skb->len < skb->data_len); ++ skb_postpull_rcsum(skb, skb->data, len); ++ return skb->data += len; ++} ++EXPORT_SYMBOL_GPL(skb_pull_rcsum); ++ ++/** ++ * skb_segment - Perform protocol segmentation on skb. ++ * @skb: buffer to segment ++ * @features: features for the output path (see dev->features) ++ * ++ * This function performs segmentation on the given skb. It returns ++ * a pointer to the first in a list of new skbs for the segments. ++ * In case of error it returns ERR_PTR(err). ++ */ ++struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) ++{ ++ struct sk_buff *segs = NULL; ++ struct sk_buff *tail = NULL; ++ struct sk_buff *fskb = skb_shinfo(skb)->frag_list; ++ unsigned int mss = skb_shinfo(skb)->gso_size; ++ unsigned int doffset = skb->data - skb_mac_header(skb); ++ unsigned int offset = doffset; ++ unsigned int tnl_hlen = skb_tnl_header_len(skb); ++ unsigned int headroom; ++ unsigned int len; ++ __be16 proto; ++ bool csum; ++ int sg = !!(features & NETIF_F_SG); ++ int nfrags = skb_shinfo(skb)->nr_frags; ++ int err = -ENOMEM; ++ int i = 0; ++ int pos; ++ ++ proto = skb_network_protocol(skb); ++ if (unlikely(!proto)) ++ return ERR_PTR(-EINVAL); ++ ++ csum = !!can_checksum_protocol(features, proto); ++ __skb_push(skb, doffset); ++ headroom = skb_headroom(skb); ++ pos = skb_headlen(skb); ++ ++ do { ++ struct sk_buff *nskb; ++ skb_frag_t *frag; ++ int hsize; ++ int size; ++ ++ len = skb->len - offset; ++ if (len > mss) ++ len = mss; ++ ++ hsize = skb_headlen(skb) - offset; ++ if (hsize < 0) ++ hsize = 0; ++ if (hsize > len || !sg) ++ hsize = len; ++ ++ if (!hsize && i >= nfrags) { ++ BUG_ON(fskb->len != len); ++ ++ pos += len; ++ nskb = skb_clone(fskb, GFP_ATOMIC); ++ fskb = fskb->next; ++ ++ if (unlikely(!nskb)) ++ goto err; ++ ++ hsize = skb_end_offset(nskb); ++ if (skb_cow_head(nskb, doffset + headroom)) { ++ kfree_skb(nskb); ++ goto err; ++ } ++ ++ nskb->truesize += skb_end_offset(nskb) - hsize; ++ skb_release_head_state(nskb); ++ __skb_push(nskb, doffset); ++ } else { ++ nskb = __alloc_skb(hsize + doffset + headroom, ++ GFP_ATOMIC, skb_alloc_rx_flag(skb), ++ NUMA_NO_NODE); ++ ++ if (unlikely(!nskb)) ++ goto err; ++ ++ skb_reserve(nskb, headroom); ++ __skb_put(nskb, doffset); ++ } ++ ++ if (segs) ++ tail->next = nskb; ++ else ++ segs = nskb; ++ tail = nskb; ++ ++ __copy_skb_header(nskb, skb); ++ nskb->mac_len = skb->mac_len; ++ ++ /* nskb and skb might have different headroom */ ++ if (nskb->ip_summed == CHECKSUM_PARTIAL) ++ nskb->csum_start += skb_headroom(nskb) - headroom; ++ ++ skb_reset_mac_header(nskb); ++ skb_set_network_header(nskb, skb->mac_len); ++ nskb->transport_header = (nskb->network_header + ++ skb_network_header_len(skb)); ++ ++ skb_copy_from_linear_data_offset(skb, -tnl_hlen, ++ nskb->data - tnl_hlen, ++ doffset + tnl_hlen); ++ ++ if (fskb != skb_shinfo(skb)->frag_list) ++ goto perform_csum_check; ++ ++ if (!sg) { ++ nskb->ip_summed = CHECKSUM_NONE; ++ nskb->csum = skb_copy_and_csum_bits(skb, offset, ++ skb_put(nskb, len), ++ len, 0); ++ continue; ++ } ++ ++ frag = skb_shinfo(nskb)->frags; ++ ++ skb_copy_from_linear_data_offset(skb, offset, ++ skb_put(nskb, hsize), hsize); ++ ++ skb_shinfo(nskb)->tx_flags = skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG; ++ ++ while (pos < offset + len && i < nfrags) { ++ *frag = skb_shinfo(skb)->frags[i]; ++ __skb_frag_ref(frag); ++ size = skb_frag_size(frag); ++ ++ if (pos < offset) { ++ frag->page_offset += offset - pos; ++ skb_frag_size_sub(frag, offset - pos); ++ } ++ ++ skb_shinfo(nskb)->nr_frags++; ++ ++ if (pos + size <= offset + len) { ++ i++; ++ pos += size; ++ } else { ++ skb_frag_size_sub(frag, pos + size - (offset + len)); ++ goto skip_fraglist; ++ } ++ ++ frag++; ++ } ++ ++ if (pos < offset + len) { ++ struct sk_buff *fskb2 = fskb; ++ ++ BUG_ON(pos + fskb->len != offset + len); ++ ++ pos += fskb->len; ++ fskb = fskb->next; ++ ++ if (fskb2->next) { ++ fskb2 = skb_clone(fskb2, GFP_ATOMIC); ++ if (!fskb2) ++ goto err; ++ } else ++ skb_get(fskb2); ++ ++ SKB_FRAG_ASSERT(nskb); ++ skb_shinfo(nskb)->frag_list = fskb2; ++ } ++ ++skip_fraglist: ++ nskb->data_len = len - hsize; ++ nskb->len += nskb->data_len; ++ nskb->truesize += nskb->data_len; ++ ++perform_csum_check: ++ if (!csum) { ++ nskb->csum = skb_checksum(nskb, doffset, ++ nskb->len - doffset, 0); ++ nskb->ip_summed = CHECKSUM_NONE; ++ } ++ } while ((offset += len) < skb->len); ++ ++ return segs; ++ ++err: ++ while ((skb = segs)) { ++ segs = skb->next; ++ kfree_skb(skb); ++ } ++ return ERR_PTR(err); ++} ++EXPORT_SYMBOL_GPL(skb_segment); ++ ++int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) ++{ ++ struct sk_buff *p = *head; ++ struct sk_buff *nskb; ++ struct skb_shared_info *skbinfo = skb_shinfo(skb); ++ struct skb_shared_info *pinfo = skb_shinfo(p); ++ unsigned int headroom; ++ unsigned int len = skb_gro_len(skb); ++ unsigned int offset = skb_gro_offset(skb); ++ unsigned int headlen = skb_headlen(skb); ++ unsigned int delta_truesize; ++ ++ if (p->len + len >= 65536) ++ return -E2BIG; ++ ++ if (pinfo->frag_list) ++ goto merge; ++ else if (headlen <= offset) { ++ skb_frag_t *frag; ++ skb_frag_t *frag2; ++ int i = skbinfo->nr_frags; ++ int nr_frags = pinfo->nr_frags + i; ++ ++ offset -= headlen; ++ ++ if (nr_frags > MAX_SKB_FRAGS) ++ return -E2BIG; ++ ++ pinfo->nr_frags = nr_frags; ++ skbinfo->nr_frags = 0; ++ ++ frag = pinfo->frags + nr_frags; ++ frag2 = skbinfo->frags + i; ++ do { ++ *--frag = *--frag2; ++ } while (--i); ++ ++ frag->page_offset += offset; ++ skb_frag_size_sub(frag, offset); ++ ++ /* all fragments truesize : remove (head size + sk_buff) */ ++ delta_truesize = skb->truesize - ++ SKB_TRUESIZE(skb_end_offset(skb)); ++ ++ skb->truesize -= skb->data_len; ++ skb->len -= skb->data_len; ++ skb->data_len = 0; ++ ++ NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE; ++ goto done; ++ } else if (skb->head_frag) { ++ int nr_frags = pinfo->nr_frags; ++ skb_frag_t *frag = pinfo->frags + nr_frags; ++ struct page *page = virt_to_head_page(skb->head); ++ unsigned int first_size = headlen - offset; ++ unsigned int first_offset; ++ ++ if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS) ++ return -E2BIG; ++ ++ first_offset = skb->data - ++ (unsigned char *)page_address(page) + ++ offset; ++ ++ pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags; ++ ++ frag->page.p = page; ++ frag->page_offset = first_offset; ++ skb_frag_size_set(frag, first_size); ++ ++ memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); ++ /* We dont need to clear skbinfo->nr_frags here */ ++ ++ delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); ++ NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; ++ goto done; ++ } else if (skb_gro_len(p) != pinfo->gso_size) ++ return -E2BIG; ++ ++ headroom = skb_headroom(p); ++ nskb = alloc_skb(headroom + skb_gro_offset(p), GFP_ATOMIC); ++ if (unlikely(!nskb)) ++ return -ENOMEM; ++ ++ __copy_skb_header(nskb, p); ++ nskb->mac_len = p->mac_len; ++ ++ skb_reserve(nskb, headroom); ++ __skb_put(nskb, skb_gro_offset(p)); ++ ++ skb_set_mac_header(nskb, skb_mac_header(p) - p->data); ++ skb_set_network_header(nskb, skb_network_offset(p)); ++ skb_set_transport_header(nskb, skb_transport_offset(p)); ++ ++ __skb_pull(p, skb_gro_offset(p)); ++ memcpy(skb_mac_header(nskb), skb_mac_header(p), ++ p->data - skb_mac_header(p)); ++ ++ skb_shinfo(nskb)->frag_list = p; ++ skb_shinfo(nskb)->gso_size = pinfo->gso_size; ++ pinfo->gso_size = 0; ++ skb_header_release(p); ++ NAPI_GRO_CB(nskb)->last = p; ++ ++ nskb->data_len += p->len; ++ nskb->truesize += p->truesize; ++ nskb->len += p->len; ++ ++ *head = nskb; ++ nskb->next = p->next; ++ p->next = NULL; ++ ++ p = nskb; ++ ++merge: ++ delta_truesize = skb->truesize; ++ if (offset > headlen) { ++ unsigned int eat = offset - headlen; ++ ++ skbinfo->frags[0].page_offset += eat; ++ skb_frag_size_sub(&skbinfo->frags[0], eat); ++ skb->data_len -= eat; ++ skb->len -= eat; ++ offset = headlen; ++ } ++ ++ __skb_pull(skb, offset); ++ ++ NAPI_GRO_CB(p)->last->next = skb; ++ NAPI_GRO_CB(p)->last = skb; ++ skb_header_release(skb); ++ ++done: ++ NAPI_GRO_CB(p)->count++; ++ p->data_len += len; ++ p->truesize += delta_truesize; ++ p->len += len; ++ ++ NAPI_GRO_CB(skb)->same_flow = 1; ++ return 0; ++} ++EXPORT_SYMBOL_GPL(skb_gro_receive); ++ ++void __init skb_init(void) ++{ ++ skbuff_head_cache = kmem_cache_create("skbuff_head_cache", ++ sizeof(struct sk_buff), ++ 0, ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC, ++ NULL); ++ skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", ++ (2*sizeof(struct sk_buff)) + ++ sizeof(atomic_t), ++ 0, ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC, ++ NULL); ++} ++ ++/** ++ * skb_to_sgvec - Fill a scatter-gather list from a socket buffer ++ * @skb: Socket buffer containing the buffers to be mapped ++ * @sg: The scatter-gather list to map into ++ * @offset: The offset into the buffer's contents to start mapping ++ * @len: Length of buffer space to be mapped ++ * ++ * Fill the specified scatter-gather list with mappings/pointers into a ++ * region of the buffer space attached to a socket buffer. ++ */ ++static int ++__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) ++{ ++ int start = skb_headlen(skb); ++ int i, copy = start - offset; ++ struct sk_buff *frag_iter; ++ int elt = 0; ++ ++ if (copy > 0) { ++ if (copy > len) ++ copy = len; ++ sg_set_buf(sg, skb->data + offset, copy); ++ elt++; ++ if ((len -= copy) == 0) ++ return elt; ++ offset += copy; ++ } ++ ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ int end; ++ ++ WARN_ON(start > offset + len); ++ ++ end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); ++ if ((copy = end - offset) > 0) { ++ skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; ++ ++ if (copy > len) ++ copy = len; ++ sg_set_page(&sg[elt], skb_frag_page(frag), copy, ++ frag->page_offset+offset-start); ++ elt++; ++ if (!(len -= copy)) ++ return elt; ++ offset += copy; ++ } ++ start = end; ++ } ++ ++ skb_walk_frags(skb, frag_iter) { ++ int end; ++ ++ WARN_ON(start > offset + len); ++ ++ end = start + frag_iter->len; ++ if ((copy = end - offset) > 0) { ++ if (copy > len) ++ copy = len; ++ elt += __skb_to_sgvec(frag_iter, sg+elt, offset - start, ++ copy); ++ if ((len -= copy) == 0) ++ return elt; ++ offset += copy; ++ } ++ start = end; ++ } ++ BUG_ON(len); ++ return elt; ++} ++ ++int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) ++{ ++ int nsg = __skb_to_sgvec(skb, sg, offset, len); ++ ++ sg_mark_end(&sg[nsg - 1]); ++ ++ return nsg; ++} ++EXPORT_SYMBOL_GPL(skb_to_sgvec); ++ ++/** ++ * skb_cow_data - Check that a socket buffer's data buffers are writable ++ * @skb: The socket buffer to check. ++ * @tailbits: Amount of trailing space to be added ++ * @trailer: Returned pointer to the skb where the @tailbits space begins ++ * ++ * Make sure that the data buffers attached to a socket buffer are ++ * writable. If they are not, private copies are made of the data buffers ++ * and the socket buffer is set to use these instead. ++ * ++ * If @tailbits is given, make sure that there is space to write @tailbits ++ * bytes of data beyond current end of socket buffer. @trailer will be ++ * set to point to the skb in which this space begins. ++ * ++ * The number of scatterlist elements required to completely map the ++ * COW'd and extended socket buffer will be returned. ++ */ ++int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) ++{ ++ int copyflag; ++ int elt; ++ struct sk_buff *skb1, **skb_p; ++ ++ /* If skb is cloned or its head is paged, reallocate ++ * head pulling out all the pages (pages are considered not writable ++ * at the moment even if they are anonymous). ++ */ ++ if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) && ++ __pskb_pull_tail(skb, skb_pagelen(skb)-skb_headlen(skb)) == NULL) ++ return -ENOMEM; ++ ++ /* Easy case. Most of packets will go this way. */ ++ if (!skb_has_frag_list(skb)) { ++ /* A little of trouble, not enough of space for trailer. ++ * This should not happen, when stack is tuned to generate ++ * good frames. OK, on miss we reallocate and reserve even more ++ * space, 128 bytes is fair. */ ++ ++ if (skb_tailroom(skb) < tailbits && ++ pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC)) ++ return -ENOMEM; ++ ++ /* Voila! */ ++ *trailer = skb; ++ return 1; ++ } ++ ++ /* Misery. We are in troubles, going to mincer fragments... */ ++ ++ elt = 1; ++ skb_p = &skb_shinfo(skb)->frag_list; ++ copyflag = 0; ++ ++ while ((skb1 = *skb_p) != NULL) { ++ int ntail = 0; ++ ++ /* The fragment is partially pulled by someone, ++ * this can happen on input. Copy it and everything ++ * after it. */ ++ ++ if (skb_shared(skb1)) ++ copyflag = 1; ++ ++ /* If the skb is the last, worry about trailer. */ ++ ++ if (skb1->next == NULL && tailbits) { ++ if (skb_shinfo(skb1)->nr_frags || ++ skb_has_frag_list(skb1) || ++ skb_tailroom(skb1) < tailbits) ++ ntail = tailbits + 128; ++ } ++ ++ if (copyflag || ++ skb_cloned(skb1) || ++ ntail || ++ skb_shinfo(skb1)->nr_frags || ++ skb_has_frag_list(skb1)) { ++ struct sk_buff *skb2; ++ ++ /* Fuck, we are miserable poor guys... */ ++ if (ntail == 0) ++ skb2 = skb_copy(skb1, GFP_ATOMIC); ++ else ++ skb2 = skb_copy_expand(skb1, ++ skb_headroom(skb1), ++ ntail, ++ GFP_ATOMIC); ++ if (unlikely(skb2 == NULL)) ++ return -ENOMEM; ++ ++ if (skb1->sk) ++ skb_set_owner_w(skb2, skb1->sk); ++ ++ /* Looking around. Are we still alive? ++ * OK, link new skb, drop old one */ ++ ++ skb2->next = skb1->next; ++ *skb_p = skb2; ++ kfree_skb(skb1); ++ skb1 = skb2; ++ } ++ elt++; ++ *trailer = skb1; ++ skb_p = &skb1->next; ++ } ++ ++ return elt; ++} ++EXPORT_SYMBOL_GPL(skb_cow_data); ++ ++static void sock_rmem_free(struct sk_buff *skb) ++{ ++ struct sock *sk = skb->sk; ++ ++ atomic_sub(skb->truesize, &sk->sk_rmem_alloc); ++} ++ ++/* ++ * Note: We dont mem charge error packets (no sk_forward_alloc changes) ++ */ ++int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) ++{ ++ int len = skb->len; ++ ++ if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= ++ (unsigned int)sk->sk_rcvbuf) ++ return -ENOMEM; ++ ++ skb_orphan(skb); ++ skb->sk = sk; ++ skb->destructor = sock_rmem_free; ++ atomic_add(skb->truesize, &sk->sk_rmem_alloc); ++ ++ /* before exiting rcu section, make sure dst is refcounted */ ++ skb_dst_force(skb); ++ ++ skb_queue_tail(&sk->sk_error_queue, skb); ++ if (!sock_flag(sk, SOCK_DEAD)) ++ sk->sk_data_ready(sk, len); ++ return 0; ++} ++EXPORT_SYMBOL(sock_queue_err_skb); ++ ++void skb_tstamp_tx(struct sk_buff *orig_skb, ++ struct skb_shared_hwtstamps *hwtstamps) ++{ ++ struct sock *sk = orig_skb->sk; ++ struct sock_exterr_skb *serr; ++ struct sk_buff *skb; ++ int err; ++ ++ if (!sk) ++ return; ++ ++ if (hwtstamps) { ++ *skb_hwtstamps(orig_skb) = ++ *hwtstamps; ++ } else { ++ /* ++ * no hardware time stamps available, ++ * so keep the shared tx_flags and only ++ * store software time stamp ++ */ ++ orig_skb->tstamp = ktime_get_real(); ++ } ++ ++ skb = skb_clone(orig_skb, GFP_ATOMIC); ++ if (!skb) ++ return; ++ ++ serr = SKB_EXT_ERR(skb); ++ memset(serr, 0, sizeof(*serr)); ++ serr->ee.ee_errno = ENOMSG; ++ serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; ++ ++ err = sock_queue_err_skb(sk, skb); ++ ++ if (err) ++ kfree_skb(skb); ++} ++EXPORT_SYMBOL_GPL(skb_tstamp_tx); ++ ++void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) ++{ ++ struct sock *sk = skb->sk; ++ struct sock_exterr_skb *serr; ++ int err; ++ ++ skb->wifi_acked_valid = 1; ++ skb->wifi_acked = acked; ++ ++ serr = SKB_EXT_ERR(skb); ++ memset(serr, 0, sizeof(*serr)); ++ serr->ee.ee_errno = ENOMSG; ++ serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; ++ ++ err = sock_queue_err_skb(sk, skb); ++ if (err) ++ kfree_skb(skb); ++} ++EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); ++ ++ ++/** ++ * skb_partial_csum_set - set up and verify partial csum values for packet ++ * @skb: the skb to set ++ * @start: the number of bytes after skb->data to start checksumming. ++ * @off: the offset from start to place the checksum. ++ * ++ * For untrusted partially-checksummed packets, we need to make sure the values ++ * for skb->csum_start and skb->csum_offset are valid so we don't oops. ++ * ++ * This function checks and sets those values and skb->ip_summed: if this ++ * returns false you should drop the packet. ++ */ ++bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) ++{ ++ if (unlikely(start > skb_headlen(skb)) || ++ unlikely((int)start + off > skb_headlen(skb) - 2)) { ++ net_warn_ratelimited("bad partial csum: csum=%u/%u len=%u\n", ++ start, off, skb_headlen(skb)); ++ return false; ++ } ++ skb->ip_summed = CHECKSUM_PARTIAL; ++ skb->csum_start = skb_headroom(skb) + start; ++ skb->csum_offset = off; ++ skb_set_transport_header(skb, start); ++ return true; ++} ++EXPORT_SYMBOL_GPL(skb_partial_csum_set); ++ ++void __skb_warn_lro_forwarding(const struct sk_buff *skb) ++{ ++ net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n", ++ skb->dev->name); ++} ++EXPORT_SYMBOL(__skb_warn_lro_forwarding); ++ ++void kfree_skb_partial(struct sk_buff *skb, bool head_stolen) ++{ ++ if (head_stolen) { ++ skb_release_head_state(skb); ++ kmem_cache_free(skbuff_head_cache, skb); ++ } else { ++ __kfree_skb(skb); ++ } ++} ++EXPORT_SYMBOL(kfree_skb_partial); ++ ++/** ++ * skb_try_coalesce - try to merge skb to prior one ++ * @to: prior buffer ++ * @from: buffer to add ++ * @fragstolen: pointer to boolean ++ * @delta_truesize: how much more was allocated than was requested ++ */ ++bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, ++ bool *fragstolen, int *delta_truesize) ++{ ++ int i, delta, len = from->len; ++ ++ *fragstolen = false; ++ ++ if (skb_cloned(to)) ++ return false; ++ ++ if (len <= skb_tailroom(to)) { ++ BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); ++ *delta_truesize = 0; ++ return true; ++ } ++ ++ if (skb_has_frag_list(to) || skb_has_frag_list(from)) ++ return false; ++ ++ if (skb_headlen(from) != 0) { ++ struct page *page; ++ unsigned int offset; ++ ++ if (skb_shinfo(to)->nr_frags + ++ skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) ++ return false; ++ ++ if (skb_head_is_locked(from)) ++ return false; ++ ++ delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); ++ ++ page = virt_to_head_page(from->head); ++ offset = from->data - (unsigned char *)page_address(page); ++ ++ skb_fill_page_desc(to, skb_shinfo(to)->nr_frags, ++ page, offset, skb_headlen(from)); ++ *fragstolen = true; ++ } else { ++ if (skb_shinfo(to)->nr_frags + ++ skb_shinfo(from)->nr_frags > MAX_SKB_FRAGS) ++ return false; ++ ++ delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from)); ++ } ++ ++ WARN_ON_ONCE(delta < len); ++ ++ memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags, ++ skb_shinfo(from)->frags, ++ skb_shinfo(from)->nr_frags * sizeof(skb_frag_t)); ++ skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags; ++ ++ if (!skb_cloned(from)) ++ skb_shinfo(from)->nr_frags = 0; ++ ++ /* if the skb is not cloned this does nothing ++ * since we set nr_frags to 0. ++ */ ++ for (i = 0; i < skb_shinfo(from)->nr_frags; i++) ++ skb_frag_ref(from, i); ++ ++ to->truesize += delta; ++ to->len += len; ++ to->data_len += len; ++ ++ *delta_truesize = delta; ++ return true; ++} ++EXPORT_SYMBOL(skb_try_coalesce); +diff -ruN linux-3.10.27/net/ipv6/ip6_output.c linux-3.10.27-imq/net/ipv6/ip6_output.c +--- linux-3.10.27/net/ipv6/ip6_output.c 2014-01-16 00:29:14.000000000 +0100 ++++ linux-3.10.27-imq/net/ipv6/ip6_output.c 2014-01-18 10:19:59.348342972 +0100 +@@ -89,9 +89,6 @@ + struct in6_addr *nexthop; + int ret; + +- skb->protocol = htons(ETH_P_IPV6); +- skb->dev = dev; +- + if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) { + struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); + +@@ -168,6 +165,13 @@ + return 0; + } + ++ /* ++ * IMQ-patch: moved setting skb->dev and skb->protocol from ++ * ip6_finish_output2 to fix crashing at netif_skb_features(). ++ */ ++ skb->protocol = htons(ETH_P_IPV6); ++ skb->dev = dev; ++ + return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev, + ip6_finish_output, + !(IP6CB(skb)->flags & IP6SKB_REROUTED)); +diff -ruN linux-3.10.27/net/ipv6/ip6_output.c.orig linux-3.10.27-imq/net/ipv6/ip6_output.c.orig +--- linux-3.10.27/net/ipv6/ip6_output.c.orig 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.10.27-imq/net/ipv6/ip6_output.c.orig 2014-01-16 00:29:14.000000000 +0100 +@@ -0,0 +1,1580 @@ ++/* ++ * IPv6 output functions ++ * Linux INET6 implementation ++ * ++ * Authors: ++ * Pedro Roque ++ * ++ * Based on linux/net/ipv4/ip_output.c ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ * ++ * Changes: ++ * A.N.Kuznetsov : airthmetics in fragmentation. ++ * extension headers are implemented. ++ * route changes now work. ++ * ip6_forward does not confuse sniffers. ++ * etc. ++ * ++ * H. von Brand : Added missing #include ++ * Imran Patel : frag id should be in NBO ++ * Kazunori MIYAZAWA @USAGI ++ * : add ip6_append_data and related functions ++ * for datagram xmit ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++int __ip6_local_out(struct sk_buff *skb) ++{ ++ int len; ++ ++ len = skb->len - sizeof(struct ipv6hdr); ++ if (len > IPV6_MAXPLEN) ++ len = 0; ++ ipv6_hdr(skb)->payload_len = htons(len); ++ ++ return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, ++ skb_dst(skb)->dev, dst_output); ++} ++ ++int ip6_local_out(struct sk_buff *skb) ++{ ++ int err; ++ ++ err = __ip6_local_out(skb); ++ if (likely(err == 1)) ++ err = dst_output(skb); ++ ++ return err; ++} ++EXPORT_SYMBOL_GPL(ip6_local_out); ++ ++static int ip6_finish_output2(struct sk_buff *skb) ++{ ++ struct dst_entry *dst = skb_dst(skb); ++ struct net_device *dev = dst->dev; ++ struct neighbour *neigh; ++ struct in6_addr *nexthop; ++ int ret; ++ ++ skb->protocol = htons(ETH_P_IPV6); ++ skb->dev = dev; ++ ++ if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) { ++ struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); ++ ++ if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) && ++ ((mroute6_socket(dev_net(dev), skb) && ++ !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || ++ ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr, ++ &ipv6_hdr(skb)->saddr))) { ++ struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); ++ ++ /* Do not check for IFF_ALLMULTI; multicast routing ++ is not supported in any case. ++ */ ++ if (newskb) ++ NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, ++ newskb, NULL, newskb->dev, ++ dev_loopback_xmit); ++ ++ if (ipv6_hdr(skb)->hop_limit == 0) { ++ IP6_INC_STATS(dev_net(dev), idev, ++ IPSTATS_MIB_OUTDISCARDS); ++ kfree_skb(skb); ++ return 0; ++ } ++ } ++ ++ IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST, ++ skb->len); ++ ++ if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <= ++ IPV6_ADDR_SCOPE_NODELOCAL && ++ !(dev->flags & IFF_LOOPBACK)) { ++ kfree_skb(skb); ++ return 0; ++ } ++ } ++ ++ rcu_read_lock_bh(); ++ nexthop = rt6_nexthop((struct rt6_info *)dst); ++ neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); ++ if (unlikely(!neigh)) ++ neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); ++ if (!IS_ERR(neigh)) { ++ ret = dst_neigh_output(dst, neigh, skb); ++ rcu_read_unlock_bh(); ++ return ret; ++ } ++ rcu_read_unlock_bh(); ++ ++ IP6_INC_STATS(dev_net(dst->dev), ++ ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); ++ kfree_skb(skb); ++ return -EINVAL; ++} ++ ++static int ip6_finish_output(struct sk_buff *skb) ++{ ++ if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || ++ dst_allfrag(skb_dst(skb)) || ++ (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) ++ return ip6_fragment(skb, ip6_finish_output2); ++ else ++ return ip6_finish_output2(skb); ++} ++ ++int ip6_output(struct sk_buff *skb) ++{ ++ struct net_device *dev = skb_dst(skb)->dev; ++ struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); ++ if (unlikely(idev->cnf.disable_ipv6)) { ++ IP6_INC_STATS(dev_net(dev), idev, ++ IPSTATS_MIB_OUTDISCARDS); ++ kfree_skb(skb); ++ return 0; ++ } ++ ++ return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev, ++ ip6_finish_output, ++ !(IP6CB(skb)->flags & IP6SKB_REROUTED)); ++} ++ ++/* ++ * xmit an sk_buff (used by TCP, SCTP and DCCP) ++ */ ++ ++int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, ++ struct ipv6_txoptions *opt, int tclass) ++{ ++ struct net *net = sock_net(sk); ++ struct ipv6_pinfo *np = inet6_sk(sk); ++ struct in6_addr *first_hop = &fl6->daddr; ++ struct dst_entry *dst = skb_dst(skb); ++ struct ipv6hdr *hdr; ++ u8 proto = fl6->flowi6_proto; ++ int seg_len = skb->len; ++ int hlimit = -1; ++ u32 mtu; ++ ++ if (opt) { ++ unsigned int head_room; ++ ++ /* First: exthdrs may take lots of space (~8K for now) ++ MAX_HEADER is not enough. ++ */ ++ head_room = opt->opt_nflen + opt->opt_flen; ++ seg_len += head_room; ++ head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); ++ ++ if (skb_headroom(skb) < head_room) { ++ struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); ++ if (skb2 == NULL) { ++ IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), ++ IPSTATS_MIB_OUTDISCARDS); ++ kfree_skb(skb); ++ return -ENOBUFS; ++ } ++ consume_skb(skb); ++ skb = skb2; ++ skb_set_owner_w(skb, sk); ++ } ++ if (opt->opt_flen) ++ ipv6_push_frag_opts(skb, opt, &proto); ++ if (opt->opt_nflen) ++ ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop); ++ } ++ ++ skb_push(skb, sizeof(struct ipv6hdr)); ++ skb_reset_network_header(skb); ++ hdr = ipv6_hdr(skb); ++ ++ /* ++ * Fill in the IPv6 header ++ */ ++ if (np) ++ hlimit = np->hop_limit; ++ if (hlimit < 0) ++ hlimit = ip6_dst_hoplimit(dst); ++ ++ ip6_flow_hdr(hdr, tclass, fl6->flowlabel); ++ ++ hdr->payload_len = htons(seg_len); ++ hdr->nexthdr = proto; ++ hdr->hop_limit = hlimit; ++ ++ hdr->saddr = fl6->saddr; ++ hdr->daddr = *first_hop; ++ ++ skb->priority = sk->sk_priority; ++ skb->mark = sk->sk_mark; ++ ++ mtu = dst_mtu(dst); ++ if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) { ++ IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), ++ IPSTATS_MIB_OUT, skb->len); ++ return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, ++ dst->dev, dst_output); ++ } ++ ++ skb->dev = dst->dev; ++ ipv6_local_error(sk, EMSGSIZE, fl6, mtu); ++ IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); ++ kfree_skb(skb); ++ return -EMSGSIZE; ++} ++ ++EXPORT_SYMBOL(ip6_xmit); ++ ++static int ip6_call_ra_chain(struct sk_buff *skb, int sel) ++{ ++ struct ip6_ra_chain *ra; ++ struct sock *last = NULL; ++ ++ read_lock(&ip6_ra_lock); ++ for (ra = ip6_ra_chain; ra; ra = ra->next) { ++ struct sock *sk = ra->sk; ++ if (sk && ra->sel == sel && ++ (!sk->sk_bound_dev_if || ++ sk->sk_bound_dev_if == skb->dev->ifindex)) { ++ if (last) { ++ struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); ++ if (skb2) ++ rawv6_rcv(last, skb2); ++ } ++ last = sk; ++ } ++ } ++ ++ if (last) { ++ rawv6_rcv(last, skb); ++ read_unlock(&ip6_ra_lock); ++ return 1; ++ } ++ read_unlock(&ip6_ra_lock); ++ return 0; ++} ++ ++static int ip6_forward_proxy_check(struct sk_buff *skb) ++{ ++ struct ipv6hdr *hdr = ipv6_hdr(skb); ++ u8 nexthdr = hdr->nexthdr; ++ __be16 frag_off; ++ int offset; ++ ++ if (ipv6_ext_hdr(nexthdr)) { ++ offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); ++ if (offset < 0) ++ return 0; ++ } else ++ offset = sizeof(struct ipv6hdr); ++ ++ if (nexthdr == IPPROTO_ICMPV6) { ++ struct icmp6hdr *icmp6; ++ ++ if (!pskb_may_pull(skb, (skb_network_header(skb) + ++ offset + 1 - skb->data))) ++ return 0; ++ ++ icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); ++ ++ switch (icmp6->icmp6_type) { ++ case NDISC_ROUTER_SOLICITATION: ++ case NDISC_ROUTER_ADVERTISEMENT: ++ case NDISC_NEIGHBOUR_SOLICITATION: ++ case NDISC_NEIGHBOUR_ADVERTISEMENT: ++ case NDISC_REDIRECT: ++ /* For reaction involving unicast neighbor discovery ++ * message destined to the proxied address, pass it to ++ * input function. ++ */ ++ return 1; ++ default: ++ break; ++ } ++ } ++ ++ /* ++ * The proxying router can't forward traffic sent to a link-local ++ * address, so signal the sender and discard the packet. This ++ * behavior is clarified by the MIPv6 specification. ++ */ ++ if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { ++ dst_link_failure(skb); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static inline int ip6_forward_finish(struct sk_buff *skb) ++{ ++ return dst_output(skb); ++} ++ ++int ip6_forward(struct sk_buff *skb) ++{ ++ struct dst_entry *dst = skb_dst(skb); ++ struct ipv6hdr *hdr = ipv6_hdr(skb); ++ struct inet6_skb_parm *opt = IP6CB(skb); ++ struct net *net = dev_net(dst->dev); ++ u32 mtu; ++ ++ if (net->ipv6.devconf_all->forwarding == 0) ++ goto error; ++ ++ if (skb_warn_if_lro(skb)) ++ goto drop; ++ ++ if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { ++ IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS); ++ goto drop; ++ } ++ ++ if (skb->pkt_type != PACKET_HOST) ++ goto drop; ++ ++ skb_forward_csum(skb); ++ ++ /* ++ * We DO NOT make any processing on ++ * RA packets, pushing them to user level AS IS ++ * without ane WARRANTY that application will be able ++ * to interpret them. The reason is that we ++ * cannot make anything clever here. ++ * ++ * We are not end-node, so that if packet contains ++ * AH/ESP, we cannot make anything. ++ * Defragmentation also would be mistake, RA packets ++ * cannot be fragmented, because there is no warranty ++ * that different fragments will go along one path. --ANK ++ */ ++ if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { ++ if (ip6_call_ra_chain(skb, ntohs(opt->ra))) ++ return 0; ++ } ++ ++ /* ++ * check and decrement ttl ++ */ ++ if (hdr->hop_limit <= 1) { ++ /* Force OUTPUT device used as source address */ ++ skb->dev = dst->dev; ++ icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); ++ IP6_INC_STATS_BH(net, ++ ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS); ++ ++ kfree_skb(skb); ++ return -ETIMEDOUT; ++ } ++ ++ /* XXX: idev->cnf.proxy_ndp? */ ++ if (net->ipv6.devconf_all->proxy_ndp && ++ pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { ++ int proxied = ip6_forward_proxy_check(skb); ++ if (proxied > 0) ++ return ip6_input(skb); ++ else if (proxied < 0) { ++ IP6_INC_STATS(net, ip6_dst_idev(dst), ++ IPSTATS_MIB_INDISCARDS); ++ goto drop; ++ } ++ } ++ ++ if (!xfrm6_route_forward(skb)) { ++ IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS); ++ goto drop; ++ } ++ dst = skb_dst(skb); ++ ++ /* IPv6 specs say nothing about it, but it is clear that we cannot ++ send redirects to source routed frames. ++ We don't send redirects to frames decapsulated from IPsec. ++ */ ++ if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) { ++ struct in6_addr *target = NULL; ++ struct inet_peer *peer; ++ struct rt6_info *rt; ++ ++ /* ++ * incoming and outgoing devices are the same ++ * send a redirect. ++ */ ++ ++ rt = (struct rt6_info *) dst; ++ if (rt->rt6i_flags & RTF_GATEWAY) ++ target = &rt->rt6i_gateway; ++ else ++ target = &hdr->daddr; ++ ++ peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1); ++ ++ /* Limit redirects both by destination (here) ++ and by source (inside ndisc_send_redirect) ++ */ ++ if (inet_peer_xrlim_allow(peer, 1*HZ)) ++ ndisc_send_redirect(skb, target); ++ if (peer) ++ inet_putpeer(peer); ++ } else { ++ int addrtype = ipv6_addr_type(&hdr->saddr); ++ ++ /* This check is security critical. */ ++ if (addrtype == IPV6_ADDR_ANY || ++ addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) ++ goto error; ++ if (addrtype & IPV6_ADDR_LINKLOCAL) { ++ icmpv6_send(skb, ICMPV6_DEST_UNREACH, ++ ICMPV6_NOT_NEIGHBOUR, 0); ++ goto error; ++ } ++ } ++ ++ mtu = dst_mtu(dst); ++ if (mtu < IPV6_MIN_MTU) ++ mtu = IPV6_MIN_MTU; ++ ++ if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) || ++ (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) { ++ /* Again, force OUTPUT device used as source address */ ++ skb->dev = dst->dev; ++ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); ++ IP6_INC_STATS_BH(net, ++ ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS); ++ IP6_INC_STATS_BH(net, ++ ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS); ++ kfree_skb(skb); ++ return -EMSGSIZE; ++ } ++ ++ if (skb_cow(skb, dst->dev->hard_header_len)) { ++ IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS); ++ goto drop; ++ } ++ ++ hdr = ipv6_hdr(skb); ++ ++ /* Mangling hops number delayed to point after skb COW */ ++ ++ hdr->hop_limit--; ++ ++ IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); ++ IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len); ++ return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev, ++ ip6_forward_finish); ++ ++error: ++ IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS); ++drop: ++ kfree_skb(skb); ++ return -EINVAL; ++} ++ ++static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) ++{ ++ to->pkt_type = from->pkt_type; ++ to->priority = from->priority; ++ to->protocol = from->protocol; ++ skb_dst_drop(to); ++ skb_dst_set(to, dst_clone(skb_dst(from))); ++ to->dev = from->dev; ++ to->mark = from->mark; ++ ++#ifdef CONFIG_NET_SCHED ++ to->tc_index = from->tc_index; ++#endif ++ nf_copy(to, from); ++#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) ++ to->nf_trace = from->nf_trace; ++#endif ++ skb_copy_secmark(to, from); ++} ++ ++int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) ++{ ++ struct sk_buff *frag; ++ struct rt6_info *rt = (struct rt6_info*)skb_dst(skb); ++ struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL; ++ struct ipv6hdr *tmp_hdr; ++ struct frag_hdr *fh; ++ unsigned int mtu, hlen, left, len; ++ int hroom, troom; ++ __be32 frag_id = 0; ++ int ptr, offset = 0, err=0; ++ u8 *prevhdr, nexthdr = 0; ++ struct net *net = dev_net(skb_dst(skb)->dev); ++ ++ hlen = ip6_find_1stfragopt(skb, &prevhdr); ++ nexthdr = *prevhdr; ++ ++ mtu = ip6_skb_dst_mtu(skb); ++ ++ /* We must not fragment if the socket is set to force MTU discovery ++ * or if the skb it not generated by a local socket. ++ */ ++ if (unlikely(!skb->local_df && skb->len > mtu) || ++ (IP6CB(skb)->frag_max_size && ++ IP6CB(skb)->frag_max_size > mtu)) { ++ if (skb->sk && dst_allfrag(skb_dst(skb))) ++ sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK); ++ ++ skb->dev = skb_dst(skb)->dev; ++ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); ++ IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), ++ IPSTATS_MIB_FRAGFAILS); ++ kfree_skb(skb); ++ return -EMSGSIZE; ++ } ++ ++ if (np && np->frag_size < mtu) { ++ if (np->frag_size) ++ mtu = np->frag_size; ++ } ++ mtu -= hlen + sizeof(struct frag_hdr); ++ ++ if (skb_has_frag_list(skb)) { ++ int first_len = skb_pagelen(skb); ++ struct sk_buff *frag2; ++ ++ if (first_len - hlen > mtu || ++ ((first_len - hlen) & 7) || ++ skb_cloned(skb)) ++ goto slow_path; ++ ++ skb_walk_frags(skb, frag) { ++ /* Correct geometry. */ ++ if (frag->len > mtu || ++ ((frag->len & 7) && frag->next) || ++ skb_headroom(frag) < hlen) ++ goto slow_path_clean; ++ ++ /* Partially cloned skb? */ ++ if (skb_shared(frag)) ++ goto slow_path_clean; ++ ++ BUG_ON(frag->sk); ++ if (skb->sk) { ++ frag->sk = skb->sk; ++ frag->destructor = sock_wfree; ++ } ++ skb->truesize -= frag->truesize; ++ } ++ ++ err = 0; ++ offset = 0; ++ frag = skb_shinfo(skb)->frag_list; ++ skb_frag_list_init(skb); ++ /* BUILD HEADER */ ++ ++ *prevhdr = NEXTHDR_FRAGMENT; ++ tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); ++ if (!tmp_hdr) { ++ IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), ++ IPSTATS_MIB_FRAGFAILS); ++ return -ENOMEM; ++ } ++ ++ __skb_pull(skb, hlen); ++ fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr)); ++ __skb_push(skb, hlen); ++ skb_reset_network_header(skb); ++ memcpy(skb_network_header(skb), tmp_hdr, hlen); ++ ++ ipv6_select_ident(fh, rt); ++ fh->nexthdr = nexthdr; ++ fh->reserved = 0; ++ fh->frag_off = htons(IP6_MF); ++ frag_id = fh->identification; ++ ++ first_len = skb_pagelen(skb); ++ skb->data_len = first_len - skb_headlen(skb); ++ skb->len = first_len; ++ ipv6_hdr(skb)->payload_len = htons(first_len - ++ sizeof(struct ipv6hdr)); ++ ++ dst_hold(&rt->dst); ++ ++ for (;;) { ++ /* Prepare header of the next frame, ++ * before previous one went down. */ ++ if (frag) { ++ frag->ip_summed = CHECKSUM_NONE; ++ skb_reset_transport_header(frag); ++ fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr)); ++ __skb_push(frag, hlen); ++ skb_reset_network_header(frag); ++ memcpy(skb_network_header(frag), tmp_hdr, ++ hlen); ++ offset += skb->len - hlen - sizeof(struct frag_hdr); ++ fh->nexthdr = nexthdr; ++ fh->reserved = 0; ++ fh->frag_off = htons(offset); ++ if (frag->next != NULL) ++ fh->frag_off |= htons(IP6_MF); ++ fh->identification = frag_id; ++ ipv6_hdr(frag)->payload_len = ++ htons(frag->len - ++ sizeof(struct ipv6hdr)); ++ ip6_copy_metadata(frag, skb); ++ } ++ ++ err = output(skb); ++ if(!err) ++ IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), ++ IPSTATS_MIB_FRAGCREATES); ++ ++ if (err || !frag) ++ break; ++ ++ skb = frag; ++ frag = skb->next; ++ skb->next = NULL; ++ } ++ ++ kfree(tmp_hdr); ++ ++ if (err == 0) { ++ IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), ++ IPSTATS_MIB_FRAGOKS); ++ ip6_rt_put(rt); ++ return 0; ++ } ++ ++ while (frag) { ++ skb = frag->next; ++ kfree_skb(frag); ++ frag = skb; ++ } ++ ++ IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), ++ IPSTATS_MIB_FRAGFAILS); ++ ip6_rt_put(rt); ++ return err; ++ ++slow_path_clean: ++ skb_walk_frags(skb, frag2) { ++ if (frag2 == frag) ++ break; ++ frag2->sk = NULL; ++ frag2->destructor = NULL; ++ skb->truesize += frag2->truesize; ++ } ++ } ++ ++slow_path: ++ if ((skb->ip_summed == CHECKSUM_PARTIAL) && ++ skb_checksum_help(skb)) ++ goto fail; ++ ++ left = skb->len - hlen; /* Space per frame */ ++ ptr = hlen; /* Where to start from */ ++ ++ /* ++ * Fragment the datagram. ++ */ ++ ++ *prevhdr = NEXTHDR_FRAGMENT; ++ hroom = LL_RESERVED_SPACE(rt->dst.dev); ++ troom = rt->dst.dev->needed_tailroom; ++ ++ /* ++ * Keep copying data until we run out. ++ */ ++ while(left > 0) { ++ len = left; ++ /* IF: it doesn't fit, use 'mtu' - the data space left */ ++ if (len > mtu) ++ len = mtu; ++ /* IF: we are not sending up to and including the packet end ++ then align the next start on an eight byte boundary */ ++ if (len < left) { ++ len &= ~7; ++ } ++ /* ++ * Allocate buffer. ++ */ ++ ++ if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + ++ hroom + troom, GFP_ATOMIC)) == NULL) { ++ NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n"); ++ IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), ++ IPSTATS_MIB_FRAGFAILS); ++ err = -ENOMEM; ++ goto fail; ++ } ++ ++ /* ++ * Set up data on packet ++ */ ++ ++ ip6_copy_metadata(frag, skb); ++ skb_reserve(frag, hroom); ++ skb_put(frag, len + hlen + sizeof(struct frag_hdr)); ++ skb_reset_network_header(frag); ++ fh = (struct frag_hdr *)(skb_network_header(frag) + hlen); ++ frag->transport_header = (frag->network_header + hlen + ++ sizeof(struct frag_hdr)); ++ ++ /* ++ * Charge the memory for the fragment to any owner ++ * it might possess ++ */ ++ if (skb->sk) ++ skb_set_owner_w(frag, skb->sk); ++ ++ /* ++ * Copy the packet header into the new buffer. ++ */ ++ skb_copy_from_linear_data(skb, skb_network_header(frag), hlen); ++ ++ /* ++ * Build fragment header. ++ */ ++ fh->nexthdr = nexthdr; ++ fh->reserved = 0; ++ if (!frag_id) { ++ ipv6_select_ident(fh, rt); ++ frag_id = fh->identification; ++ } else ++ fh->identification = frag_id; ++ ++ /* ++ * Copy a block of the IP datagram. ++ */ ++ if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len)) ++ BUG(); ++ left -= len; ++ ++ fh->frag_off = htons(offset); ++ if (left > 0) ++ fh->frag_off |= htons(IP6_MF); ++ ipv6_hdr(frag)->payload_len = htons(frag->len - ++ sizeof(struct ipv6hdr)); ++ ++ ptr += len; ++ offset += len; ++ ++ /* ++ * Put this fragment into the sending queue. ++ */ ++ err = output(frag); ++ if (err) ++ goto fail; ++ ++ IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), ++ IPSTATS_MIB_FRAGCREATES); ++ } ++ IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), ++ IPSTATS_MIB_FRAGOKS); ++ consume_skb(skb); ++ return err; ++ ++fail: ++ IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), ++ IPSTATS_MIB_FRAGFAILS); ++ kfree_skb(skb); ++ return err; ++} ++ ++static inline int ip6_rt_check(const struct rt6key *rt_key, ++ const struct in6_addr *fl_addr, ++ const struct in6_addr *addr_cache) ++{ ++ return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && ++ (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)); ++} ++ ++static struct dst_entry *ip6_sk_dst_check(struct sock *sk, ++ struct dst_entry *dst, ++ const struct flowi6 *fl6) ++{ ++ struct ipv6_pinfo *np = inet6_sk(sk); ++ struct rt6_info *rt; ++ ++ if (!dst) ++ goto out; ++ ++ if (dst->ops->family != AF_INET6) { ++ dst_release(dst); ++ return NULL; ++ } ++ ++ rt = (struct rt6_info *)dst; ++ /* Yes, checking route validity in not connected ++ * case is not very simple. Take into account, ++ * that we do not support routing by source, TOS, ++ * and MSG_DONTROUTE --ANK (980726) ++ * ++ * 1. ip6_rt_check(): If route was host route, ++ * check that cached destination is current. ++ * If it is network route, we still may ++ * check its validity using saved pointer ++ * to the last used address: daddr_cache. ++ * We do not want to save whole address now, ++ * (because main consumer of this service ++ * is tcp, which has not this problem), ++ * so that the last trick works only on connected ++ * sockets. ++ * 2. oif also should be the same. ++ */ ++ if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || ++#ifdef CONFIG_IPV6_SUBTREES ++ ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || ++#endif ++ (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) { ++ dst_release(dst); ++ dst = NULL; ++ } ++ ++out: ++ return dst; ++} ++ ++static int ip6_dst_lookup_tail(struct sock *sk, ++ struct dst_entry **dst, struct flowi6 *fl6) ++{ ++ struct net *net = sock_net(sk); ++#ifdef CONFIG_IPV6_OPTIMISTIC_DAD ++ struct neighbour *n; ++ struct rt6_info *rt; ++#endif ++ int err; ++ ++ if (*dst == NULL) ++ *dst = ip6_route_output(net, sk, fl6); ++ ++ if ((err = (*dst)->error)) ++ goto out_err_release; ++ ++ if (ipv6_addr_any(&fl6->saddr)) { ++ struct rt6_info *rt = (struct rt6_info *) *dst; ++ err = ip6_route_get_saddr(net, rt, &fl6->daddr, ++ sk ? inet6_sk(sk)->srcprefs : 0, ++ &fl6->saddr); ++ if (err) ++ goto out_err_release; ++ } ++ ++#ifdef CONFIG_IPV6_OPTIMISTIC_DAD ++ /* ++ * Here if the dst entry we've looked up ++ * has a neighbour entry that is in the INCOMPLETE ++ * state and the src address from the flow is ++ * marked as OPTIMISTIC, we release the found ++ * dst entry and replace it instead with the ++ * dst entry of the nexthop router ++ */ ++ rt = (struct rt6_info *) *dst; ++ rcu_read_lock_bh(); ++ n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt)); ++ err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0; ++ rcu_read_unlock_bh(); ++ ++ if (err) { ++ struct inet6_ifaddr *ifp; ++ struct flowi6 fl_gw6; ++ int redirect; ++ ++ ifp = ipv6_get_ifaddr(net, &fl6->saddr, ++ (*dst)->dev, 1); ++ ++ redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); ++ if (ifp) ++ in6_ifa_put(ifp); ++ ++ if (redirect) { ++ /* ++ * We need to get the dst entry for the ++ * default router instead ++ */ ++ dst_release(*dst); ++ memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); ++ memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); ++ *dst = ip6_route_output(net, sk, &fl_gw6); ++ if ((err = (*dst)->error)) ++ goto out_err_release; ++ } ++ } ++#endif ++ ++ return 0; ++ ++out_err_release: ++ if (err == -ENETUNREACH) ++ IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES); ++ dst_release(*dst); ++ *dst = NULL; ++ return err; ++} ++ ++/** ++ * ip6_dst_lookup - perform route lookup on flow ++ * @sk: socket which provides route info ++ * @dst: pointer to dst_entry * for result ++ * @fl6: flow to lookup ++ * ++ * This function performs a route lookup on the given flow. ++ * ++ * It returns zero on success, or a standard errno code on error. ++ */ ++int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6) ++{ ++ *dst = NULL; ++ return ip6_dst_lookup_tail(sk, dst, fl6); ++} ++EXPORT_SYMBOL_GPL(ip6_dst_lookup); ++ ++/** ++ * ip6_dst_lookup_flow - perform route lookup on flow with ipsec ++ * @sk: socket which provides route info ++ * @fl6: flow to lookup ++ * @final_dst: final destination address for ipsec lookup ++ * @can_sleep: we are in a sleepable context ++ * ++ * This function performs a route lookup on the given flow. ++ * ++ * It returns a valid dst pointer on success, or a pointer encoded ++ * error code. ++ */ ++struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, ++ const struct in6_addr *final_dst, ++ bool can_sleep) ++{ ++ struct dst_entry *dst = NULL; ++ int err; ++ ++ err = ip6_dst_lookup_tail(sk, &dst, fl6); ++ if (err) ++ return ERR_PTR(err); ++ if (final_dst) ++ fl6->daddr = *final_dst; ++ if (can_sleep) ++ fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP; ++ ++ return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); ++} ++EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); ++ ++/** ++ * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow ++ * @sk: socket which provides the dst cache and route info ++ * @fl6: flow to lookup ++ * @final_dst: final destination address for ipsec lookup ++ * @can_sleep: we are in a sleepable context ++ * ++ * This function performs a route lookup on the given flow with the ++ * possibility of using the cached route in the socket if it is valid. ++ * It will take the socket dst lock when operating on the dst cache. ++ * As a result, this function can only be used in process context. ++ * ++ * It returns a valid dst pointer on success, or a pointer encoded ++ * error code. ++ */ ++struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, ++ const struct in6_addr *final_dst, ++ bool can_sleep) ++{ ++ struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); ++ int err; ++ ++ dst = ip6_sk_dst_check(sk, dst, fl6); ++ ++ err = ip6_dst_lookup_tail(sk, &dst, fl6); ++ if (err) ++ return ERR_PTR(err); ++ if (final_dst) ++ fl6->daddr = *final_dst; ++ if (can_sleep) ++ fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP; ++ ++ return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); ++} ++EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); ++ ++static inline int ip6_ufo_append_data(struct sock *sk, ++ int getfrag(void *from, char *to, int offset, int len, ++ int odd, struct sk_buff *skb), ++ void *from, int length, int hh_len, int fragheaderlen, ++ int transhdrlen, int mtu,unsigned int flags, ++ struct rt6_info *rt) ++ ++{ ++ struct sk_buff *skb; ++ int err; ++ ++ /* There is support for UDP large send offload by network ++ * device, so create one single skb packet containing complete ++ * udp datagram ++ */ ++ if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) { ++ struct frag_hdr fhdr; ++ ++ skb = sock_alloc_send_skb(sk, ++ hh_len + fragheaderlen + transhdrlen + 20, ++ (flags & MSG_DONTWAIT), &err); ++ if (skb == NULL) ++ return err; ++ ++ /* reserve space for Hardware header */ ++ skb_reserve(skb, hh_len); ++ ++ /* create space for UDP/IP header */ ++ skb_put(skb,fragheaderlen + transhdrlen); ++ ++ /* initialize network header pointer */ ++ skb_reset_network_header(skb); ++ ++ /* initialize protocol header pointer */ ++ skb->transport_header = skb->network_header + fragheaderlen; ++ ++ skb->ip_summed = CHECKSUM_PARTIAL; ++ skb->csum = 0; ++ ++ /* Specify the length of each IPv6 datagram fragment. ++ * It has to be a multiple of 8. ++ */ ++ skb_shinfo(skb)->gso_size = (mtu - fragheaderlen - ++ sizeof(struct frag_hdr)) & ~7; ++ skb_shinfo(skb)->gso_type = SKB_GSO_UDP; ++ ipv6_select_ident(&fhdr, rt); ++ skb_shinfo(skb)->ip6_frag_id = fhdr.identification; ++ __skb_queue_tail(&sk->sk_write_queue, skb); ++ } ++ ++ return skb_append_datato_frags(sk, skb, getfrag, from, ++ (length - transhdrlen)); ++} ++ ++static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, ++ gfp_t gfp) ++{ ++ return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; ++} ++ ++static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, ++ gfp_t gfp) ++{ ++ return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; ++} ++ ++static void ip6_append_data_mtu(unsigned int *mtu, ++ int *maxfraglen, ++ unsigned int fragheaderlen, ++ struct sk_buff *skb, ++ struct rt6_info *rt, ++ bool pmtuprobe) ++{ ++ if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { ++ if (skb == NULL) { ++ /* first fragment, reserve header_len */ ++ *mtu = *mtu - rt->dst.header_len; ++ ++ } else { ++ /* ++ * this fragment is not first, the headers ++ * space is regarded as data space. ++ */ ++ *mtu = min(*mtu, pmtuprobe ? ++ rt->dst.dev->mtu : ++ dst_mtu(rt->dst.path)); ++ } ++ *maxfraglen = ((*mtu - fragheaderlen) & ~7) ++ + fragheaderlen - sizeof(struct frag_hdr); ++ } ++} ++ ++int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, ++ int offset, int len, int odd, struct sk_buff *skb), ++ void *from, int length, int transhdrlen, ++ int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6, ++ struct rt6_info *rt, unsigned int flags, int dontfrag) ++{ ++ struct inet_sock *inet = inet_sk(sk); ++ struct ipv6_pinfo *np = inet6_sk(sk); ++ struct inet_cork *cork; ++ struct sk_buff *skb, *skb_prev = NULL; ++ unsigned int maxfraglen, fragheaderlen, mtu; ++ int exthdrlen; ++ int dst_exthdrlen; ++ int hh_len; ++ int copy; ++ int err; ++ int offset = 0; ++ __u8 tx_flags = 0; ++ ++ if (flags&MSG_PROBE) ++ return 0; ++ cork = &inet->cork.base; ++ if (skb_queue_empty(&sk->sk_write_queue)) { ++ /* ++ * setup for corking ++ */ ++ if (opt) { ++ if (WARN_ON(np->cork.opt)) ++ return -EINVAL; ++ ++ np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation); ++ if (unlikely(np->cork.opt == NULL)) ++ return -ENOBUFS; ++ ++ np->cork.opt->tot_len = opt->tot_len; ++ np->cork.opt->opt_flen = opt->opt_flen; ++ np->cork.opt->opt_nflen = opt->opt_nflen; ++ ++ np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt, ++ sk->sk_allocation); ++ if (opt->dst0opt && !np->cork.opt->dst0opt) ++ return -ENOBUFS; ++ ++ np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt, ++ sk->sk_allocation); ++ if (opt->dst1opt && !np->cork.opt->dst1opt) ++ return -ENOBUFS; ++ ++ np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt, ++ sk->sk_allocation); ++ if (opt->hopopt && !np->cork.opt->hopopt) ++ return -ENOBUFS; ++ ++ np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt, ++ sk->sk_allocation); ++ if (opt->srcrt && !np->cork.opt->srcrt) ++ return -ENOBUFS; ++ ++ /* need source address above miyazawa*/ ++ } ++ dst_hold(&rt->dst); ++ cork->dst = &rt->dst; ++ inet->cork.fl.u.ip6 = *fl6; ++ np->cork.hop_limit = hlimit; ++ np->cork.tclass = tclass; ++ if (rt->dst.flags & DST_XFRM_TUNNEL) ++ mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ? ++ rt->dst.dev->mtu : dst_mtu(&rt->dst); ++ else ++ mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ? ++ rt->dst.dev->mtu : dst_mtu(rt->dst.path); ++ if (np->frag_size < mtu) { ++ if (np->frag_size) ++ mtu = np->frag_size; ++ } ++ cork->fragsize = mtu; ++ if (dst_allfrag(rt->dst.path)) ++ cork->flags |= IPCORK_ALLFRAG; ++ cork->length = 0; ++ exthdrlen = (opt ? opt->opt_flen : 0); ++ length += exthdrlen; ++ transhdrlen += exthdrlen; ++ dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; ++ } else { ++ rt = (struct rt6_info *)cork->dst; ++ fl6 = &inet->cork.fl.u.ip6; ++ opt = np->cork.opt; ++ transhdrlen = 0; ++ exthdrlen = 0; ++ dst_exthdrlen = 0; ++ mtu = cork->fragsize; ++ } ++ ++ hh_len = LL_RESERVED_SPACE(rt->dst.dev); ++ ++ fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + ++ (opt ? opt->opt_nflen : 0); ++ maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr); ++ ++ if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) { ++ if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) { ++ ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen); ++ return -EMSGSIZE; ++ } ++ } ++ ++ /* For UDP, check if TX timestamp is enabled */ ++ if (sk->sk_type == SOCK_DGRAM) ++ sock_tx_timestamp(sk, &tx_flags); ++ ++ /* ++ * Let's try using as much space as possible. ++ * Use MTU if total length of the message fits into the MTU. ++ * Otherwise, we need to reserve fragment header and ++ * fragment alignment (= 8-15 octects, in total). ++ * ++ * Note that we may need to "move" the data from the tail of ++ * of the buffer to the new fragment when we split ++ * the message. ++ * ++ * FIXME: It may be fragmented into multiple chunks ++ * at once if non-fragmentable extension headers ++ * are too large. ++ * --yoshfuji ++ */ ++ ++ if ((length > mtu) && dontfrag && (sk->sk_protocol == IPPROTO_UDP || ++ sk->sk_protocol == IPPROTO_RAW)) { ++ ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen); ++ return -EMSGSIZE; ++ } ++ ++ skb = skb_peek_tail(&sk->sk_write_queue); ++ cork->length += length; ++ if (((length > mtu) || ++ (skb && skb_has_frags(skb))) && ++ (sk->sk_protocol == IPPROTO_UDP) && ++ (rt->dst.dev->features & NETIF_F_UFO)) { ++ err = ip6_ufo_append_data(sk, getfrag, from, length, ++ hh_len, fragheaderlen, ++ transhdrlen, mtu, flags, rt); ++ if (err) ++ goto error; ++ return 0; ++ } ++ ++ if (!skb) ++ goto alloc_new_skb; ++ ++ while (length > 0) { ++ /* Check if the remaining data fits into current packet. */ ++ copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len; ++ if (copy < length) ++ copy = maxfraglen - skb->len; ++ ++ if (copy <= 0) { ++ char *data; ++ unsigned int datalen; ++ unsigned int fraglen; ++ unsigned int fraggap; ++ unsigned int alloclen; ++alloc_new_skb: ++ /* There's no room in the current skb */ ++ if (skb) ++ fraggap = skb->len - maxfraglen; ++ else ++ fraggap = 0; ++ /* update mtu and maxfraglen if necessary */ ++ if (skb == NULL || skb_prev == NULL) ++ ip6_append_data_mtu(&mtu, &maxfraglen, ++ fragheaderlen, skb, rt, ++ np->pmtudisc == ++ IPV6_PMTUDISC_PROBE); ++ ++ skb_prev = skb; ++ ++ /* ++ * If remaining data exceeds the mtu, ++ * we know we need more fragment(s). ++ */ ++ datalen = length + fraggap; ++ ++ if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) ++ datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; ++ if ((flags & MSG_MORE) && ++ !(rt->dst.dev->features&NETIF_F_SG)) ++ alloclen = mtu; ++ else ++ alloclen = datalen + fragheaderlen; ++ ++ alloclen += dst_exthdrlen; ++ ++ if (datalen != length + fraggap) { ++ /* ++ * this is not the last fragment, the trailer ++ * space is regarded as data space. ++ */ ++ datalen += rt->dst.trailer_len; ++ } ++ ++ alloclen += rt->dst.trailer_len; ++ fraglen = datalen + fragheaderlen; ++ ++ /* ++ * We just reserve space for fragment header. ++ * Note: this may be overallocation if the message ++ * (without MSG_MORE) fits into the MTU. ++ */ ++ alloclen += sizeof(struct frag_hdr); ++ ++ if (transhdrlen) { ++ skb = sock_alloc_send_skb(sk, ++ alloclen + hh_len, ++ (flags & MSG_DONTWAIT), &err); ++ } else { ++ skb = NULL; ++ if (atomic_read(&sk->sk_wmem_alloc) <= ++ 2 * sk->sk_sndbuf) ++ skb = sock_wmalloc(sk, ++ alloclen + hh_len, 1, ++ sk->sk_allocation); ++ if (unlikely(skb == NULL)) ++ err = -ENOBUFS; ++ else { ++ /* Only the initial fragment ++ * is time stamped. ++ */ ++ tx_flags = 0; ++ } ++ } ++ if (skb == NULL) ++ goto error; ++ /* ++ * Fill in the control structures ++ */ ++ skb->ip_summed = CHECKSUM_NONE; ++ skb->csum = 0; ++ /* reserve for fragmentation and ipsec header */ ++ skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + ++ dst_exthdrlen); ++ ++ if (sk->sk_type == SOCK_DGRAM) ++ skb_shinfo(skb)->tx_flags = tx_flags; ++ ++ /* ++ * Find where to start putting bytes ++ */ ++ data = skb_put(skb, fraglen); ++ skb_set_network_header(skb, exthdrlen); ++ data += fragheaderlen; ++ skb->transport_header = (skb->network_header + ++ fragheaderlen); ++ if (fraggap) { ++ skb->csum = skb_copy_and_csum_bits( ++ skb_prev, maxfraglen, ++ data + transhdrlen, fraggap, 0); ++ skb_prev->csum = csum_sub(skb_prev->csum, ++ skb->csum); ++ data += fraggap; ++ pskb_trim_unique(skb_prev, maxfraglen); ++ } ++ copy = datalen - transhdrlen - fraggap; ++ ++ if (copy < 0) { ++ err = -EINVAL; ++ kfree_skb(skb); ++ goto error; ++ } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { ++ err = -EFAULT; ++ kfree_skb(skb); ++ goto error; ++ } ++ ++ offset += copy; ++ length -= datalen - fraggap; ++ transhdrlen = 0; ++ exthdrlen = 0; ++ dst_exthdrlen = 0; ++ ++ /* ++ * Put the packet on the pending queue ++ */ ++ __skb_queue_tail(&sk->sk_write_queue, skb); ++ continue; ++ } ++ ++ if (copy > length) ++ copy = length; ++ ++ if (!(rt->dst.dev->features&NETIF_F_SG)) { ++ unsigned int off; ++ ++ off = skb->len; ++ if (getfrag(from, skb_put(skb, copy), ++ offset, copy, off, skb) < 0) { ++ __skb_trim(skb, off); ++ err = -EFAULT; ++ goto error; ++ } ++ } else { ++ int i = skb_shinfo(skb)->nr_frags; ++ struct page_frag *pfrag = sk_page_frag(sk); ++ ++ err = -ENOMEM; ++ if (!sk_page_frag_refill(sk, pfrag)) ++ goto error; ++ ++ if (!skb_can_coalesce(skb, i, pfrag->page, ++ pfrag->offset)) { ++ err = -EMSGSIZE; ++ if (i == MAX_SKB_FRAGS) ++ goto error; ++ ++ __skb_fill_page_desc(skb, i, pfrag->page, ++ pfrag->offset, 0); ++ skb_shinfo(skb)->nr_frags = ++i; ++ get_page(pfrag->page); ++ } ++ copy = min_t(int, copy, pfrag->size - pfrag->offset); ++ if (getfrag(from, ++ page_address(pfrag->page) + pfrag->offset, ++ offset, copy, skb->len, skb) < 0) ++ goto error_efault; ++ ++ pfrag->offset += copy; ++ skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); ++ skb->len += copy; ++ skb->data_len += copy; ++ skb->truesize += copy; ++ atomic_add(copy, &sk->sk_wmem_alloc); ++ } ++ offset += copy; ++ length -= copy; ++ } ++ ++ return 0; ++ ++error_efault: ++ err = -EFAULT; ++error: ++ cork->length -= length; ++ IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); ++ return err; ++} ++EXPORT_SYMBOL_GPL(ip6_append_data); ++ ++static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np) ++{ ++ if (np->cork.opt) { ++ kfree(np->cork.opt->dst0opt); ++ kfree(np->cork.opt->dst1opt); ++ kfree(np->cork.opt->hopopt); ++ kfree(np->cork.opt->srcrt); ++ kfree(np->cork.opt); ++ np->cork.opt = NULL; ++ } ++ ++ if (inet->cork.base.dst) { ++ dst_release(inet->cork.base.dst); ++ inet->cork.base.dst = NULL; ++ inet->cork.base.flags &= ~IPCORK_ALLFRAG; ++ } ++ memset(&inet->cork.fl, 0, sizeof(inet->cork.fl)); ++} ++ ++int ip6_push_pending_frames(struct sock *sk) ++{ ++ struct sk_buff *skb, *tmp_skb; ++ struct sk_buff **tail_skb; ++ struct in6_addr final_dst_buf, *final_dst = &final_dst_buf; ++ struct inet_sock *inet = inet_sk(sk); ++ struct ipv6_pinfo *np = inet6_sk(sk); ++ struct net *net = sock_net(sk); ++ struct ipv6hdr *hdr; ++ struct ipv6_txoptions *opt = np->cork.opt; ++ struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst; ++ struct flowi6 *fl6 = &inet->cork.fl.u.ip6; ++ unsigned char proto = fl6->flowi6_proto; ++ int err = 0; ++ ++ if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) ++ goto out; ++ tail_skb = &(skb_shinfo(skb)->frag_list); ++ ++ /* move skb->data to ip header from ext header */ ++ if (skb->data < skb_network_header(skb)) ++ __skb_pull(skb, skb_network_offset(skb)); ++ while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { ++ __skb_pull(tmp_skb, skb_network_header_len(skb)); ++ *tail_skb = tmp_skb; ++ tail_skb = &(tmp_skb->next); ++ skb->len += tmp_skb->len; ++ skb->data_len += tmp_skb->len; ++ skb->truesize += tmp_skb->truesize; ++ tmp_skb->destructor = NULL; ++ tmp_skb->sk = NULL; ++ } ++ ++ /* Allow local fragmentation. */ ++ if (np->pmtudisc < IPV6_PMTUDISC_DO) ++ skb->local_df = 1; ++ ++ *final_dst = fl6->daddr; ++ __skb_pull(skb, skb_network_header_len(skb)); ++ if (opt && opt->opt_flen) ++ ipv6_push_frag_opts(skb, opt, &proto); ++ if (opt && opt->opt_nflen) ++ ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst); ++ ++ skb_push(skb, sizeof(struct ipv6hdr)); ++ skb_reset_network_header(skb); ++ hdr = ipv6_hdr(skb); ++ ++ ip6_flow_hdr(hdr, np->cork.tclass, fl6->flowlabel); ++ hdr->hop_limit = np->cork.hop_limit; ++ hdr->nexthdr = proto; ++ hdr->saddr = fl6->saddr; ++ hdr->daddr = *final_dst; ++ ++ skb->priority = sk->sk_priority; ++ skb->mark = sk->sk_mark; ++ ++ skb_dst_set(skb, dst_clone(&rt->dst)); ++ IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); ++ if (proto == IPPROTO_ICMPV6) { ++ struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); ++ ++ ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type); ++ ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS); ++ } ++ ++ err = ip6_local_out(skb); ++ if (err) { ++ if (err > 0) ++ err = net_xmit_errno(err); ++ if (err) ++ goto error; ++ } ++ ++out: ++ ip6_cork_release(inet, np); ++ return err; ++error: ++ IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); ++ goto out; ++} ++EXPORT_SYMBOL_GPL(ip6_push_pending_frames); ++ ++void ip6_flush_pending_frames(struct sock *sk) ++{ ++ struct sk_buff *skb; ++ ++ while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) { ++ if (skb_dst(skb)) ++ IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), ++ IPSTATS_MIB_OUTDISCARDS); ++ kfree_skb(skb); ++ } ++ ++ ip6_cork_release(inet_sk(sk), inet6_sk(sk)); ++} ++EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); +diff -ruN linux-3.10.27/net/netfilter/core.c linux-3.10.27-imq/net/netfilter/core.c +--- linux-3.10.27/net/netfilter/core.c 2014-01-16 00:29:14.000000000 +0100 ++++ linux-3.10.27-imq/net/netfilter/core.c 2014-01-18 10:19:59.349342984 +0100 +@@ -191,9 +191,11 @@ + ret = NF_DROP_GETERR(verdict); + if (ret == 0) + ret = -EPERM; +- } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { ++ } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE || ++ (verdict & NF_VERDICT_MASK) == NF_IMQ_QUEUE) { + int err = nf_queue(skb, elem, pf, hook, indev, outdev, okfn, +- verdict >> NF_VERDICT_QBITS); ++ verdict >> NF_VERDICT_QBITS, ++ verdict & NF_VERDICT_MASK); + if (err < 0) { + if (err == -ECANCELED) + goto next_hook; +diff -ruN linux-3.10.27/net/netfilter/Kconfig linux-3.10.27-imq/net/netfilter/Kconfig +--- linux-3.10.27/net/netfilter/Kconfig 2014-01-16 00:29:14.000000000 +0100 ++++ linux-3.10.27-imq/net/netfilter/Kconfig 2014-01-18 10:19:59.349342984 +0100 +@@ -641,6 +641,18 @@ + + To compile it as a module, choose M here. If unsure, say N. + ++config NETFILTER_XT_TARGET_IMQ ++ tristate '"IMQ" target support' ++ depends on NETFILTER_XTABLES ++ depends on IP_NF_MANGLE || IP6_NF_MANGLE ++ select IMQ ++ default m if NETFILTER_ADVANCED=n ++ help ++ This option adds a `IMQ' target which is used to specify if and ++ to which imq device packets should get enqueued/dequeued. ++ ++ To compile it as a module, choose M here. If unsure, say N. ++ + config NETFILTER_XT_TARGET_MARK + tristate '"MARK" target support' + depends on NETFILTER_ADVANCED +diff -ruN linux-3.10.27/net/netfilter/Makefile linux-3.10.27-imq/net/netfilter/Makefile +--- linux-3.10.27/net/netfilter/Makefile 2014-01-16 00:29:14.000000000 +0100 ++++ linux-3.10.27-imq/net/netfilter/Makefile 2014-01-18 10:19:59.349342984 +0100 +@@ -82,6 +82,7 @@ + obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o + obj-$(CONFIG_NETFILTER_XT_TARGET_HL) += xt_HL.o + obj-$(CONFIG_NETFILTER_XT_TARGET_HMARK) += xt_HMARK.o ++obj-$(CONFIG_NETFILTER_XT_TARGET_IMQ) += xt_IMQ.o + obj-$(CONFIG_NETFILTER_XT_TARGET_LED) += xt_LED.o + obj-$(CONFIG_NETFILTER_XT_TARGET_LOG) += xt_LOG.o + obj-$(CONFIG_NETFILTER_XT_TARGET_NETMAP) += xt_NETMAP.o +diff -ruN linux-3.10.27/net/netfilter/nf_internals.h linux-3.10.27-imq/net/netfilter/nf_internals.h +--- linux-3.10.27/net/netfilter/nf_internals.h 2014-01-16 00:29:14.000000000 +0100 ++++ linux-3.10.27-imq/net/netfilter/nf_internals.h 2014-01-18 10:19:59.349342984 +0100 +@@ -29,7 +29,7 @@ + struct net_device *indev, + struct net_device *outdev, + int (*okfn)(struct sk_buff *), +- unsigned int queuenum); ++ unsigned int queuenum, unsigned int queuetype); + extern int __init netfilter_queue_init(void); + + /* nf_log.c */ +diff -ruN linux-3.10.27/net/netfilter/nf_queue.c linux-3.10.27-imq/net/netfilter/nf_queue.c +--- linux-3.10.27/net/netfilter/nf_queue.c 2014-01-16 00:29:14.000000000 +0100 ++++ linux-3.10.27-imq/net/netfilter/nf_queue.c 2014-01-18 10:19:59.350342998 +0100 +@@ -27,6 +27,23 @@ + */ + static const struct nf_queue_handler __rcu *queue_handler __read_mostly; + ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++static const struct nf_queue_handler __rcu *queue_imq_handler __read_mostly; ++ ++void nf_register_queue_imq_handler(const struct nf_queue_handler *qh) ++{ ++ rcu_assign_pointer(queue_imq_handler, qh); ++} ++EXPORT_SYMBOL_GPL(nf_register_queue_imq_handler); ++ ++void nf_unregister_queue_imq_handler(void) ++{ ++ RCU_INIT_POINTER(queue_imq_handler, NULL); ++ synchronize_rcu(); ++} ++EXPORT_SYMBOL_GPL(nf_unregister_queue_imq_handler); ++#endif ++ + /* return EBUSY when somebody else is registered, return EEXIST if the + * same handler is registered, return 0 in case of success. */ + void nf_register_queue_handler(const struct nf_queue_handler *qh) +@@ -105,7 +122,8 @@ + struct net_device *indev, + struct net_device *outdev, + int (*okfn)(struct sk_buff *), +- unsigned int queuenum) ++ unsigned int queuenum, ++ unsigned int queuetype) + { + int status = -ENOENT; + struct nf_queue_entry *entry = NULL; +@@ -115,7 +133,17 @@ + /* QUEUE == DROP if no one is waiting, to be safe. */ + rcu_read_lock(); + +- qh = rcu_dereference(queue_handler); ++ if (queuetype == NF_IMQ_QUEUE) { ++#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) ++ qh = rcu_dereference(queue_imq_handler); ++#else ++ BUG(); ++ goto err_unlock; ++#endif ++ } else { ++ qh = rcu_dereference(queue_handler); ++ } ++ + if (!qh) { + status = -ESRCH; + goto err_unlock; +@@ -205,9 +233,11 @@ + local_bh_enable(); + break; + case NF_QUEUE: ++ case NF_IMQ_QUEUE: + err = nf_queue(skb, elem, entry->pf, entry->hook, + entry->indev, entry->outdev, entry->okfn, +- verdict >> NF_VERDICT_QBITS); ++ verdict >> NF_VERDICT_QBITS, ++ verdict & NF_VERDICT_MASK); + if (err < 0) { + if (err == -ECANCELED) + goto next_hook; +diff -ruN linux-3.10.27/net/netfilter/xt_IMQ.c linux-3.10.27-imq/net/netfilter/xt_IMQ.c +--- linux-3.10.27/net/netfilter/xt_IMQ.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-3.10.27-imq/net/netfilter/xt_IMQ.c 2014-01-18 10:19:59.350342998 +0100 +@@ -0,0 +1,72 @@ ++/* ++ * This target marks packets to be enqueued to an imq device ++ */ ++#include ++#include ++#include ++#include ++#include ++ ++static unsigned int imq_target(struct sk_buff *pskb, ++ const struct xt_action_param *par) ++{ ++ const struct xt_imq_info *mr = par->targinfo; ++ ++ pskb->imq_flags = (mr->todev & IMQ_F_IFMASK) | IMQ_F_ENQUEUE; ++ ++ return XT_CONTINUE; ++} ++ ++static int imq_checkentry(const struct xt_tgchk_param *par) ++{ ++ struct xt_imq_info *mr = par->targinfo; ++ ++ if (mr->todev > IMQ_MAX_DEVS - 1) { ++ pr_warn("IMQ: invalid device specified, highest is %u\n", ++ IMQ_MAX_DEVS - 1); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static struct xt_target xt_imq_reg[] __read_mostly = { ++ { ++ .name = "IMQ", ++ .family = AF_INET, ++ .checkentry = imq_checkentry, ++ .target = imq_target, ++ .targetsize = sizeof(struct xt_imq_info), ++ .table = "mangle", ++ .me = THIS_MODULE ++ }, ++ { ++ .name = "IMQ", ++ .family = AF_INET6, ++ .checkentry = imq_checkentry, ++ .target = imq_target, ++ .targetsize = sizeof(struct xt_imq_info), ++ .table = "mangle", ++ .me = THIS_MODULE ++ }, ++}; ++ ++static int __init imq_init(void) ++{ ++ return xt_register_targets(xt_imq_reg, ARRAY_SIZE(xt_imq_reg)); ++} ++ ++static void __exit imq_fini(void) ++{ ++ xt_unregister_targets(xt_imq_reg, ARRAY_SIZE(xt_imq_reg)); ++} ++ ++module_init(imq_init); ++module_exit(imq_fini); ++ ++MODULE_AUTHOR("http://www.linuximq.net"); ++MODULE_DESCRIPTION("Pseudo-driver for the intermediate queue device. See http://www.linuximq.net/ for more information."); ++MODULE_LICENSE("GPL"); ++MODULE_ALIAS("ipt_IMQ"); ++MODULE_ALIAS("ip6t_IMQ"); ++ -- 2.39.2