]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
3.4-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 5 Mar 2014 19:50:15 +0000 (11:50 -0800)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 5 Mar 2014 19:50:15 +0000 (11:50 -0800)
added patches:
xen-blkback-check-for-insane-amounts-of-request-on-the-ring-v6.patch
xen-boot-disable-bios-smp-mp-table-search.patch
xen-events-mask-events-when-changing-their-vcpu-binding.patch
xen-io-ring.h-new-macro-to-detect-whether-there-are-too-many-requests-on-the-ring.patch
xen-netback-coalesce-slots-in-tx-path-and-fix-regressions.patch
xen-netback-don-t-disconnect-frontend-when-seeing-oversize-packet.patch
xen-netback-fix-sparse-warning.patch
xen-smp-fix-leakage-of-timer-interrupt-line-for-every-cpu-online-offline.patch
xen-smp-spinlock-fix-leakage-of-the-spinlock-interrupt-line-for-every-cpu-online-offline.patch

queue-3.4/series
queue-3.4/xen-blkback-check-for-insane-amounts-of-request-on-the-ring-v6.patch [new file with mode: 0644]
queue-3.4/xen-boot-disable-bios-smp-mp-table-search.patch [new file with mode: 0644]
queue-3.4/xen-events-mask-events-when-changing-their-vcpu-binding.patch [new file with mode: 0644]
queue-3.4/xen-io-ring.h-new-macro-to-detect-whether-there-are-too-many-requests-on-the-ring.patch [new file with mode: 0644]
queue-3.4/xen-netback-coalesce-slots-in-tx-path-and-fix-regressions.patch [new file with mode: 0644]
queue-3.4/xen-netback-don-t-disconnect-frontend-when-seeing-oversize-packet.patch [new file with mode: 0644]
queue-3.4/xen-netback-fix-sparse-warning.patch [new file with mode: 0644]
queue-3.4/xen-smp-fix-leakage-of-timer-interrupt-line-for-every-cpu-online-offline.patch [new file with mode: 0644]
queue-3.4/xen-smp-spinlock-fix-leakage-of-the-spinlock-interrupt-line-for-every-cpu-online-offline.patch [new file with mode: 0644]

index 4bd231e7e022589fabf05bc6cf5a6de4ae96683b..be8abb533e5d8938ef08c2df5a36524f1ff476c7 100644 (file)
@@ -65,3 +65,12 @@ nbd-correct-disconnect-behavior.patch
 block-don-t-access-request-after-it-might-be-freed.patch
 ext4-return-enomem-if-sb_getblk-fails.patch
 saa7134-fix-unlocked-snd_pcm_stop-call.patch
+xen-boot-disable-bios-smp-mp-table-search.patch
+xen-smp-fix-leakage-of-timer-interrupt-line-for-every-cpu-online-offline.patch
+xen-smp-spinlock-fix-leakage-of-the-spinlock-interrupt-line-for-every-cpu-online-offline.patch
+xen-netback-fix-sparse-warning.patch
+xen-netback-coalesce-slots-in-tx-path-and-fix-regressions.patch
+xen-netback-don-t-disconnect-frontend-when-seeing-oversize-packet.patch
+xen-io-ring.h-new-macro-to-detect-whether-there-are-too-many-requests-on-the-ring.patch
+xen-blkback-check-for-insane-amounts-of-request-on-the-ring-v6.patch
+xen-events-mask-events-when-changing-their-vcpu-binding.patch
diff --git a/queue-3.4/xen-blkback-check-for-insane-amounts-of-request-on-the-ring-v6.patch b/queue-3.4/xen-blkback-check-for-insane-amounts-of-request-on-the-ring-v6.patch
new file mode 100644 (file)
index 0000000..421723b
--- /dev/null
@@ -0,0 +1,207 @@
+From 9371cadbbcc7c00c81753b9727b19fb3bc74d458 Mon Sep 17 00:00:00 2001
+From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Date: Wed, 23 Jan 2013 16:54:32 -0500
+Subject: xen/blkback: Check for insane amounts of request on the ring (v6).
+
+From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+
+commit 9371cadbbcc7c00c81753b9727b19fb3bc74d458 upstream.
+
+commit 8e3f8755545cc4a7f4da8e9ef76d6d32e0dca576 upstream.
+
+Check that the ring does not have an insane amount of requests
+(more than there could fit on the ring).
+
+If we detect this case we will stop processing the requests
+and wait until the XenBus disconnects the ring.
+
+The existing check RING_REQUEST_CONS_OVERFLOW which checks for how
+many responses we have created in the past (rsp_prod_pvt) vs
+requests consumed (req_cons) and whether said difference is greater or
+equal to the size of the ring, does not catch this case.
+
+Wha the condition does check if there is a need to process more
+as we still have a backlog of responses to finish. Note that both
+of those values (rsp_prod_pvt and req_cons) are not exposed on the
+shared ring.
+
+To understand this problem a mini crash course in ring protocol
+response/request updates is in place.
+
+There are four entries: req_prod and rsp_prod; req_event and rsp_event
+to track the ring entries. We are only concerned about the first two -
+which set the tone of this bug.
+
+The req_prod is a value incremented by frontend for each request put
+on the ring. Conversely the rsp_prod is a value incremented by the backend
+for each response put on the ring (rsp_prod gets set by rsp_prod_pvt when
+pushing the responses on the ring).  Both values can
+wrap and are modulo the size of the ring (in block case that is 32).
+Please see RING_GET_REQUEST and RING_GET_RESPONSE for the more details.
+
+The culprit here is that if the difference between the
+req_prod and req_cons is greater than the ring size we have a problem.
+Fortunately for us, the '__do_block_io_op' loop:
+
+       rc = blk_rings->common.req_cons;
+       rp = blk_rings->common.sring->req_prod;
+
+       while (rc != rp) {
+
+               ..
+               blk_rings->common.req_cons = ++rc; /* before make_response() */
+
+       }
+
+will loop up to the point when rc == rp. The macros inside of the
+loop (RING_GET_REQUEST) is smart and is indexing based on the modulo
+of the ring size. If the frontend has provided a bogus req_prod value
+we will loop until the 'rc == rp' - which means we could be processing
+already processed requests (or responses) often.
+
+The reason the RING_REQUEST_CONS_OVERFLOW is not helping here is
+b/c it only tracks how many responses we have internally produced
+and whether we would should process more. The astute reader will
+notice that the macro RING_REQUEST_CONS_OVERFLOW provides two
+arguments - more on this later.
+
+For example, if we were to enter this function with these values:
+
+               blk_rings->common.sring->req_prod =  X+31415 (X is the value from
+               the last time __do_block_io_op was called).
+        blk_rings->common.req_cons = X
+        blk_rings->common.rsp_prod_pvt = X
+
+The RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, blk_rings->common.req_cons)
+is doing:
+
+       req_cons - rsp_prod_pvt >= 32
+
+Which is,
+       X - X >= 32 or 0 >= 32
+
+And that is false, so we continue on looping (this bug).
+
+If we re-use said macro RING_REQUEST_CONS_OVERFLOW and pass in the rp
+instead (sring->req_prod) of rc, the this macro can do the check:
+
+     req_prod - rsp_prov_pvt >= 32
+
+Which is,
+       X + 31415 - X >= 32 , or 31415 >= 32
+
+which is true, so we can error out and break out of the function.
+
+Unfortunatly the difference between rsp_prov_pvt and req_prod can be
+at 32 (which would error out in the macro). This condition exists when
+the backend is lagging behind with the responses and still has not finished
+responding to all of them (so make_response has not been called), and
+the rsp_prov_pvt + 32 == req_cons. This ends up with us not being able
+to use said macro.
+
+Hence introducing a new macro called RING_REQUEST_PROD_OVERFLOW which does
+a simple check of:
+
+    req_prod - rsp_prod_pvt > RING_SIZE
+
+And with the X values from above:
+
+   X + 31415 - X > 32
+
+Returns true. Also not that if the ring is full (which is where
+the RING_REQUEST_CONS_OVERFLOW triggered), we would not hit the
+same condition:
+
+   X + 32 - X > 32
+
+Which is false.
+
+Lets use that macro.
+Note that in v5 of this patchset the macro was different - we used an
+earlier version.
+
+[v1: Move the check outside the loop]
+[v2: Add a pr_warn as suggested by David]
+[v3: Use RING_REQUEST_CONS_OVERFLOW as suggested by Jan]
+[v4: Move wake_up after kthread_stop as suggested by Jan]
+[v5: Use RING_REQUEST_PROD_OVERFLOW instead]
+[v6: Use RING_REQUEST_PROD_OVERFLOW - Jan's version]
+Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+[bwh: Backported to 3.2: adjust context]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Cc: Yijing Wang <wangyijing@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/block/xen-blkback/blkback.c |   13 ++++++++++++-
+ drivers/block/xen-blkback/common.h  |    2 ++
+ drivers/block/xen-blkback/xenbus.c  |    2 ++
+ 3 files changed, 16 insertions(+), 1 deletion(-)
+
+--- a/drivers/block/xen-blkback/blkback.c
++++ b/drivers/block/xen-blkback/blkback.c
+@@ -274,6 +274,7 @@ int xen_blkif_schedule(void *arg)
+ {
+       struct xen_blkif *blkif = arg;
+       struct xen_vbd *vbd = &blkif->vbd;
++      int ret;
+       xen_blkif_get(blkif);
+@@ -294,8 +295,12 @@ int xen_blkif_schedule(void *arg)
+               blkif->waiting_reqs = 0;
+               smp_mb(); /* clear flag *before* checking for work */
+-              if (do_block_io_op(blkif))
++              ret = do_block_io_op(blkif);
++              if (ret > 0)
+                       blkif->waiting_reqs = 1;
++              if (ret == -EACCES)
++                      wait_event_interruptible(blkif->shutdown_wq,
++                                               kthread_should_stop());
+               if (log_stats && time_after(jiffies, blkif->st_print))
+                       print_stats(blkif);
+@@ -531,6 +536,12 @@ __do_block_io_op(struct xen_blkif *blkif
+       rp = blk_rings->common.sring->req_prod;
+       rmb(); /* Ensure we see queued requests up to 'rp'. */
++      if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) {
++              rc = blk_rings->common.rsp_prod_pvt;
++              pr_warn(DRV_PFX "Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n",
++                      rp, rc, rp - rc, blkif->vbd.pdevice);
++              return -EACCES;
++      }
+       while (rc != rp) {
+               if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
+--- a/drivers/block/xen-blkback/common.h
++++ b/drivers/block/xen-blkback/common.h
+@@ -216,6 +216,8 @@ struct xen_blkif {
+       int                     st_wr_sect;
+       wait_queue_head_t       waiting_to_free;
++      /* Thread shutdown wait queue. */
++      wait_queue_head_t       shutdown_wq;
+ };
+--- a/drivers/block/xen-blkback/xenbus.c
++++ b/drivers/block/xen-blkback/xenbus.c
+@@ -118,6 +118,7 @@ static struct xen_blkif *xen_blkif_alloc
+       atomic_set(&blkif->drain, 0);
+       blkif->st_print = jiffies;
+       init_waitqueue_head(&blkif->waiting_to_free);
++      init_waitqueue_head(&blkif->shutdown_wq);
+       return blkif;
+ }
+@@ -178,6 +179,7 @@ static void xen_blkif_disconnect(struct
+ {
+       if (blkif->xenblkd) {
+               kthread_stop(blkif->xenblkd);
++              wake_up(&blkif->shutdown_wq);
+               blkif->xenblkd = NULL;
+       }
diff --git a/queue-3.4/xen-boot-disable-bios-smp-mp-table-search.patch b/queue-3.4/xen-boot-disable-bios-smp-mp-table-search.patch
new file mode 100644 (file)
index 0000000..ab243f7
--- /dev/null
@@ -0,0 +1,92 @@
+From 0a193b148d6d0ed05ea82f466b6d7eac50b87ac5 Mon Sep 17 00:00:00 2001
+From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Date: Wed, 19 Sep 2012 08:30:55 -0400
+Subject: xen/boot: Disable BIOS SMP MP table search.
+
+From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+
+commit bd49940a35ec7d488ae63bd625639893b3385b97 upstream.
+
+As the initial domain we are able to search/map certain regions
+of memory to harvest configuration data. For all low-level we
+use ACPI tables - for interrupts we use exclusively ACPI _PRT
+(so DSDT) and MADT for INT_SRC_OVR.
+
+The SMP MP table is not used at all. As a matter of fact we do
+not even support machines that only have SMP MP but no ACPI tables.
+
+Lets follow how Moorestown does it and just disable searching
+for BIOS SMP tables.
+
+This also fixes an issue on HP Proliant BL680c G5 and DL380 G6:
+
+9f->100 for 1:1 PTE
+Freeing 9f-100 pfn range: 97 pages freed
+1-1 mapping on 9f->100
+.. snip..
+e820: BIOS-provided physical RAM map:
+Xen: [mem 0x0000000000000000-0x000000000009efff] usable
+Xen: [mem 0x000000000009f400-0x00000000000fffff] reserved
+Xen: [mem 0x0000000000100000-0x00000000cfd1dfff] usable
+.. snip..
+Scan for SMP in [mem 0x00000000-0x000003ff]
+Scan for SMP in [mem 0x0009fc00-0x0009ffff]
+Scan for SMP in [mem 0x000f0000-0x000fffff]
+found SMP MP-table at [mem 0x000f4fa0-0x000f4faf] mapped at [ffff8800000f4fa0]
+(XEN) mm.c:908:d0 Error getting mfn 100 (pfn 5555555555555555) from L1 entry 0000000000100461 for l1e_owner=0, pg_owner=0
+(XEN) mm.c:4995:d0 ptwr_emulate: could not get_page_from_l1e()
+BUG: unable to handle kernel NULL pointer dereference at           (null)
+IP: [<ffffffff81ac07e2>] xen_set_pte_init+0x66/0x71
+. snip..
+Pid: 0, comm: swapper Not tainted 3.6.0-rc6upstream-00188-gb6fb969-dirty #2 HP ProLiant BL680c G5
+.. snip..
+Call Trace:
+ [<ffffffff81ad31c6>] __early_ioremap+0x18a/0x248
+ [<ffffffff81624731>] ? printk+0x48/0x4a
+ [<ffffffff81ad32ac>] early_ioremap+0x13/0x15
+ [<ffffffff81acc140>] get_mpc_size+0x2f/0x67
+ [<ffffffff81acc284>] smp_scan_config+0x10c/0x136
+ [<ffffffff81acc2e4>] default_find_smp_config+0x36/0x5a
+ [<ffffffff81ac3085>] setup_arch+0x5b3/0xb5b
+ [<ffffffff81624731>] ? printk+0x48/0x4a
+ [<ffffffff81abca7f>] start_kernel+0x90/0x390
+ [<ffffffff81abc356>] x86_64_start_reservations+0x131/0x136
+ [<ffffffff81abfa83>] xen_start_kernel+0x65f/0x661
+(XEN) Domain 0 crashed: 'noreboot' set - not rebooting.
+
+which is that ioremap would end up mapping 0xff using _PAGE_IOMAP
+(which is what early_ioremap sticks as a flag) - which meant
+we would get MFN 0xFF (pte ff461, which is OK), and then it would
+also map 0x100 (b/c ioremap tries to get page aligned request, and
+it was trying to map 0xf4fa0 + PAGE_SIZE - so it mapped the next page)
+as _PAGE_IOMAP. Since 0x100 is actually a RAM page, and the _PAGE_IOMAP
+bypasses the P2M lookup we would happily set the PTE to 1000461.
+Xen would deny the request since we do not have access to the
+Machine Frame Number (MFN) of 0x100. The P2M[0x100] is for example
+0x80140.
+
+Fixes-Oracle-Bugzilla: https://bugzilla.oracle.com/bugzilla/show_bug.cgi?id=13665
+Acked-by: Jan Beulich <jbeulich@suse.com>
+Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+[bwh: Backported to 3.2: adjust context]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Cc: Yijing Wang <wangyijing@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/xen/enlighten.c |    4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/arch/x86/xen/enlighten.c
++++ b/arch/x86/xen/enlighten.c
+@@ -1448,6 +1448,10 @@ asmlinkage void __init xen_start_kernel(
+               /* Make sure ACS will be enabled */
+               pci_request_acs();
++
++              /* Avoid searching for BIOS MP tables */
++              x86_init.mpparse.find_smp_config = x86_init_noop;
++              x86_init.mpparse.get_smp_config = x86_init_uint_noop;
+       }
+ #ifdef CONFIG_PCI
+       /* PCI BIOS service won't work from a PV guest. */
diff --git a/queue-3.4/xen-events-mask-events-when-changing-their-vcpu-binding.patch b/queue-3.4/xen-events-mask-events-when-changing-their-vcpu-binding.patch
new file mode 100644 (file)
index 0000000..5a8548c
--- /dev/null
@@ -0,0 +1,72 @@
+From 5e72fdb8d827560893642e85a251d339109a00f4 Mon Sep 17 00:00:00 2001
+From: David Vrabel <david.vrabel@citrix.com>
+Date: Thu, 15 Aug 2013 13:21:07 +0100
+Subject: xen/events: mask events when changing their VCPU binding
+
+From: David Vrabel <david.vrabel@citrix.com>
+
+commit 5e72fdb8d827560893642e85a251d339109a00f4 upstream.
+
+commit 4704fe4f03a5ab27e3c36184af85d5000e0f8a48 upstream.
+
+When a event is being bound to a VCPU there is a window between the
+EVTCHNOP_bind_vpcu call and the adjustment of the local per-cpu masks
+where an event may be lost.  The hypervisor upcalls the new VCPU but
+the kernel thinks that event is still bound to the old VCPU and
+ignores it.
+
+There is even a problem when the event is being bound to the same VCPU
+as there is a small window beween the clear_bit() and set_bit() calls
+in bind_evtchn_to_cpu().  When scanning for pending events, the kernel
+may read the bit when it is momentarily clear and ignore the event.
+
+Avoid this by masking the event during the whole bind operation.
+
+Signed-off-by: David Vrabel <david.vrabel@citrix.com>
+Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+[bwh: Backported to 3.2: remove the BM() cast]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Cc: Yijing Wang <wangyijing@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/xen/events.c |   11 +++++++++++
+ 1 file changed, 11 insertions(+)
+
+--- a/drivers/xen/events.c
++++ b/drivers/xen/events.c
+@@ -1422,8 +1422,10 @@ void rebind_evtchn_irq(int evtchn, int i
+ /* Rebind an evtchn so that it gets delivered to a specific cpu */
+ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
+ {
++      struct shared_info *s = HYPERVISOR_shared_info;
+       struct evtchn_bind_vcpu bind_vcpu;
+       int evtchn = evtchn_from_irq(irq);
++      int masked;
+       if (!VALID_EVTCHN(evtchn))
+               return -1;
+@@ -1440,6 +1442,12 @@ static int rebind_irq_to_cpu(unsigned ir
+       bind_vcpu.vcpu = tcpu;
+       /*
++       * Mask the event while changing the VCPU binding to prevent
++       * it being delivered on an unexpected VCPU.
++       */
++      masked = sync_test_and_set_bit(evtchn, s->evtchn_mask);
++
++      /*
+        * If this fails, it usually just indicates that we're dealing with a
+        * virq or IPI channel, which don't actually need to be rebound. Ignore
+        * it, but don't do the xenlinux-level rebind in that case.
+@@ -1447,6 +1455,9 @@ static int rebind_irq_to_cpu(unsigned ir
+       if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
+               bind_evtchn_to_cpu(evtchn, tcpu);
++      if (!masked)
++              unmask_evtchn(evtchn);
++
+       return 0;
+ }
diff --git a/queue-3.4/xen-io-ring.h-new-macro-to-detect-whether-there-are-too-many-requests-on-the-ring.patch b/queue-3.4/xen-io-ring.h-new-macro-to-detect-whether-there-are-too-many-requests-on-the-ring.patch
new file mode 100644 (file)
index 0000000..16e389a
--- /dev/null
@@ -0,0 +1,42 @@
+From 79c4d036e08cdcd9403047a37cbc9e37b5ee86b4 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Mon, 17 Jun 2013 15:16:33 -0400
+Subject: xen/io/ring.h: new macro to detect whether there are too many requests on the ring
+
+From: Jan Beulich <jbeulich@suse.com>
+
+commit 8d9256906a97c24e97e016482b9be06ea2532b05 upstream.
+
+Backends may need to protect themselves against an insane number of
+produced requests stored by a frontend, in case they iterate over
+requests until reaching the req_prod value. There can't be more
+requests on the ring than the difference between produced requests
+and produced (but possibly not yet published) responses.
+
+This is a more strict alternative to a patch previously posted by
+Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Cc: Yijing Wang <wangyijing@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/xen/interface/io/ring.h |    5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/include/xen/interface/io/ring.h
++++ b/include/xen/interface/io/ring.h
+@@ -188,6 +188,11 @@ struct __name##_back_ring {                                               \
+ #define RING_REQUEST_CONS_OVERFLOW(_r, _cons)                         \
+     (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r))
++/* Ill-behaved frontend determination: Can there be this many requests? */
++#define RING_REQUEST_PROD_OVERFLOW(_r, _prod)               \
++    (((_prod) - (_r)->rsp_prod_pvt) > RING_SIZE(_r))
++
++
+ #define RING_PUSH_REQUESTS(_r) do {                                   \
+     wmb(); /* back sees requests /before/ updated producer index */   \
+     (_r)->sring->req_prod = (_r)->req_prod_pvt;                               \
diff --git a/queue-3.4/xen-netback-coalesce-slots-in-tx-path-and-fix-regressions.patch b/queue-3.4/xen-netback-coalesce-slots-in-tx-path-and-fix-regressions.patch
new file mode 100644 (file)
index 0000000..5cdcdf0
--- /dev/null
@@ -0,0 +1,545 @@
+From bccc108d67244797827c61870e12d84f66a212fe Mon Sep 17 00:00:00 2001
+From: Wei Liu <wei.liu2@citrix.com>
+Date: Mon, 22 Apr 2013 02:20:42 +0000
+Subject: xen-netback: coalesce slots in TX path and fix regressions
+
+From: Wei Liu <wei.liu2@citrix.com>
+
+commit 2810e5b9a7731ca5fce22bfbe12c96e16ac44b6f upstream.
+
+This patch tries to coalesce tx requests when constructing grant copy
+structures. It enables netback to deal with situation when frontend's
+MAX_SKB_FRAGS is larger than backend's MAX_SKB_FRAGS.
+
+With the help of coalescing, this patch tries to address two regressions
+avoid reopening the security hole in XSA-39.
+
+Regression 1. The reduction of the number of supported ring entries (slots)
+per packet (from 18 to 17). This regression has been around for some time but
+remains unnoticed until XSA-39 security fix. This is fixed by coalescing
+slots.
+
+Regression 2. The XSA-39 security fix turning "too many frags" errors from
+just dropping the packet to a fatal error and disabling the VIF. This is fixed
+by coalescing slots (handling 18 slots when backend's MAX_SKB_FRAGS is 17)
+which rules out false positive (using 18 slots is legit) and dropping packets
+using 19 to `max_skb_slots` slots.
+
+To avoid reopening security hole in XSA-39, frontend sending packet using more
+than max_skb_slots is considered malicious.
+
+The behavior of netback for packet is thus:
+
+    1-18            slots: valid
+   19-max_skb_slots slots: drop and respond with an error
+   max_skb_slots+   slots: fatal error
+
+max_skb_slots is configurable by admin, default value is 20.
+
+Also change variable name from "frags" to "slots" in netbk_count_requests.
+
+Please note that RX path still has dependency on MAX_SKB_FRAGS. This will be
+fixed with separate patch.
+
+Signed-off-by: Wei Liu <wei.liu2@citrix.com>
+Acked-by: Ian Campbell <ian.campbell@citrix.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Cc: Yijing Wang <wangyijing@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/net/xen-netback/netback.c |  273 +++++++++++++++++++++++++++++++-------
+ include/xen/interface/io/netif.h  |   18 ++
+ 2 files changed, 241 insertions(+), 50 deletions(-)
+
+--- a/drivers/net/xen-netback/netback.c
++++ b/drivers/net/xen-netback/netback.c
+@@ -46,11 +46,25 @@
+ #include <asm/xen/hypercall.h>
+ #include <asm/xen/page.h>
++/*
++ * This is the maximum slots a skb can have. If a guest sends a skb
++ * which exceeds this limit it is considered malicious.
++ */
++#define MAX_SKB_SLOTS_DEFAULT 20
++static unsigned int max_skb_slots = MAX_SKB_SLOTS_DEFAULT;
++module_param(max_skb_slots, uint, 0444);
++
++typedef unsigned int pending_ring_idx_t;
++#define INVALID_PENDING_RING_IDX (~0U)
++
+ struct pending_tx_info {
+-      struct xen_netif_tx_request req;
++      struct xen_netif_tx_request req; /* coalesced tx request */
+       struct xenvif *vif;
++      pending_ring_idx_t head; /* head != INVALID_PENDING_RING_IDX
++                                * if it is head of one or more tx
++                                * reqs
++                                */
+ };
+-typedef unsigned int pending_ring_idx_t;
+ struct netbk_rx_meta {
+       int id;
+@@ -101,7 +115,11 @@ struct xen_netbk {
+       atomic_t netfront_count;
+       struct pending_tx_info pending_tx_info[MAX_PENDING_REQS];
+-      struct gnttab_copy tx_copy_ops[MAX_PENDING_REQS];
++      /* Coalescing tx requests before copying makes number of grant
++       * copy ops greater or equal to number of slots required. In
++       * worst case a tx request consumes 2 gnttab_copy.
++       */
++      struct gnttab_copy tx_copy_ops[2*MAX_PENDING_REQS];
+       u16 pending_ring[MAX_PENDING_REQS];
+@@ -117,6 +135,16 @@ struct xen_netbk {
+ static struct xen_netbk *xen_netbk;
+ static int xen_netbk_group_nr;
++/*
++ * If head != INVALID_PENDING_RING_IDX, it means this tx request is head of
++ * one or more merged tx requests, otherwise it is the continuation of
++ * previous tx request.
++ */
++static inline int pending_tx_is_head(struct xen_netbk *netbk, RING_IDX idx)
++{
++      return netbk->pending_tx_info[idx].head != INVALID_PENDING_RING_IDX;
++}
++
+ void xen_netbk_add_xenvif(struct xenvif *vif)
+ {
+       int i;
+@@ -249,6 +277,7 @@ static int max_required_rx_slots(struct
+ {
+       int max = DIV_ROUND_UP(vif->dev->mtu, PAGE_SIZE);
++      /* XXX FIXME: RX path dependent on MAX_SKB_FRAGS */
+       if (vif->can_sg || vif->gso || vif->gso_prefix)
+               max += MAX_SKB_FRAGS + 1; /* extra_info + frags */
+@@ -627,6 +656,7 @@ static void xen_netbk_rx_action(struct x
+               __skb_queue_tail(&rxq, skb);
+               /* Filled the batch queue? */
++              /* XXX FIXME: RX path dependent on MAX_SKB_FRAGS */
+               if (count + MAX_SKB_FRAGS >= XEN_NETIF_RX_RING_SIZE)
+                       break;
+       }
+@@ -870,47 +900,78 @@ static void netbk_fatal_tx_err(struct xe
+ static int netbk_count_requests(struct xenvif *vif,
+                               struct xen_netif_tx_request *first,
++                              RING_IDX first_idx,
+                               struct xen_netif_tx_request *txp,
+                               int work_to_do)
+ {
+       RING_IDX cons = vif->tx.req_cons;
+-      int frags = 0;
++      int slots = 0;
++      int drop_err = 0;
+       if (!(first->flags & XEN_NETTXF_more_data))
+               return 0;
+       do {
+-              if (frags >= work_to_do) {
+-                      netdev_err(vif->dev, "Need more frags\n");
++              if (slots >= work_to_do) {
++                      netdev_err(vif->dev,
++                                 "Asked for %d slots but exceeds this limit\n",
++                                 work_to_do);
+                       netbk_fatal_tx_err(vif);
+                       return -ENODATA;
+               }
+-              if (unlikely(frags >= MAX_SKB_FRAGS)) {
+-                      netdev_err(vif->dev, "Too many frags\n");
++              /* This guest is really using too many slots and
++               * considered malicious.
++               */
++              if (unlikely(slots >= max_skb_slots)) {
++                      netdev_err(vif->dev,
++                                 "Malicious frontend using %d slots, threshold %u\n",
++                                 slots, max_skb_slots);
+                       netbk_fatal_tx_err(vif);
+                       return -E2BIG;
+               }
+-              memcpy(txp, RING_GET_REQUEST(&vif->tx, cons + frags),
++              /* Xen network protocol had implicit dependency on
++               * MAX_SKB_FRAGS. XEN_NETIF_NR_SLOTS_MIN is set to the
++               * historical MAX_SKB_FRAGS value 18 to honor the same
++               * behavior as before. Any packet using more than 18
++               * slots but less than max_skb_slots slots is dropped
++               */
++              if (!drop_err && slots >= XEN_NETIF_NR_SLOTS_MIN) {
++                      if (net_ratelimit())
++                              netdev_dbg(vif->dev,
++                                         "Too many slots (%d) exceeding limit (%d), dropping packet\n",
++                                         slots, XEN_NETIF_NR_SLOTS_MIN);
++                      drop_err = -E2BIG;
++              }
++
++              memcpy(txp, RING_GET_REQUEST(&vif->tx, cons + slots),
+                      sizeof(*txp));
+               if (txp->size > first->size) {
+-                      netdev_err(vif->dev, "Frag is bigger than frame.\n");
++                      netdev_err(vif->dev,
++                                 "Invalid tx request, slot size %u > remaining size %u\n",
++                                 txp->size, first->size);
+                       netbk_fatal_tx_err(vif);
+                       return -EIO;
+               }
+               first->size -= txp->size;
+-              frags++;
++              slots++;
+               if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) {
+-                      netdev_err(vif->dev, "txp->offset: %x, size: %u\n",
++                      netdev_err(vif->dev, "Cross page boundary, txp->offset: %x, size: %u\n",
+                                txp->offset, txp->size);
+                       netbk_fatal_tx_err(vif);
+                       return -EINVAL;
+               }
+       } while ((txp++)->flags & XEN_NETTXF_more_data);
+-      return frags;
++
++      if (drop_err) {
++              netbk_tx_err(vif, first, first_idx + slots);
++              return drop_err;
++      }
++
++      return slots;
+ }
+ static struct page *xen_netbk_alloc_page(struct xen_netbk *netbk,
+@@ -934,48 +995,114 @@ static struct gnttab_copy *xen_netbk_get
+       struct skb_shared_info *shinfo = skb_shinfo(skb);
+       skb_frag_t *frags = shinfo->frags;
+       u16 pending_idx = *((u16 *)skb->data);
+-      int i, start;
++      u16 head_idx = 0;
++      int slot, start;
++      struct page *page;
++      pending_ring_idx_t index, start_idx = 0;
++      uint16_t dst_offset;
++      unsigned int nr_slots;
++      struct pending_tx_info *first = NULL;
++
++      /* At this point shinfo->nr_frags is in fact the number of
++       * slots, which can be as large as XEN_NETIF_NR_SLOTS_MIN.
++       */
++      nr_slots = shinfo->nr_frags;
+       /* Skip first skb fragment if it is on same page as header fragment. */
+       start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
+-      for (i = start; i < shinfo->nr_frags; i++, txp++) {
+-              struct page *page;
+-              pending_ring_idx_t index;
++      /* Coalesce tx requests, at this point the packet passed in
++       * should be <= 64K. Any packets larger than 64K have been
++       * handled in netbk_count_requests().
++       */
++      for (shinfo->nr_frags = slot = start; slot < nr_slots;
++           shinfo->nr_frags++) {
+               struct pending_tx_info *pending_tx_info =
+                       netbk->pending_tx_info;
+-              index = pending_index(netbk->pending_cons++);
+-              pending_idx = netbk->pending_ring[index];
+-              page = xen_netbk_alloc_page(netbk, pending_idx);
++              page = alloc_page(GFP_KERNEL|__GFP_COLD);
+               if (!page)
+                       goto err;
+-              gop->source.u.ref = txp->gref;
+-              gop->source.domid = vif->domid;
+-              gop->source.offset = txp->offset;
++              dst_offset = 0;
++              first = NULL;
++              while (dst_offset < PAGE_SIZE && slot < nr_slots) {
++                      gop->flags = GNTCOPY_source_gref;
++
++                      gop->source.u.ref = txp->gref;
++                      gop->source.domid = vif->domid;
++                      gop->source.offset = txp->offset;
++
++                      gop->dest.domid = DOMID_SELF;
++
++                      gop->dest.offset = dst_offset;
++                      gop->dest.u.gmfn = virt_to_mfn(page_address(page));
++
++                      if (dst_offset + txp->size > PAGE_SIZE) {
++                              /* This page can only merge a portion
++                               * of tx request. Do not increment any
++                               * pointer / counter here. The txp
++                               * will be dealt with in future
++                               * rounds, eventually hitting the
++                               * `else` branch.
++                               */
++                              gop->len = PAGE_SIZE - dst_offset;
++                              txp->offset += gop->len;
++                              txp->size -= gop->len;
++                              dst_offset += gop->len; /* quit loop */
++                      } else {
++                              /* This tx request can be merged in the page */
++                              gop->len = txp->size;
++                              dst_offset += gop->len;
++
++                              index = pending_index(netbk->pending_cons++);
++
++                              pending_idx = netbk->pending_ring[index];
++
++                              memcpy(&pending_tx_info[pending_idx].req, txp,
++                                     sizeof(*txp));
++                              xenvif_get(vif);
++
++                              pending_tx_info[pending_idx].vif = vif;
++
++                              /* Poison these fields, corresponding
++                               * fields for head tx req will be set
++                               * to correct values after the loop.
++                               */
++                              netbk->mmap_pages[pending_idx] = (void *)(~0UL);
++                              pending_tx_info[pending_idx].head =
++                                      INVALID_PENDING_RING_IDX;
++
++                              if (!first) {
++                                      first = &pending_tx_info[pending_idx];
++                                      start_idx = index;
++                                      head_idx = pending_idx;
++                              }
+-              gop->dest.u.gmfn = virt_to_mfn(page_address(page));
+-              gop->dest.domid = DOMID_SELF;
+-              gop->dest.offset = txp->offset;
+-
+-              gop->len = txp->size;
+-              gop->flags = GNTCOPY_source_gref;
++                              txp++;
++                              slot++;
++                      }
+-              gop++;
++                      gop++;
++              }
+-              memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
+-              xenvif_get(vif);
+-              pending_tx_info[pending_idx].vif = vif;
+-              frag_set_pending_idx(&frags[i], pending_idx);
++              first->req.offset = 0;
++              first->req.size = dst_offset;
++              first->head = start_idx;
++              set_page_ext(page, netbk, head_idx);
++              netbk->mmap_pages[head_idx] = page;
++              frag_set_pending_idx(&frags[shinfo->nr_frags], head_idx);
+       }
++      BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS);
++
+       return gop;
+ err:
+       /* Unwind, freeing all pages and sending error responses. */
+-      while (i-- > start) {
+-              xen_netbk_idx_release(netbk, frag_get_pending_idx(&frags[i]),
+-                                    XEN_NETIF_RSP_ERROR);
++      while (shinfo->nr_frags-- > start) {
++              xen_netbk_idx_release(netbk,
++                              frag_get_pending_idx(&frags[shinfo->nr_frags]),
++                              XEN_NETIF_RSP_ERROR);
+       }
+       /* The head too, if necessary. */
+       if (start)
+@@ -991,8 +1118,10 @@ static int xen_netbk_tx_check_gop(struct
+       struct gnttab_copy *gop = *gopp;
+       u16 pending_idx = *((u16 *)skb->data);
+       struct skb_shared_info *shinfo = skb_shinfo(skb);
++      struct pending_tx_info *tx_info;
+       int nr_frags = shinfo->nr_frags;
+       int i, err, start;
++      u16 peek; /* peek into next tx request */
+       /* Check status of header. */
+       err = gop->status;
+@@ -1004,11 +1133,20 @@ static int xen_netbk_tx_check_gop(struct
+       for (i = start; i < nr_frags; i++) {
+               int j, newerr;
++              pending_ring_idx_t head;
+               pending_idx = frag_get_pending_idx(&shinfo->frags[i]);
++              tx_info = &netbk->pending_tx_info[pending_idx];
++              head = tx_info->head;
+               /* Check error status: if okay then remember grant handle. */
+-              newerr = (++gop)->status;
++              do {
++                      newerr = (++gop)->status;
++                      if (newerr)
++                              break;
++                      peek = netbk->pending_ring[pending_index(++head)];
++              } while (!pending_tx_is_head(netbk, peek));
++
+               if (likely(!newerr)) {
+                       /* Had a previous error? Invalidate this fragment. */
+                       if (unlikely(err))
+@@ -1233,11 +1371,12 @@ static unsigned xen_netbk_tx_build_gops(
+       struct sk_buff *skb;
+       int ret;
+-      while (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
++      while ((nr_pending_reqs(netbk) + XEN_NETIF_NR_SLOTS_MIN
++              < MAX_PENDING_REQS) &&
+               !list_empty(&netbk->net_schedule_list)) {
+               struct xenvif *vif;
+               struct xen_netif_tx_request txreq;
+-              struct xen_netif_tx_request txfrags[MAX_SKB_FRAGS];
++              struct xen_netif_tx_request txfrags[max_skb_slots];
+               struct page *page;
+               struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX-1];
+               u16 pending_idx;
+@@ -1298,7 +1437,8 @@ static unsigned xen_netbk_tx_build_gops(
+                               continue;
+               }
+-              ret = netbk_count_requests(vif, &txreq, txfrags, work_to_do);
++              ret = netbk_count_requests(vif, &txreq, idx,
++                                         txfrags, work_to_do);
+               if (unlikely(ret < 0))
+                       continue;
+@@ -1325,7 +1465,7 @@ static unsigned xen_netbk_tx_build_gops(
+               pending_idx = netbk->pending_ring[index];
+               data_len = (txreq.size > PKT_PROT_LEN &&
+-                          ret < MAX_SKB_FRAGS) ?
++                          ret < XEN_NETIF_NR_SLOTS_MIN) ?
+                       PKT_PROT_LEN : txreq.size;
+               skb = alloc_skb(data_len + NET_SKB_PAD + NET_IP_ALIGN,
+@@ -1375,6 +1515,7 @@ static unsigned xen_netbk_tx_build_gops(
+               memcpy(&netbk->pending_tx_info[pending_idx].req,
+                      &txreq, sizeof(txreq));
+               netbk->pending_tx_info[pending_idx].vif = vif;
++              netbk->pending_tx_info[pending_idx].head = index;
+               *((u16 *)skb->data) = pending_idx;
+               __skb_put(skb, data_len);
+@@ -1505,7 +1646,10 @@ static void xen_netbk_idx_release(struct
+ {
+       struct xenvif *vif;
+       struct pending_tx_info *pending_tx_info;
+-      pending_ring_idx_t index;
++      pending_ring_idx_t head;
++      u16 peek; /* peek into next tx request */
++
++      BUG_ON(netbk->mmap_pages[pending_idx] == (void *)(~0UL));
+       /* Already complete? */
+       if (netbk->mmap_pages[pending_idx] == NULL)
+@@ -1514,19 +1658,40 @@ static void xen_netbk_idx_release(struct
+       pending_tx_info = &netbk->pending_tx_info[pending_idx];
+       vif = pending_tx_info->vif;
++      head = pending_tx_info->head;
+-      make_tx_response(vif, &pending_tx_info->req, status);
++      BUG_ON(!pending_tx_is_head(netbk, head));
++      BUG_ON(netbk->pending_ring[pending_index(head)] != pending_idx);
+-      index = pending_index(netbk->pending_prod++);
+-      netbk->pending_ring[index] = pending_idx;
++      do {
++              pending_ring_idx_t index;
++              pending_ring_idx_t idx = pending_index(head);
++              u16 info_idx = netbk->pending_ring[idx];
+-      xenvif_put(vif);
++              pending_tx_info = &netbk->pending_tx_info[info_idx];
++              make_tx_response(vif, &pending_tx_info->req, status);
+-      netbk->mmap_pages[pending_idx]->mapping = NULL;
++              /* Setting any number other than
++               * INVALID_PENDING_RING_IDX indicates this slot is
++               * starting a new packet / ending a previous packet.
++               */
++              pending_tx_info->head = 0;
++
++              index = pending_index(netbk->pending_prod++);
++              netbk->pending_ring[index] = netbk->pending_ring[info_idx];
++
++              xenvif_put(vif);
++
++              peek = netbk->pending_ring[pending_index(++head)];
++
++      } while (!pending_tx_is_head(netbk, peek));
++
++      netbk->mmap_pages[pending_idx]->mapping = 0;
+       put_page(netbk->mmap_pages[pending_idx]);
+       netbk->mmap_pages[pending_idx] = NULL;
+ }
++
+ static void make_tx_response(struct xenvif *vif,
+                            struct xen_netif_tx_request *txp,
+                            s8       st)
+@@ -1579,8 +1744,9 @@ static inline int rx_work_todo(struct xe
+ static inline int tx_work_todo(struct xen_netbk *netbk)
+ {
+-      if (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
+-                      !list_empty(&netbk->net_schedule_list))
++      if ((nr_pending_reqs(netbk) + XEN_NETIF_NR_SLOTS_MIN
++           < MAX_PENDING_REQS) &&
++           !list_empty(&netbk->net_schedule_list))
+               return 1;
+       return 0;
+@@ -1663,6 +1829,13 @@ static int __init netback_init(void)
+       if (!xen_domain())
+               return -ENODEV;
++      if (max_skb_slots < XEN_NETIF_NR_SLOTS_MIN) {
++              printk(KERN_INFO
++                     "xen-netback: max_skb_slots too small (%d), bump it to XEN_NETIF_NR_SLOTS_MIN (%d)\n",
++                     max_skb_slots, XEN_NETIF_NR_SLOTS_MIN);
++              max_skb_slots = XEN_NETIF_NR_SLOTS_MIN;
++      }
++
+       xen_netbk_group_nr = num_online_cpus();
+       xen_netbk = vzalloc(sizeof(struct xen_netbk) * xen_netbk_group_nr);
+       if (!xen_netbk)
+--- a/include/xen/interface/io/netif.h
++++ b/include/xen/interface/io/netif.h
+@@ -13,6 +13,24 @@
+ #include "../grant_table.h"
+ /*
++ * Older implementation of Xen network frontend / backend has an
++ * implicit dependency on the MAX_SKB_FRAGS as the maximum number of
++ * ring slots a skb can use. Netfront / netback may not work as
++ * expected when frontend and backend have different MAX_SKB_FRAGS.
++ *
++ * A better approach is to add mechanism for netfront / netback to
++ * negotiate this value. However we cannot fix all possible
++ * frontends, so we need to define a value which states the minimum
++ * slots backend must support.
++ *
++ * The minimum value derives from older Linux kernel's MAX_SKB_FRAGS
++ * (18), which is proved to work with most frontends. Any new backend
++ * which doesn't negotiate with frontend should expect frontend to
++ * send a valid packet using slots up to this value.
++ */
++#define XEN_NETIF_NR_SLOTS_MIN 18
++
++/*
+  * Notifications after enqueuing any type of message should be conditional on
+  * the appropriate req_event or rsp_event field in the shared ring.
+  * If the client sends notification for rx requests then it should specify
diff --git a/queue-3.4/xen-netback-don-t-disconnect-frontend-when-seeing-oversize-packet.patch b/queue-3.4/xen-netback-don-t-disconnect-frontend-when-seeing-oversize-packet.patch
new file mode 100644 (file)
index 0000000..405ca51
--- /dev/null
@@ -0,0 +1,63 @@
+From 276d632e34fcf94042dd4182e4398a8b6cbdc07f Mon Sep 17 00:00:00 2001
+From: Wei Liu <wei.liu2@citrix.com>
+Date: Mon, 22 Apr 2013 02:20:43 +0000
+Subject: xen-netback: don't disconnect frontend when seeing oversize packet
+
+From: Wei Liu <wei.liu2@citrix.com>
+
+commit 03393fd5cc2b6cdeec32b704ecba64dbb0feae3c upstream.
+
+Some frontend drivers are sending packets > 64 KiB in length. This length
+overflows the length field in the first slot making the following slots have
+an invalid length.
+
+Turn this error back into a non-fatal error by dropping the packet. To avoid
+having the following slots having fatal errors, consume all slots in the
+packet.
+
+This does not reopen the security hole in XSA-39 as if the packet as an
+invalid number of slots it will still hit fatal error case.
+
+Signed-off-by: David Vrabel <david.vrabel@citrix.com>
+Signed-off-by: Wei Liu <wei.liu2@citrix.com>
+Acked-by: Ian Campbell <ian.campbell@citrix.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Cc: Yijing Wang <wangyijing@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/net/xen-netback/netback.c |   22 ++++++++++++++++------
+ 1 file changed, 16 insertions(+), 6 deletions(-)
+
+--- a/drivers/net/xen-netback/netback.c
++++ b/drivers/net/xen-netback/netback.c
+@@ -947,12 +947,22 @@ static int netbk_count_requests(struct x
+               memcpy(txp, RING_GET_REQUEST(&vif->tx, cons + slots),
+                      sizeof(*txp));
+-              if (txp->size > first->size) {
+-                      netdev_err(vif->dev,
+-                                 "Invalid tx request, slot size %u > remaining size %u\n",
+-                                 txp->size, first->size);
+-                      netbk_fatal_tx_err(vif);
+-                      return -EIO;
++
++              /* If the guest submitted a frame >= 64 KiB then
++               * first->size overflowed and following slots will
++               * appear to be larger than the frame.
++               *
++               * This cannot be fatal error as there are buggy
++               * frontends that do this.
++               *
++               * Consume all slots and drop the packet.
++               */
++              if (!drop_err && txp->size > first->size) {
++                      if (net_ratelimit())
++                              netdev_dbg(vif->dev,
++                                         "Invalid tx request, slot size %u > remaining size %u\n",
++                                         txp->size, first->size);
++                      drop_err = -EIO;
+               }
+               first->size -= txp->size;
diff --git a/queue-3.4/xen-netback-fix-sparse-warning.patch b/queue-3.4/xen-netback-fix-sparse-warning.patch
new file mode 100644 (file)
index 0000000..e037157
--- /dev/null
@@ -0,0 +1,32 @@
+From 4ea6949b9f6e783e3b053266d308582b8ae83c9e Mon Sep 17 00:00:00 2001
+From: stephen hemminger <stephen@networkplumber.org>
+Date: Wed, 10 Apr 2013 10:54:46 +0000
+Subject: xen-netback: fix sparse warning
+
+From: stephen hemminger <stephen@networkplumber.org>
+
+commit 9eaee8beeeb3bca0d9b14324fd9d467d48db784c upstream.
+
+Fix warning about 0 used as NULL.
+
+Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Cc: Yijing Wang <wangyijing@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/net/xen-netback/netback.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/net/xen-netback/netback.c
++++ b/drivers/net/xen-netback/netback.c
+@@ -1522,7 +1522,7 @@ static void xen_netbk_idx_release(struct
+       xenvif_put(vif);
+-      netbk->mmap_pages[pending_idx]->mapping = 0;
++      netbk->mmap_pages[pending_idx]->mapping = NULL;
+       put_page(netbk->mmap_pages[pending_idx]);
+       netbk->mmap_pages[pending_idx] = NULL;
+ }
diff --git a/queue-3.4/xen-smp-fix-leakage-of-timer-interrupt-line-for-every-cpu-online-offline.patch b/queue-3.4/xen-smp-fix-leakage-of-timer-interrupt-line-for-every-cpu-online-offline.patch
new file mode 100644 (file)
index 0000000..f55ca97
--- /dev/null
@@ -0,0 +1,62 @@
+From 20c4ec0f41d9be235c376c4cd5f5517ca31d7874 Mon Sep 17 00:00:00 2001
+From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Date: Tue, 16 Apr 2013 13:49:26 -0400
+Subject: xen/smp: Fix leakage of timer interrupt line for every CPU online/offline.
+
+From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+
+commit 888b65b4bc5e7fcbbb967023300cd5d44dba1950 upstream.
+
+In the PVHVM path when we do CPU online/offline path we would
+leak the timer%d IRQ line everytime we do a offline event. The
+online path (xen_hvm_setup_cpu_clockevents via
+x86_cpuinit.setup_percpu_clockev) would allocate a new interrupt
+line for the timer%d.
+
+But we would still use the old interrupt line leading to:
+
+kernel BUG at /home/konrad/ssd/konrad/linux/kernel/hrtimer.c:1261!
+invalid opcode: 0000 [#1] SMP
+RIP: 0010:[<ffffffff810b9e21>]  [<ffffffff810b9e21>] hrtimer_interrupt+0x261/0x270
+.. snip..
+ <IRQ>
+ [<ffffffff810445ef>] xen_timer_interrupt+0x2f/0x1b0
+ [<ffffffff81104825>] ? stop_machine_cpu_stop+0xb5/0xf0
+ [<ffffffff8111434c>] handle_irq_event_percpu+0x7c/0x240
+ [<ffffffff811175b9>] handle_percpu_irq+0x49/0x70
+ [<ffffffff813a74a3>] __xen_evtchn_do_upcall+0x1c3/0x2f0
+ [<ffffffff813a760a>] xen_evtchn_do_upcall+0x2a/0x40
+ [<ffffffff8167c26d>] xen_hvm_callback_vector+0x6d/0x80
+ <EOI>
+ [<ffffffff81666d01>] ? start_secondary+0x193/0x1a8
+ [<ffffffff81666cfd>] ? start_secondary+0x18f/0x1a8
+
+There is also the oddity (timer1) in the /proc/interrupts after
+offlining CPU1:
+
+  64:       1121          0  xen-percpu-virq      timer0
+  78:          0          0  xen-percpu-virq      timer1
+  84:          0       2483  xen-percpu-virq      timer2
+
+This patch fixes it.
+
+Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+[bwh: Backported to 3.2: adjust context]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Cc: Yijing Wang <wangyijing@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/xen/smp.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/x86/xen/smp.c
++++ b/arch/x86/xen/smp.c
+@@ -576,6 +576,7 @@ static void xen_hvm_cpu_die(unsigned int
+       unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
+       unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
+       unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL);
++      xen_teardown_timer(cpu);
+       native_cpu_die(cpu);
+ }
diff --git a/queue-3.4/xen-smp-spinlock-fix-leakage-of-the-spinlock-interrupt-line-for-every-cpu-online-offline.patch b/queue-3.4/xen-smp-spinlock-fix-leakage-of-the-spinlock-interrupt-line-for-every-cpu-online-offline.patch
new file mode 100644 (file)
index 0000000..289ae2f
--- /dev/null
@@ -0,0 +1,75 @@
+From abfcdd7ef364708e54276e97041fe4dc1dd8dc12 Mon Sep 17 00:00:00 2001
+From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Date: Tue, 16 Apr 2013 14:08:50 -0400
+Subject: xen/smp/spinlock: Fix leakage of the spinlock interrupt line for every CPU online/offline
+
+From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+
+commit 66ff0fe9e7bda8aec99985b24daad03652f7304e upstream.
+
+While we don't use the spinlock interrupt line (see for details
+commit f10cd522c5fbfec9ae3cc01967868c9c2401ed23 -
+xen: disable PV spinlocks on HVM) - we should still do the proper
+init / deinit sequence. We did not do that correctly and for the
+CPU init for PVHVM guest we would allocate an interrupt line - but
+failed to deallocate the old interrupt line.
+
+This resulted in leakage of an irq_desc but more importantly this splat
+as we online an offlined CPU:
+
+genirq: Flags mismatch irq 71. 0002cc20 (spinlock1) vs. 0002cc20 (spinlock1)
+Pid: 2542, comm: init.late Not tainted 3.9.0-rc6upstream #1
+Call Trace:
+ [<ffffffff811156de>] __setup_irq+0x23e/0x4a0
+ [<ffffffff81194191>] ? kmem_cache_alloc_trace+0x221/0x250
+ [<ffffffff811161bb>] request_threaded_irq+0xfb/0x160
+ [<ffffffff8104c6f0>] ? xen_spin_trylock+0x20/0x20
+ [<ffffffff813a8423>] bind_ipi_to_irqhandler+0xa3/0x160
+ [<ffffffff81303758>] ? kasprintf+0x38/0x40
+ [<ffffffff8104c6f0>] ? xen_spin_trylock+0x20/0x20
+ [<ffffffff810cad35>] ? update_max_interval+0x15/0x40
+ [<ffffffff816605db>] xen_init_lock_cpu+0x3c/0x78
+ [<ffffffff81660029>] xen_hvm_cpu_notify+0x29/0x33
+ [<ffffffff81676bdd>] notifier_call_chain+0x4d/0x70
+ [<ffffffff810bb2a9>] __raw_notifier_call_chain+0x9/0x10
+ [<ffffffff8109402b>] __cpu_notify+0x1b/0x30
+ [<ffffffff8166834a>] _cpu_up+0xa0/0x14b
+ [<ffffffff816684ce>] cpu_up+0xd9/0xec
+ [<ffffffff8165f754>] store_online+0x94/0xd0
+ [<ffffffff8141d15b>] dev_attr_store+0x1b/0x20
+ [<ffffffff81218f44>] sysfs_write_file+0xf4/0x170
+ [<ffffffff811a2864>] vfs_write+0xb4/0x130
+ [<ffffffff811a302a>] sys_write+0x5a/0xa0
+ [<ffffffff8167ada9>] system_call_fastpath+0x16/0x1b
+cpu 1 spinlock event irq -16
+smpboot: Booting Node 0 Processor 1 APIC 0x2
+
+And if one looks at the /proc/interrupts right after
+offlining (CPU1):
+
+  70:          0          0  xen-percpu-ipi       spinlock0
+  71:          0          0  xen-percpu-ipi       spinlock1
+  77:          0          0  xen-percpu-ipi       spinlock2
+
+There is the oddity of the 'spinlock1' still being present.
+
+Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+[bwh: Backported to 3.2: adjust context]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Cc: Yijing Wang <wangyijing@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/xen/smp.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/x86/xen/smp.c
++++ b/arch/x86/xen/smp.c
+@@ -576,6 +576,7 @@ static void xen_hvm_cpu_die(unsigned int
+       unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
+       unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
+       unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL);
++      xen_uninit_lock_cpu(cpu);
+       xen_teardown_timer(cpu);
+       native_cpu_die(cpu);
+ }