From: Greg Kroah-Hartman Date: Mon, 23 Apr 2007 23:29:15 +0000 (-0700) Subject: start up 2.6.20 queue again... X-Git-Tag: v2.6.20.8~3 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=a74cad59e74a4cfa4921e0e1fec77199b935fa32;p=thirdparty%2Fkernel%2Fstable-queue.git start up 2.6.20 queue again... --- diff --git a/queue-2.6.20/fix-sparc64-sbus-iommu-allocator.patch b/queue-2.6.20/fix-sparc64-sbus-iommu-allocator.patch new file mode 100644 index 00000000000..4d187dcf864 --- /dev/null +++ b/queue-2.6.20/fix-sparc64-sbus-iommu-allocator.patch @@ -0,0 +1,826 @@ +From stable-bounces@linux.kernel.org Tue Apr 17 14:38:46 2007 +From: David Miller +Date: Tue, 17 Apr 2007 14:37:25 -0700 (PDT) +Subject: Fix sparc64 SBUS IOMMU allocator +To: stable@kernel.org +Cc: bunk@stusta.de +Message-ID: <20070417.143725.72712787.davem@davemloft.net> + +From: David Miller + +[SPARC64]: Fix SBUS IOMMU allocation code. + +There are several IOMMU allocator bugs. Instead of trying to fix this +overly complicated code, just mirror the PCI IOMMU arena allocator +which is very stable and well stress tested. + +I tried to make the code as identical as possible so we can switch +sun4u PCI and SBUS over to a common piece of IOMMU code. All that +will be need are two callbacks, one to do a full IOMMU flush and one +to do a streaming buffer flush. + +This patch gets rid of a lot of hangs and mysterious crashes on SBUS +sparc64 systems, at least for me. + +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman + +--- + arch/sparc64/kernel/sbus.c | 566 ++++++++++++++++++--------------------------- + 1 file changed, 235 insertions(+), 331 deletions(-) + +--- a/arch/sparc64/kernel/sbus.c ++++ b/arch/sparc64/kernel/sbus.c +@@ -24,48 +24,25 @@ + + #include "iommu_common.h" + +-/* These should be allocated on an SMP_CACHE_BYTES +- * aligned boundary for optimal performance. +- * +- * On SYSIO, using an 8K page size we have 1GB of SBUS +- * DMA space mapped. We divide this space into equally +- * sized clusters. We allocate a DMA mapping from the +- * cluster that matches the order of the allocation, or +- * if the order is greater than the number of clusters, +- * we try to allocate from the last cluster. +- */ +- +-#define NCLUSTERS 8UL +-#define ONE_GIG (1UL * 1024UL * 1024UL * 1024UL) +-#define CLUSTER_SIZE (ONE_GIG / NCLUSTERS) +-#define CLUSTER_MASK (CLUSTER_SIZE - 1) +-#define CLUSTER_NPAGES (CLUSTER_SIZE >> IO_PAGE_SHIFT) + #define MAP_BASE ((u32)0xc0000000) + ++struct sbus_iommu_arena { ++ unsigned long *map; ++ unsigned int hint; ++ unsigned int limit; ++}; ++ + struct sbus_iommu { +-/*0x00*/spinlock_t lock; ++ spinlock_t lock; + +-/*0x08*/iopte_t *page_table; +-/*0x10*/unsigned long strbuf_regs; +-/*0x18*/unsigned long iommu_regs; +-/*0x20*/unsigned long sbus_control_reg; +- +-/*0x28*/volatile unsigned long strbuf_flushflag; +- +- /* If NCLUSTERS is ever decresed to 4 or lower, +- * you must increase the size of the type of +- * these counters. You have been duly warned. -DaveM +- */ +-/*0x30*/struct { +- u16 next; +- u16 flush; +- } alloc_info[NCLUSTERS]; +- +- /* The lowest used consistent mapping entry. Since +- * we allocate consistent maps out of cluster 0 this +- * is relative to the beginning of closter 0. +- */ +-/*0x50*/u32 lowest_consistent_map; ++ struct sbus_iommu_arena arena; ++ ++ iopte_t *page_table; ++ unsigned long strbuf_regs; ++ unsigned long iommu_regs; ++ unsigned long sbus_control_reg; ++ ++ volatile unsigned long strbuf_flushflag; + }; + + /* Offsets from iommu_regs */ +@@ -91,19 +68,6 @@ static void __iommu_flushall(struct sbus + tag += 8UL; + } + upa_readq(iommu->sbus_control_reg); +- +- for (entry = 0; entry < NCLUSTERS; entry++) { +- iommu->alloc_info[entry].flush = +- iommu->alloc_info[entry].next; +- } +-} +- +-static void iommu_flush(struct sbus_iommu *iommu, u32 base, unsigned long npages) +-{ +- while (npages--) +- upa_writeq(base + (npages << IO_PAGE_SHIFT), +- iommu->iommu_regs + IOMMU_FLUSH); +- upa_readq(iommu->sbus_control_reg); + } + + /* Offsets from strbuf_regs */ +@@ -156,178 +120,115 @@ static void sbus_strbuf_flush(struct sbu + base, npages); + } + +-static iopte_t *alloc_streaming_cluster(struct sbus_iommu *iommu, unsigned long npages) ++/* Based largely upon the ppc64 iommu allocator. */ ++static long sbus_arena_alloc(struct sbus_iommu *iommu, unsigned long npages) + { +- iopte_t *iopte, *limit, *first, *cluster; +- unsigned long cnum, ent, nent, flush_point, found; +- +- cnum = 0; +- nent = 1; +- while ((1UL << cnum) < npages) +- cnum++; +- if(cnum >= NCLUSTERS) { +- nent = 1UL << (cnum - NCLUSTERS); +- cnum = NCLUSTERS - 1; +- } +- iopte = iommu->page_table + (cnum * CLUSTER_NPAGES); +- +- if (cnum == 0) +- limit = (iommu->page_table + +- iommu->lowest_consistent_map); +- else +- limit = (iopte + CLUSTER_NPAGES); +- +- iopte += ((ent = iommu->alloc_info[cnum].next) << cnum); +- flush_point = iommu->alloc_info[cnum].flush; +- +- first = iopte; +- cluster = NULL; +- found = 0; +- for (;;) { +- if (iopte_val(*iopte) == 0UL) { +- found++; +- if (!cluster) +- cluster = iopte; ++ struct sbus_iommu_arena *arena = &iommu->arena; ++ unsigned long n, i, start, end, limit; ++ int pass; ++ ++ limit = arena->limit; ++ start = arena->hint; ++ pass = 0; ++ ++again: ++ n = find_next_zero_bit(arena->map, limit, start); ++ end = n + npages; ++ if (unlikely(end >= limit)) { ++ if (likely(pass < 1)) { ++ limit = start; ++ start = 0; ++ __iommu_flushall(iommu); ++ pass++; ++ goto again; + } else { +- /* Used cluster in the way */ +- cluster = NULL; +- found = 0; ++ /* Scanned the whole thing, give up. */ ++ return -1; + } ++ } + +- if (found == nent) +- break; +- +- iopte += (1 << cnum); +- ent++; +- if (iopte >= limit) { +- iopte = (iommu->page_table + (cnum * CLUSTER_NPAGES)); +- ent = 0; +- +- /* Multiple cluster allocations must not wrap */ +- cluster = NULL; +- found = 0; ++ for (i = n; i < end; i++) { ++ if (test_bit(i, arena->map)) { ++ start = i + 1; ++ goto again; + } +- if (ent == flush_point) +- __iommu_flushall(iommu); +- if (iopte == first) +- goto bad; + } + +- /* ent/iopte points to the last cluster entry we're going to use, +- * so save our place for the next allocation. +- */ +- if ((iopte + (1 << cnum)) >= limit) +- ent = 0; +- else +- ent = ent + 1; +- iommu->alloc_info[cnum].next = ent; +- if (ent == flush_point) +- __iommu_flushall(iommu); +- +- /* I've got your streaming cluster right here buddy boy... */ +- return cluster; +- +-bad: +- printk(KERN_EMERG "sbus: alloc_streaming_cluster of npages(%ld) failed!\n", +- npages); +- return NULL; ++ for (i = n; i < end; i++) ++ __set_bit(i, arena->map); ++ ++ arena->hint = end; ++ ++ return n; + } + +-static void free_streaming_cluster(struct sbus_iommu *iommu, u32 base, unsigned long npages) ++static void sbus_arena_free(struct sbus_iommu_arena *arena, unsigned long base, unsigned long npages) + { +- unsigned long cnum, ent, nent; +- iopte_t *iopte; ++ unsigned long i; + +- cnum = 0; +- nent = 1; +- while ((1UL << cnum) < npages) +- cnum++; +- if(cnum >= NCLUSTERS) { +- nent = 1UL << (cnum - NCLUSTERS); +- cnum = NCLUSTERS - 1; +- } +- ent = (base & CLUSTER_MASK) >> (IO_PAGE_SHIFT + cnum); +- iopte = iommu->page_table + ((base - MAP_BASE) >> IO_PAGE_SHIFT); +- do { +- iopte_val(*iopte) = 0UL; +- iopte += 1 << cnum; +- } while(--nent); +- +- /* If the global flush might not have caught this entry, +- * adjust the flush point such that we will flush before +- * ever trying to reuse it. +- */ +-#define between(X,Y,Z) (((Z) - (Y)) >= ((X) - (Y))) +- if (between(ent, iommu->alloc_info[cnum].next, iommu->alloc_info[cnum].flush)) +- iommu->alloc_info[cnum].flush = ent; +-#undef between ++ for (i = base; i < (base + npages); i++) ++ __clear_bit(i, arena->map); + } + +-/* We allocate consistent mappings from the end of cluster zero. */ +-static iopte_t *alloc_consistent_cluster(struct sbus_iommu *iommu, unsigned long npages) ++static void sbus_iommu_table_init(struct sbus_iommu *iommu, unsigned int tsbsize) + { +- iopte_t *iopte; ++ unsigned long tsbbase, order, sz, num_tsb_entries; + +- iopte = iommu->page_table + (1 * CLUSTER_NPAGES); +- while (iopte > iommu->page_table) { +- iopte--; +- if (!(iopte_val(*iopte) & IOPTE_VALID)) { +- unsigned long tmp = npages; +- +- while (--tmp) { +- iopte--; +- if (iopte_val(*iopte) & IOPTE_VALID) +- break; +- } +- if (tmp == 0) { +- u32 entry = (iopte - iommu->page_table); ++ num_tsb_entries = tsbsize / sizeof(iopte_t); + +- if (entry < iommu->lowest_consistent_map) +- iommu->lowest_consistent_map = entry; +- return iopte; +- } +- } ++ /* Setup initial software IOMMU state. */ ++ spin_lock_init(&iommu->lock); ++ ++ /* Allocate and initialize the free area map. */ ++ sz = num_tsb_entries / 8; ++ sz = (sz + 7UL) & ~7UL; ++ iommu->arena.map = kzalloc(sz, GFP_KERNEL); ++ if (!iommu->arena.map) { ++ prom_printf("PCI_IOMMU: Error, kmalloc(arena.map) failed.\n"); ++ prom_halt(); ++ } ++ iommu->arena.limit = num_tsb_entries; ++ ++ /* Now allocate and setup the IOMMU page table itself. */ ++ order = get_order(tsbsize); ++ tsbbase = __get_free_pages(GFP_KERNEL, order); ++ if (!tsbbase) { ++ prom_printf("IOMMU: Error, gfp(tsb) failed.\n"); ++ prom_halt(); + } +- return NULL; ++ iommu->page_table = (iopte_t *)tsbbase; ++ memset(iommu->page_table, 0, tsbsize); + } + +-static void free_consistent_cluster(struct sbus_iommu *iommu, u32 base, unsigned long npages) ++static inline iopte_t *alloc_npages(struct sbus_iommu *iommu, unsigned long npages) + { +- iopte_t *iopte = iommu->page_table + ((base - MAP_BASE) >> IO_PAGE_SHIFT); ++ long entry; + +- if ((iopte - iommu->page_table) == iommu->lowest_consistent_map) { +- iopte_t *walk = iopte + npages; +- iopte_t *limit; ++ entry = sbus_arena_alloc(iommu, npages); ++ if (unlikely(entry < 0)) ++ return NULL; + +- limit = iommu->page_table + CLUSTER_NPAGES; +- while (walk < limit) { +- if (iopte_val(*walk) != 0UL) +- break; +- walk++; +- } +- iommu->lowest_consistent_map = +- (walk - iommu->page_table); +- } ++ return iommu->page_table + entry; ++} + +- while (npages--) +- *iopte++ = __iopte(0UL); ++static inline void free_npages(struct sbus_iommu *iommu, dma_addr_t base, unsigned long npages) ++{ ++ sbus_arena_free(&iommu->arena, base >> IO_PAGE_SHIFT, npages); + } + + void *sbus_alloc_consistent(struct sbus_dev *sdev, size_t size, dma_addr_t *dvma_addr) + { +- unsigned long order, first_page, flags; + struct sbus_iommu *iommu; + iopte_t *iopte; ++ unsigned long flags, order, first_page; + void *ret; + int npages; + +- if (size <= 0 || sdev == NULL || dvma_addr == NULL) +- return NULL; +- + size = IO_PAGE_ALIGN(size); + order = get_order(size); + if (order >= 10) + return NULL; ++ + first_page = __get_free_pages(GFP_KERNEL|__GFP_COMP, order); + if (first_page == 0UL) + return NULL; +@@ -336,108 +237,121 @@ void *sbus_alloc_consistent(struct sbus_ + iommu = sdev->bus->iommu; + + spin_lock_irqsave(&iommu->lock, flags); +- iopte = alloc_consistent_cluster(iommu, size >> IO_PAGE_SHIFT); +- if (iopte == NULL) { +- spin_unlock_irqrestore(&iommu->lock, flags); ++ iopte = alloc_npages(iommu, size >> IO_PAGE_SHIFT); ++ spin_unlock_irqrestore(&iommu->lock, flags); ++ ++ if (unlikely(iopte == NULL)) { + free_pages(first_page, order); + return NULL; + } + +- /* Ok, we're committed at this point. */ +- *dvma_addr = MAP_BASE + ((iopte - iommu->page_table) << IO_PAGE_SHIFT); ++ *dvma_addr = (MAP_BASE + ++ ((iopte - iommu->page_table) << IO_PAGE_SHIFT)); + ret = (void *) first_page; + npages = size >> IO_PAGE_SHIFT; ++ first_page = __pa(first_page); + while (npages--) { +- *iopte++ = __iopte(IOPTE_VALID | IOPTE_CACHE | IOPTE_WRITE | +- (__pa(first_page) & IOPTE_PAGE)); ++ iopte_val(*iopte) = (IOPTE_VALID | IOPTE_CACHE | ++ IOPTE_WRITE | ++ (first_page & IOPTE_PAGE)); ++ iopte++; + first_page += IO_PAGE_SIZE; + } +- iommu_flush(iommu, *dvma_addr, size >> IO_PAGE_SHIFT); +- spin_unlock_irqrestore(&iommu->lock, flags); + + return ret; + } + + void sbus_free_consistent(struct sbus_dev *sdev, size_t size, void *cpu, dma_addr_t dvma) + { +- unsigned long order, npages; + struct sbus_iommu *iommu; +- +- if (size <= 0 || sdev == NULL || cpu == NULL) +- return; ++ iopte_t *iopte; ++ unsigned long flags, order, npages; + + npages = IO_PAGE_ALIGN(size) >> IO_PAGE_SHIFT; + iommu = sdev->bus->iommu; ++ iopte = iommu->page_table + ++ ((dvma - MAP_BASE) >> IO_PAGE_SHIFT); ++ ++ spin_lock_irqsave(&iommu->lock, flags); ++ ++ free_npages(iommu, dvma - MAP_BASE, npages); + +- spin_lock_irq(&iommu->lock); +- free_consistent_cluster(iommu, dvma, npages); +- iommu_flush(iommu, dvma, npages); +- spin_unlock_irq(&iommu->lock); ++ spin_unlock_irqrestore(&iommu->lock, flags); + + order = get_order(size); + if (order < 10) + free_pages((unsigned long)cpu, order); + } + +-dma_addr_t sbus_map_single(struct sbus_dev *sdev, void *ptr, size_t size, int dir) ++dma_addr_t sbus_map_single(struct sbus_dev *sdev, void *ptr, size_t sz, int direction) + { +- struct sbus_iommu *iommu = sdev->bus->iommu; +- unsigned long npages, pbase, flags; +- iopte_t *iopte; +- u32 dma_base, offset; +- unsigned long iopte_bits; ++ struct sbus_iommu *iommu; ++ iopte_t *base; ++ unsigned long flags, npages, oaddr; ++ unsigned long i, base_paddr; ++ u32 bus_addr, ret; ++ unsigned long iopte_protection; ++ ++ iommu = sdev->bus->iommu; + +- if (dir == SBUS_DMA_NONE) ++ if (unlikely(direction == SBUS_DMA_NONE)) + BUG(); + +- pbase = (unsigned long) ptr; +- offset = (u32) (pbase & ~IO_PAGE_MASK); +- size = (IO_PAGE_ALIGN(pbase + size) - (pbase & IO_PAGE_MASK)); +- pbase = (unsigned long) __pa(pbase & IO_PAGE_MASK); ++ oaddr = (unsigned long)ptr; ++ npages = IO_PAGE_ALIGN(oaddr + sz) - (oaddr & IO_PAGE_MASK); ++ npages >>= IO_PAGE_SHIFT; + + spin_lock_irqsave(&iommu->lock, flags); +- npages = size >> IO_PAGE_SHIFT; +- iopte = alloc_streaming_cluster(iommu, npages); +- if (iopte == NULL) +- goto bad; +- dma_base = MAP_BASE + ((iopte - iommu->page_table) << IO_PAGE_SHIFT); +- npages = size >> IO_PAGE_SHIFT; +- iopte_bits = IOPTE_VALID | IOPTE_STBUF | IOPTE_CACHE; +- if (dir != SBUS_DMA_TODEVICE) +- iopte_bits |= IOPTE_WRITE; +- while (npages--) { +- *iopte++ = __iopte(iopte_bits | (pbase & IOPTE_PAGE)); +- pbase += IO_PAGE_SIZE; +- } +- npages = size >> IO_PAGE_SHIFT; ++ base = alloc_npages(iommu, npages); + spin_unlock_irqrestore(&iommu->lock, flags); + +- return (dma_base | offset); ++ if (unlikely(!base)) ++ BUG(); + +-bad: +- spin_unlock_irqrestore(&iommu->lock, flags); +- BUG(); +- return 0; ++ bus_addr = (MAP_BASE + ++ ((base - iommu->page_table) << IO_PAGE_SHIFT)); ++ ret = bus_addr | (oaddr & ~IO_PAGE_MASK); ++ base_paddr = __pa(oaddr & IO_PAGE_MASK); ++ ++ iopte_protection = IOPTE_VALID | IOPTE_STBUF | IOPTE_CACHE; ++ if (direction != SBUS_DMA_TODEVICE) ++ iopte_protection |= IOPTE_WRITE; ++ ++ for (i = 0; i < npages; i++, base++, base_paddr += IO_PAGE_SIZE) ++ iopte_val(*base) = iopte_protection | base_paddr; ++ ++ return ret; + } + +-void sbus_unmap_single(struct sbus_dev *sdev, dma_addr_t dma_addr, size_t size, int direction) ++void sbus_unmap_single(struct sbus_dev *sdev, dma_addr_t bus_addr, size_t sz, int direction) + { + struct sbus_iommu *iommu = sdev->bus->iommu; +- u32 dma_base = dma_addr & IO_PAGE_MASK; +- unsigned long flags; ++ iopte_t *base; ++ unsigned long flags, npages, i; + +- size = (IO_PAGE_ALIGN(dma_addr + size) - dma_base); ++ if (unlikely(direction == SBUS_DMA_NONE)) ++ BUG(); ++ ++ npages = IO_PAGE_ALIGN(bus_addr + sz) - (bus_addr & IO_PAGE_MASK); ++ npages >>= IO_PAGE_SHIFT; ++ base = iommu->page_table + ++ ((bus_addr - MAP_BASE) >> IO_PAGE_SHIFT); ++ ++ bus_addr &= IO_PAGE_MASK; + + spin_lock_irqsave(&iommu->lock, flags); +- free_streaming_cluster(iommu, dma_base, size >> IO_PAGE_SHIFT); +- sbus_strbuf_flush(iommu, dma_base, size >> IO_PAGE_SHIFT, direction); ++ sbus_strbuf_flush(iommu, bus_addr, npages, direction); ++ for (i = 0; i < npages; i++) ++ iopte_val(base[i]) = 0UL; ++ free_npages(iommu, bus_addr - MAP_BASE, npages); + spin_unlock_irqrestore(&iommu->lock, flags); + } + + #define SG_ENT_PHYS_ADDRESS(SG) \ + (__pa(page_address((SG)->page)) + (SG)->offset) + +-static inline void fill_sg(iopte_t *iopte, struct scatterlist *sg, int nused, int nelems, unsigned long iopte_bits) ++static inline void fill_sg(iopte_t *iopte, struct scatterlist *sg, ++ int nused, int nelems, unsigned long iopte_protection) + { + struct scatterlist *dma_sg = sg; + struct scatterlist *sg_end = sg + nelems; +@@ -462,7 +376,7 @@ static inline void fill_sg(iopte_t *iopt + for (;;) { + unsigned long tmp; + +- tmp = (unsigned long) SG_ENT_PHYS_ADDRESS(sg); ++ tmp = SG_ENT_PHYS_ADDRESS(sg); + len = sg->length; + if (((tmp ^ pteval) >> IO_PAGE_SHIFT) != 0UL) { + pteval = tmp & IO_PAGE_MASK; +@@ -478,7 +392,7 @@ static inline void fill_sg(iopte_t *iopt + sg++; + } + +- pteval = ((pteval & IOPTE_PAGE) | iopte_bits); ++ pteval = iopte_protection | (pteval & IOPTE_PAGE); + while (len > 0) { + *iopte++ = __iopte(pteval); + pteval += IO_PAGE_SIZE; +@@ -509,103 +423,111 @@ static inline void fill_sg(iopte_t *iopt + } + } + +-int sbus_map_sg(struct sbus_dev *sdev, struct scatterlist *sg, int nents, int dir) ++int sbus_map_sg(struct sbus_dev *sdev, struct scatterlist *sglist, int nelems, int direction) + { +- struct sbus_iommu *iommu = sdev->bus->iommu; +- unsigned long flags, npages; +- iopte_t *iopte; ++ struct sbus_iommu *iommu; ++ unsigned long flags, npages, iopte_protection; ++ iopte_t *base; + u32 dma_base; + struct scatterlist *sgtmp; + int used; +- unsigned long iopte_bits; +- +- if (dir == SBUS_DMA_NONE) +- BUG(); + + /* Fast path single entry scatterlists. */ +- if (nents == 1) { +- sg->dma_address = ++ if (nelems == 1) { ++ sglist->dma_address = + sbus_map_single(sdev, +- (page_address(sg->page) + sg->offset), +- sg->length, dir); +- sg->dma_length = sg->length; ++ (page_address(sglist->page) + sglist->offset), ++ sglist->length, direction); ++ sglist->dma_length = sglist->length; + return 1; + } + +- npages = prepare_sg(sg, nents); ++ iommu = sdev->bus->iommu; ++ ++ if (unlikely(direction == SBUS_DMA_NONE)) ++ BUG(); ++ ++ npages = prepare_sg(sglist, nelems); + + spin_lock_irqsave(&iommu->lock, flags); +- iopte = alloc_streaming_cluster(iommu, npages); +- if (iopte == NULL) +- goto bad; +- dma_base = MAP_BASE + ((iopte - iommu->page_table) << IO_PAGE_SHIFT); ++ base = alloc_npages(iommu, npages); ++ spin_unlock_irqrestore(&iommu->lock, flags); ++ ++ if (unlikely(base == NULL)) ++ BUG(); ++ ++ dma_base = MAP_BASE + ++ ((base - iommu->page_table) << IO_PAGE_SHIFT); + + /* Normalize DVMA addresses. */ +- sgtmp = sg; +- used = nents; ++ used = nelems; + ++ sgtmp = sglist; + while (used && sgtmp->dma_length) { + sgtmp->dma_address += dma_base; + sgtmp++; + used--; + } +- used = nents - used; ++ used = nelems - used; ++ ++ iopte_protection = IOPTE_VALID | IOPTE_STBUF | IOPTE_CACHE; ++ if (direction != SBUS_DMA_TODEVICE) ++ iopte_protection |= IOPTE_WRITE; + +- iopte_bits = IOPTE_VALID | IOPTE_STBUF | IOPTE_CACHE; +- if (dir != SBUS_DMA_TODEVICE) +- iopte_bits |= IOPTE_WRITE; ++ fill_sg(base, sglist, used, nelems, iopte_protection); + +- fill_sg(iopte, sg, used, nents, iopte_bits); + #ifdef VERIFY_SG +- verify_sglist(sg, nents, iopte, npages); ++ verify_sglist(sglist, nelems, base, npages); + #endif +- spin_unlock_irqrestore(&iommu->lock, flags); + + return used; +- +-bad: +- spin_unlock_irqrestore(&iommu->lock, flags); +- BUG(); +- return 0; + } + +-void sbus_unmap_sg(struct sbus_dev *sdev, struct scatterlist *sg, int nents, int direction) ++void sbus_unmap_sg(struct sbus_dev *sdev, struct scatterlist *sglist, int nelems, int direction) + { +- unsigned long size, flags; + struct sbus_iommu *iommu; +- u32 dvma_base; +- int i; ++ iopte_t *base; ++ unsigned long flags, i, npages; ++ u32 bus_addr; + +- /* Fast path single entry scatterlists. */ +- if (nents == 1) { +- sbus_unmap_single(sdev, sg->dma_address, sg->dma_length, direction); +- return; +- } ++ if (unlikely(direction == SBUS_DMA_NONE)) ++ BUG(); + +- dvma_base = sg[0].dma_address & IO_PAGE_MASK; +- for (i = 0; i < nents; i++) { +- if (sg[i].dma_length == 0) ++ iommu = sdev->bus->iommu; ++ ++ bus_addr = sglist->dma_address & IO_PAGE_MASK; ++ ++ for (i = 1; i < nelems; i++) ++ if (sglist[i].dma_length == 0) + break; +- } + i--; +- size = IO_PAGE_ALIGN(sg[i].dma_address + sg[i].dma_length) - dvma_base; ++ npages = (IO_PAGE_ALIGN(sglist[i].dma_address + sglist[i].dma_length) - ++ bus_addr) >> IO_PAGE_SHIFT; ++ ++ base = iommu->page_table + ++ ((bus_addr - MAP_BASE) >> IO_PAGE_SHIFT); + +- iommu = sdev->bus->iommu; + spin_lock_irqsave(&iommu->lock, flags); +- free_streaming_cluster(iommu, dvma_base, size >> IO_PAGE_SHIFT); +- sbus_strbuf_flush(iommu, dvma_base, size >> IO_PAGE_SHIFT, direction); ++ sbus_strbuf_flush(iommu, bus_addr, npages, direction); ++ for (i = 0; i < npages; i++) ++ iopte_val(base[i]) = 0UL; ++ free_npages(iommu, bus_addr - MAP_BASE, npages); + spin_unlock_irqrestore(&iommu->lock, flags); + } + +-void sbus_dma_sync_single_for_cpu(struct sbus_dev *sdev, dma_addr_t base, size_t size, int direction) ++void sbus_dma_sync_single_for_cpu(struct sbus_dev *sdev, dma_addr_t bus_addr, size_t sz, int direction) + { +- struct sbus_iommu *iommu = sdev->bus->iommu; +- unsigned long flags; ++ struct sbus_iommu *iommu; ++ unsigned long flags, npages; ++ ++ iommu = sdev->bus->iommu; + +- size = (IO_PAGE_ALIGN(base + size) - (base & IO_PAGE_MASK)); ++ npages = IO_PAGE_ALIGN(bus_addr + sz) - (bus_addr & IO_PAGE_MASK); ++ npages >>= IO_PAGE_SHIFT; ++ bus_addr &= IO_PAGE_MASK; + + spin_lock_irqsave(&iommu->lock, flags); +- sbus_strbuf_flush(iommu, base & IO_PAGE_MASK, size >> IO_PAGE_SHIFT, direction); ++ sbus_strbuf_flush(iommu, bus_addr, npages, direction); + spin_unlock_irqrestore(&iommu->lock, flags); + } + +@@ -613,23 +535,25 @@ void sbus_dma_sync_single_for_device(str + { + } + +-void sbus_dma_sync_sg_for_cpu(struct sbus_dev *sdev, struct scatterlist *sg, int nents, int direction) ++void sbus_dma_sync_sg_for_cpu(struct sbus_dev *sdev, struct scatterlist *sglist, int nelems, int direction) + { +- struct sbus_iommu *iommu = sdev->bus->iommu; +- unsigned long flags, size; +- u32 base; +- int i; ++ struct sbus_iommu *iommu; ++ unsigned long flags, npages, i; ++ u32 bus_addr; ++ ++ iommu = sdev->bus->iommu; + +- base = sg[0].dma_address & IO_PAGE_MASK; +- for (i = 0; i < nents; i++) { +- if (sg[i].dma_length == 0) ++ bus_addr = sglist[0].dma_address & IO_PAGE_MASK; ++ for (i = 0; i < nelems; i++) { ++ if (!sglist[i].dma_length) + break; + } + i--; +- size = IO_PAGE_ALIGN(sg[i].dma_address + sg[i].dma_length) - base; ++ npages = (IO_PAGE_ALIGN(sglist[i].dma_address + sglist[i].dma_length) ++ - bus_addr) >> IO_PAGE_SHIFT; + + spin_lock_irqsave(&iommu->lock, flags); +- sbus_strbuf_flush(iommu, base, size >> IO_PAGE_SHIFT, direction); ++ sbus_strbuf_flush(iommu, bus_addr, npages, direction); + spin_unlock_irqrestore(&iommu->lock, flags); + } + +@@ -1104,7 +1028,7 @@ static void __init sbus_iommu_init(int _ + struct linux_prom64_registers *pr; + struct device_node *dp; + struct sbus_iommu *iommu; +- unsigned long regs, tsb_base; ++ unsigned long regs; + u64 control; + int i; + +@@ -1132,14 +1056,6 @@ static void __init sbus_iommu_init(int _ + + memset(iommu, 0, sizeof(*iommu)); + +- /* We start with no consistent mappings. */ +- iommu->lowest_consistent_map = CLUSTER_NPAGES; +- +- for (i = 0; i < NCLUSTERS; i++) { +- iommu->alloc_info[i].flush = 0; +- iommu->alloc_info[i].next = 0; +- } +- + /* Setup spinlock. */ + spin_lock_init(&iommu->lock); + +@@ -1159,25 +1075,13 @@ static void __init sbus_iommu_init(int _ + sbus->portid, regs); + + /* Setup for TSB_SIZE=7, TBW_SIZE=0, MMU_DE=1, MMU_EN=1 */ ++ sbus_iommu_table_init(iommu, IO_TSB_SIZE); ++ + control = upa_readq(iommu->iommu_regs + IOMMU_CONTROL); + control = ((7UL << 16UL) | + (0UL << 2UL) | + (1UL << 1UL) | + (1UL << 0UL)); +- +- /* Using the above configuration we need 1MB iommu page +- * table (128K ioptes * 8 bytes per iopte). This is +- * page order 7 on UltraSparc. +- */ +- tsb_base = __get_free_pages(GFP_ATOMIC, get_order(IO_TSB_SIZE)); +- if (tsb_base == 0UL) { +- prom_printf("sbus_iommu_init: Fatal error, cannot alloc TSB table.\n"); +- prom_halt(); +- } +- +- iommu->page_table = (iopte_t *) tsb_base; +- memset(iommu->page_table, 0, IO_TSB_SIZE); +- + upa_writeq(control, iommu->iommu_regs + IOMMU_CONTROL); + + /* Clean out any cruft in the IOMMU using +@@ -1195,7 +1099,7 @@ static void __init sbus_iommu_init(int _ + upa_readq(iommu->sbus_control_reg); + + /* Give the TSB to SYSIO. */ +- upa_writeq(__pa(tsb_base), iommu->iommu_regs + IOMMU_TSBBASE); ++ upa_writeq(__pa(iommu->page_table), iommu->iommu_regs + IOMMU_TSBBASE); + + /* Setup streaming buffer, DE=1 SB_EN=1 */ + control = (1UL << 1UL) | (1UL << 0UL); diff --git a/queue-2.6.20/hid-zeroing-of-bytes-in-output-fields-is-bogus.patch b/queue-2.6.20/hid-zeroing-of-bytes-in-output-fields-is-bogus.patch new file mode 100644 index 00000000000..00399e37171 --- /dev/null +++ b/queue-2.6.20/hid-zeroing-of-bytes-in-output-fields-is-bogus.patch @@ -0,0 +1,48 @@ +From stable-bounces@linux.kernel.org Sun Apr 15 13:24:49 2007 +From: Jiri Kosina +Date: Sun, 15 Apr 2007 22:30:15 +0200 (CEST) +Subject: HID: zeroing of bytes in output fields is bogus +To: stable@kernel.org +Message-ID: + +From: Jiri Kosina + +HID: zeroing of bytes in output fields is bogus + +This patch removes bogus zeroing of unused bits in output reports, +introduced in Simon's patch in commit d4ae650a. +According to the specification, any sane device should not care +about values of unused bits. + +What is worse, the zeroing is done in a way which is broken and +might clear certain bits in output reports which are actually +_used_ - a device that has multiple fields with one value of +the size 1 bit each might serve as an example of why this is +bogus - the second call of hid_output_report() would clear the +first bit of report, which has already been set up previously. + +This patch will break LEDs on SpaceNavigator, because this device +is broken and takes into account the bits which it shouldn't touch. +The quirk for this particular device will be provided in a separate +patch. + +Signed-off-by: Jiri Kosina +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/hid/hid-core.c | 4 ---- + 1 file changed, 4 deletions(-) + +--- a/drivers/hid/hid-core.c ++++ b/drivers/hid/hid-core.c +@@ -876,10 +876,6 @@ static void hid_output_field(struct hid_ + unsigned size = field->report_size; + unsigned n; + +- /* make sure the unused bits in the last byte are zeros */ +- if (count > 0 && size > 0) +- data[(count*size-1)/8] = 0; +- + for (n = 0; n < count; n++) { + if (field->logical_minimum < 0) /* signed values */ + implement(data, offset + n * size, size, s32ton(field->value[n], size)); diff --git a/queue-2.6.20/holepunch-fix-disconnected-pages-after-second-truncate.patch b/queue-2.6.20/holepunch-fix-disconnected-pages-after-second-truncate.patch new file mode 100644 index 00000000000..b3403199acf --- /dev/null +++ b/queue-2.6.20/holepunch-fix-disconnected-pages-after-second-truncate.patch @@ -0,0 +1,51 @@ +From hugh_dickins@symantec.com Fri Apr 13 10:27:15 2007 +From: Hugh Dickins +Date: Fri, 13 Apr 2007 18:27:10 +0100 (BST) +Subject: [PATCH 3/4] holepunch: fix disconnected pages after second truncate +To: Greg KH , Adrian Bunk +Cc: Miklos Szeredi , stable@kernel.org +Message-ID: + +From: Hugh Dickins + +shmem_truncate_range has its own truncate_inode_pages_range, to free any +pages racily instantiated while it was in progress: a SHMEM_PAGEIN flag +is set when this might have happened. But holepunching gets no chance +to clear that flag at the start of vmtruncate_range, so it's always set +(unless a truncate came just before), so holepunch almost always does +this second truncate_inode_pages_range. + +shmem holepunch has unlikely swap<->file races hereabouts whatever we do +(without a fuller rework than is fit for this release): I was going to +skip the second truncate in the punch_hole case, but Miklos points out +that would make holepunch correctness more vulnerable to swapoff. So +keep the second truncate, but follow it by an unmap_mapping_range to +eliminate the disconnected pages (freed from pagecache while still +mapped in userspace) that it might have left behind. + +Signed-off-by: Hugh Dickins +Signed-off-by: Greg Kroah-Hartman + +--- + mm/shmem.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +--- a/mm/shmem.c ++++ b/mm/shmem.c +@@ -674,8 +674,16 @@ done2: + * generic_delete_inode did it, before we lowered next_index. + * Also, though shmem_getpage checks i_size before adding to + * cache, no recheck after: so fix the narrow window there too. ++ * ++ * Recalling truncate_inode_pages_range and unmap_mapping_range ++ * every time for punch_hole (which never got a chance to clear ++ * SHMEM_PAGEIN at the start of vmtruncate_range) is expensive, ++ * yet hardly ever necessary: try to optimize them out later. + */ + truncate_inode_pages_range(inode->i_mapping, start, end); ++ if (punch_hole) ++ unmap_mapping_range(inode->i_mapping, start, ++ end - start, 1); + } + + spin_lock(&info->lock); diff --git a/queue-2.6.20/holepunch-fix-mmap_sem-i_mutex-deadlock.patch b/queue-2.6.20/holepunch-fix-mmap_sem-i_mutex-deadlock.patch new file mode 100644 index 00000000000..115dd0cf7a2 --- /dev/null +++ b/queue-2.6.20/holepunch-fix-mmap_sem-i_mutex-deadlock.patch @@ -0,0 +1,74 @@ +From hugh_dickins@symantec.com Fri Apr 13 10:28:00 2007 +From: Hugh Dickins +Date: Fri, 13 Apr 2007 18:27:55 +0100 (BST) +Subject: holepunch: fix mmap_sem i_mutex deadlock +To: Greg KH , Adrian Bunk +Cc: Miklos Szeredi , stable@kernel.org +Message-ID: + +From: Hugh Dickins + +sys_madvise has down_write of mmap_sem, then madvise_remove calls +vmtruncate_range which takes i_mutex and i_alloc_sem: no, we can +easily devise deadlocks from that ordering. + +madvise_remove drop mmap_sem while calling vmtruncate_range: luckily, +since madvise_remove doesn't split or merge vmas, it's easy to handle +this case with a NULL prev, without restructuring sys_madvise. (Though +sad to retake mmap_sem when it's unlikely to be needed, and certainly +down_read is sufficient for MADV_REMOVE, unlike the other madvices.) + +Signed-off-by: Hugh Dickins +Signed-off-by: Greg Kroah-Hartman + +--- + mm/madvise.c | 19 ++++++++++++++----- + 1 file changed, 14 insertions(+), 5 deletions(-) + +--- a/mm/madvise.c ++++ b/mm/madvise.c +@@ -159,9 +159,10 @@ static long madvise_remove(struct vm_are + unsigned long start, unsigned long end) + { + struct address_space *mapping; +- loff_t offset, endoff; ++ loff_t offset, endoff; ++ int error; + +- *prev = vma; ++ *prev = NULL; /* tell sys_madvise we drop mmap_sem */ + + if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) + return -EINVAL; +@@ -180,7 +181,12 @@ static long madvise_remove(struct vm_are + + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + endoff = (loff_t)(end - vma->vm_start - 1) + + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); +- return vmtruncate_range(mapping->host, offset, endoff); ++ ++ /* vmtruncate_range needs to take i_mutex and i_alloc_sem */ ++ up_write(¤t->mm->mmap_sem); ++ error = vmtruncate_range(mapping->host, offset, endoff); ++ down_write(¤t->mm->mmap_sem); ++ return error; + } + + static long +@@ -315,12 +321,15 @@ asmlinkage long sys_madvise(unsigned lon + if (error) + goto out; + start = tmp; +- if (start < prev->vm_end) ++ if (prev && start < prev->vm_end) + start = prev->vm_end; + error = unmapped_error; + if (start >= end) + goto out; +- vma = prev->vm_next; ++ if (prev) ++ vma = prev->vm_next; ++ else /* madvise_remove dropped mmap_sem */ ++ vma = find_vma(current->mm, start); + } + out: + up_write(¤t->mm->mmap_sem); diff --git a/queue-2.6.20/holepunch-fix-shmem_truncate_range-punch-locking.patch b/queue-2.6.20/holepunch-fix-shmem_truncate_range-punch-locking.patch new file mode 100644 index 00000000000..c74f4c725f8 --- /dev/null +++ b/queue-2.6.20/holepunch-fix-shmem_truncate_range-punch-locking.patch @@ -0,0 +1,225 @@ +From hugh_dickins@symantec.com Fri Apr 13 10:26:22 2007 +From: Hugh Dickins +Date: Fri, 13 Apr 2007 18:26:13 +0100 (BST) +Subject: [PATCH 2/4] holepunch: fix shmem_truncate_range punch locking +To: Greg KH , Adrian Bunk +Cc: Miklos Szeredi , stable@kernel.org +Message-ID: + +From: Hugh Dickins + +Miklos Szeredi observes that during truncation of shmem page directories, +info->lock is released to improve latency (after lowering i_size and +next_index to exclude races); but this is quite wrong for holepunching, +which receives no such protection from i_size or next_index, and is left +vulnerable to races with shmem_unuse, shmem_getpage and shmem_writepage. + +Hold info->lock throughout when holepunching? No, any user could prevent +rescheduling for far too long. Instead take info->lock just when needed: +in shmem_free_swp when removing the swap entries, and whenever removing +a directory page from the level above. But so long as we remove before +scanning, we can safely skip taking the lock at the lower levels, except +at misaligned start and end of the hole. + +Signed-off-by: Hugh Dickins +Signed-off-by: Greg Kroah-Hartman + +--- + mm/shmem.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++--------------- + 1 file changed, 73 insertions(+), 23 deletions(-) + +--- a/mm/shmem.c ++++ b/mm/shmem.c +@@ -402,26 +402,38 @@ static swp_entry_t *shmem_swp_alloc(stru + /* + * shmem_free_swp - free some swap entries in a directory + * +- * @dir: pointer to the directory +- * @edir: pointer after last entry of the directory ++ * @dir: pointer to the directory ++ * @edir: pointer after last entry of the directory ++ * @punch_lock: pointer to spinlock when needed for the holepunch case + */ +-static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir) ++static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir, ++ spinlock_t *punch_lock) + { ++ spinlock_t *punch_unlock = NULL; + swp_entry_t *ptr; + int freed = 0; + + for (ptr = dir; ptr < edir; ptr++) { + if (ptr->val) { ++ if (unlikely(punch_lock)) { ++ punch_unlock = punch_lock; ++ punch_lock = NULL; ++ spin_lock(punch_unlock); ++ if (!ptr->val) ++ continue; ++ } + free_swap_and_cache(*ptr); + *ptr = (swp_entry_t){0}; + freed++; + } + } ++ if (punch_unlock) ++ spin_unlock(punch_unlock); + return freed; + } + +-static int shmem_map_and_free_swp(struct page *subdir, +- int offset, int limit, struct page ***dir) ++static int shmem_map_and_free_swp(struct page *subdir, int offset, ++ int limit, struct page ***dir, spinlock_t *punch_lock) + { + swp_entry_t *ptr; + int freed = 0; +@@ -431,7 +443,8 @@ static int shmem_map_and_free_swp(struct + int size = limit - offset; + if (size > LATENCY_LIMIT) + size = LATENCY_LIMIT; +- freed += shmem_free_swp(ptr+offset, ptr+offset+size); ++ freed += shmem_free_swp(ptr+offset, ptr+offset+size, ++ punch_lock); + if (need_resched()) { + shmem_swp_unmap(ptr); + if (*dir) { +@@ -482,6 +495,8 @@ static void shmem_truncate_range(struct + int offset; + int freed; + int punch_hole; ++ spinlock_t *needs_lock; ++ spinlock_t *punch_lock; + unsigned long upper_limit; + + inode->i_ctime = inode->i_mtime = CURRENT_TIME; +@@ -495,6 +510,7 @@ static void shmem_truncate_range(struct + limit = info->next_index; + upper_limit = SHMEM_MAX_INDEX; + info->next_index = idx; ++ needs_lock = NULL; + punch_hole = 0; + } else { + if (end + 1 >= inode->i_size) { /* we may free a little more */ +@@ -505,6 +521,7 @@ static void shmem_truncate_range(struct + limit = (end + 1) >> PAGE_CACHE_SHIFT; + upper_limit = limit; + } ++ needs_lock = &info->lock; + punch_hole = 1; + } + +@@ -521,7 +538,7 @@ static void shmem_truncate_range(struct + size = limit; + if (size > SHMEM_NR_DIRECT) + size = SHMEM_NR_DIRECT; +- nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size); ++ nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock); + } + + /* +@@ -531,6 +548,19 @@ static void shmem_truncate_range(struct + if (!topdir || limit <= SHMEM_NR_DIRECT) + goto done2; + ++ /* ++ * The truncation case has already dropped info->lock, and we're safe ++ * because i_size and next_index have already been lowered, preventing ++ * access beyond. But in the punch_hole case, we still need to take ++ * the lock when updating the swap directory, because there might be ++ * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or ++ * shmem_writepage. However, whenever we find we can remove a whole ++ * directory page (not at the misaligned start or end of the range), ++ * we first NULLify its pointer in the level above, and then have no ++ * need to take the lock when updating its contents: needs_lock and ++ * punch_lock (either pointing to info->lock or NULL) manage this. ++ */ ++ + upper_limit -= SHMEM_NR_DIRECT; + limit -= SHMEM_NR_DIRECT; + idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0; +@@ -552,7 +582,13 @@ static void shmem_truncate_range(struct + diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) % + ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE; + if (!diroff && !offset && upper_limit >= stage) { +- *dir = NULL; ++ if (needs_lock) { ++ spin_lock(needs_lock); ++ *dir = NULL; ++ spin_unlock(needs_lock); ++ needs_lock = NULL; ++ } else ++ *dir = NULL; + nr_pages_to_free++; + list_add(&middir->lru, &pages_to_free); + } +@@ -578,8 +614,16 @@ static void shmem_truncate_range(struct + } + stage = idx + ENTRIES_PER_PAGEPAGE; + middir = *dir; ++ if (punch_hole) ++ needs_lock = &info->lock; + if (upper_limit >= stage) { +- *dir = NULL; ++ if (needs_lock) { ++ spin_lock(needs_lock); ++ *dir = NULL; ++ spin_unlock(needs_lock); ++ needs_lock = NULL; ++ } else ++ *dir = NULL; + nr_pages_to_free++; + list_add(&middir->lru, &pages_to_free); + } +@@ -588,31 +632,37 @@ static void shmem_truncate_range(struct + dir = shmem_dir_map(middir); + diroff = 0; + } ++ punch_lock = needs_lock; + subdir = dir[diroff]; +- if (subdir && page_private(subdir)) { ++ if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) { ++ if (needs_lock) { ++ spin_lock(needs_lock); ++ dir[diroff] = NULL; ++ spin_unlock(needs_lock); ++ punch_lock = NULL; ++ } else ++ dir[diroff] = NULL; ++ nr_pages_to_free++; ++ list_add(&subdir->lru, &pages_to_free); ++ } ++ if (subdir && page_private(subdir) /* has swap entries */) { + size = limit - idx; + if (size > ENTRIES_PER_PAGE) + size = ENTRIES_PER_PAGE; + freed = shmem_map_and_free_swp(subdir, +- offset, size, &dir); ++ offset, size, &dir, punch_lock); + if (!dir) + dir = shmem_dir_map(middir); + nr_swaps_freed += freed; +- if (offset) ++ if (offset || punch_lock) { + spin_lock(&info->lock); +- set_page_private(subdir, page_private(subdir) - freed); +- if (offset) ++ set_page_private(subdir, ++ page_private(subdir) - freed); + spin_unlock(&info->lock); +- if (!punch_hole) +- BUG_ON(page_private(subdir) > offset); +- } +- if (offset) +- offset = 0; +- else if (subdir && upper_limit - idx >= ENTRIES_PER_PAGE) { +- dir[diroff] = NULL; +- nr_pages_to_free++; +- list_add(&subdir->lru, &pages_to_free); ++ } else ++ BUG_ON(page_private(subdir) != freed); + } ++ offset = 0; + } + done1: + shmem_dir_unmap(dir); diff --git a/queue-2.6.20/holepunch-fix-shmem_truncate_range-punching-too-far.patch b/queue-2.6.20/holepunch-fix-shmem_truncate_range-punching-too-far.patch new file mode 100644 index 00000000000..500fc047d8f --- /dev/null +++ b/queue-2.6.20/holepunch-fix-shmem_truncate_range-punching-too-far.patch @@ -0,0 +1,112 @@ +From hugh_dickins@symantec.com Fri Apr 13 10:25:06 2007 +From: Hugh Dickins +Date: Fri, 13 Apr 2007 18:25:00 +0100 (BST) +Subject: holepunch: fix shmem_truncate_range punching too far +To: Greg KH , Adrian Bunk +Cc: Miklos Szeredi , stable@kernel.org +Message-ID: + +From: Hugh Dickins + +Miklos Szeredi observes BUG_ON(!entry) in shmem_writepage() triggered +in rare circumstances, because shmem_truncate_range() erroneously +removes partially truncated directory pages at the end of the range: +later reclaim on pages pointing to these removed directories triggers +the BUG. Indeed, and it can also cause data loss beyond the hole. + +Fix this as in the patch proposed by Miklos, but distinguish between +"limit" (how far we need to search: ignore truncation's next_index +optimization in the holepunch case - if there are races it's more +consistent to act on the whole range specified) and "upper_limit" +(how far we can free directory pages: generally we must be careful +to keep partially punched pages, but can relax at end of file - +i_size being held stable by i_mutex). + +Signed-off-by: Hugh Dickins +Signed-off-by: Greg Kroah-Hartman + + +--- + mm/shmem.c | 32 +++++++++++++++++++++----------- + 1 file changed, 21 insertions(+), 11 deletions(-) + +--- a/mm/shmem.c ++++ b/mm/shmem.c +@@ -481,7 +481,8 @@ static void shmem_truncate_range(struct + long nr_swaps_freed = 0; + int offset; + int freed; +- int punch_hole = 0; ++ int punch_hole; ++ unsigned long upper_limit; + + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; +@@ -492,11 +493,18 @@ static void shmem_truncate_range(struct + info->flags |= SHMEM_TRUNCATE; + if (likely(end == (loff_t) -1)) { + limit = info->next_index; ++ upper_limit = SHMEM_MAX_INDEX; + info->next_index = idx; ++ punch_hole = 0; + } else { +- limit = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; +- if (limit > info->next_index) +- limit = info->next_index; ++ if (end + 1 >= inode->i_size) { /* we may free a little more */ ++ limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >> ++ PAGE_CACHE_SHIFT; ++ upper_limit = SHMEM_MAX_INDEX; ++ } else { ++ limit = (end + 1) >> PAGE_CACHE_SHIFT; ++ upper_limit = limit; ++ } + punch_hole = 1; + } + +@@ -520,10 +528,10 @@ static void shmem_truncate_range(struct + * If there are no indirect blocks or we are punching a hole + * below indirect blocks, nothing to be done. + */ +- if (!topdir || (punch_hole && (limit <= SHMEM_NR_DIRECT))) ++ if (!topdir || limit <= SHMEM_NR_DIRECT) + goto done2; + +- BUG_ON(limit <= SHMEM_NR_DIRECT); ++ upper_limit -= SHMEM_NR_DIRECT; + limit -= SHMEM_NR_DIRECT; + idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0; + offset = idx % ENTRIES_PER_PAGE; +@@ -543,7 +551,7 @@ static void shmem_truncate_range(struct + if (*dir) { + diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) % + ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE; +- if (!diroff && !offset) { ++ if (!diroff && !offset && upper_limit >= stage) { + *dir = NULL; + nr_pages_to_free++; + list_add(&middir->lru, &pages_to_free); +@@ -570,9 +578,11 @@ static void shmem_truncate_range(struct + } + stage = idx + ENTRIES_PER_PAGEPAGE; + middir = *dir; +- *dir = NULL; +- nr_pages_to_free++; +- list_add(&middir->lru, &pages_to_free); ++ if (upper_limit >= stage) { ++ *dir = NULL; ++ nr_pages_to_free++; ++ list_add(&middir->lru, &pages_to_free); ++ } + shmem_dir_unmap(dir); + cond_resched(); + dir = shmem_dir_map(middir); +@@ -598,7 +608,7 @@ static void shmem_truncate_range(struct + } + if (offset) + offset = 0; +- else if (subdir && !page_private(subdir)) { ++ else if (subdir && upper_limit - idx >= ENTRIES_PER_PAGE) { + dir[diroff] = NULL; + nr_pages_to_free++; + list_add(&subdir->lru, &pages_to_free); diff --git a/queue-2.6.20/ib-mthca-fix-data-corruption-after-fmr-unmap-on-sinai.patch b/queue-2.6.20/ib-mthca-fix-data-corruption-after-fmr-unmap-on-sinai.patch new file mode 100644 index 00000000000..67f1ccab2c8 --- /dev/null +++ b/queue-2.6.20/ib-mthca-fix-data-corruption-after-fmr-unmap-on-sinai.patch @@ -0,0 +1,42 @@ +From stable-bounces@linux.kernel.org Mon Apr 16 14:19:25 2007 +From: Roland Dreier +Date: Mon, 16 Apr 2007 14:17:42 -0700 +Subject: IB/mthca: Fix data corruption after FMR unmap on Sinai +To: stable@kernel.org +Cc: mst@mellanox.co.il, general@lists.openfabrics.org +Message-ID: + +From: Michael S. Tsirkin + +In mthca_arbel_fmr_unmap(), the high bits of the key are masked off. +This gets rid of the effect of adjust_key(), which makes sure that +bits 3 and 23 of the key are equal when the Sinai throughput +optimization is enabled, and so it may happen that an FMR will end up +with bits 3 and 23 in the key being different. This causes data +corruption, because when enabling the throughput optimization, the +driver promises the HCA firmware that bits 3 and 23 of all memory keys +will always be equal. + +Fix by re-applying adjust_key() after masking the key. + +Thanks to Or Gerlitz for reproducing the problem, and Ariel Shahar for +help in debug. + +Signed-off-by: Michael S. Tsirkin +Signed-off-by: Roland Dreier +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/infiniband/hw/mthca/mthca_mr.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/infiniband/hw/mthca/mthca_mr.c ++++ b/drivers/infiniband/hw/mthca/mthca_mr.c +@@ -751,6 +751,7 @@ void mthca_arbel_fmr_unmap(struct mthca_ + + key = arbel_key_to_hw_index(fmr->ibmr.lkey); + key &= dev->limits.num_mpts - 1; ++ key = adjust_key(dev, key); + fmr->ibmr.lkey = fmr->ibmr.rkey = arbel_hw_index_to_key(key); + + fmr->maps = 0; diff --git a/queue-2.6.20/knfsd-use-a-spinlock-to-protect-sk_info_authunix.patch b/queue-2.6.20/knfsd-use-a-spinlock-to-protect-sk_info_authunix.patch new file mode 100644 index 00000000000..24776186bd7 --- /dev/null +++ b/queue-2.6.20/knfsd-use-a-spinlock-to-protect-sk_info_authunix.patch @@ -0,0 +1,76 @@ +From stable-bounces@linux.kernel.org Mon Apr 16 19:03:16 2007 +From: NeilBrown +Date: Tue, 17 Apr 2007 12:01:41 +1000 +Subject: knfsd: Use a spinlock to protect sk_info_authunix +To: Andrew Morton +Cc: stable@kernel.org, Gabriel Barazer , nfs@lists.sourceforge.net, linux-kernel@vger.kernel.org, Greg Banks +Message-ID: <1070417020141.28483@suse.de> + +From: NeilBrown + +sk_info_authunix is not being protected properly so the object that +it points to can be cache_put twice, leading to corruption. + +We borrow svsk->sk_defer_lock to provide the protection. We should probably +rename that lock to have a more generic name - later. + +Thanks to Gabriel for reporting this. + +Cc: Greg Banks +Cc: Gabriel Barazer +Signed-off-by: Neil Brown +Signed-off-by: Greg Kroah-Hartman + +--- + net/sunrpc/svcauth_unix.c | 21 ++++++++++++++++----- + 1 file changed, 16 insertions(+), 5 deletions(-) + +--- a/net/sunrpc/svcauth_unix.c ++++ b/net/sunrpc/svcauth_unix.c +@@ -383,7 +383,10 @@ void svcauth_unix_purge(void) + static inline struct ip_map * + ip_map_cached_get(struct svc_rqst *rqstp) + { +- struct ip_map *ipm = rqstp->rq_sock->sk_info_authunix; ++ struct ip_map *ipm; ++ struct svc_sock *svsk = rqstp->rq_sock; ++ spin_lock_bh(&svsk->sk_defer_lock); ++ ipm = svsk->sk_info_authunix; + if (ipm != NULL) { + if (!cache_valid(&ipm->h)) { + /* +@@ -391,12 +394,14 @@ ip_map_cached_get(struct svc_rqst *rqstp + * remembered, e.g. by a second mount from the + * same IP address. + */ +- rqstp->rq_sock->sk_info_authunix = NULL; ++ svsk->sk_info_authunix = NULL; ++ spin_unlock_bh(&svsk->sk_defer_lock); + cache_put(&ipm->h, &ip_map_cache); + return NULL; + } + cache_get(&ipm->h); + } ++ spin_unlock_bh(&svsk->sk_defer_lock); + return ipm; + } + +@@ -405,9 +410,15 @@ ip_map_cached_put(struct svc_rqst *rqstp + { + struct svc_sock *svsk = rqstp->rq_sock; + +- if (svsk->sk_sock->type == SOCK_STREAM && svsk->sk_info_authunix == NULL) +- svsk->sk_info_authunix = ipm; /* newly cached, keep the reference */ +- else ++ spin_lock_bh(&svsk->sk_defer_lock); ++ if (svsk->sk_sock->type == SOCK_STREAM && ++ svsk->sk_info_authunix == NULL) { ++ /* newly cached, keep the reference */ ++ svsk->sk_info_authunix = ipm; ++ ipm = NULL; ++ } ++ spin_unlock_bh(&svsk->sk_defer_lock); ++ if (ipm) + cache_put(&ipm->h, &ip_map_cache); + } + diff --git a/queue-2.6.20/kvm-mmu-fix-guest-writes-to-nonpae-pde.patch b/queue-2.6.20/kvm-mmu-fix-guest-writes-to-nonpae-pde.patch new file mode 100644 index 00000000000..a4ff23e7d43 --- /dev/null +++ b/queue-2.6.20/kvm-mmu-fix-guest-writes-to-nonpae-pde.patch @@ -0,0 +1,115 @@ +From stable-bounces@linux.kernel.org Sun Apr 22 02:29:31 2007 +From: Avi Kivity +Date: Sun, 22 Apr 2007 12:28:05 +0300 +Subject: KVM: MMU: Fix guest writes to nonpae pde +To: stable@kernel.org +Cc: kvm-devel@lists.sourceforge.net, linux-kernel@vger.kernel.org, Avi Kivity +Message-ID: <11772340852200-git-send-email-avi@qumranet.com> + +From: Avi Kivity + +KVM shadow page tables are always in pae mode, regardless of the guest +setting. This means that a guest pde (mapping 4MB of memory) is mapped +to two shadow pdes (mapping 2MB each). + +When the guest writes to a pte or pde, we intercept the write and emulate it. +We also remove any shadowed mappings corresponding to the write. Since the +mmu did not account for the doubling in the number of pdes, it removed the +wrong entry, resulting in a mismatch between shadow page tables and guest +page tables, followed shortly by guest memory corruption. + +This patch fixes the problem by detecting the special case of writing to +a non-pae pde and adjusting the address and number of shadow pdes zapped +accordingly. + +Acked-by: Ingo Molnar +Signed-off-by: Avi Kivity +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/kvm/mmu.c | 47 +++++++++++++++++++++++++++++++++++------------ + 1 file changed, 35 insertions(+), 12 deletions(-) + +--- a/drivers/kvm/mmu.c ++++ b/drivers/kvm/mmu.c +@@ -1093,22 +1093,40 @@ out: + return r; + } + ++static void mmu_pre_write_zap_pte(struct kvm_vcpu *vcpu, ++ struct kvm_mmu_page *page, ++ u64 *spte) ++{ ++ u64 pte; ++ struct kvm_mmu_page *child; ++ ++ pte = *spte; ++ if (is_present_pte(pte)) { ++ if (page->role.level == PT_PAGE_TABLE_LEVEL) ++ rmap_remove(vcpu, spte); ++ else { ++ child = page_header(pte & PT64_BASE_ADDR_MASK); ++ mmu_page_remove_parent_pte(vcpu, child, spte); ++ } ++ } ++ *spte = 0; ++} ++ + void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) + { + gfn_t gfn = gpa >> PAGE_SHIFT; + struct kvm_mmu_page *page; +- struct kvm_mmu_page *child; + struct hlist_node *node, *n; + struct hlist_head *bucket; + unsigned index; + u64 *spte; +- u64 pte; + unsigned offset = offset_in_page(gpa); + unsigned pte_size; + unsigned page_offset; + unsigned misaligned; + int level; + int flooded = 0; ++ int npte; + + pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes); + if (gfn == vcpu->last_pt_write_gfn) { +@@ -1144,22 +1162,27 @@ void kvm_mmu_pre_write(struct kvm_vcpu * + } + page_offset = offset; + level = page->role.level; ++ npte = 1; + if (page->role.glevels == PT32_ROOT_LEVEL) { +- page_offset <<= 1; /* 32->64 */ ++ page_offset <<= 1; /* 32->64 */ ++ /* ++ * A 32-bit pde maps 4MB while the shadow pdes map ++ * only 2MB. So we need to double the offset again ++ * and zap two pdes instead of one. ++ */ ++ if (level == PT32_ROOT_LEVEL) { ++ page_offset &= ~7; /* kill rounding error */ ++ page_offset <<= 1; ++ npte = 2; ++ } + page_offset &= ~PAGE_MASK; + } + spte = __va(page->page_hpa); + spte += page_offset / sizeof(*spte); +- pte = *spte; +- if (is_present_pte(pte)) { +- if (level == PT_PAGE_TABLE_LEVEL) +- rmap_remove(vcpu, spte); +- else { +- child = page_header(pte & PT64_BASE_ADDR_MASK); +- mmu_page_remove_parent_pte(vcpu, child, spte); +- } ++ while (npte--) { ++ mmu_pre_write_zap_pte(vcpu, page, spte); ++ ++spte; + } +- *spte = 0; + } + } + diff --git a/queue-2.6.20/kvm-mmu-fix-host-memory-corruption-on-i386-with-4gb-ram.patch b/queue-2.6.20/kvm-mmu-fix-host-memory-corruption-on-i386-with-4gb-ram.patch new file mode 100644 index 00000000000..9b0cb2073b1 --- /dev/null +++ b/queue-2.6.20/kvm-mmu-fix-host-memory-corruption-on-i386-with-4gb-ram.patch @@ -0,0 +1,48 @@ +From stable-bounces@linux.kernel.org Sun Apr 22 02:30:01 2007 +From: Avi Kivity +Date: Sun, 22 Apr 2007 12:28:49 +0300 +Subject: KVM: MMU: Fix host memory corruption on i386 with >= 4GB ram +To: stable@kernel.org +Cc: kvm-devel@lists.sourceforge.net, linux-kernel@vger.kernel.org, Avi Kivity +Message-ID: <11772341294121-git-send-email-avi@qumranet.com> + +From: Avi Kivity + +PAGE_MASK is an unsigned long, so using it to mask physical addresses on +i386 (which are 64-bit wide) leads to truncation. This can result in +page->private of unrelated memory pages being modified, with disasterous +results. + +Fix by not using PAGE_MASK for physical addresses; instead calculate +the correct value directly from PAGE_SIZE. Also fix a similar BUG_ON(). + +Acked-by: Ingo Molnar +Signed-off-by: Avi Kivity +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/kvm/mmu.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/drivers/kvm/mmu.c ++++ b/drivers/kvm/mmu.c +@@ -131,7 +131,7 @@ static int dbg = 1; + (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) + + +-#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & PAGE_MASK) ++#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) + #define PT64_DIR_BASE_ADDR_MASK \ + (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) + +@@ -406,8 +406,8 @@ static void rmap_write_protect(struct kv + spte = desc->shadow_ptes[0]; + } + BUG_ON(!spte); +- BUG_ON((*spte & PT64_BASE_ADDR_MASK) != +- page_to_pfn(page) << PAGE_SHIFT); ++ BUG_ON((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT ++ != page_to_pfn(page)); + BUG_ON(!(*spte & PT_PRESENT_MASK)); + BUG_ON(!(*spte & PT_WRITABLE_MASK)); + rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); diff --git a/queue-2.6.20/series b/queue-2.6.20/series new file mode 100644 index 00000000000..27177a7f199 --- /dev/null +++ b/queue-2.6.20/series @@ -0,0 +1,10 @@ +knfsd-use-a-spinlock-to-protect-sk_info_authunix.patch +ib-mthca-fix-data-corruption-after-fmr-unmap-on-sinai.patch +hid-zeroing-of-bytes-in-output-fields-is-bogus.patch +kvm-mmu-fix-guest-writes-to-nonpae-pde.patch +kvm-mmu-fix-host-memory-corruption-on-i386-with-4gb-ram.patch +holepunch-fix-shmem_truncate_range-punching-too-far.patch +holepunch-fix-shmem_truncate_range-punch-locking.patch +holepunch-fix-disconnected-pages-after-second-truncate.patch +holepunch-fix-mmap_sem-i_mutex-deadlock.patch +fix-sparc64-sbus-iommu-allocator.patch