2 * Compressed RAM based swap device
4 * Copyright (C) 2008, 2009, 2010 Nitin Gupta
6 * This code is released using a dual license strategy: BSD/GPL
7 * You can choose the licence that better fits your requirements.
9 * Released under the terms of 3-clause BSD License
10 * Released under the terms of GNU General Public License Version 2.0
12 * Project home: http://compcache.googlecode.com
15 #define KMSG_COMPONENT "ramzswap"
16 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
18 #include <linux/module.h>
19 #include <linux/kernel.h>
20 #include <linux/bitops.h>
21 #include <linux/blkdev.h>
22 #include <linux/buffer_head.h>
23 #include <linux/device.h>
24 #include <linux/genhd.h>
25 #include <linux/highmem.h>
26 #include <linux/slab.h>
27 #include <linux/lzo.h>
28 #include <linux/string.h>
29 #include <linux/swap.h>
30 #include <linux/swapops.h>
31 #include <linux/vmalloc.h>
33 #include "ramzswap_drv.h"
36 static int ramzswap_major
;
37 static struct ramzswap
*devices
;
40 * Pages that compress to larger than this size are
41 * forwarded to backing swap, if present or stored
42 * uncompressed in memory otherwise.
44 static unsigned int max_zpage_size
;
46 /* Module params (documentation at end) */
47 static unsigned int num_devices
;
49 static int rzs_test_flag(struct ramzswap
*rzs
, u32 index
,
50 enum rzs_pageflags flag
)
52 return rzs
->table
[index
].flags
& BIT(flag
);
55 static void rzs_set_flag(struct ramzswap
*rzs
, u32 index
,
56 enum rzs_pageflags flag
)
58 rzs
->table
[index
].flags
|= BIT(flag
);
61 static void rzs_clear_flag(struct ramzswap
*rzs
, u32 index
,
62 enum rzs_pageflags flag
)
64 rzs
->table
[index
].flags
&= ~BIT(flag
);
67 static int page_zero_filled(void *ptr
)
72 page
= (unsigned long *)ptr
;
74 for (pos
= 0; pos
!= PAGE_SIZE
/ sizeof(*page
); pos
++) {
83 * memlimit cannot be greater than backing disk size.
85 static void ramzswap_set_memlimit(struct ramzswap
*rzs
, size_t totalram_bytes
)
87 int memlimit_valid
= 1;
90 pr_info("Memory limit not set.\n");
94 if (rzs
->memlimit
> rzs
->disksize
) {
95 pr_info("Memory limit cannot be greater than "
96 "disksize: limit=%zu, disksize=%zu\n",
97 rzs
->memlimit
, rzs
->disksize
);
101 if (!memlimit_valid
) {
102 size_t mempart
, disksize
;
103 pr_info("Using default: smaller of (%u%% of RAM) and "
104 "(backing disk size).\n",
105 default_memlimit_perc_ram
);
106 mempart
= default_memlimit_perc_ram
* (totalram_bytes
/ 100);
107 disksize
= rzs
->disksize
;
108 rzs
->memlimit
= mempart
> disksize
? disksize
: mempart
;
111 if (rzs
->memlimit
> totalram_bytes
/ 2) {
113 "Its not advisable setting limit more than half of "
114 "size of memory since we expect a 2:1 compression ratio. "
115 "Limit represents amount of *compressed* data we can keep "
117 "\tMemory Size: %zu kB\n"
118 "\tLimit you selected: %zu kB\n"
119 "Continuing anyway ...\n",
120 totalram_bytes
>> 10, rzs
->memlimit
>> 10
124 rzs
->memlimit
&= PAGE_MASK
;
125 BUG_ON(!rzs
->memlimit
);
128 static void ramzswap_set_disksize(struct ramzswap
*rzs
, size_t totalram_bytes
)
130 if (!rzs
->disksize
) {
132 "disk size not provided. You can use disksize_kb module "
133 "param to specify size.\nUsing default: (%u%% of RAM).\n",
134 default_disksize_perc_ram
136 rzs
->disksize
= default_disksize_perc_ram
*
137 (totalram_bytes
/ 100);
140 if (rzs
->disksize
> 2 * (totalram_bytes
)) {
142 "There is little point creating a ramzswap of greater than "
143 "twice the size of memory since we expect a 2:1 compression "
144 "ratio. Note that ramzswap uses about 0.1%% of the size of "
145 "the swap device when not in use so a huge ramzswap is "
147 "\tMemory Size: %zu kB\n"
148 "\tSize you selected: %zu kB\n"
149 "Continuing anyway ...\n",
150 totalram_bytes
>> 10, rzs
->disksize
154 rzs
->disksize
&= PAGE_MASK
;
158 * Swap header (1st page of swap device) contains information
159 * to indentify it as a swap partition. Prepare such a header
160 * for ramzswap device (ramzswap0) so that swapon can identify
161 * it as swap partition. In case backing swap device is provided,
162 * copy its swap header.
164 static int setup_swap_header(struct ramzswap
*rzs
, union swap_header
*s
)
168 struct address_space
*mapping
;
169 union swap_header
*backing_swap_header
;
172 * There is no backing swap device. Create a swap header
173 * that is acceptable by swapon.
175 if (!rzs
->backing_swap
) {
177 s
->info
.last_page
= (rzs
->disksize
>> PAGE_SHIFT
) - 1;
178 s
->info
.nr_badpages
= 0;
179 memcpy(s
->magic
.magic
, "SWAPSPACE2", 10);
184 * We have a backing swap device. Copy its swap header
185 * to ramzswap device header. If this header contains
186 * invalid information (backing device not a swap
187 * partition, etc.), swapon will fail for ramzswap
188 * which is correct behavior - we don't want to swap
189 * over filesystem partition!
192 /* Read the backing swap header (code from sys_swapon) */
193 mapping
= rzs
->swap_file
->f_mapping
;
194 if (!mapping
->a_ops
->readpage
) {
199 page
= read_mapping_page(mapping
, 0, rzs
->swap_file
);
205 backing_swap_header
= kmap(page
);
206 memcpy(s
, backing_swap_header
, sizeof(*s
));
207 if (s
->info
.nr_badpages
) {
208 pr_info("Cannot use backing swap with bad pages (%u)\n",
209 s
->info
.nr_badpages
);
213 * ramzswap disksize equals number of usable pages in backing
214 * swap. Set last_page in swap header to match this disksize
215 * ('last_page' means 0-based index of last usable swap page).
217 s
->info
.last_page
= (rzs
->disksize
>> PAGE_SHIFT
) - 1;
224 static void ramzswap_ioctl_get_stats(struct ramzswap
*rzs
,
225 struct ramzswap_ioctl_stats
*s
)
227 strncpy(s
->backing_swap_name
, rzs
->backing_swap_name
,
228 MAX_SWAP_NAME_LEN
- 1);
229 s
->backing_swap_name
[MAX_SWAP_NAME_LEN
- 1] = '\0';
231 s
->disksize
= rzs
->disksize
;
232 s
->memlimit
= rzs
->memlimit
;
234 #if defined(CONFIG_RAMZSWAP_STATS)
236 struct ramzswap_stats
*rs
= &rzs
->stats
;
237 size_t succ_writes
, mem_used
;
238 unsigned int good_compress_perc
= 0, no_compress_perc
= 0;
240 mem_used
= xv_get_total_size_bytes(rzs
->mem_pool
)
241 + (rs
->pages_expand
<< PAGE_SHIFT
);
242 succ_writes
= rzs_stat64_read(rzs
, &rs
->num_writes
) -
243 rzs_stat64_read(rzs
, &rs
->failed_writes
);
245 if (succ_writes
&& rs
->pages_stored
) {
246 good_compress_perc
= rs
->good_compress
* 100
248 no_compress_perc
= rs
->pages_expand
* 100
252 s
->num_reads
= rzs_stat64_read(rzs
, &rs
->num_reads
);
253 s
->num_writes
= rzs_stat64_read(rzs
, &rs
->num_writes
);
254 s
->failed_reads
= rzs_stat64_read(rzs
, &rs
->failed_reads
);
255 s
->failed_writes
= rzs_stat64_read(rzs
, &rs
->failed_writes
);
256 s
->invalid_io
= rzs_stat64_read(rzs
, &rs
->invalid_io
);
257 s
->notify_free
= rzs_stat64_read(rzs
, &rs
->notify_free
);
258 s
->pages_zero
= rs
->pages_zero
;
260 s
->good_compress_pct
= good_compress_perc
;
261 s
->pages_expand_pct
= no_compress_perc
;
263 s
->pages_stored
= rs
->pages_stored
;
264 s
->pages_used
= mem_used
>> PAGE_SHIFT
;
265 s
->orig_data_size
= rs
->pages_stored
<< PAGE_SHIFT
;
266 s
->compr_data_size
= rs
->compr_size
;
267 s
->mem_used_total
= mem_used
;
269 s
->bdev_num_reads
= rzs_stat64_read(rzs
, &rs
->bdev_num_reads
);
270 s
->bdev_num_writes
= rzs_stat64_read(rzs
, &rs
->bdev_num_writes
);
272 #endif /* CONFIG_RAMZSWAP_STATS */
275 static int add_backing_swap_extent(struct ramzswap
*rzs
,
280 struct list_head
*head
;
281 struct page
*curr_page
, *new_page
;
282 unsigned int extents_per_page
= PAGE_SIZE
/
283 sizeof(struct ramzswap_backing_extent
);
285 idx
= rzs
->num_extents
% extents_per_page
;
287 new_page
= alloc_page(__GFP_ZERO
);
291 if (rzs
->num_extents
) {
292 curr_page
= virt_to_page(rzs
->curr_extent
);
293 head
= &curr_page
->lru
;
295 head
= &rzs
->backing_swap_extent_list
;
298 list_add(&new_page
->lru
, head
);
299 rzs
->curr_extent
= page_address(new_page
);
302 rzs
->curr_extent
->phy_pagenum
= phy_pagenum
;
303 rzs
->curr_extent
->num_pages
= num_pages
;
305 pr_debug("add_extent: idx=%u, phy_pgnum=%lu, num_pgs=%lu, "
306 "pg_last=%lu, curr_ext=%p\n", idx
, phy_pagenum
, num_pages
,
307 phy_pagenum
+ num_pages
- 1, rzs
->curr_extent
);
309 if (idx
!= extents_per_page
- 1)
315 static int setup_backing_swap_extents(struct ramzswap
*rzs
,
316 struct inode
*inode
, unsigned long *num_pages
)
320 unsigned blocks_per_page
;
321 pgoff_t contig_pages
= 0, total_pages
= 0;
322 pgoff_t pagenum
= 0, prev_pagenum
= 0;
323 sector_t probe_block
= 0;
326 blkbits
= inode
->i_blkbits
;
327 blocks_per_page
= PAGE_SIZE
>> blkbits
;
329 last_block
= i_size_read(inode
) >> blkbits
;
330 while (probe_block
+ blocks_per_page
<= last_block
) {
331 unsigned block_in_page
;
332 sector_t first_block
;
334 first_block
= bmap(inode
, probe_block
);
335 if (first_block
== 0)
338 /* It must be PAGE_SIZE aligned on-disk */
339 if (first_block
& (blocks_per_page
- 1)) {
344 /* All blocks within this page must be contiguous on disk */
345 for (block_in_page
= 1; block_in_page
< blocks_per_page
;
349 block
= bmap(inode
, probe_block
+ block_in_page
);
352 if (block
!= first_block
+ block_in_page
) {
360 * We found a PAGE_SIZE length, PAGE_SIZE aligned
363 pagenum
= first_block
>> (PAGE_SHIFT
- blkbits
);
365 if (total_pages
&& (pagenum
!= prev_pagenum
+ 1)) {
366 ret
= add_backing_swap_extent(rzs
, prev_pagenum
-
367 (contig_pages
- 1), contig_pages
);
375 prev_pagenum
= pagenum
;
376 probe_block
+= blocks_per_page
;
383 pr_debug("adding last extent: pagenum=%lu, "
384 "contig_pages=%lu\n", pagenum
, contig_pages
);
385 ret
= add_backing_swap_extent(rzs
,
386 prev_pagenum
- (contig_pages
- 1), contig_pages
);
391 if (!rzs
->num_extents
) {
392 pr_err("No swap extents found!\n");
397 *num_pages
= total_pages
;
398 pr_info("Found %lu extents containing %luk\n",
399 rzs
->num_extents
, *num_pages
<< (PAGE_SHIFT
- 10));
404 pr_err("Backing swapfile has holes\n");
407 while (ret
&& !list_empty(&rzs
->backing_swap_extent_list
)) {
409 struct list_head
*entry
= rzs
->backing_swap_extent_list
.next
;
410 page
= list_entry(entry
, struct page
, lru
);
417 static void map_backing_swap_extents(struct ramzswap
*rzs
)
419 struct ramzswap_backing_extent
*se
;
420 struct page
*table_page
, *se_page
;
421 unsigned long num_pages
, num_table_pages
, entry
;
422 unsigned long se_idx
, span
;
423 unsigned entries_per_page
= PAGE_SIZE
/ sizeof(*rzs
->table
);
424 unsigned extents_per_page
= PAGE_SIZE
/ sizeof(*se
);
426 /* True for block device */
427 if (!rzs
->num_extents
)
430 se_page
= list_entry(rzs
->backing_swap_extent_list
.next
,
432 se
= page_address(se_page
);
433 span
= se
->num_pages
;
434 num_pages
= rzs
->disksize
>> PAGE_SHIFT
;
435 num_table_pages
= DIV_ROUND_UP(num_pages
* sizeof(*rzs
->table
),
440 while (num_table_pages
--) {
441 table_page
= vmalloc_to_page(&rzs
->table
[entry
]);
442 while (span
<= entry
) {
444 if (se_idx
== rzs
->num_extents
)
447 if (!(se_idx
% extents_per_page
)) {
448 se_page
= list_entry(se_page
->lru
.next
,
450 se
= page_address(se_page
);
454 span
+= se
->num_pages
;
456 table_page
->mapping
= (struct address_space
*)se
;
457 table_page
->private = se
->num_pages
- (span
- entry
);
458 pr_debug("map_table: entry=%lu, span=%lu, map=%p, priv=%lu\n",
459 entry
, span
, table_page
->mapping
, table_page
->private);
460 entry
+= entries_per_page
;
465 * Check if value of backing_swap module param is sane.
466 * Claim this device and set ramzswap size equal to
467 * size of this block device.
469 static int setup_backing_swap(struct ramzswap
*rzs
)
473 unsigned long num_pages
= 0;
475 struct file
*swap_file
;
476 struct address_space
*mapping
;
477 struct block_device
*bdev
= NULL
;
479 if (!rzs
->backing_swap_name
[0]) {
480 pr_debug("backing_swap param not given\n");
484 pr_info("Using backing swap device: %s\n", rzs
->backing_swap_name
);
486 swap_file
= filp_open(rzs
->backing_swap_name
,
487 O_RDWR
| O_LARGEFILE
, 0);
488 if (IS_ERR(swap_file
)) {
489 pr_err("Error opening backing device: %s\n",
490 rzs
->backing_swap_name
);
495 mapping
= swap_file
->f_mapping
;
496 inode
= mapping
->host
;
498 if (S_ISBLK(inode
->i_mode
)) {
499 bdev
= I_BDEV(inode
);
500 ret
= bd_claim(bdev
, setup_backing_swap
);
505 disksize
= i_size_read(inode
);
507 * Can happen if user gives an extended partition as
508 * backing swap or simply a bad disk.
511 pr_err("Error reading backing swap size.\n");
514 } else if (S_ISREG(inode
->i_mode
)) {
515 bdev
= inode
->i_sb
->s_bdev
;
516 if (IS_SWAPFILE(inode
)) {
520 ret
= setup_backing_swap_extents(rzs
, inode
, &num_pages
);
523 disksize
= num_pages
<< PAGE_SHIFT
;
528 rzs
->swap_file
= swap_file
;
529 rzs
->backing_swap
= bdev
;
530 rzs
->disksize
= disksize
;
537 filp_close(swap_file
, NULL
);
540 rzs
->backing_swap
= NULL
;
545 * Map logical page number 'pagenum' to physical page number
546 * on backing swap device. For block device, this is a nop.
548 static u32
map_backing_swap_page(struct ramzswap
*rzs
, u32 pagenum
)
550 u32 skip_pages
, entries_per_page
;
551 size_t delta
, se_offset
, skipped
;
552 struct page
*table_page
, *se_page
;
553 struct ramzswap_backing_extent
*se
;
555 if (!rzs
->num_extents
)
558 entries_per_page
= PAGE_SIZE
/ sizeof(*rzs
->table
);
560 table_page
= vmalloc_to_page(&rzs
->table
[pagenum
]);
561 se
= (struct ramzswap_backing_extent
*)table_page
->mapping
;
562 se_page
= virt_to_page(se
);
564 skip_pages
= pagenum
- (pagenum
/ entries_per_page
* entries_per_page
);
565 se_offset
= table_page
->private + skip_pages
;
567 if (se_offset
< se
->num_pages
)
568 return se
->phy_pagenum
+ se_offset
;
570 skipped
= se
->num_pages
- table_page
->private;
572 struct ramzswap_backing_extent
*se_base
;
573 u32 se_entries_per_page
= PAGE_SIZE
/ sizeof(*se
);
575 /* Get next swap extent */
576 se_base
= (struct ramzswap_backing_extent
*)
577 page_address(se_page
);
578 if (se
- se_base
== se_entries_per_page
- 1) {
579 se_page
= list_entry(se_page
->lru
.next
,
581 se
= page_address(se_page
);
586 skipped
+= se
->num_pages
;
587 } while (skipped
< skip_pages
);
589 delta
= skipped
- skip_pages
;
590 se_offset
= se
->num_pages
- delta
;
592 return se
->phy_pagenum
+ se_offset
;
595 static void ramzswap_free_page(struct ramzswap
*rzs
, size_t index
)
600 struct page
*page
= rzs
->table
[index
].page
;
601 u32 offset
= rzs
->table
[index
].offset
;
603 if (unlikely(!page
)) {
605 * No memory is allocated for zero filled pages.
606 * Simply clear zero page flag.
608 if (rzs_test_flag(rzs
, index
, RZS_ZERO
)) {
609 rzs_clear_flag(rzs
, index
, RZS_ZERO
);
610 rzs_stat_dec(&rzs
->stats
.pages_zero
);
615 if (unlikely(rzs_test_flag(rzs
, index
, RZS_UNCOMPRESSED
))) {
618 rzs_clear_flag(rzs
, index
, RZS_UNCOMPRESSED
);
619 rzs_stat_dec(&rzs
->stats
.pages_expand
);
623 obj
= kmap_atomic(page
, KM_USER0
) + offset
;
624 clen
= xv_get_object_size(obj
) - sizeof(struct zobj_header
);
625 kunmap_atomic(obj
, KM_USER0
);
627 xv_free(rzs
->mem_pool
, page
, offset
);
628 if (clen
<= PAGE_SIZE
/ 2)
629 rzs_stat_dec(&rzs
->stats
.good_compress
);
632 rzs
->stats
.compr_size
-= clen
;
633 rzs_stat_dec(&rzs
->stats
.pages_stored
);
635 rzs
->table
[index
].page
= NULL
;
636 rzs
->table
[index
].offset
= 0;
639 static int handle_zero_page(struct bio
*bio
)
642 struct page
*page
= bio
->bi_io_vec
[0].bv_page
;
644 user_mem
= kmap_atomic(page
, KM_USER0
);
645 memset(user_mem
, 0, PAGE_SIZE
);
646 kunmap_atomic(user_mem
, KM_USER0
);
648 flush_dcache_page(page
);
650 set_bit(BIO_UPTODATE
, &bio
->bi_flags
);
655 static int handle_uncompressed_page(struct ramzswap
*rzs
, struct bio
*bio
)
659 unsigned char *user_mem
, *cmem
;
661 page
= bio
->bi_io_vec
[0].bv_page
;
662 index
= bio
->bi_sector
>> SECTORS_PER_PAGE_SHIFT
;
664 user_mem
= kmap_atomic(page
, KM_USER0
);
665 cmem
= kmap_atomic(rzs
->table
[index
].page
, KM_USER1
) +
666 rzs
->table
[index
].offset
;
668 memcpy(user_mem
, cmem
, PAGE_SIZE
);
669 kunmap_atomic(user_mem
, KM_USER0
);
670 kunmap_atomic(cmem
, KM_USER1
);
672 flush_dcache_page(page
);
674 set_bit(BIO_UPTODATE
, &bio
->bi_flags
);
680 * Called when request page is not present in ramzswap.
681 * Its either in backing swap device (if present) or
682 * this is an attempt to read before any previous write
683 * to this location - this happens due to readahead when
684 * swap device is read from user-space (e.g. during swapon)
686 static int handle_ramzswap_fault(struct ramzswap
*rzs
, struct bio
*bio
)
689 * Always forward such requests to backing swap
690 * device (if present)
692 if (rzs
->backing_swap
) {
694 rzs_stat64_dec(rzs
, &rzs
->stats
.num_reads
);
695 rzs_stat64_inc(rzs
, &rzs
->stats
.bdev_num_reads
);
696 bio
->bi_bdev
= rzs
->backing_swap
;
699 * In case backing swap is a file, find the right offset within
700 * the file corresponding to logical position 'index'. For block
701 * device, this is a nop.
703 pagenum
= bio
->bi_sector
>> SECTORS_PER_PAGE_SHIFT
;
704 bio
->bi_sector
= map_backing_swap_page(rzs
, pagenum
)
705 << SECTORS_PER_PAGE_SHIFT
;
710 * Its unlikely event in case backing dev is
713 pr_debug("Read before write on swap device: "
714 "sector=%lu, size=%u, offset=%u\n",
715 (ulong
)(bio
->bi_sector
), bio
->bi_size
,
716 bio
->bi_io_vec
[0].bv_offset
);
718 /* Do nothing. Just return success */
719 set_bit(BIO_UPTODATE
, &bio
->bi_flags
);
724 static int ramzswap_read(struct ramzswap
*rzs
, struct bio
*bio
)
730 struct zobj_header
*zheader
;
731 unsigned char *user_mem
, *cmem
;
733 rzs_stat64_inc(rzs
, &rzs
->stats
.num_reads
);
735 page
= bio
->bi_io_vec
[0].bv_page
;
736 index
= bio
->bi_sector
>> SECTORS_PER_PAGE_SHIFT
;
738 if (rzs_test_flag(rzs
, index
, RZS_ZERO
))
739 return handle_zero_page(bio
);
741 /* Requested page is not present in compressed area */
742 if (!rzs
->table
[index
].page
)
743 return handle_ramzswap_fault(rzs
, bio
);
745 /* Page is stored uncompressed since it's incompressible */
746 if (unlikely(rzs_test_flag(rzs
, index
, RZS_UNCOMPRESSED
)))
747 return handle_uncompressed_page(rzs
, bio
);
749 user_mem
= kmap_atomic(page
, KM_USER0
);
752 cmem
= kmap_atomic(rzs
->table
[index
].page
, KM_USER1
) +
753 rzs
->table
[index
].offset
;
755 ret
= lzo1x_decompress_safe(
756 cmem
+ sizeof(*zheader
),
757 xv_get_object_size(cmem
) - sizeof(*zheader
),
760 kunmap_atomic(user_mem
, KM_USER0
);
761 kunmap_atomic(cmem
, KM_USER1
);
763 /* should NEVER happen */
764 if (unlikely(ret
!= LZO_E_OK
)) {
765 pr_err("Decompression failed! err=%d, page=%u\n",
767 rzs_stat64_inc(rzs
, &rzs
->stats
.failed_reads
);
771 flush_dcache_page(page
);
773 set_bit(BIO_UPTODATE
, &bio
->bi_flags
);
782 static int ramzswap_write(struct ramzswap
*rzs
, struct bio
*bio
)
784 int ret
, fwd_write_request
= 0;
787 struct zobj_header
*zheader
;
788 struct page
*page
, *page_store
;
789 unsigned char *user_mem
, *cmem
, *src
;
791 rzs_stat64_inc(rzs
, &rzs
->stats
.num_writes
);
793 page
= bio
->bi_io_vec
[0].bv_page
;
794 index
= bio
->bi_sector
>> SECTORS_PER_PAGE_SHIFT
;
796 src
= rzs
->compress_buffer
;
799 * System swaps to same sector again when the stored page
800 * is no longer referenced by any process. So, its now safe
801 * to free the memory that was allocated for this page.
803 if (rzs
->table
[index
].page
|| rzs_test_flag(rzs
, index
, RZS_ZERO
))
804 ramzswap_free_page(rzs
, index
);
806 mutex_lock(&rzs
->lock
);
808 user_mem
= kmap_atomic(page
, KM_USER0
);
809 if (page_zero_filled(user_mem
)) {
810 kunmap_atomic(user_mem
, KM_USER0
);
811 mutex_unlock(&rzs
->lock
);
812 rzs_stat_inc(&rzs
->stats
.pages_zero
);
813 rzs_set_flag(rzs
, index
, RZS_ZERO
);
815 set_bit(BIO_UPTODATE
, &bio
->bi_flags
);
820 if (rzs
->backing_swap
&&
821 (rzs
->stats
.compr_size
> rzs
->memlimit
- PAGE_SIZE
)) {
822 kunmap_atomic(user_mem
, KM_USER0
);
823 mutex_unlock(&rzs
->lock
);
824 fwd_write_request
= 1;
828 ret
= lzo1x_1_compress(user_mem
, PAGE_SIZE
, src
, &clen
,
829 rzs
->compress_workmem
);
831 kunmap_atomic(user_mem
, KM_USER0
);
833 if (unlikely(ret
!= LZO_E_OK
)) {
834 mutex_unlock(&rzs
->lock
);
835 pr_err("Compression failed! err=%d\n", ret
);
836 rzs_stat64_inc(rzs
, &rzs
->stats
.failed_writes
);
841 * Page is incompressible. Forward it to backing swap
842 * if present. Otherwise, store it as-is (uncompressed)
843 * since we do not want to return too many swap write
844 * errors which has side effect of hanging the system.
846 if (unlikely(clen
> max_zpage_size
)) {
847 if (rzs
->backing_swap
) {
848 mutex_unlock(&rzs
->lock
);
849 fwd_write_request
= 1;
854 page_store
= alloc_page(GFP_NOIO
| __GFP_HIGHMEM
);
855 if (unlikely(!page_store
)) {
856 mutex_unlock(&rzs
->lock
);
857 pr_info("Error allocating memory for incompressible "
858 "page: %u\n", index
);
859 rzs_stat64_inc(rzs
, &rzs
->stats
.failed_writes
);
864 rzs_set_flag(rzs
, index
, RZS_UNCOMPRESSED
);
865 rzs_stat_inc(&rzs
->stats
.pages_expand
);
866 rzs
->table
[index
].page
= page_store
;
867 src
= kmap_atomic(page
, KM_USER0
);
871 if (xv_malloc(rzs
->mem_pool
, clen
+ sizeof(*zheader
),
872 &rzs
->table
[index
].page
, &offset
,
873 GFP_NOIO
| __GFP_HIGHMEM
)) {
874 mutex_unlock(&rzs
->lock
);
875 pr_info("Error allocating memory for compressed "
876 "page: %u, size=%zu\n", index
, clen
);
877 rzs_stat64_inc(rzs
, &rzs
->stats
.failed_writes
);
878 if (rzs
->backing_swap
)
879 fwd_write_request
= 1;
884 rzs
->table
[index
].offset
= offset
;
886 cmem
= kmap_atomic(rzs
->table
[index
].page
, KM_USER1
) +
887 rzs
->table
[index
].offset
;
890 /* Back-reference needed for memory defragmentation */
891 if (!rzs_test_flag(rzs
, index
, RZS_UNCOMPRESSED
)) {
892 zheader
= (struct zobj_header
*)cmem
;
893 zheader
->table_idx
= index
;
894 cmem
+= sizeof(*zheader
);
898 memcpy(cmem
, src
, clen
);
900 kunmap_atomic(cmem
, KM_USER1
);
901 if (unlikely(rzs_test_flag(rzs
, index
, RZS_UNCOMPRESSED
)))
902 kunmap_atomic(src
, KM_USER0
);
905 rzs
->stats
.compr_size
+= clen
;
906 rzs_stat_inc(&rzs
->stats
.pages_stored
);
907 if (clen
<= PAGE_SIZE
/ 2)
908 rzs_stat_inc(&rzs
->stats
.good_compress
);
910 mutex_unlock(&rzs
->lock
);
912 set_bit(BIO_UPTODATE
, &bio
->bi_flags
);
917 if (fwd_write_request
) {
918 rzs_stat64_inc(rzs
, &rzs
->stats
.bdev_num_writes
);
919 bio
->bi_bdev
= rzs
->backing_swap
;
922 * TODO: We currently have linear mapping of ramzswap and
923 * backing swap sectors. This is not desired since we want
924 * to optimize writes to backing swap to minimize disk seeks
925 * or have effective wear leveling (for SSDs). Also, a
926 * non-linear mapping is required to implement compressed
929 bio
->bi_sector
= get_backing_swap_page()
930 << SECTORS_PER_PAGE_SHIFT
;
933 * In case backing swap is a file, find the right offset within
934 * the file corresponding to logical position 'index'. For block
935 * device, this is a nop.
937 bio
->bi_sector
= map_backing_swap_page(rzs
, index
)
938 << SECTORS_PER_PAGE_SHIFT
;
947 * Check if request is within bounds and page aligned.
949 static inline int valid_swap_request(struct ramzswap
*rzs
, struct bio
*bio
)
952 (bio
->bi_sector
>= (rzs
->disksize
>> SECTOR_SHIFT
)) ||
953 (bio
->bi_sector
& (SECTORS_PER_PAGE
- 1)) ||
954 (bio
->bi_vcnt
!= 1) ||
955 (bio
->bi_size
!= PAGE_SIZE
) ||
956 (bio
->bi_io_vec
[0].bv_offset
!= 0))) {
961 /* swap request is valid */
966 * Handler function for all ramzswap I/O requests.
968 static int ramzswap_make_request(struct request_queue
*queue
, struct bio
*bio
)
971 struct ramzswap
*rzs
= queue
->queuedata
;
973 if (unlikely(!rzs
->init_done
)) {
978 if (!valid_swap_request(rzs
, bio
)) {
979 rzs_stat64_inc(rzs
, &rzs
->stats
.invalid_io
);
984 switch (bio_data_dir(bio
)) {
986 ret
= ramzswap_read(rzs
, bio
);
990 ret
= ramzswap_write(rzs
, bio
);
997 static void reset_device(struct ramzswap
*rzs
)
999 int is_backing_blkdev
= 0;
1000 size_t index
, num_pages
;
1001 unsigned entries_per_page
;
1002 unsigned long num_table_pages
, entry
= 0;
1004 /* Do not accept any new I/O request */
1007 if (rzs
->backing_swap
&& !rzs
->num_extents
)
1008 is_backing_blkdev
= 1;
1010 num_pages
= rzs
->disksize
>> PAGE_SHIFT
;
1012 /* Free various per-device buffers */
1013 kfree(rzs
->compress_workmem
);
1014 free_pages((unsigned long)rzs
->compress_buffer
, 1);
1016 rzs
->compress_workmem
= NULL
;
1017 rzs
->compress_buffer
= NULL
;
1019 /* Free all pages that are still in this ramzswap device */
1020 for (index
= 0; index
< num_pages
; index
++) {
1024 page
= rzs
->table
[index
].page
;
1025 offset
= rzs
->table
[index
].offset
;
1030 if (unlikely(rzs_test_flag(rzs
, index
, RZS_UNCOMPRESSED
)))
1033 xv_free(rzs
->mem_pool
, page
, offset
);
1036 entries_per_page
= PAGE_SIZE
/ sizeof(*rzs
->table
);
1037 num_table_pages
= DIV_ROUND_UP(num_pages
* sizeof(*rzs
->table
),
1040 * Set page->mapping to NULL for every table page.
1041 * Otherwise, we will hit bad_page() during free.
1043 while (rzs
->num_extents
&& num_table_pages
--) {
1045 page
= vmalloc_to_page(&rzs
->table
[entry
]);
1046 page
->mapping
= NULL
;
1047 entry
+= entries_per_page
;
1052 xv_destroy_pool(rzs
->mem_pool
);
1053 rzs
->mem_pool
= NULL
;
1055 /* Free all swap extent pages */
1056 while (!list_empty(&rzs
->backing_swap_extent_list
)) {
1058 struct list_head
*entry
;
1059 entry
= rzs
->backing_swap_extent_list
.next
;
1060 page
= list_entry(entry
, struct page
, lru
);
1064 INIT_LIST_HEAD(&rzs
->backing_swap_extent_list
);
1065 rzs
->num_extents
= 0;
1067 /* Close backing swap device, if present */
1068 if (rzs
->backing_swap
) {
1069 if (is_backing_blkdev
)
1070 bd_release(rzs
->backing_swap
);
1071 filp_close(rzs
->swap_file
, NULL
);
1072 rzs
->backing_swap
= NULL
;
1073 memset(rzs
->backing_swap_name
, 0, MAX_SWAP_NAME_LEN
);
1077 memset(&rzs
->stats
, 0, sizeof(rzs
->stats
));
1083 static int ramzswap_ioctl_init_device(struct ramzswap
*rzs
)
1088 union swap_header
*swap_header
;
1090 if (rzs
->init_done
) {
1091 pr_info("Device already initialized!\n");
1095 ret
= setup_backing_swap(rzs
);
1099 if (rzs
->backing_swap
)
1100 ramzswap_set_memlimit(rzs
, totalram_pages
<< PAGE_SHIFT
);
1102 ramzswap_set_disksize(rzs
, totalram_pages
<< PAGE_SHIFT
);
1104 rzs
->compress_workmem
= kzalloc(LZO1X_MEM_COMPRESS
, GFP_KERNEL
);
1105 if (!rzs
->compress_workmem
) {
1106 pr_err("Error allocating compressor working memory!\n");
1111 rzs
->compress_buffer
= (void *)__get_free_pages(__GFP_ZERO
, 1);
1112 if (!rzs
->compress_buffer
) {
1113 pr_err("Error allocating compressor buffer space\n");
1118 num_pages
= rzs
->disksize
>> PAGE_SHIFT
;
1119 rzs
->table
= vmalloc(num_pages
* sizeof(*rzs
->table
));
1121 pr_err("Error allocating ramzswap address table\n");
1122 /* To prevent accessing table entries during cleanup */
1127 memset(rzs
->table
, 0, num_pages
* sizeof(*rzs
->table
));
1129 map_backing_swap_extents(rzs
);
1131 page
= alloc_page(__GFP_ZERO
);
1133 pr_err("Error allocating swap header page\n");
1137 rzs
->table
[0].page
= page
;
1138 rzs_set_flag(rzs
, 0, RZS_UNCOMPRESSED
);
1140 swap_header
= kmap(page
);
1141 ret
= setup_swap_header(rzs
, swap_header
);
1144 pr_err("Error setting swap header\n");
1148 set_capacity(rzs
->disk
, rzs
->disksize
>> SECTOR_SHIFT
);
1151 * We have ident mapping of sectors for ramzswap and
1152 * and the backing swap device. So, this queue flag
1153 * should be according to backing dev.
1155 if (!rzs
->backing_swap
||
1156 blk_queue_nonrot(rzs
->backing_swap
->bd_disk
->queue
))
1157 queue_flag_set_unlocked(QUEUE_FLAG_NONROT
, rzs
->disk
->queue
);
1159 rzs
->mem_pool
= xv_create_pool();
1160 if (!rzs
->mem_pool
) {
1161 pr_err("Error creating memory pool\n");
1167 * Pages that compress to size greater than this are forwarded
1168 * to physical swap disk (if backing dev is provided)
1169 * TODO: make this configurable
1171 if (rzs
->backing_swap
)
1172 max_zpage_size
= max_zpage_size_bdev
;
1174 max_zpage_size
= max_zpage_size_nobdev
;
1175 pr_debug("Max compressed page size: %u bytes\n", max_zpage_size
);
1179 pr_debug("Initialization done!\n");
1185 pr_err("Initialization failed: err=%d\n", ret
);
1189 static int ramzswap_ioctl_reset_device(struct ramzswap
*rzs
)
1197 static int ramzswap_ioctl(struct block_device
*bdev
, fmode_t mode
,
1198 unsigned int cmd
, unsigned long arg
)
1201 size_t disksize_kb
, memlimit_kb
;
1203 struct ramzswap
*rzs
= bdev
->bd_disk
->private_data
;
1206 case RZSIO_SET_DISKSIZE_KB
:
1207 if (rzs
->init_done
) {
1211 if (copy_from_user(&disksize_kb
, (void *)arg
,
1216 rzs
->disksize
= disksize_kb
<< 10;
1217 pr_info("Disk size set to %zu kB\n", disksize_kb
);
1220 case RZSIO_SET_MEMLIMIT_KB
:
1221 if (rzs
->init_done
) {
1222 /* TODO: allow changing memlimit */
1226 if (copy_from_user(&memlimit_kb
, (void *)arg
,
1231 rzs
->memlimit
= memlimit_kb
<< 10;
1232 pr_info("Memory limit set to %zu kB\n", memlimit_kb
);
1235 case RZSIO_SET_BACKING_SWAP
:
1236 if (rzs
->init_done
) {
1241 if (copy_from_user(&rzs
->backing_swap_name
, (void *)arg
,
1246 rzs
->backing_swap_name
[MAX_SWAP_NAME_LEN
- 1] = '\0';
1247 pr_info("Backing swap set to %s\n", rzs
->backing_swap_name
);
1250 case RZSIO_GET_STATS
:
1252 struct ramzswap_ioctl_stats
*stats
;
1253 if (!rzs
->init_done
) {
1257 stats
= kzalloc(sizeof(*stats
), GFP_KERNEL
);
1262 ramzswap_ioctl_get_stats(rzs
, stats
);
1263 if (copy_to_user((void *)arg
, stats
, sizeof(*stats
))) {
1272 ret
= ramzswap_ioctl_init_device(rzs
);
1276 /* Do not reset an active device! */
1277 if (bdev
->bd_holders
) {
1282 /* Make sure all pending I/O is finished */
1286 ret
= ramzswap_ioctl_reset_device(rzs
);
1290 pr_info("Invalid ioctl %u\n", cmd
);
1298 static struct block_device_operations ramzswap_devops
= {
1299 .ioctl
= ramzswap_ioctl
,
1300 .owner
= THIS_MODULE
,
1303 static int create_device(struct ramzswap
*rzs
, int device_id
)
1307 mutex_init(&rzs
->lock
);
1308 spin_lock_init(&rzs
->stat64_lock
);
1309 INIT_LIST_HEAD(&rzs
->backing_swap_extent_list
);
1311 rzs
->queue
= blk_alloc_queue(GFP_KERNEL
);
1313 pr_err("Error allocating disk queue for device %d\n",
1319 blk_queue_make_request(rzs
->queue
, ramzswap_make_request
);
1320 rzs
->queue
->queuedata
= rzs
;
1322 /* gendisk structure */
1323 rzs
->disk
= alloc_disk(1);
1325 blk_cleanup_queue(rzs
->queue
);
1326 pr_warning("Error allocating disk structure for device %d\n",
1332 rzs
->disk
->major
= ramzswap_major
;
1333 rzs
->disk
->first_minor
= device_id
;
1334 rzs
->disk
->fops
= &ramzswap_devops
;
1335 rzs
->disk
->queue
= rzs
->queue
;
1336 rzs
->disk
->private_data
= rzs
;
1337 snprintf(rzs
->disk
->disk_name
, 16, "ramzswap%d", device_id
);
1340 * Actual capacity set using RZSIO_SET_DISKSIZE_KB ioctl
1341 * or set equal to backing swap device (if provided)
1343 set_capacity(rzs
->disk
, 0);
1345 blk_queue_physical_block_size(rzs
->disk
->queue
, PAGE_SIZE
);
1346 blk_queue_logical_block_size(rzs
->disk
->queue
, PAGE_SIZE
);
1348 add_disk(rzs
->disk
);
1356 static void destroy_device(struct ramzswap
*rzs
)
1359 del_gendisk(rzs
->disk
);
1360 put_disk(rzs
->disk
);
1364 blk_cleanup_queue(rzs
->queue
);
1367 static int __init
ramzswap_init(void)
1371 if (num_devices
> max_num_devices
) {
1372 pr_warning("Invalid value for num_devices: %u\n",
1378 ramzswap_major
= register_blkdev(0, "ramzswap");
1379 if (ramzswap_major
<= 0) {
1380 pr_warning("Unable to get major number\n");
1386 pr_info("num_devices not specified. Using default: 1\n");
1390 /* Allocate the device array and initialize each one */
1391 pr_info("Creating %u devices ...\n", num_devices
);
1392 devices
= kzalloc(num_devices
* sizeof(struct ramzswap
), GFP_KERNEL
);
1398 for (dev_id
= 0; dev_id
< num_devices
; dev_id
++) {
1399 ret
= create_device(&devices
[dev_id
], dev_id
);
1408 destroy_device(&devices
[--dev_id
]);
1410 unregister_blkdev(ramzswap_major
, "ramzswap");
1415 static void __exit
ramzswap_exit(void)
1418 struct ramzswap
*rzs
;
1420 for (i
= 0; i
< num_devices
; i
++) {
1423 destroy_device(rzs
);
1428 unregister_blkdev(ramzswap_major
, "ramzswap");
1431 pr_debug("Cleanup done!\n");
1434 module_param(num_devices
, uint
, 0);
1435 MODULE_PARM_DESC(num_devices
, "Number of ramzswap devices");
1437 module_init(ramzswap_init
);
1438 module_exit(ramzswap_exit
);
1440 MODULE_LICENSE("Dual BSD/GPL");
1441 MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
1442 MODULE_DESCRIPTION("Compressed RAM Based Swap Device");