]>
Commit | Line | Data |
---|---|---|
00e5a55c BS |
1 | From: Peter Zijlstra <a.p.zijlstra@chello.nl> |
2 | Subject: mm: add support for non block device backed swap files | |
3 | Patch-mainline: No | |
4 | References: FATE#303834 | |
5 | ||
6 | New addres_space_operations methods are added: | |
7 | int swapon(struct file *); | |
8 | int swapoff(struct file *); | |
9 | int swap_out(struct file *, struct page *, struct writeback_control *); | |
10 | int swap_in(struct file *, struct page *); | |
11 | ||
12 | When during sys_swapon() the ->swapon() method is found and returns no error | |
13 | the swapper_space.a_ops will proxy to sis->swap_file->f_mapping->a_ops, and | |
14 | make use of ->swap_{out,in}() to write/read swapcache pages. | |
15 | ||
16 | The ->swapon() method will be used to communicate to the file that the VM | |
17 | relies on it, and the address_space should take adequate measures (like | |
18 | reserving memory for mempools or the like). The ->swapoff() method will be | |
19 | called on sys_swapoff() when ->swapon() was found and returned no error. | |
20 | ||
21 | This new interface can be used to obviate the need for ->bmap in the swapfile | |
22 | code. A filesystem would need to load (and maybe even allocate) the full block | |
23 | map for a file into memory and pin it there on ->swapon() so that | |
24 | ->swap_{out,in}() have instant access to it. It can be released on ->swapoff(). | |
25 | ||
26 | The reason to provide ->swap_{out,in}() over using {write,read}page() is to | |
27 | 1) make a distinction between swapcache and pagecache pages, and | |
28 | 2) to provide a struct file * for credential context (normally not needed | |
29 | in the context of writepage, as the page content is normally dirtied | |
30 | using either of the following interfaces: | |
31 | write_{begin,end}() | |
32 | {prepare,commit}_write() | |
33 | page_mkwrite() | |
34 | which do have the file context. | |
35 | ||
36 | [miklos@szeredi.hu: cleanups] | |
37 | Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> | |
38 | Acked-by: Neil Brown <neilb@suse.de> | |
39 | Acked-by: Suresh Jayaraman <sjayaraman@suse.de> | |
40 | ||
41 | --- | |
42 | Documentation/filesystems/Locking | 22 ++++++++++++++++ | |
43 | Documentation/filesystems/vfs.txt | 18 +++++++++++++ | |
44 | include/linux/buffer_head.h | 2 - | |
45 | include/linux/fs.h | 9 ++++++ | |
46 | include/linux/swap.h | 4 ++ | |
47 | mm/page_io.c | 52 ++++++++++++++++++++++++++++++++++++++ | |
48 | mm/swap_state.c | 4 +- | |
49 | mm/swapfile.c | 32 +++++++++++++++++++++-- | |
50 | 8 files changed, 137 insertions(+), 6 deletions(-) | |
51 | ||
52 | --- a/Documentation/filesystems/Locking | |
53 | +++ b/Documentation/filesystems/Locking | |
54 | @@ -169,6 +169,10 @@ prototypes: | |
55 | int (*direct_IO)(int, struct kiocb *, const struct iovec *iov, | |
56 | loff_t offset, unsigned long nr_segs); | |
57 | int (*launder_page) (struct page *); | |
58 | + int (*swapon) (struct file *); | |
59 | + int (*swapoff) (struct file *); | |
60 | + int (*swap_out) (struct file *, struct page *, struct writeback_control *); | |
61 | + int (*swap_in) (struct file *, struct page *); | |
62 | ||
63 | locking rules: | |
64 | All except set_page_dirty may block | |
65 | @@ -190,6 +194,10 @@ invalidatepage: no yes | |
66 | releasepage: no yes | |
67 | direct_IO: no | |
68 | launder_page: no yes | |
69 | +swapon no | |
70 | +swapoff no | |
71 | +swap_out no yes, unlocks | |
72 | +swap_in no yes, unlocks | |
73 | ||
74 | ->prepare_write(), ->commit_write(), ->sync_page() and ->readpage() | |
75 | may be called from the request handler (/dev/loop). | |
76 | @@ -289,6 +297,20 @@ cleaned, or an error value if not. Note | |
77 | getting mapped back in and redirtied, it needs to be kept locked | |
78 | across the entire operation. | |
79 | ||
80 | + ->swapon() will be called with a non-zero argument on files backing | |
81 | +(non block device backed) swapfiles. A return value of zero indicates success, | |
82 | +in which case this file can be used for backing swapspace. The swapspace | |
83 | +operations will be proxied to the address space operations. | |
84 | + | |
85 | + ->swapoff() will be called in the sys_swapoff() path when ->swapon() | |
86 | +returned success. | |
87 | + | |
88 | + ->swap_out() when swapon() returned success, this method is used to | |
89 | +write the swap page. | |
90 | + | |
91 | + ->swap_in() when swapon() returned success, this method is used to | |
92 | +read the swap page. | |
93 | + | |
94 | Note: currently almost all instances of address_space methods are | |
95 | using BKL for internal serialization and that's one of the worst sources | |
96 | of contention. Normally they are calling library functions (in fs/buffer.c) | |
97 | --- a/Documentation/filesystems/vfs.txt | |
98 | +++ b/Documentation/filesystems/vfs.txt | |
99 | @@ -539,6 +539,11 @@ struct address_space_operations { | |
100 | /* migrate the contents of a page to the specified target */ | |
101 | int (*migratepage) (struct page *, struct page *); | |
102 | int (*launder_page) (struct page *); | |
103 | + int (*swapon)(struct file *); | |
104 | + int (*swapoff)(struct file *); | |
105 | + int (*swap_out)(struct file *file, struct page *page, | |
106 | + struct writeback_control *wbc); | |
107 | + int (*swap_in)(struct file *file, struct page *page); | |
108 | }; | |
109 | ||
110 | writepage: called by the VM to write a dirty page to backing store. | |
111 | @@ -724,6 +729,19 @@ struct address_space_operations { | |
112 | prevent redirtying the page, it is kept locked during the whole | |
113 | operation. | |
114 | ||
115 | + swapon: Called when swapon is used on a file. A | |
116 | + return value of zero indicates success, in which case this | |
117 | + file can be used to back swapspace. The swapspace operations | |
118 | + will be proxied to this address space's ->swap_{out,in} methods. | |
119 | + | |
120 | + swapoff: Called during swapoff on files where swapon was successfull. | |
121 | + | |
122 | + swap_out: Called to write a swapcache page to a backing store, similar to | |
123 | + writepage. | |
124 | + | |
125 | + swap_in: Called to read a swapcache page from a backing store, similar to | |
126 | + readpage. | |
127 | + | |
128 | The File Object | |
129 | =============== | |
130 | ||
131 | --- a/include/linux/buffer_head.h | |
132 | +++ b/include/linux/buffer_head.h | |
133 | @@ -345,7 +345,7 @@ static inline void invalidate_inode_buff | |
134 | static inline int remove_inode_buffers(struct inode *inode) { return 1; } | |
135 | static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; } | |
136 | static inline void invalidate_bdev(struct block_device *bdev) {} | |
137 | - | |
138 | +static inline void block_sync_page(struct page *) { } | |
139 | ||
140 | #endif /* CONFIG_BLOCK */ | |
141 | #endif /* _LINUX_BUFFER_HEAD_H */ | |
142 | --- a/include/linux/fs.h | |
143 | +++ b/include/linux/fs.h | |
144 | @@ -525,6 +525,15 @@ struct address_space_operations { | |
145 | int (*launder_page) (struct page *); | |
146 | int (*is_partially_uptodate) (struct page *, read_descriptor_t *, | |
147 | unsigned long); | |
148 | + | |
149 | + /* | |
150 | + * swapfile support | |
151 | + */ | |
152 | + int (*swapon)(struct file *file); | |
153 | + int (*swapoff)(struct file *file); | |
154 | + int (*swap_out)(struct file *file, struct page *page, | |
155 | + struct writeback_control *wbc); | |
156 | + int (*swap_in)(struct file *file, struct page *page); | |
157 | }; | |
158 | ||
159 | /* | |
160 | --- a/include/linux/swap.h | |
161 | +++ b/include/linux/swap.h | |
162 | @@ -120,6 +120,7 @@ enum { | |
163 | SWP_USED = (1 << 0), /* is slot in swap_info[] used? */ | |
164 | SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */ | |
165 | SWP_ACTIVE = (SWP_USED | SWP_WRITEOK), | |
166 | + SWP_FILE = (1 << 2), /* file swap area */ | |
167 | /* add others here before... */ | |
168 | SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ | |
169 | }; | |
170 | @@ -217,6 +218,8 @@ extern void swap_unplug_io_fn(struct bac | |
171 | /* linux/mm/page_io.c */ | |
172 | extern int swap_readpage(struct file *, struct page *); | |
173 | extern int swap_writepage(struct page *page, struct writeback_control *wbc); | |
174 | +extern void swap_sync_page(struct page *page); | |
175 | +extern int swap_set_page_dirty(struct page *page); | |
176 | extern void end_swap_bio_read(struct bio *bio, int err); | |
177 | ||
178 | /* linux/mm/swap_state.c */ | |
179 | @@ -249,6 +252,7 @@ extern unsigned int count_swap_pages(int | |
180 | extern sector_t map_swap_page(struct swap_info_struct *, pgoff_t); | |
181 | extern sector_t swapdev_block(int, pgoff_t); | |
182 | extern struct swap_info_struct *get_swap_info_struct(unsigned); | |
183 | +extern struct swap_info_struct *page_swap_info(struct page *); | |
184 | extern int can_share_swap_page(struct page *); | |
185 | extern int remove_exclusive_swap_page(struct page *); | |
186 | struct backing_dev_info; | |
187 | --- a/mm/page_io.c | |
188 | +++ b/mm/page_io.c | |
189 | @@ -17,6 +17,7 @@ | |
190 | #include <linux/bio.h> | |
191 | #include <linux/swapops.h> | |
192 | #include <linux/writeback.h> | |
193 | +#include <linux/buffer_head.h> | |
194 | #include <trace/swap.h> | |
195 | #include <asm/pgtable.h> | |
196 | ||
197 | @@ -98,11 +99,23 @@ int swap_writepage(struct page *page, st | |
198 | { | |
199 | struct bio *bio; | |
200 | int ret = 0, rw = WRITE; | |
201 | + struct swap_info_struct *sis = page_swap_info(page); | |
202 | ||
203 | if (remove_exclusive_swap_page(page)) { | |
204 | unlock_page(page); | |
205 | goto out; | |
206 | } | |
207 | + | |
208 | + if (sis->flags & SWP_FILE) { | |
209 | + struct file *swap_file = sis->swap_file; | |
210 | + struct address_space *mapping = swap_file->f_mapping; | |
211 | + | |
212 | + ret = mapping->a_ops->swap_out(swap_file, page, wbc); | |
213 | + if (!ret) | |
214 | + count_vm_event(PSWPOUT); | |
215 | + return ret; | |
216 | + } | |
217 | + | |
218 | bio = get_swap_bio(GFP_NOIO, page_private(page), page, | |
219 | end_swap_bio_write); | |
220 | if (bio == NULL) { | |
221 | @@ -122,13 +135,52 @@ out: | |
222 | return ret; | |
223 | } | |
224 | ||
225 | +void swap_sync_page(struct page *page) | |
226 | +{ | |
227 | + struct swap_info_struct *sis = page_swap_info(page); | |
228 | + | |
229 | + if (sis->flags & SWP_FILE) { | |
230 | + struct address_space *mapping = sis->swap_file->f_mapping; | |
231 | + | |
232 | + if (mapping->a_ops->sync_page) | |
233 | + mapping->a_ops->sync_page(page); | |
234 | + } else { | |
235 | + block_sync_page(page); | |
236 | + } | |
237 | +} | |
238 | + | |
239 | +int swap_set_page_dirty(struct page *page) | |
240 | +{ | |
241 | + struct swap_info_struct *sis = page_swap_info(page); | |
242 | + | |
243 | + if (sis->flags & SWP_FILE) { | |
244 | + struct address_space *mapping = sis->swap_file->f_mapping; | |
245 | + | |
246 | + return mapping->a_ops->set_page_dirty(page); | |
247 | + } else { | |
248 | + return __set_page_dirty_nobuffers(page); | |
249 | + } | |
250 | +} | |
251 | + | |
252 | int swap_readpage(struct file *file, struct page *page) | |
253 | { | |
254 | struct bio *bio; | |
255 | int ret = 0; | |
256 | + struct swap_info_struct *sis = page_swap_info(page); | |
257 | ||
258 | BUG_ON(!PageLocked(page)); | |
259 | BUG_ON(PageUptodate(page)); | |
260 | + | |
261 | + if (sis->flags & SWP_FILE) { | |
262 | + struct file *swap_file = sis->swap_file; | |
263 | + struct address_space *mapping = swap_file->f_mapping; | |
264 | + | |
265 | + ret = mapping->a_ops->swap_in(swap_file, page); | |
266 | + if (!ret) | |
267 | + count_vm_event(PSWPIN); | |
268 | + return ret; | |
269 | + } | |
270 | + | |
271 | bio = get_swap_bio(GFP_KERNEL, page_private(page), page, | |
272 | end_swap_bio_read); | |
273 | if (bio == NULL) { | |
274 | --- a/mm/swapfile.c | |
275 | +++ b/mm/swapfile.c | |
276 | @@ -1018,6 +1018,14 @@ static void destroy_swap_extents(struct | |
277 | list_del(&se->list); | |
278 | kfree(se); | |
279 | } | |
280 | + | |
281 | + if (sis->flags & SWP_FILE) { | |
282 | + struct file *swap_file = sis->swap_file; | |
283 | + struct address_space *mapping = swap_file->f_mapping; | |
284 | + | |
285 | + sis->flags &= ~SWP_FILE; | |
286 | + mapping->a_ops->swapoff(swap_file); | |
287 | + } | |
288 | } | |
289 | ||
290 | /* | |
291 | @@ -1092,7 +1100,9 @@ add_swap_extent(struct swap_info_struct | |
292 | */ | |
293 | static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) | |
294 | { | |
295 | - struct inode *inode; | |
296 | + struct file *swap_file = sis->swap_file; | |
297 | + struct address_space *mapping = swap_file->f_mapping; | |
298 | + struct inode *inode = mapping->host; | |
299 | unsigned blocks_per_page; | |
300 | unsigned long page_no; | |
301 | unsigned blkbits; | |
302 | @@ -1103,13 +1113,22 @@ static int setup_swap_extents(struct swa | |
303 | int nr_extents = 0; | |
304 | int ret; | |
305 | ||
306 | - inode = sis->swap_file->f_mapping->host; | |
307 | if (S_ISBLK(inode->i_mode)) { | |
308 | ret = add_swap_extent(sis, 0, sis->max, 0); | |
309 | *span = sis->pages; | |
310 | goto done; | |
311 | } | |
312 | ||
313 | + if (mapping->a_ops->swapon) { | |
314 | + ret = mapping->a_ops->swapon(swap_file); | |
315 | + if (!ret) { | |
316 | + sis->flags |= SWP_FILE; | |
317 | + ret = add_swap_extent(sis, 0, sis->max, 0); | |
318 | + *span = sis->pages; | |
319 | + } | |
320 | + goto done; | |
321 | + } | |
322 | + | |
323 | blkbits = inode->i_blkbits; | |
324 | blocks_per_page = PAGE_SIZE >> blkbits; | |
325 | ||
326 | @@ -1683,7 +1702,7 @@ SYSCALL_DEFINE2(swapon, const char __use | |
327 | else | |
328 | p->prio = --least_priority; | |
329 | p->swap_map = swap_map; | |
330 | - p->flags = SWP_ACTIVE; | |
331 | + p->flags |= SWP_WRITEOK; | |
332 | nr_swap_pages += nr_good_pages; | |
333 | total_swap_pages += nr_good_pages; | |
334 | ||
335 | @@ -1823,6 +1842,13 @@ get_swap_info_struct(unsigned type) | |
336 | return &swap_info[type]; | |
337 | } | |
338 | ||
339 | +struct swap_info_struct *page_swap_info(struct page *page) | |
340 | +{ | |
341 | + swp_entry_t swap = { .val = page_private(page) }; | |
342 | + BUG_ON(!PageSwapCache(page)); | |
343 | + return &swap_info[swp_type(swap)]; | |
344 | +} | |
345 | + | |
346 | /* | |
347 | * swap_lock prevents swap_map being freed. Don't grab an extra | |
348 | * reference on the swaphandle, it doesn't matter if it becomes unused. | |
349 | --- a/mm/swap_state.c | |
350 | +++ b/mm/swap_state.c | |
351 | @@ -27,8 +27,8 @@ | |
352 | */ | |
353 | static const struct address_space_operations swap_aops = { | |
354 | .writepage = swap_writepage, | |
355 | - .sync_page = block_sync_page, | |
356 | - .set_page_dirty = __set_page_dirty_nobuffers, | |
357 | + .sync_page = swap_sync_page, | |
358 | + .set_page_dirty = swap_set_page_dirty, | |
359 | .migratepage = migrate_page, | |
360 | }; | |
361 |