]> git.ipfire.org Git - people/pmueller/ipfire-2.x.git/blob - src/patches/suse-2.6.27.25/patches.suse/xfs-nfsd-dmapi-aware
Updated xen patches taken from suse.
[people/pmueller/ipfire-2.x.git] / src / patches / suse-2.6.27.25 / patches.suse / xfs-nfsd-dmapi-aware
1 From: Greg Banks
2 Subject: Make NFSD DMAPI aware
3 References: 74107, 173874, bnc#450658
4 Patch-mainline: obstruction...
5
6 G'day,
7
8 The NFSv3 protocol specifies an error, NFS3ERR_JUKEBOX, which a server
9 should return when an I/O operation will take a very long time.
10 This causes a different pattern of retries in clients, and avoids
11 a number of serious problems associated with I/Os which take longer
12 than an RPC timeout. The Linux knfsd server has code to generate the
13 jukebox error and many NFS clients are known to have working code to
14 handle it.
15
16 One scenario in which a server should emit the JUKEBOX error is when
17 a file data which the client is attempting to access is managed by
18 an HSM (Hierarchical Storage Manager) and is not present on the disk
19 and needs to be brought in from tape. Due to the nature of tapes this
20 operation can take minutes rather than the milliseconds normally seen
21 for local file data.
22
23 Currently the Linux knfsd handles this situation poorly. A READ NFS
24 call will cause the nfsd thread handling it to block until the file
25 is available, without sending a reply to the NFS client. After a
26 few seconds the client retries, and this second READ call causes
27 another nfsd to block behind the first one. A few seconds later and
28 the client's retries have blocked *all* the nfsd threads, and all NFS
29 service from the server stops until the original file arrives on disk.
30
31 WRITEs and SETATTRs which truncate the file are marginally better, in
32 that the knfsd dupcache will catch the retries and drop them without
33 blocking an nfsd (the dupcache *will* catch the retries because the
34 cache entry remains in RC_INPROG state and is not reused until the
35 first call finishes). However the first call still blocks, so given
36 WRITEs to enough offline files the server can still be locked up.
37
38 There are also client-side implications, depending on the client
39 implementation. For example, on a Linux client an RPC retry loop uses
40 an RPC request slot, so reads from enough separate offline files can
41 lock up a mountpoint.
42
43 This patch seeks to remedy the interaction between knfsd and HSMs by
44 providing mechanisms to allow knfsd to tell an underlying filesystem
45 (which supports HSMs) not to block for reads, writes and truncates
46 of offline files. It's a port of a Linux 2.4 patch used in SGI's
47 ProPack distro for the last 12 months. The patch:
48
49 * provides a new ATTR_NO_BLOCK flag which the kernel can
50 use to tell a filesystem's inode_ops->setattr() operation not
51 to block when truncating an offline file. XFS already obeys
52 this flag (inside a #ifdef)
53
54 * changes knfsd to provide ATTR_NO_BLOCK when it does the VFS
55 calls to implement the SETATTR NFS call.
56
57 * changes knfsd to supply the O_NONBLOCK flag in the temporary
58 struct file it uses for VFS reads and writes, in order to ask
59 the filesystem not to block when reading or writing an offline
60 file. XFS already obeys this new semantic for O_NONBLOCK
61 (and in SLES9 so does JFS).
62
63 * adds code to translate the -EAGAIN the filesystem returns when
64 it would have blocked, to the -ETIMEDOUT that knfsd expects.
65
66
67 Signed-off-by: Greg Banks <gnb@melbourne.sgi.com>
68 (SLES9 patch Acked-by: okir@suse.de)
69 Signed-off-by: NeilBrown <neilb@suse.de>
70 Acked-by: Jan Kara <jack@suse.cz>
71
72 fs/nfsd/vfs.c | 32 ++++++++++++++++++++++++++++++--
73 fs/xfs/linux-2.6/xfs_iops.c | 7 ++++++-
74 include/linux/fs.h | 1 +
75 3 files changed, 37 insertions(+), 3 deletions(-)
76
77
78 --- a/fs/nfsd/vfs.c
79 +++ b/fs/nfsd/vfs.c
80 @@ -358,6 +358,15 @@ nfsd_setattr(struct svc_rqst *rqstp, str
81 goto out_nfserr;
82 }
83 DQUOT_INIT(inode);
84 +
85 + /*
86 + * Tell a Hierarchical Storage Manager (e.g. via DMAPI) to
87 + * return EAGAIN when an action would take minutes instead of
88 + * milliseconds so that NFS can reply to the client with
89 + * NFSERR_JUKEBOX instead of blocking an nfsd thread.
90 + */
91 + if (rqstp->rq_vers >= 3)
92 + iap->ia_valid |= ATTR_NO_BLOCK;
93 }
94
95 /* sanitize the mode change */
96 @@ -389,6 +398,9 @@ nfsd_setattr(struct svc_rqst *rqstp, str
97 if (!check_guard || guardtime == inode->i_ctime.tv_sec) {
98 fh_lock(fhp);
99 host_err = notify_change(dentry, iap);
100 + /* to get NFSERR_JUKEBOX on the wire, need -ETIMEDOUT */
101 + if (host_err == -EAGAIN)
102 + host_err = -ETIMEDOUT;
103 err = nfserrno(host_err);
104 fh_unlock(fhp);
105 }
106 @@ -906,6 +918,10 @@ nfsd_vfs_read(struct svc_rqst *rqstp, st
107 if (ra && ra->p_set)
108 file->f_ra = ra->p_ra;
109
110 + /* Support HSMs -- see comment in nfsd_setattr() */
111 + if (rqstp->rq_vers >= 3)
112 + file->f_flags |= O_NONBLOCK;
113 +
114 if (file->f_op->splice_read && rqstp->rq_splice_ok) {
115 struct splice_desc sd = {
116 .len = 0,
117 @@ -938,8 +954,12 @@ nfsd_vfs_read(struct svc_rqst *rqstp, st
118 *count = host_err;
119 err = 0;
120 fsnotify_access(file->f_path.dentry);
121 - } else
122 + } else {
123 + /* to get NFSERR_JUKEBOX on the wire, need -ETIMEDOUT */
124 + if (host_err == -EAGAIN)
125 + host_err = -ETIMEDOUT;
126 err = nfserrno(host_err);
127 + }
128 out:
129 return err;
130 }
131 @@ -998,6 +1018,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, s
132 if (stable && !EX_WGATHER(exp))
133 file->f_flags |= O_SYNC;
134
135 + /* Support HSMs -- see comment in nfsd_setattr() */
136 + if (rqstp->rq_vers >= 3)
137 + file->f_flags |= O_NONBLOCK;
138 +
139 /* Write the data. */
140 oldfs = get_fs(); set_fs(KERNEL_DS);
141 host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);
142 @@ -1050,8 +1074,12 @@ nfsd_vfs_write(struct svc_rqst *rqstp, s
143 dprintk("nfsd: write complete host_err=%d\n", host_err);
144 if (host_err >= 0)
145 err = 0;
146 - else
147 + else {
148 + /* to get NFSERR_JUKEBOX on the wire, need -ETIMEDOUT */
149 + if (host_err == -EAGAIN)
150 + host_err = -ETIMEDOUT;
151 err = nfserrno(host_err);
152 + }
153 out:
154 return err;
155 }
156 --- a/fs/xfs/linux-2.6/xfs_iops.c
157 +++ b/fs/xfs/linux-2.6/xfs_iops.c
158 @@ -601,7 +601,12 @@ xfs_vn_setattr(
159 struct dentry *dentry,
160 struct iattr *iattr)
161 {
162 - return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0, NULL);
163 + int flags = 0;
164 +#ifdef ATTR_NO_BLOCK
165 + if (iattr->ia_valid & ATTR_NO_BLOCK)
166 + flags |= O_NONBLOCK;
167 +#endif
168 + return -xfs_setattr(XFS_I(dentry->d_inode), iattr, flags, NULL);
169 }
170
171 /*
172 --- a/include/linux/fs.h
173 +++ b/include/linux/fs.h
174 @@ -342,6 +342,7 @@ typedef void (dio_iodone_t)(struct kiocb
175 #define ATTR_KILL_PRIV (1 << 14)
176 #define ATTR_OPEN (1 << 15) /* Truncating from open(O_TRUNC) */
177 #define ATTR_TIMES_SET (1 << 16)
178 +#define ATTR_NO_BLOCK (1 << 17) /* Return EAGAIN and don't block on long truncates */
179
180 /*
181 * This is the Inode Attributes structure, used for notify_change(). It