]>
Commit | Line | Data |
---|---|---|
2cb7cef9 BS |
1 | From: Greg Banks |
2 | Subject: Make NFSD DMAPI aware | |
3 | References: 74107, 173874, bnc#450658 | |
4 | Patch-mainline: obstruction... | |
5 | ||
6 | G'day, | |
7 | ||
8 | The NFSv3 protocol specifies an error, NFS3ERR_JUKEBOX, which a server | |
9 | should return when an I/O operation will take a very long time. | |
10 | This causes a different pattern of retries in clients, and avoids | |
11 | a number of serious problems associated with I/Os which take longer | |
12 | than an RPC timeout. The Linux knfsd server has code to generate the | |
13 | jukebox error and many NFS clients are known to have working code to | |
14 | handle it. | |
15 | ||
16 | One scenario in which a server should emit the JUKEBOX error is when | |
17 | a file data which the client is attempting to access is managed by | |
18 | an HSM (Hierarchical Storage Manager) and is not present on the disk | |
19 | and needs to be brought in from tape. Due to the nature of tapes this | |
20 | operation can take minutes rather than the milliseconds normally seen | |
21 | for local file data. | |
22 | ||
23 | Currently the Linux knfsd handles this situation poorly. A READ NFS | |
24 | call will cause the nfsd thread handling it to block until the file | |
25 | is available, without sending a reply to the NFS client. After a | |
26 | few seconds the client retries, and this second READ call causes | |
27 | another nfsd to block behind the first one. A few seconds later and | |
28 | the client's retries have blocked *all* the nfsd threads, and all NFS | |
29 | service from the server stops until the original file arrives on disk. | |
30 | ||
31 | WRITEs and SETATTRs which truncate the file are marginally better, in | |
32 | that the knfsd dupcache will catch the retries and drop them without | |
33 | blocking an nfsd (the dupcache *will* catch the retries because the | |
34 | cache entry remains in RC_INPROG state and is not reused until the | |
35 | first call finishes). However the first call still blocks, so given | |
36 | WRITEs to enough offline files the server can still be locked up. | |
37 | ||
38 | There are also client-side implications, depending on the client | |
39 | implementation. For example, on a Linux client an RPC retry loop uses | |
40 | an RPC request slot, so reads from enough separate offline files can | |
41 | lock up a mountpoint. | |
42 | ||
43 | This patch seeks to remedy the interaction between knfsd and HSMs by | |
44 | providing mechanisms to allow knfsd to tell an underlying filesystem | |
45 | (which supports HSMs) not to block for reads, writes and truncates | |
46 | of offline files. It's a port of a Linux 2.4 patch used in SGI's | |
47 | ProPack distro for the last 12 months. The patch: | |
48 | ||
49 | * provides a new ATTR_NO_BLOCK flag which the kernel can | |
50 | use to tell a filesystem's inode_ops->setattr() operation not | |
51 | to block when truncating an offline file. XFS already obeys | |
52 | this flag (inside a #ifdef) | |
53 | ||
54 | * changes knfsd to provide ATTR_NO_BLOCK when it does the VFS | |
55 | calls to implement the SETATTR NFS call. | |
56 | ||
57 | * changes knfsd to supply the O_NONBLOCK flag in the temporary | |
58 | struct file it uses for VFS reads and writes, in order to ask | |
59 | the filesystem not to block when reading or writing an offline | |
60 | file. XFS already obeys this new semantic for O_NONBLOCK | |
61 | (and in SLES9 so does JFS). | |
62 | ||
63 | * adds code to translate the -EAGAIN the filesystem returns when | |
64 | it would have blocked, to the -ETIMEDOUT that knfsd expects. | |
65 | ||
66 | ||
67 | Signed-off-by: Greg Banks <gnb@melbourne.sgi.com> | |
68 | (SLES9 patch Acked-by: okir@suse.de) | |
69 | Signed-off-by: NeilBrown <neilb@suse.de> | |
70 | Acked-by: Jan Kara <jack@suse.cz> | |
71 | ||
72 | fs/nfsd/vfs.c | 32 ++++++++++++++++++++++++++++++-- | |
73 | fs/xfs/linux-2.6/xfs_iops.c | 7 ++++++- | |
74 | include/linux/fs.h | 1 + | |
75 | 3 files changed, 37 insertions(+), 3 deletions(-) | |
76 | ||
77 | ||
78 | --- a/fs/nfsd/vfs.c | |
79 | +++ b/fs/nfsd/vfs.c | |
80 | @@ -358,6 +358,15 @@ nfsd_setattr(struct svc_rqst *rqstp, str | |
81 | goto out_nfserr; | |
82 | } | |
83 | DQUOT_INIT(inode); | |
84 | + | |
85 | + /* | |
86 | + * Tell a Hierarchical Storage Manager (e.g. via DMAPI) to | |
87 | + * return EAGAIN when an action would take minutes instead of | |
88 | + * milliseconds so that NFS can reply to the client with | |
89 | + * NFSERR_JUKEBOX instead of blocking an nfsd thread. | |
90 | + */ | |
91 | + if (rqstp->rq_vers >= 3) | |
92 | + iap->ia_valid |= ATTR_NO_BLOCK; | |
93 | } | |
94 | ||
95 | /* sanitize the mode change */ | |
96 | @@ -389,6 +398,9 @@ nfsd_setattr(struct svc_rqst *rqstp, str | |
97 | if (!check_guard || guardtime == inode->i_ctime.tv_sec) { | |
98 | fh_lock(fhp); | |
99 | host_err = notify_change(dentry, iap); | |
100 | + /* to get NFSERR_JUKEBOX on the wire, need -ETIMEDOUT */ | |
101 | + if (host_err == -EAGAIN) | |
102 | + host_err = -ETIMEDOUT; | |
103 | err = nfserrno(host_err); | |
104 | fh_unlock(fhp); | |
105 | } | |
106 | @@ -906,6 +918,10 @@ nfsd_vfs_read(struct svc_rqst *rqstp, st | |
107 | if (ra && ra->p_set) | |
108 | file->f_ra = ra->p_ra; | |
109 | ||
110 | + /* Support HSMs -- see comment in nfsd_setattr() */ | |
111 | + if (rqstp->rq_vers >= 3) | |
112 | + file->f_flags |= O_NONBLOCK; | |
113 | + | |
114 | if (file->f_op->splice_read && rqstp->rq_splice_ok) { | |
115 | struct splice_desc sd = { | |
116 | .len = 0, | |
117 | @@ -938,8 +954,12 @@ nfsd_vfs_read(struct svc_rqst *rqstp, st | |
118 | *count = host_err; | |
119 | err = 0; | |
120 | fsnotify_access(file->f_path.dentry); | |
121 | - } else | |
122 | + } else { | |
123 | + /* to get NFSERR_JUKEBOX on the wire, need -ETIMEDOUT */ | |
124 | + if (host_err == -EAGAIN) | |
125 | + host_err = -ETIMEDOUT; | |
126 | err = nfserrno(host_err); | |
127 | + } | |
128 | out: | |
129 | return err; | |
130 | } | |
131 | @@ -998,6 +1018,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, s | |
132 | if (stable && !EX_WGATHER(exp)) | |
133 | file->f_flags |= O_SYNC; | |
134 | ||
135 | + /* Support HSMs -- see comment in nfsd_setattr() */ | |
136 | + if (rqstp->rq_vers >= 3) | |
137 | + file->f_flags |= O_NONBLOCK; | |
138 | + | |
139 | /* Write the data. */ | |
140 | oldfs = get_fs(); set_fs(KERNEL_DS); | |
141 | host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset); | |
142 | @@ -1050,8 +1074,12 @@ nfsd_vfs_write(struct svc_rqst *rqstp, s | |
143 | dprintk("nfsd: write complete host_err=%d\n", host_err); | |
144 | if (host_err >= 0) | |
145 | err = 0; | |
146 | - else | |
147 | + else { | |
148 | + /* to get NFSERR_JUKEBOX on the wire, need -ETIMEDOUT */ | |
149 | + if (host_err == -EAGAIN) | |
150 | + host_err = -ETIMEDOUT; | |
151 | err = nfserrno(host_err); | |
152 | + } | |
153 | out: | |
154 | return err; | |
155 | } | |
156 | --- a/fs/xfs/linux-2.6/xfs_iops.c | |
157 | +++ b/fs/xfs/linux-2.6/xfs_iops.c | |
158 | @@ -601,7 +601,12 @@ xfs_vn_setattr( | |
159 | struct dentry *dentry, | |
160 | struct iattr *iattr) | |
161 | { | |
162 | - return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0, NULL); | |
163 | + int flags = 0; | |
164 | +#ifdef ATTR_NO_BLOCK | |
165 | + if (iattr->ia_valid & ATTR_NO_BLOCK) | |
166 | + flags |= O_NONBLOCK; | |
167 | +#endif | |
168 | + return -xfs_setattr(XFS_I(dentry->d_inode), iattr, flags, NULL); | |
169 | } | |
170 | ||
171 | /* | |
172 | --- a/include/linux/fs.h | |
173 | +++ b/include/linux/fs.h | |
174 | @@ -342,6 +342,7 @@ typedef void (dio_iodone_t)(struct kiocb | |
175 | #define ATTR_KILL_PRIV (1 << 14) | |
176 | #define ATTR_OPEN (1 << 15) /* Truncating from open(O_TRUNC) */ | |
177 | #define ATTR_TIMES_SET (1 << 16) | |
178 | +#define ATTR_NO_BLOCK (1 << 17) /* Return EAGAIN and don't block on long truncates */ | |
179 | ||
180 | /* | |
181 | * This is the Inode Attributes structure, used for notify_change(). It |