]>
Commit | Line | Data |
---|---|---|
959ef981 | 1 | // SPDX-License-Identifier: GPL-2.0+ |
a555a1f4 DW |
2 | /* |
3 | * Copyright (C) 2018 Oracle. All Rights Reserved. | |
a555a1f4 | 4 | * Author: Darrick J. Wong <darrick.wong@oracle.com> |
a555a1f4 | 5 | */ |
a440f877 | 6 | #include "xfs.h" |
a555a1f4 DW |
7 | #include <stdint.h> |
8 | #include <stdlib.h> | |
a555a1f4 DW |
9 | #include <unistd.h> |
10 | #include <fcntl.h> | |
11 | #include <sys/types.h> | |
a555a1f4 | 12 | #include <sys/statvfs.h> |
03c0cd8f DW |
13 | #ifdef HAVE_SG_IO |
14 | # include <scsi/sg.h> | |
15 | #endif | |
16 | #ifdef HAVE_HDIO_GETGEO | |
17 | # include <linux/hdreg.h> | |
18 | #endif | |
a555a1f4 | 19 | #include "platform_defs.h" |
660b5d96 | 20 | #include "libfrog/util.h" |
42b4c8e8 | 21 | #include "libfrog/paths.h" |
a555a1f4 | 22 | #include "xfs_scrub.h" |
03c0cd8f | 23 | #include "common.h" |
a555a1f4 | 24 | #include "disk.h" |
ae14fe63 | 25 | #include "platform_defs.h" |
a555a1f4 DW |
26 | |
27 | #ifndef BLKROTATIONAL | |
28 | # define BLKROTATIONAL _IO(0x12, 126) | |
29 | #endif | |
30 | ||
31 | /* | |
32 | * Disk Abstraction | |
33 | * | |
34 | * These routines help us to discover the geometry of a block device, | |
35 | * estimate the amount of concurrent IOs that we can send to it, and | |
36 | * abstract the process of performing read verification of disk blocks. | |
37 | */ | |
38 | ||
39 | /* Figure out how many disk heads are available. */ | |
40 | static unsigned int | |
41 | __disk_heads( | |
42 | struct disk *disk) | |
43 | { | |
44 | int iomin; | |
45 | int ioopt; | |
ae14fe63 | 46 | int nproc = platform_nproc(); |
a555a1f4 DW |
47 | unsigned short rot; |
48 | int error; | |
49 | ||
50 | /* If it's not a block device, throw all the CPUs at it. */ | |
51 | if (!S_ISBLK(disk->d_sb.st_mode)) | |
52 | return nproc; | |
53 | ||
54 | /* Non-rotational device? Throw all the CPUs at the problem. */ | |
55 | rot = 1; | |
56 | error = ioctl(disk->d_fd, BLKROTATIONAL, &rot); | |
57 | if (error == 0 && rot == 0) | |
58 | return nproc; | |
59 | ||
60 | /* | |
61 | * Sometimes we can infer the number of devices from the | |
62 | * min/optimal IO sizes. | |
63 | */ | |
64 | iomin = ioopt = 0; | |
65 | if (ioctl(disk->d_fd, BLKIOMIN, &iomin) == 0 && | |
66 | ioctl(disk->d_fd, BLKIOOPT, &ioopt) == 0 && | |
67 | iomin > 0 && ioopt > 0) { | |
68 | return min(nproc, max(1, ioopt / iomin)); | |
69 | } | |
70 | ||
71 | /* Rotating device? I guess? */ | |
72 | return 2; | |
73 | } | |
74 | ||
75 | /* Figure out how many disk heads are available. */ | |
76 | unsigned int | |
77 | disk_heads( | |
78 | struct disk *disk) | |
79 | { | |
32c6cc09 DW |
80 | if (force_nr_threads) |
81 | return force_nr_threads; | |
a555a1f4 DW |
82 | return __disk_heads(disk); |
83 | } | |
84 | ||
03c0cd8f DW |
85 | /* |
86 | * Execute a SCSI VERIFY(16) to verify disk contents. | |
87 | * For devices that support this command, this can sharply reduce the | |
88 | * runtime of the data block verification phase if the storage device's | |
89 | * internal bandwidth exceeds its link bandwidth. However, it only | |
90 | * works if we're talking to a raw SCSI device, and only if we trust the | |
91 | * firmware. | |
92 | */ | |
93 | #ifdef HAVE_SG_IO | |
94 | # define SENSE_BUF_LEN 64 | |
95 | # define VERIFY16_CMDLEN 16 | |
96 | # define VERIFY16_CMD 0x8F | |
97 | ||
98 | # ifndef SG_FLAG_Q_AT_TAIL | |
99 | # define SG_FLAG_Q_AT_TAIL 0x10 | |
100 | # endif | |
101 | static int | |
102 | disk_scsi_verify( | |
103 | struct disk *disk, | |
104 | uint64_t startblock, /* lba */ | |
105 | uint64_t blockcount) /* lba */ | |
106 | { | |
107 | struct sg_io_hdr iohdr; | |
108 | unsigned char cdb[VERIFY16_CMDLEN]; | |
109 | unsigned char sense[SENSE_BUF_LEN]; | |
110 | uint64_t llba; | |
111 | uint64_t veri_len = blockcount; | |
112 | int error; | |
113 | ||
114 | assert(!debug_tweak_on("XFS_SCRUB_NO_SCSI_VERIFY")); | |
115 | ||
116 | llba = startblock + (disk->d_start >> BBSHIFT); | |
117 | ||
118 | /* Borrowed from sg_verify */ | |
119 | cdb[0] = VERIFY16_CMD; | |
120 | cdb[1] = 0; /* skip PI, DPO, and byte check. */ | |
121 | cdb[2] = (llba >> 56) & 0xff; | |
122 | cdb[3] = (llba >> 48) & 0xff; | |
123 | cdb[4] = (llba >> 40) & 0xff; | |
124 | cdb[5] = (llba >> 32) & 0xff; | |
125 | cdb[6] = (llba >> 24) & 0xff; | |
126 | cdb[7] = (llba >> 16) & 0xff; | |
127 | cdb[8] = (llba >> 8) & 0xff; | |
128 | cdb[9] = llba & 0xff; | |
129 | cdb[10] = (veri_len >> 24) & 0xff; | |
130 | cdb[11] = (veri_len >> 16) & 0xff; | |
131 | cdb[12] = (veri_len >> 8) & 0xff; | |
132 | cdb[13] = veri_len & 0xff; | |
133 | cdb[14] = 0; | |
134 | cdb[15] = 0; | |
135 | memset(sense, 0, SENSE_BUF_LEN); | |
136 | ||
137 | /* v3 SG_IO */ | |
138 | memset(&iohdr, 0, sizeof(iohdr)); | |
139 | iohdr.interface_id = 'S'; | |
140 | iohdr.dxfer_direction = SG_DXFER_NONE; | |
141 | iohdr.cmdp = cdb; | |
142 | iohdr.cmd_len = VERIFY16_CMDLEN; | |
143 | iohdr.sbp = sense; | |
144 | iohdr.mx_sb_len = SENSE_BUF_LEN; | |
145 | iohdr.flags |= SG_FLAG_Q_AT_TAIL; | |
146 | iohdr.timeout = 30000; /* 30s */ | |
147 | ||
148 | error = ioctl(disk->d_fd, SG_IO, &iohdr); | |
323ef14c | 149 | if (error < 0) |
03c0cd8f DW |
150 | return error; |
151 | ||
152 | dbg_printf("VERIFY(16) fd %d lba %"PRIu64" len %"PRIu64" info %x " | |
153 | "status %d masked %d msg %d host %d driver %d " | |
154 | "duration %d resid %d\n", | |
155 | disk->d_fd, startblock, blockcount, iohdr.info, | |
156 | iohdr.status, iohdr.masked_status, iohdr.msg_status, | |
157 | iohdr.host_status, iohdr.driver_status, iohdr.duration, | |
158 | iohdr.resid); | |
159 | ||
160 | if (iohdr.info & SG_INFO_CHECK) { | |
161 | dbg_printf("status: msg %x host %x driver %x\n", | |
162 | iohdr.msg_status, iohdr.host_status, | |
163 | iohdr.driver_status); | |
164 | errno = EIO; | |
165 | return -1; | |
166 | } | |
167 | ||
323ef14c | 168 | return blockcount << BBSHIFT; |
03c0cd8f DW |
169 | } |
170 | #else | |
171 | # define disk_scsi_verify(...) (ENOTTY) | |
172 | #endif /* HAVE_SG_IO */ | |
173 | ||
174 | /* Test the availability of the kernel scrub ioctl. */ | |
175 | static bool | |
176 | disk_can_scsi_verify( | |
177 | struct disk *disk) | |
178 | { | |
179 | int error; | |
180 | ||
181 | if (debug_tweak_on("XFS_SCRUB_NO_SCSI_VERIFY")) | |
182 | return false; | |
183 | ||
184 | error = disk_scsi_verify(disk, 0, 1); | |
185 | return error == 0; | |
186 | } | |
187 | ||
a555a1f4 DW |
188 | /* Open a disk device and discover its geometry. */ |
189 | struct disk * | |
190 | disk_open( | |
191 | const char *pathname) | |
192 | { | |
03c0cd8f DW |
193 | #ifdef HAVE_HDIO_GETGEO |
194 | struct hd_geometry bdgeo; | |
195 | #endif | |
a555a1f4 | 196 | struct disk *disk; |
03c0cd8f | 197 | bool suspicious_disk = false; |
a555a1f4 DW |
198 | int error; |
199 | ||
200 | disk = calloc(1, sizeof(struct disk)); | |
201 | if (!disk) | |
202 | return NULL; | |
203 | ||
204 | disk->d_fd = open(pathname, O_RDONLY | O_DIRECT | O_NOATIME); | |
205 | if (disk->d_fd < 0) | |
206 | goto out_free; | |
207 | ||
208 | /* Try to get LBA size. */ | |
20e10ad4 | 209 | error = ioctl(disk->d_fd, BLKSSZGET, &disk->d_lbasize); |
a555a1f4 | 210 | if (error) |
20e10ad4 DW |
211 | disk->d_lbasize = 512; |
212 | disk->d_lbalog = log2_roundup(disk->d_lbasize); | |
a555a1f4 DW |
213 | |
214 | /* Obtain disk's stat info. */ | |
215 | error = fstat(disk->d_fd, &disk->d_sb); | |
216 | if (error) | |
217 | goto out_close; | |
218 | ||
219 | /* Determine bdev size, block size, and offset. */ | |
220 | if (S_ISBLK(disk->d_sb.st_mode)) { | |
221 | error = ioctl(disk->d_fd, BLKGETSIZE64, &disk->d_size); | |
222 | if (error) | |
223 | disk->d_size = 0; | |
224 | error = ioctl(disk->d_fd, BLKBSZGET, &disk->d_blksize); | |
225 | if (error) | |
226 | disk->d_blksize = 0; | |
03c0cd8f DW |
227 | #ifdef HAVE_HDIO_GETGEO |
228 | error = ioctl(disk->d_fd, HDIO_GETGEO, &bdgeo); | |
229 | if (!error) { | |
230 | /* | |
231 | * dm devices will pass through ioctls, which means | |
232 | * we can't use SCSI VERIFY unless the start is 0. | |
233 | * Most dm devices don't set geometry (unlike scsi | |
234 | * and nvme) so use a zeroed out CHS to screen them | |
235 | * out. | |
236 | */ | |
237 | if (bdgeo.start != 0 && | |
238 | (unsigned long long)bdgeo.heads * bdgeo.sectors * | |
239 | bdgeo.sectors == 0) | |
240 | suspicious_disk = true; | |
241 | disk->d_start = bdgeo.start << BBSHIFT; | |
242 | } else | |
243 | #endif | |
244 | disk->d_start = 0; | |
a555a1f4 DW |
245 | } else { |
246 | disk->d_size = disk->d_sb.st_size; | |
247 | disk->d_blksize = disk->d_sb.st_blksize; | |
248 | disk->d_start = 0; | |
249 | } | |
250 | ||
03c0cd8f DW |
251 | /* Can we issue SCSI VERIFY? */ |
252 | if (!suspicious_disk && disk_can_scsi_verify(disk)) | |
253 | disk->d_flags |= DISK_FLAG_SCSI_VERIFY; | |
254 | ||
a555a1f4 DW |
255 | return disk; |
256 | out_close: | |
257 | close(disk->d_fd); | |
258 | out_free: | |
259 | free(disk); | |
260 | return NULL; | |
261 | } | |
262 | ||
263 | /* Close a disk device. */ | |
264 | int | |
265 | disk_close( | |
266 | struct disk *disk) | |
267 | { | |
268 | int error = 0; | |
269 | ||
270 | if (disk->d_fd >= 0) | |
271 | error = close(disk->d_fd); | |
272 | disk->d_fd = -1; | |
273 | free(disk); | |
274 | return error; | |
275 | } | |
276 | ||
03c0cd8f DW |
277 | #define BTOLBAT(d, bytes) ((uint64_t)(bytes) >> (d)->d_lbalog) |
278 | #define LBASIZE(d) (1ULL << (d)->d_lbalog) | |
279 | #define BTOLBA(d, bytes) (((uint64_t)(bytes) + LBASIZE(d) - 1) >> (d)->d_lbalog) | |
280 | ||
cac2b8b0 DW |
281 | /* Simulate disk errors. */ |
282 | static int | |
283 | disk_simulate_read_error( | |
284 | struct disk *disk, | |
285 | uint64_t start, | |
286 | uint64_t *length) | |
287 | { | |
288 | static int64_t interval; | |
289 | uint64_t start_interval; | |
290 | ||
291 | /* Simulated disk errors are disabled. */ | |
292 | if (interval < 0) | |
293 | return 0; | |
294 | ||
295 | /* Figure out the disk read error interval. */ | |
296 | if (interval == 0) { | |
297 | char *p; | |
298 | ||
299 | /* Pretend there's bad media every so often, in bytes. */ | |
300 | p = getenv("XFS_SCRUB_DISK_ERROR_INTERVAL"); | |
301 | if (p == NULL) { | |
302 | interval = -1; | |
303 | return 0; | |
304 | } | |
305 | interval = strtoull(p, NULL, 10); | |
306 | interval &= ~((1U << disk->d_lbalog) - 1); | |
307 | } | |
b8302b7f DW |
308 | if (interval <= 0) { |
309 | interval = -1; | |
310 | return 0; | |
311 | } | |
cac2b8b0 DW |
312 | |
313 | /* | |
314 | * We simulate disk errors by pretending that there are media errors at | |
315 | * predetermined intervals across the disk. If a read verify request | |
316 | * crosses one of those intervals we shorten it so that the next read | |
317 | * will start on an interval threshold. If the read verify request | |
318 | * starts on an interval threshold, we send back EIO as if it had | |
319 | * failed. | |
320 | */ | |
321 | if ((start % interval) == 0) { | |
322 | dbg_printf("fd %d: simulating disk error at %"PRIu64".\n", | |
323 | disk->d_fd, start); | |
324 | return EIO; | |
325 | } | |
326 | ||
327 | start_interval = start / interval; | |
328 | if (start_interval != (start + *length) / interval) { | |
329 | *length = ((start_interval + 1) * interval) - start; | |
330 | dbg_printf( | |
331 | "fd %d: simulating short read at %"PRIu64" to length %"PRIu64".\n", | |
332 | disk->d_fd, start, *length); | |
333 | } | |
334 | ||
335 | return 0; | |
336 | } | |
337 | ||
a555a1f4 DW |
338 | /* Read-verify an extent of a disk device. */ |
339 | ssize_t | |
340 | disk_read_verify( | |
341 | struct disk *disk, | |
342 | void *buf, | |
343 | uint64_t start, | |
344 | uint64_t length) | |
345 | { | |
cac2b8b0 DW |
346 | if (debug) { |
347 | int ret; | |
348 | ||
349 | ret = disk_simulate_read_error(disk, start, &length); | |
350 | if (ret) { | |
351 | errno = ret; | |
352 | return -1; | |
353 | } | |
354 | ||
355 | /* Don't actually issue the IO */ | |
356 | if (getenv("XFS_SCRUB_DISK_VERIFY_SKIP")) | |
357 | return length; | |
358 | } | |
359 | ||
03c0cd8f DW |
360 | /* Convert to logical block size. */ |
361 | if (disk->d_flags & DISK_FLAG_SCSI_VERIFY) | |
362 | return disk_scsi_verify(disk, BTOLBAT(disk, start), | |
363 | BTOLBA(disk, length)); | |
364 | ||
a555a1f4 DW |
365 | return pread(disk->d_fd, buf, length, start); |
366 | } |