2 * unix_io.c --- This is the Unix (well, really POSIX) implementation
5 * Implements a one-block write-through cache.
7 * Includes support for Windows NT support under Cygwin.
9 * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
10 * 2002 by Theodore Ts'o.
13 * This file may be redistributed under the terms of the GNU Library
14 * General Public License, version 2.
18 #if !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
19 #define _XOPEN_SOURCE 600
20 #define _DARWIN_C_SOURCE
21 #define _FILE_OFFSET_BITS 64
22 #ifndef _LARGEFILE_SOURCE
23 #define _LARGEFILE_SOURCE
25 #ifndef _LARGEFILE64_SOURCE
26 #define _LARGEFILE64_SOURCE
45 #include <sys/utsname.h>
48 #include <sys/types.h>
50 #ifdef HAVE_SYS_IOCTL_H
51 #include <sys/ioctl.h>
53 #ifdef HAVE_SYS_MOUNT_H
54 #include <sys/mount.h>
56 #ifdef HAVE_SYS_PRCTL_H
57 #include <sys/prctl.h>
59 #define PR_GET_DUMPABLE 3
64 #if HAVE_SYS_RESOURCE_H
65 #include <sys/resource.h>
67 #if HAVE_LINUX_FALLOC_H
68 #include <linux/falloc.h>
74 #if defined(__linux__) && defined(_IO) && !defined(BLKROGET)
75 #define BLKROGET _IO(0x12, 94) /* Get read-only status (0 = read_write). */
85 * For checking structure magic numbers...
88 #define EXT2_CHECK_MAGIC(struct, code) \
89 if ((struct)->magic != (code)) return (code)
93 unsigned long long block
;
101 #define WRITE_DIRECT_SIZE 4 /* Must be smaller than CACHE_SIZE */
102 #define READ_DIRECT_SIZE 4 /* Should be smaller than CACHE_SIZE */
104 struct unix_private_data
{
111 struct unix_cache cache
[CACHE_SIZE
];
113 struct struct_io_stats io_stats
;
115 pthread_mutex_t cache_mutex
;
116 pthread_mutex_t bounce_mutex
;
117 pthread_mutex_t stats_mutex
;
121 #define IS_ALIGNED(n, align) ((((uintptr_t) n) & \
122 ((uintptr_t) ((align)-1))) == 0)
124 typedef enum lock_kind
{
125 CACHE_MTX
, BOUNCE_MTX
, STATS_MTX
129 static inline pthread_mutex_t
*get_mutex(struct unix_private_data
*data
,
132 if (data
->flags
& IO_FLAG_THREADS
) {
135 return &data
->cache_mutex
;
137 return &data
->bounce_mutex
;
139 return &data
->stats_mutex
;
146 static inline void mutex_lock(struct unix_private_data
*data
, kind_t kind
)
149 pthread_mutex_t
*mtx
= get_mutex(data
,kind
);
152 pthread_mutex_lock(mtx
);
156 static inline void mutex_unlock(struct unix_private_data
*data
, kind_t kind
)
159 pthread_mutex_t
*mtx
= get_mutex(data
,kind
);
162 pthread_mutex_unlock(mtx
);
166 static errcode_t
unix_get_stats(io_channel channel
, io_stats
*stats
)
168 errcode_t retval
= 0;
170 struct unix_private_data
*data
;
172 EXT2_CHECK_MAGIC(channel
, EXT2_ET_MAGIC_IO_CHANNEL
);
173 data
= (struct unix_private_data
*) channel
->private_data
;
174 EXT2_CHECK_MAGIC(data
, EXT2_ET_MAGIC_UNIX_IO_CHANNEL
);
177 mutex_lock(data
, STATS_MTX
);
178 *stats
= &data
->io_stats
;
179 mutex_unlock(data
, STATS_MTX
);
185 static char *safe_getenv(const char *arg
)
187 if ((getuid() != geteuid()) || (getgid() != getegid()))
190 if (prctl(PR_GET_DUMPABLE
, 0, 0, 0, 0) == 0)
193 #if (defined(linux) && defined(SYS_prctl))
194 if (syscall(SYS_prctl
, PR_GET_DUMPABLE
, 0, 0, 0, 0) == 0)
199 #if defined(HAVE_SECURE_GETENV)
200 return secure_getenv(arg
);
201 #elif defined(HAVE___SECURE_GETENV)
202 return __secure_getenv(arg
);
209 * Here are the raw I/O functions
211 static errcode_t
raw_read_blk(io_channel channel
,
212 struct unix_private_data
*data
,
213 unsigned long long block
,
214 int count
, void *bufv
)
218 ext2_loff_t location
;
220 unsigned char *buf
= bufv
;
221 ssize_t really_read
= 0;
222 unsigned long long aligned_blk
;
223 int align_size
, offset
;
225 size
= (count
< 0) ? -count
: (ext2_loff_t
) count
* channel
->block_size
;
226 mutex_lock(data
, STATS_MTX
);
227 data
->io_stats
.bytes_read
+= size
;
228 mutex_unlock(data
, STATS_MTX
);
229 location
= ((ext2_loff_t
) block
* channel
->block_size
) + data
->offset
;
231 if (data
->flags
& IO_FLAG_FORCE_BOUNCE
)
235 /* Try an aligned pread */
236 if ((channel
->align
== 0) ||
237 (IS_ALIGNED(buf
, channel
->align
) &&
238 IS_ALIGNED(location
, channel
->align
) &&
239 IS_ALIGNED(size
, channel
->align
))) {
240 actual
= pread64(data
->dev
, buf
, size
, location
);
246 /* Try an aligned pread */
247 if ((sizeof(off_t
) >= sizeof(ext2_loff_t
)) &&
248 ((channel
->align
== 0) ||
249 (IS_ALIGNED(buf
, channel
->align
) &&
250 IS_ALIGNED(location
, channel
->align
) &&
251 IS_ALIGNED(size
, channel
->align
)))) {
252 actual
= pread(data
->dev
, buf
, size
, location
);
257 #endif /* HAVE_PREAD */
259 if ((channel
->align
== 0) ||
260 (IS_ALIGNED(buf
, channel
->align
) &&
261 IS_ALIGNED(location
, channel
->align
) &&
262 IS_ALIGNED(size
, channel
->align
))) {
263 mutex_lock(data
, BOUNCE_MTX
);
264 if (ext2fs_llseek(data
->dev
, location
, SEEK_SET
) < 0) {
265 retval
= errno
? errno
: EXT2_ET_LLSEEK_FAILED
;
268 actual
= read(data
->dev
, buf
, size
);
269 if (actual
!= size
) {
275 retval
= EXT2_ET_SHORT_READ
;
282 printf("raw_read_blk: O_DIRECT fallback: %p %lu\n", buf
,
283 (unsigned long) size
);
287 * The buffer or size which we're trying to read isn't aligned
288 * to the O_DIRECT rules, so we need to do this the hard way...
291 if (channel
->align
== 0)
293 if ((channel
->block_size
> channel
->align
) &&
294 (channel
->block_size
% channel
->align
) == 0)
295 align_size
= channel
->block_size
;
297 align_size
= channel
->align
;
298 aligned_blk
= location
/ align_size
;
299 offset
= location
% align_size
;
301 mutex_lock(data
, BOUNCE_MTX
);
302 if (ext2fs_llseek(data
->dev
, aligned_blk
* align_size
, SEEK_SET
) < 0) {
303 retval
= errno
? errno
: EXT2_ET_LLSEEK_FAILED
;
307 actual
= read(data
->dev
, data
->bounce
, align_size
);
308 if (actual
!= align_size
) {
309 actual
= really_read
;
314 if ((actual
+ offset
) > align_size
)
315 actual
= align_size
- offset
;
318 memcpy(buf
, (char *)data
->bounce
+ offset
, actual
);
320 really_read
+= actual
;
327 mutex_unlock(data
, BOUNCE_MTX
);
331 mutex_unlock(data
, BOUNCE_MTX
);
332 if (actual
>= 0 && actual
< size
)
333 memset((char *) buf
+actual
, 0, size
-actual
);
334 if (channel
->read_error
)
335 retval
= (channel
->read_error
)(channel
, block
, count
, buf
,
336 size
, actual
, retval
);
340 #define RAW_WRITE_NO_HANDLER 1
342 static errcode_t
raw_write_blk(io_channel channel
,
343 struct unix_private_data
*data
,
344 unsigned long long block
,
345 int count
, const void *bufv
,
349 ext2_loff_t location
;
352 const unsigned char *buf
= bufv
;
353 unsigned long long aligned_blk
;
354 int align_size
, offset
;
357 size
= channel
->block_size
;
362 size
= (ext2_loff_t
) count
* channel
->block_size
;
364 mutex_lock(data
, STATS_MTX
);
365 data
->io_stats
.bytes_written
+= size
;
366 mutex_unlock(data
, STATS_MTX
);
368 location
= ((ext2_loff_t
) block
* channel
->block_size
) + data
->offset
;
370 if (data
->flags
& IO_FLAG_FORCE_BOUNCE
)
374 /* Try an aligned pwrite */
375 if ((channel
->align
== 0) ||
376 (IS_ALIGNED(buf
, channel
->align
) &&
377 IS_ALIGNED(location
, channel
->align
) &&
378 IS_ALIGNED(size
, channel
->align
))) {
379 actual
= pwrite64(data
->dev
, buf
, size
, location
);
384 /* Try an aligned pwrite */
385 if ((sizeof(off_t
) >= sizeof(ext2_loff_t
)) &&
386 ((channel
->align
== 0) ||
387 (IS_ALIGNED(buf
, channel
->align
) &&
388 IS_ALIGNED(location
, channel
->align
) &&
389 IS_ALIGNED(size
, channel
->align
)))) {
390 actual
= pwrite(data
->dev
, buf
, size
, location
);
394 #endif /* HAVE_PWRITE */
396 if ((channel
->align
== 0) ||
397 (IS_ALIGNED(buf
, channel
->align
) &&
398 IS_ALIGNED(location
, channel
->align
) &&
399 IS_ALIGNED(size
, channel
->align
))) {
400 mutex_lock(data
, BOUNCE_MTX
);
401 if (ext2fs_llseek(data
->dev
, location
, SEEK_SET
) < 0) {
402 retval
= errno
? errno
: EXT2_ET_LLSEEK_FAILED
;
405 actual
= write(data
->dev
, buf
, size
);
406 mutex_unlock(data
, BOUNCE_MTX
);
411 if (actual
!= size
) {
413 retval
= EXT2_ET_SHORT_WRITE
;
420 printf("raw_write_blk: O_DIRECT fallback: %p %lu\n", buf
,
421 (unsigned long) size
);
424 * The buffer or size which we're trying to write isn't aligned
425 * to the O_DIRECT rules, so we need to do this the hard way...
428 if (channel
->align
== 0)
430 if ((channel
->block_size
> channel
->align
) &&
431 (channel
->block_size
% channel
->align
) == 0)
432 align_size
= channel
->block_size
;
434 align_size
= channel
->align
;
435 aligned_blk
= location
/ align_size
;
436 offset
= location
% align_size
;
441 mutex_lock(data
, BOUNCE_MTX
);
442 if (size
< align_size
|| offset
) {
443 if (ext2fs_llseek(data
->dev
, aligned_blk
* align_size
,
445 retval
= errno
? errno
: EXT2_ET_LLSEEK_FAILED
;
448 actual
= read(data
->dev
, data
->bounce
,
450 if (actual
!= align_size
) {
455 memset((char *) data
->bounce
+ actual
, 0,
456 align_size
- actual
);
460 if ((actual
+ offset
) > align_size
)
461 actual
= align_size
- offset
;
464 memcpy(((char *)data
->bounce
) + offset
, buf
, actual
);
465 if (ext2fs_llseek(data
->dev
, aligned_blk
* align_size
, SEEK_SET
) < 0) {
466 retval
= errno
? errno
: EXT2_ET_LLSEEK_FAILED
;
469 actual_w
= write(data
->dev
, data
->bounce
, align_size
);
470 mutex_unlock(data
, BOUNCE_MTX
);
475 if (actual_w
!= align_size
)
486 mutex_unlock(data
, BOUNCE_MTX
);
488 if (((flags
& RAW_WRITE_NO_HANDLER
) == 0) && channel
->write_error
)
489 retval
= (channel
->write_error
)(channel
, block
, count
, buf
,
490 size
, actual
, retval
);
496 * Here we implement the cache functions
499 /* Allocate the cache buffers */
500 static errcode_t
alloc_cache(io_channel channel
,
501 struct unix_private_data
*data
)
504 struct unix_cache
*cache
;
507 data
->access_time
= 0;
508 for (i
=0, cache
= data
->cache
; i
< CACHE_SIZE
; i
++, cache
++) {
510 cache
->access_time
= 0;
514 ext2fs_free_mem(&cache
->buf
);
515 retval
= io_channel_alloc_buf(channel
, 0, &cache
->buf
);
519 if (channel
->align
|| data
->flags
& IO_FLAG_FORCE_BOUNCE
) {
521 ext2fs_free_mem(&data
->bounce
);
522 retval
= io_channel_alloc_buf(channel
, 0, &data
->bounce
);
527 /* Free the cache buffers */
528 static void free_cache(struct unix_private_data
*data
)
530 struct unix_cache
*cache
;
533 data
->access_time
= 0;
534 for (i
=0, cache
= data
->cache
; i
< CACHE_SIZE
; i
++, cache
++) {
536 cache
->access_time
= 0;
540 ext2fs_free_mem(&cache
->buf
);
543 ext2fs_free_mem(&data
->bounce
);
548 * Try to find a block in the cache. If the block is not found, and
549 * eldest is a non-zero pointer, then fill in eldest with the cache
550 * entry to that should be reused.
552 static struct unix_cache
*find_cached_block(struct unix_private_data
*data
,
553 unsigned long long block
,
554 struct unix_cache
**eldest
)
556 struct unix_cache
*cache
, *unused_cache
, *oldest_cache
;
559 unused_cache
= oldest_cache
= 0;
560 for (i
=0, cache
= data
->cache
; i
< CACHE_SIZE
; i
++, cache
++) {
561 if (!cache
->in_use
) {
563 unused_cache
= cache
;
566 if (cache
->block
== block
) {
567 cache
->access_time
= ++data
->access_time
;
571 (cache
->access_time
< oldest_cache
->access_time
))
572 oldest_cache
= cache
;
575 *eldest
= (unused_cache
) ? unused_cache
: oldest_cache
;
580 * Reuse a particular cache entry for another block.
582 static errcode_t
reuse_cache(io_channel channel
,
583 struct unix_private_data
*data
, struct unix_cache
*cache
,
584 unsigned long long block
)
586 if (cache
->dirty
&& cache
->in_use
) {
589 retval
= raw_write_blk(channel
, data
, cache
->block
, 1,
590 cache
->buf
, RAW_WRITE_NO_HANDLER
);
592 cache
->write_err
= 1;
599 cache
->write_err
= 0;
600 cache
->block
= block
;
601 cache
->access_time
= ++data
->access_time
;
605 #define FLUSH_INVALIDATE 0x01
606 #define FLUSH_NOLOCK 0x02
609 * Flush all of the blocks in the cache
611 static errcode_t
flush_cached_blocks(io_channel channel
,
612 struct unix_private_data
*data
,
615 struct unix_cache
*cache
;
616 errcode_t retval
, retval2
= 0;
618 int errors_found
= 0;
620 if ((flags
& FLUSH_NOLOCK
) == 0)
621 mutex_lock(data
, CACHE_MTX
);
622 for (i
=0, cache
= data
->cache
; i
< CACHE_SIZE
; i
++, cache
++) {
623 if (!cache
->in_use
|| !cache
->dirty
)
625 retval
= raw_write_blk(channel
, data
,
626 cache
->block
, 1, cache
->buf
,
627 RAW_WRITE_NO_HANDLER
);
629 cache
->write_err
= 1;
634 cache
->write_err
= 0;
635 if (flags
& FLUSH_INVALIDATE
)
639 if ((flags
& FLUSH_NOLOCK
) == 0)
640 mutex_unlock(data
, CACHE_MTX
);
642 while (errors_found
) {
643 if ((flags
& FLUSH_NOLOCK
) == 0)
644 mutex_lock(data
, CACHE_MTX
);
646 for (i
=0, cache
= data
->cache
; i
< CACHE_SIZE
; i
++, cache
++) {
647 if (!cache
->in_use
|| !cache
->write_err
)
650 if (cache
->write_err
&& channel
->write_error
) {
651 char *err_buf
= NULL
;
652 unsigned long long err_block
= cache
->block
;
656 cache
->write_err
= 0;
657 if (io_channel_alloc_buf(channel
, 0,
661 memcpy(err_buf
, cache
->buf
,
662 channel
->block_size
);
663 mutex_unlock(data
, CACHE_MTX
);
664 (channel
->write_error
)(channel
, err_block
,
665 1, err_buf
, channel
->block_size
, -1,
668 ext2fs_free_mem(&err_buf
);
671 cache
->write_err
= 0;
673 if ((flags
& FLUSH_NOLOCK
) == 0)
674 mutex_unlock(data
, CACHE_MTX
);
678 #endif /* NO_IO_CACHE */
681 #ifndef BLKDISCARDZEROES
682 #define BLKDISCARDZEROES _IO(0x12,124)
686 int ext2fs_open_file(const char *pathname
, int flags
, mode_t mode
)
689 #if defined(HAVE_OPEN64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
690 return open64(pathname
, flags
, mode
);
692 return open64(pathname
, flags
);
694 return open(pathname
, flags
, mode
);
696 return open(pathname
, flags
);
700 int ext2fs_stat(const char *path
, ext2fs_struct_stat
*buf
)
702 #if defined(HAVE_FSTAT64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
703 return stat64(path
, buf
);
705 return stat(path
, buf
);
709 int ext2fs_fstat(int fd
, ext2fs_struct_stat
*buf
)
711 #if defined(HAVE_FSTAT64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
712 return fstat64(fd
, buf
);
714 return fstat(fd
, buf
);
719 static errcode_t
unix_open_channel(const char *name
, int fd
,
720 int flags
, io_channel
*channel
,
723 io_channel io
= NULL
;
724 struct unix_private_data
*data
= NULL
;
726 ext2fs_struct_stat st
;
731 if (safe_getenv("UNIX_IO_FORCE_BOUNCE"))
732 flags
|= IO_FLAG_FORCE_BOUNCE
;
736 * We need to make sure any previous errors in the block
737 * device are thrown away, sigh.
742 retval
= ext2fs_get_mem(sizeof(struct struct_io_channel
), &io
);
745 memset(io
, 0, sizeof(struct struct_io_channel
));
746 io
->magic
= EXT2_ET_MAGIC_IO_CHANNEL
;
747 retval
= ext2fs_get_mem(sizeof(struct unix_private_data
), &data
);
751 io
->manager
= io_mgr
;
752 retval
= ext2fs_get_mem(strlen(name
)+1, &io
->name
);
756 strcpy(io
->name
, name
);
757 io
->private_data
= data
;
758 io
->block_size
= 1024;
764 memset(data
, 0, sizeof(struct unix_private_data
));
765 data
->magic
= EXT2_ET_MAGIC_UNIX_IO_CHANNEL
;
766 data
->io_stats
.num_fields
= 2;
770 #if defined(O_DIRECT)
771 if (flags
& IO_FLAG_DIRECT_IO
)
772 io
->align
= ext2fs_get_dio_alignment(data
->dev
);
773 #elif defined(F_NOCACHE)
774 if (flags
& IO_FLAG_DIRECT_IO
)
779 * If the device is really a block device, then set the
780 * appropriate flag, otherwise we can set DISCARD_ZEROES flag
781 * because we are going to use punch hole instead of discard
782 * and if it succeed, subsequent read from sparse area returns
785 if (ext2fs_fstat(data
->dev
, &st
) == 0) {
786 if (ext2fsP_is_disk_device(st
.st_mode
))
787 io
->flags
|= CHANNEL_FLAGS_BLOCK_DEVICE
;
789 io
->flags
|= CHANNEL_FLAGS_DISCARD_ZEROES
;
792 #ifdef BLKDISCARDZEROES
795 if (ioctl(data
->dev
, BLKDISCARDZEROES
, &zeroes
) == 0 &&
797 io
->flags
|= CHANNEL_FLAGS_DISCARD_ZEROES
;
801 #if defined(__CYGWIN__)
803 * Some operating systems require that the buffers be aligned,
804 * regardless of O_DIRECT
810 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
811 if (io
->flags
& CHANNEL_FLAGS_BLOCK_DEVICE
) {
812 int dio_align
= ext2fs_get_dio_alignment(fd
);
814 if (io
->align
< dio_align
)
815 io
->align
= dio_align
;
819 if ((retval
= alloc_cache(io
, data
)))
823 if (flags
& IO_FLAG_RW
) {
827 /* Is the block device actually writable? */
828 error
= ioctl(data
->dev
, BLKROGET
, &readonly
);
829 if (!error
&& readonly
) {
838 #if (defined(__alpha__) || ((defined(__sparc__) || defined(__mips__)) && (SIZEOF_LONG == 4)))
839 #define RLIM_INFINITY ((unsigned long)(~0UL>>1))
841 #define RLIM_INFINITY (~0UL)
844 * Work around a bug in 2.4.10-2.4.18 kernels where writes to
845 * block devices are wrongly getting hit by the filesize
846 * limit. This workaround isn't perfect, since it won't work
847 * if glibc wasn't built against 2.2 header files. (Sigh.)
850 if ((flags
& IO_FLAG_RW
) &&
852 ((ut
.release
[0] == '2') && (ut
.release
[1] == '.') &&
853 (ut
.release
[2] == '4') && (ut
.release
[3] == '.') &&
854 (ut
.release
[4] == '1') && (ut
.release
[5] >= '0') &&
855 (ut
.release
[5] < '8')) &&
856 (ext2fs_fstat(data
->dev
, &st
) == 0) &&
857 (ext2fsP_is_disk_device(st
.st_mode
))) {
860 rlim
.rlim_cur
= rlim
.rlim_max
= (unsigned long) RLIM_INFINITY
;
861 setrlimit(RLIMIT_FSIZE
, &rlim
);
862 getrlimit(RLIMIT_FSIZE
, &rlim
);
863 if (((unsigned long) rlim
.rlim_cur
) <
864 ((unsigned long) rlim
.rlim_max
)) {
865 rlim
.rlim_cur
= rlim
.rlim_max
;
866 setrlimit(RLIMIT_FSIZE
, &rlim
);
871 if (flags
& IO_FLAG_THREADS
) {
872 io
->flags
|= CHANNEL_FLAGS_THREADS
;
873 retval
= pthread_mutex_init(&data
->cache_mutex
, NULL
);
876 retval
= pthread_mutex_init(&data
->bounce_mutex
, NULL
);
878 pthread_mutex_destroy(&data
->cache_mutex
);
881 retval
= pthread_mutex_init(&data
->stats_mutex
, NULL
);
883 pthread_mutex_destroy(&data
->cache_mutex
);
884 pthread_mutex_destroy(&data
->bounce_mutex
);
897 ext2fs_free_mem(&data
);
901 ext2fs_free_mem(&io
->name
);
903 ext2fs_free_mem(&io
);
908 static errcode_t
unixfd_open(const char *str_fd
, int flags
,
915 #if defined(HAVE_FCNTL)
916 fd_flags
= fcntl(fd
, F_GETFD
);
921 if (fd_flags
& O_RDWR
)
923 if (fd_flags
& O_EXCL
)
924 flags
|= IO_FLAG_EXCLUSIVE
;
925 #if defined(O_DIRECT)
926 if (fd_flags
& O_DIRECT
)
927 flags
|= IO_FLAG_DIRECT_IO
;
929 #endif /* HAVE_FCNTL */
931 return unix_open_channel(str_fd
, fd
, flags
, channel
, unixfd_io_manager
);
934 static errcode_t
unix_open(const char *name
, int flags
,
941 return EXT2_ET_BAD_DEVICE_NAME
;
943 open_flags
= (flags
& IO_FLAG_RW
) ? O_RDWR
: O_RDONLY
;
944 if (flags
& IO_FLAG_EXCLUSIVE
)
945 open_flags
|= O_EXCL
;
946 #if defined(O_DIRECT)
947 if (flags
& IO_FLAG_DIRECT_IO
)
948 open_flags
|= O_DIRECT
;
950 fd
= ext2fs_open_file(name
, open_flags
, 0);
953 #if defined(F_NOCACHE) && !defined(IO_DIRECT)
954 if (flags
& IO_FLAG_DIRECT_IO
) {
955 if (fcntl(fd
, F_NOCACHE
, 1) < 0)
959 return unix_open_channel(name
, fd
, flags
, channel
, unix_io_manager
);
962 static errcode_t
unix_close(io_channel channel
)
964 struct unix_private_data
*data
;
965 errcode_t retval
= 0;
967 EXT2_CHECK_MAGIC(channel
, EXT2_ET_MAGIC_IO_CHANNEL
);
968 data
= (struct unix_private_data
*) channel
->private_data
;
969 EXT2_CHECK_MAGIC(data
, EXT2_ET_MAGIC_UNIX_IO_CHANNEL
);
971 if (--channel
->refcount
> 0)
975 retval
= flush_cached_blocks(channel
, data
, 0);
978 if (close(data
->dev
) < 0)
982 if (data
->flags
& IO_FLAG_THREADS
) {
983 pthread_mutex_destroy(&data
->cache_mutex
);
984 pthread_mutex_destroy(&data
->bounce_mutex
);
985 pthread_mutex_destroy(&data
->stats_mutex
);
989 ext2fs_free_mem(&channel
->private_data
);
991 ext2fs_free_mem(&channel
->name
);
992 ext2fs_free_mem(&channel
);
996 static errcode_t
unix_set_blksize(io_channel channel
, int blksize
)
998 struct unix_private_data
*data
;
999 errcode_t retval
= 0;
1001 EXT2_CHECK_MAGIC(channel
, EXT2_ET_MAGIC_IO_CHANNEL
);
1002 data
= (struct unix_private_data
*) channel
->private_data
;
1003 EXT2_CHECK_MAGIC(data
, EXT2_ET_MAGIC_UNIX_IO_CHANNEL
);
1005 if (channel
->block_size
!= blksize
) {
1006 mutex_lock(data
, CACHE_MTX
);
1007 mutex_lock(data
, BOUNCE_MTX
);
1009 if ((retval
= flush_cached_blocks(channel
, data
, FLUSH_NOLOCK
))){
1010 mutex_unlock(data
, BOUNCE_MTX
);
1011 mutex_unlock(data
, CACHE_MTX
);
1016 channel
->block_size
= blksize
;
1018 retval
= alloc_cache(channel
, data
);
1019 mutex_unlock(data
, BOUNCE_MTX
);
1020 mutex_unlock(data
, CACHE_MTX
);
1025 static errcode_t
unix_read_blk64(io_channel channel
, unsigned long long block
,
1026 int count
, void *buf
)
1028 struct unix_private_data
*data
;
1029 struct unix_cache
*cache
;
1034 EXT2_CHECK_MAGIC(channel
, EXT2_ET_MAGIC_IO_CHANNEL
);
1035 data
= (struct unix_private_data
*) channel
->private_data
;
1036 EXT2_CHECK_MAGIC(data
, EXT2_ET_MAGIC_UNIX_IO_CHANNEL
);
1039 return raw_read_blk(channel
, data
, block
, count
, buf
);
1041 if (data
->flags
& IO_FLAG_NOCACHE
)
1042 return raw_read_blk(channel
, data
, block
, count
, buf
);
1044 * If we're doing an odd-sized read or a very large read,
1045 * flush out the cache and then do a direct read.
1047 if (count
< 0 || count
> WRITE_DIRECT_SIZE
) {
1048 if ((retval
= flush_cached_blocks(channel
, data
, 0)))
1050 return raw_read_blk(channel
, data
, block
, count
, buf
);
1054 mutex_lock(data
, CACHE_MTX
);
1056 /* If it's in the cache, use it! */
1057 if ((cache
= find_cached_block(data
, block
, NULL
))) {
1059 printf("Using cached block %lu\n", block
);
1061 memcpy(cp
, cache
->buf
, channel
->block_size
);
1064 cp
+= channel
->block_size
;
1069 * Find the number of uncached blocks so we can do a
1070 * single read request
1072 for (i
=1; i
< count
; i
++)
1073 if (find_cached_block(data
, block
+i
, NULL
))
1076 printf("Reading %d blocks starting at %lu\n", i
, block
);
1078 mutex_unlock(data
, CACHE_MTX
);
1079 if ((retval
= raw_read_blk(channel
, data
, block
, i
, cp
)))
1081 mutex_lock(data
, CACHE_MTX
);
1083 /* Save the results in the cache */
1084 for (j
=0; j
< i
; j
++) {
1085 if (!find_cached_block(data
, block
, &cache
)) {
1086 retval
= reuse_cache(channel
, data
,
1089 goto call_write_handler
;
1090 memcpy(cache
->buf
, cp
, channel
->block_size
);
1094 cp
+= channel
->block_size
;
1097 mutex_unlock(data
, CACHE_MTX
);
1101 if (cache
->write_err
&& channel
->write_error
) {
1102 char *err_buf
= NULL
;
1103 unsigned long long err_block
= cache
->block
;
1107 cache
->write_err
= 0;
1108 if (io_channel_alloc_buf(channel
, 0, &err_buf
))
1111 memcpy(err_buf
, cache
->buf
, channel
->block_size
);
1112 mutex_unlock(data
, CACHE_MTX
);
1113 (channel
->write_error
)(channel
, err_block
, 1, err_buf
,
1114 channel
->block_size
, -1,
1117 ext2fs_free_mem(&err_buf
);
1119 mutex_unlock(data
, CACHE_MTX
);
1121 #endif /* NO_IO_CACHE */
1124 static errcode_t
unix_read_blk(io_channel channel
, unsigned long block
,
1125 int count
, void *buf
)
1127 return unix_read_blk64(channel
, block
, count
, buf
);
1130 static errcode_t
unix_write_blk64(io_channel channel
, unsigned long long block
,
1131 int count
, const void *buf
)
1133 struct unix_private_data
*data
;
1134 struct unix_cache
*cache
, *reuse
;
1135 errcode_t retval
= 0;
1139 EXT2_CHECK_MAGIC(channel
, EXT2_ET_MAGIC_IO_CHANNEL
);
1140 data
= (struct unix_private_data
*) channel
->private_data
;
1141 EXT2_CHECK_MAGIC(data
, EXT2_ET_MAGIC_UNIX_IO_CHANNEL
);
1144 return raw_write_blk(channel
, data
, block
, count
, buf
, 0);
1146 if (data
->flags
& IO_FLAG_NOCACHE
)
1147 return raw_write_blk(channel
, data
, block
, count
, buf
, 0);
1149 * If we're doing an odd-sized write or a very large write,
1150 * flush out the cache completely and then do a direct write.
1152 if (count
< 0 || count
> WRITE_DIRECT_SIZE
) {
1153 if ((retval
= flush_cached_blocks(channel
, data
,
1156 return raw_write_blk(channel
, data
, block
, count
, buf
, 0);
1160 * For a moderate-sized multi-block write, first force a write
1161 * if we're in write-through cache mode, and then fill the
1162 * cache with the blocks.
1164 writethrough
= channel
->flags
& CHANNEL_FLAGS_WRITETHROUGH
;
1166 retval
= raw_write_blk(channel
, data
, block
, count
, buf
, 0);
1169 mutex_lock(data
, CACHE_MTX
);
1171 cache
= find_cached_block(data
, block
, &reuse
);
1176 err
= reuse_cache(channel
, data
, cache
, block
);
1178 goto call_write_handler
;
1180 if (cache
->buf
!= cp
)
1181 memcpy(cache
->buf
, cp
, channel
->block_size
);
1182 cache
->dirty
= !writethrough
;
1185 cp
+= channel
->block_size
;
1187 mutex_unlock(data
, CACHE_MTX
);
1191 if (cache
->write_err
&& channel
->write_error
) {
1192 char *err_buf
= NULL
;
1193 unsigned long long err_block
= cache
->block
;
1197 cache
->write_err
= 0;
1198 if (io_channel_alloc_buf(channel
, 0, &err_buf
))
1201 memcpy(err_buf
, cache
->buf
, channel
->block_size
);
1202 mutex_unlock(data
, CACHE_MTX
);
1203 (channel
->write_error
)(channel
, err_block
, 1, err_buf
,
1204 channel
->block_size
, -1,
1207 ext2fs_free_mem(&err_buf
);
1209 mutex_unlock(data
, CACHE_MTX
);
1211 #endif /* NO_IO_CACHE */
1214 static errcode_t
unix_cache_readahead(io_channel channel
,
1215 unsigned long long block
,
1216 unsigned long long count
)
1218 #ifdef POSIX_FADV_WILLNEED
1219 struct unix_private_data
*data
;
1221 data
= (struct unix_private_data
*)channel
->private_data
;
1222 EXT2_CHECK_MAGIC(data
, EXT2_ET_MAGIC_UNIX_IO_CHANNEL
);
1223 return posix_fadvise(data
->dev
,
1224 (ext2_loff_t
)block
* channel
->block_size
+ data
->offset
,
1225 (ext2_loff_t
)count
* channel
->block_size
,
1226 POSIX_FADV_WILLNEED
);
1228 return EXT2_ET_OP_NOT_SUPPORTED
;
1232 static errcode_t
unix_write_blk(io_channel channel
, unsigned long block
,
1233 int count
, const void *buf
)
1235 return unix_write_blk64(channel
, block
, count
, buf
);
1238 static errcode_t
unix_write_byte(io_channel channel
, unsigned long offset
,
1239 int size
, const void *buf
)
1241 struct unix_private_data
*data
;
1242 errcode_t retval
= 0;
1245 EXT2_CHECK_MAGIC(channel
, EXT2_ET_MAGIC_IO_CHANNEL
);
1246 data
= (struct unix_private_data
*) channel
->private_data
;
1247 EXT2_CHECK_MAGIC(data
, EXT2_ET_MAGIC_UNIX_IO_CHANNEL
);
1249 if (channel
->align
!= 0) {
1251 printf("unix_write_byte: O_DIRECT fallback\n");
1253 return EXT2_ET_UNIMPLEMENTED
;
1258 * Flush out the cache completely
1260 if ((retval
= flush_cached_blocks(channel
, data
, FLUSH_INVALIDATE
)))
1264 if (lseek(data
->dev
, offset
+ data
->offset
, SEEK_SET
) < 0)
1267 actual
= write(data
->dev
, buf
, size
);
1271 return EXT2_ET_SHORT_WRITE
;
1277 * Flush data buffers to disk.
1279 static errcode_t
unix_flush(io_channel channel
)
1281 struct unix_private_data
*data
;
1282 errcode_t retval
= 0;
1284 EXT2_CHECK_MAGIC(channel
, EXT2_ET_MAGIC_IO_CHANNEL
);
1285 data
= (struct unix_private_data
*) channel
->private_data
;
1286 EXT2_CHECK_MAGIC(data
, EXT2_ET_MAGIC_UNIX_IO_CHANNEL
);
1289 retval
= flush_cached_blocks(channel
, data
, 0);
1292 if (!retval
&& fsync(data
->dev
) != 0)
1298 static errcode_t
unix_set_option(io_channel channel
, const char *option
,
1301 struct unix_private_data
*data
;
1302 unsigned long long tmp
;
1306 EXT2_CHECK_MAGIC(channel
, EXT2_ET_MAGIC_IO_CHANNEL
);
1307 data
= (struct unix_private_data
*) channel
->private_data
;
1308 EXT2_CHECK_MAGIC(data
, EXT2_ET_MAGIC_UNIX_IO_CHANNEL
);
1310 if (!strcmp(option
, "offset")) {
1312 return EXT2_ET_INVALID_ARGUMENT
;
1314 tmp
= strtoull(arg
, &end
, 0);
1316 return EXT2_ET_INVALID_ARGUMENT
;
1318 if (data
->offset
< 0)
1319 return EXT2_ET_INVALID_ARGUMENT
;
1322 if (!strcmp(option
, "cache")) {
1324 return EXT2_ET_INVALID_ARGUMENT
;
1325 if (!strcmp(arg
, "on")) {
1326 data
->flags
&= ~IO_FLAG_NOCACHE
;
1329 if (!strcmp(arg
, "off")) {
1330 retval
= flush_cached_blocks(channel
, data
, 0);
1331 data
->flags
|= IO_FLAG_NOCACHE
;
1334 return EXT2_ET_INVALID_ARGUMENT
;
1336 return EXT2_ET_INVALID_ARGUMENT
;
1339 #if defined(__linux__) && !defined(BLKDISCARD)
1340 #define BLKDISCARD _IO(0x12,119)
1343 static errcode_t
unix_discard(io_channel channel
, unsigned long long block
,
1344 unsigned long long count
)
1346 struct unix_private_data
*data
;
1349 EXT2_CHECK_MAGIC(channel
, EXT2_ET_MAGIC_IO_CHANNEL
);
1350 data
= (struct unix_private_data
*) channel
->private_data
;
1351 EXT2_CHECK_MAGIC(data
, EXT2_ET_MAGIC_UNIX_IO_CHANNEL
);
1353 if (channel
->flags
& CHANNEL_FLAGS_BLOCK_DEVICE
) {
1357 range
[0] = (__u64
)(block
) * channel
->block_size
+ data
->offset
;
1358 range
[1] = (__u64
)(count
) * channel
->block_size
;
1360 ret
= ioctl(data
->dev
, BLKDISCARD
, &range
);
1365 #if defined(HAVE_FALLOCATE) && defined(FALLOC_FL_PUNCH_HOLE)
1367 * If we are not on block device, try to use punch hole
1368 * to reclaim free space.
1370 ret
= fallocate(data
->dev
,
1371 FALLOC_FL_PUNCH_HOLE
| FALLOC_FL_KEEP_SIZE
,
1372 (off_t
)(block
) * channel
->block_size
+ data
->offset
,
1373 (off_t
)(count
) * channel
->block_size
);
1379 if (errno
== EOPNOTSUPP
)
1385 return EXT2_ET_UNIMPLEMENTED
;
1389 * If we know about ZERO_RANGE, try that before we try PUNCH_HOLE because
1390 * ZERO_RANGE doesn't unmap preallocated blocks. We prefer fallocate because
1391 * it always invalidates page cache, and libext2fs requires that reads after
1392 * ZERO_RANGE return zeroes.
1394 static int __unix_zeroout(int fd
, off_t offset
, off_t len
)
1398 #if defined(HAVE_FALLOCATE) && defined(FALLOC_FL_ZERO_RANGE)
1399 ret
= fallocate(fd
, FALLOC_FL_ZERO_RANGE
, offset
, len
);
1403 #if defined(HAVE_FALLOCATE) && defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE)
1404 ret
= fallocate(fd
, FALLOC_FL_PUNCH_HOLE
| FALLOC_FL_KEEP_SIZE
,
1413 /* parameters might not be used if OS doesn't support zeroout */
1414 #if __GNUC_PREREQ (4, 6)
1415 #pragma GCC diagnostic push
1416 #pragma GCC diagnostic ignored "-Wunused-parameter"
1418 static errcode_t
unix_zeroout(io_channel channel
, unsigned long long block
,
1419 unsigned long long count
)
1421 struct unix_private_data
*data
;
1424 EXT2_CHECK_MAGIC(channel
, EXT2_ET_MAGIC_IO_CHANNEL
);
1425 data
= (struct unix_private_data
*) channel
->private_data
;
1426 EXT2_CHECK_MAGIC(data
, EXT2_ET_MAGIC_UNIX_IO_CHANNEL
);
1428 if (safe_getenv("UNIX_IO_NOZEROOUT"))
1431 if (!(channel
->flags
& CHANNEL_FLAGS_BLOCK_DEVICE
)) {
1432 /* Regular file, try to use truncate/punch/zero. */
1433 struct stat statbuf
;
1438 * If we're trying to zero a range past the end of the file,
1439 * extend the file size, then truncate everything.
1441 ret
= fstat(data
->dev
, &statbuf
);
1444 if ((unsigned long long) statbuf
.st_size
<
1445 (block
+ count
) * channel
->block_size
+ data
->offset
) {
1446 ret
= ftruncate(data
->dev
,
1447 (block
+ count
) * channel
->block_size
+ data
->offset
);
1453 ret
= __unix_zeroout(data
->dev
,
1454 (off_t
)(block
) * channel
->block_size
+ data
->offset
,
1455 (off_t
)(count
) * channel
->block_size
);
1458 if (errno
== EOPNOTSUPP
)
1464 return EXT2_ET_UNIMPLEMENTED
;
1466 #if __GNUC_PREREQ (4, 6)
1467 #pragma GCC diagnostic pop
1470 static struct struct_io_manager struct_unix_manager
= {
1471 .magic
= EXT2_ET_MAGIC_IO_MANAGER
,
1472 .name
= "Unix I/O Manager",
1474 .close
= unix_close
,
1475 .set_blksize
= unix_set_blksize
,
1476 .read_blk
= unix_read_blk
,
1477 .write_blk
= unix_write_blk
,
1478 .flush
= unix_flush
,
1479 .write_byte
= unix_write_byte
,
1480 .set_option
= unix_set_option
,
1481 .get_stats
= unix_get_stats
,
1482 .read_blk64
= unix_read_blk64
,
1483 .write_blk64
= unix_write_blk64
,
1484 .discard
= unix_discard
,
1485 .cache_readahead
= unix_cache_readahead
,
1486 .zeroout
= unix_zeroout
,
1489 io_manager unix_io_manager
= &struct_unix_manager
;
1491 static struct struct_io_manager struct_unixfd_manager
= {
1492 .magic
= EXT2_ET_MAGIC_IO_MANAGER
,
1493 .name
= "Unix fd I/O Manager",
1494 .open
= unixfd_open
,
1495 .close
= unix_close
,
1496 .set_blksize
= unix_set_blksize
,
1497 .read_blk
= unix_read_blk
,
1498 .write_blk
= unix_write_blk
,
1499 .flush
= unix_flush
,
1500 .write_byte
= unix_write_byte
,
1501 .set_option
= unix_set_option
,
1502 .get_stats
= unix_get_stats
,
1503 .read_blk64
= unix_read_blk64
,
1504 .write_blk64
= unix_write_blk64
,
1505 .discard
= unix_discard
,
1506 .cache_readahead
= unix_cache_readahead
,
1507 .zeroout
= unix_zeroout
,
1510 io_manager unixfd_io_manager
= &struct_unixfd_manager
;