1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
7 #include "libxfs_priv.h"
12 #include "xfs_shared.h"
13 #include "xfs_format.h"
14 #include "xfs_log_format.h"
15 #include "xfs_trans_resv.h"
16 #include "xfs_mount.h"
17 #include "xfs_defer.h"
18 #include "xfs_inode_buf.h"
19 #include "xfs_inode_fork.h"
20 #include "xfs_inode.h"
21 #include "xfs_trans.h"
23 #include "xfs_bmap_btree.h"
24 #include "xfs_trans_space.h"
25 #include "xfs_ialloc.h"
26 #include "xfs_alloc.h"
28 #include "xfs_da_format.h"
29 #include "xfs_da_btree.h"
30 #include "xfs_dir2_priv.h"
33 * Calculate the worst case log unit reservation for a given superblock
34 * configuration. Copied and munged from the kernel code, and assumes a
35 * worse case header usage (maximum log buffer sizes)
38 xfs_log_calc_unit_res(
43 int iclog_header_size
;
47 if (xfs_sb_version_haslogv2(&mp
->m_sb
)) {
48 iclog_size
= XLOG_MAX_RECORD_BSIZE
;
49 iclog_header_size
= BBTOB(iclog_size
/ XLOG_HEADER_CYCLE_SIZE
);
51 iclog_size
= XLOG_BIG_RECORD_BSIZE
;
52 iclog_header_size
= BBSIZE
;
56 * Permanent reservations have up to 'cnt'-1 active log operations
57 * in the log. A unit in this case is the amount of space for one
58 * of these log operations. Normal reservations have a cnt of 1
59 * and their unit amount is the total amount of space required.
61 * The following lines of code account for non-transaction data
62 * which occupy space in the on-disk log.
64 * Normal form of a transaction is:
65 * <oph><trans-hdr><start-oph><reg1-oph><reg1><reg2-oph>...<commit-oph>
66 * and then there are LR hdrs, split-recs and roundoff at end of syncs.
68 * We need to account for all the leadup data and trailer data
69 * around the transaction data.
70 * And then we need to account for the worst case in terms of using
72 * The worst case will happen if:
73 * - the placement of the transaction happens to be such that the
74 * roundoff is at its maximum
75 * - the transaction data is synced before the commit record is synced
76 * i.e. <transaction-data><roundoff> | <commit-rec><roundoff>
77 * Therefore the commit record is in its own Log Record.
78 * This can happen as the commit record is called with its
79 * own region to xlog_write().
80 * This then means that in the worst case, roundoff can happen for
81 * the commit-rec as well.
82 * The commit-rec is smaller than padding in this scenario and so it is
83 * not added separately.
86 /* for trans header */
87 unit_bytes
+= sizeof(xlog_op_header_t
);
88 unit_bytes
+= sizeof(xfs_trans_header_t
);
91 unit_bytes
+= sizeof(xlog_op_header_t
);
94 * for LR headers - the space for data in an iclog is the size minus
95 * the space used for the headers. If we use the iclog size, then we
96 * undercalculate the number of headers required.
98 * Furthermore - the addition of op headers for split-recs might
99 * increase the space required enough to require more log and op
100 * headers, so take that into account too.
102 * IMPORTANT: This reservation makes the assumption that if this
103 * transaction is the first in an iclog and hence has the LR headers
104 * accounted to it, then the remaining space in the iclog is
105 * exclusively for this transaction. i.e. if the transaction is larger
106 * than the iclog, it will be the only thing in that iclog.
107 * Fundamentally, this means we must pass the entire log vector to
108 * xlog_write to guarantee this.
110 iclog_space
= iclog_size
- iclog_header_size
;
111 num_headers
= howmany(unit_bytes
, iclog_space
);
113 /* for split-recs - ophdrs added when data split over LRs */
114 unit_bytes
+= sizeof(xlog_op_header_t
) * num_headers
;
116 /* add extra header reservations if we overrun */
117 while (!num_headers
||
118 howmany(unit_bytes
, iclog_space
) > num_headers
) {
119 unit_bytes
+= sizeof(xlog_op_header_t
);
122 unit_bytes
+= iclog_header_size
* num_headers
;
124 /* for commit-rec LR header - note: padding will subsume the ophdr */
125 unit_bytes
+= iclog_header_size
;
127 /* for roundoff padding for transaction data and one for commit record */
128 if (xfs_sb_version_haslogv2(&mp
->m_sb
) && mp
->m_sb
.sb_logsunit
> 1) {
129 /* log su roundoff */
130 unit_bytes
+= 2 * mp
->m_sb
.sb_logsunit
;
133 unit_bytes
+= 2 * BBSIZE
;
140 * Change the requested timestamp in the given inode.
142 * This was once shared with the kernel, but has diverged to the point
143 * where it's no longer worth the hassle of maintaining common code.
146 libxfs_trans_ichgtime(
147 struct xfs_trans
*tp
,
148 struct xfs_inode
*ip
,
154 gettimeofday(&stv
, (struct timezone
*)0);
155 tv
.tv_sec
= stv
.tv_sec
;
156 tv
.tv_nsec
= stv
.tv_usec
* 1000;
157 if (flags
& XFS_ICHGTIME_MOD
)
158 VFS_I(ip
)->i_mtime
= tv
;
159 if (flags
& XFS_ICHGTIME_CHG
)
160 VFS_I(ip
)->i_ctime
= tv
;
161 if (flags
& XFS_ICHGTIME_CREATE
) {
162 ip
->i_d
.di_crtime
.t_sec
= (int32_t)tv
.tv_sec
;
163 ip
->i_d
.di_crtime
.t_nsec
= (int32_t)tv
.tv_nsec
;
169 struct xfs_inode
*ip
,
172 /* can't set PREALLOC this way, just preserve it */
174 (ip
->i_d
.di_flags
& XFS_DIFLAG_PREALLOC
);
176 if (xflags
& FS_XFLAG_IMMUTABLE
)
177 di_flags
|= XFS_DIFLAG_IMMUTABLE
;
178 if (xflags
& FS_XFLAG_APPEND
)
179 di_flags
|= XFS_DIFLAG_APPEND
;
180 if (xflags
& FS_XFLAG_SYNC
)
181 di_flags
|= XFS_DIFLAG_SYNC
;
182 if (xflags
& FS_XFLAG_NOATIME
)
183 di_flags
|= XFS_DIFLAG_NOATIME
;
184 if (xflags
& FS_XFLAG_NODUMP
)
185 di_flags
|= XFS_DIFLAG_NODUMP
;
186 if (xflags
& FS_XFLAG_NODEFRAG
)
187 di_flags
|= XFS_DIFLAG_NODEFRAG
;
188 if (xflags
& FS_XFLAG_FILESTREAM
)
189 di_flags
|= XFS_DIFLAG_FILESTREAM
;
190 if (S_ISDIR(VFS_I(ip
)->i_mode
)) {
191 if (xflags
& FS_XFLAG_RTINHERIT
)
192 di_flags
|= XFS_DIFLAG_RTINHERIT
;
193 if (xflags
& FS_XFLAG_NOSYMLINKS
)
194 di_flags
|= XFS_DIFLAG_NOSYMLINKS
;
195 if (xflags
& FS_XFLAG_EXTSZINHERIT
)
196 di_flags
|= XFS_DIFLAG_EXTSZINHERIT
;
197 if (xflags
& FS_XFLAG_PROJINHERIT
)
198 di_flags
|= XFS_DIFLAG_PROJINHERIT
;
199 } else if (S_ISREG(VFS_I(ip
)->i_mode
)) {
200 if (xflags
& FS_XFLAG_REALTIME
)
201 di_flags
|= XFS_DIFLAG_REALTIME
;
202 if (xflags
& FS_XFLAG_EXTSIZE
)
203 di_flags
|= XFS_DIFLAG_EXTSIZE
;
211 struct xfs_inode
*ip
,
215 (ip
->i_d
.di_flags2
& XFS_DIFLAG2_REFLINK
);
217 if (xflags
& FS_XFLAG_DAX
)
218 di_flags2
|= XFS_DIFLAG2_DAX
;
219 if (xflags
& FS_XFLAG_COWEXTSIZE
)
220 di_flags2
|= XFS_DIFLAG2_COWEXTSIZE
;
226 * Allocate an inode on disk and return a copy of its in-core version.
227 * Set mode, nlink, and rdev appropriately within the inode.
228 * The uid and gid for the inode are set according to the contents of
229 * the given cred structure.
231 * This was once shared with the kernel, but has diverged to the point
232 * where it's no longer worth the hassle of maintaining common code.
243 xfs_buf_t
**ialloc_context
,
252 * Call the space management code to pick
253 * the on-disk inode to be allocated.
255 error
= xfs_dialloc(tp
, pip
? pip
->i_ino
: 0, mode
,
256 ialloc_context
, &ino
);
259 if (*ialloc_context
|| ino
== NULLFSINO
) {
263 ASSERT(*ialloc_context
== NULL
);
265 error
= libxfs_iget(tp
->t_mountp
, tp
, ino
, 0, &ip
,
266 &xfs_default_ifork_ops
);
271 VFS_I(ip
)->i_mode
= mode
;
272 set_nlink(VFS_I(ip
), nlink
);
273 ip
->i_d
.di_uid
= cr
->cr_uid
;
274 ip
->i_d
.di_gid
= cr
->cr_gid
;
275 xfs_set_projid(&ip
->i_d
, pip
? 0 : fsx
->fsx_projid
);
276 xfs_trans_ichgtime(tp
, ip
, XFS_ICHGTIME_CHG
| XFS_ICHGTIME_MOD
);
279 * We only support filesystems that understand v2 format inodes. So if
280 * this is currently an old format inode, then change the inode version
281 * number now. This way we only do the conversion here rather than here
282 * and in the flush/logging code.
284 if (ip
->i_d
.di_version
== 1) {
285 ip
->i_d
.di_version
= 2;
287 * old link count, projid_lo/hi field, pad field
292 if (pip
&& (VFS_I(pip
)->i_mode
& S_ISGID
)) {
293 ip
->i_d
.di_gid
= pip
->i_d
.di_gid
;
294 if ((VFS_I(pip
)->i_mode
& S_ISGID
) && (mode
& S_IFMT
) == S_IFDIR
)
295 VFS_I(ip
)->i_mode
|= S_ISGID
;
299 ip
->i_d
.di_nextents
= 0;
300 ASSERT(ip
->i_d
.di_nblocks
== 0);
301 ip
->i_d
.di_extsize
= pip
? 0 : fsx
->fsx_extsize
;
302 ip
->i_d
.di_dmevmask
= 0;
303 ip
->i_d
.di_dmstate
= 0;
304 ip
->i_d
.di_flags
= pip
? 0 : xfs_flags2diflags(ip
, fsx
->fsx_xflags
);
306 if (ip
->i_d
.di_version
== 3) {
307 ASSERT(ip
->i_d
.di_ino
== ino
);
308 ASSERT(uuid_equal(&ip
->i_d
.di_uuid
, &mp
->m_sb
.sb_meta_uuid
));
309 VFS_I(ip
)->i_version
= 1;
310 ip
->i_d
.di_flags2
= pip
? 0 : xfs_flags2diflags2(ip
,
312 ip
->i_d
.di_crtime
.t_sec
= (int32_t)VFS_I(ip
)->i_mtime
.tv_sec
;
313 ip
->i_d
.di_crtime
.t_nsec
= (int32_t)VFS_I(ip
)->i_mtime
.tv_nsec
;
314 ip
->i_d
.di_cowextsize
= pip
? 0 : fsx
->fsx_cowextsize
;
317 flags
= XFS_ILOG_CORE
;
318 switch (mode
& S_IFMT
) {
321 /* doesn't make sense to set an rdev for these */
326 ip
->i_d
.di_format
= XFS_DINODE_FMT_DEV
;
327 flags
|= XFS_ILOG_DEV
;
328 VFS_I(ip
)->i_rdev
= rdev
;
332 if (pip
&& (pip
->i_d
.di_flags
& XFS_DIFLAG_ANY
)) {
335 if ((mode
& S_IFMT
) == S_IFDIR
) {
336 if (pip
->i_d
.di_flags
& XFS_DIFLAG_RTINHERIT
)
337 di_flags
|= XFS_DIFLAG_RTINHERIT
;
338 if (pip
->i_d
.di_flags
& XFS_DIFLAG_EXTSZINHERIT
) {
339 di_flags
|= XFS_DIFLAG_EXTSZINHERIT
;
340 ip
->i_d
.di_extsize
= pip
->i_d
.di_extsize
;
343 if (pip
->i_d
.di_flags
& XFS_DIFLAG_RTINHERIT
) {
344 di_flags
|= XFS_DIFLAG_REALTIME
;
346 if (pip
->i_d
.di_flags
& XFS_DIFLAG_EXTSZINHERIT
) {
347 di_flags
|= XFS_DIFLAG_EXTSIZE
;
348 ip
->i_d
.di_extsize
= pip
->i_d
.di_extsize
;
351 if (pip
->i_d
.di_flags
& XFS_DIFLAG_PROJINHERIT
)
352 di_flags
|= XFS_DIFLAG_PROJINHERIT
;
353 ip
->i_d
.di_flags
|= di_flags
;
357 ip
->i_d
.di_format
= XFS_DINODE_FMT_EXTENTS
;
358 ip
->i_df
.if_flags
= XFS_IFEXTENTS
;
359 ip
->i_df
.if_bytes
= 0;
360 ip
->i_df
.if_u1
.if_root
= NULL
;
365 /* Attribute fork settings for new inode. */
366 ip
->i_d
.di_aformat
= XFS_DINODE_FMT_EXTENTS
;
367 ip
->i_d
.di_anextents
= 0;
370 * set up the inode ops structure that the libxfs code relies on
373 ip
->d_ops
= ip
->i_mount
->m_dir_inode_ops
;
375 ip
->d_ops
= ip
->i_mount
->m_nondir_inode_ops
;
378 * Log the new values stuffed into the inode.
380 xfs_trans_ijoin(tp
, ip
, 0);
381 xfs_trans_log_inode(tp
, ip
, flags
);
387 * Writes a modified inode's changes out to the inode's on disk home.
388 * Originally based on xfs_iflush_int() from xfs_inode.c in the kernel.
391 libxfs_iflush_int(xfs_inode_t
*ip
, xfs_buf_t
*bp
)
393 xfs_inode_log_item_t
*iip
;
397 ASSERT(ip
->i_d
.di_format
!= XFS_DINODE_FMT_BTREE
||
398 ip
->i_d
.di_nextents
> ip
->i_df
.if_ext_max
);
399 ASSERT(ip
->i_d
.di_version
> 1);
404 /* set *dip = inode's place in the buffer */
405 dip
= xfs_buf_offset(bp
, ip
->i_imap
.im_boffset
);
407 ASSERT(ip
->i_d
.di_magic
== XFS_DINODE_MAGIC
);
409 ASSERT( (ip
->i_d
.di_format
== XFS_DINODE_FMT_EXTENTS
) ||
410 (ip
->i_d
.di_format
== XFS_DINODE_FMT_BTREE
) );
411 } else if (XFS_ISDIR(ip
)) {
412 ASSERT( (ip
->i_d
.di_format
== XFS_DINODE_FMT_EXTENTS
) ||
413 (ip
->i_d
.di_format
== XFS_DINODE_FMT_BTREE
) ||
414 (ip
->i_d
.di_format
== XFS_DINODE_FMT_LOCAL
) );
416 ASSERT(ip
->i_d
.di_nextents
+ip
->i_d
.di_anextents
<= ip
->i_d
.di_nblocks
);
417 ASSERT(ip
->i_d
.di_forkoff
<= mp
->m_sb
.sb_inodesize
);
419 /* bump the change count on v3 inodes */
420 if (ip
->i_d
.di_version
== 3)
421 VFS_I(ip
)->i_version
++;
423 /* Check the inline fork data before we write out. */
424 if (!libxfs_inode_verify_forks(ip
))
425 return -EFSCORRUPTED
;
428 * Copy the dirty parts of the inode into the on-disk
429 * inode. We always copy out the core of the inode,
430 * because if the inode is dirty at all the core must
433 xfs_inode_to_disk(ip
, dip
, iip
->ili_item
.li_lsn
);
435 xfs_iflush_fork(ip
, dip
, iip
, XFS_DATA_FORK
);
437 xfs_iflush_fork(ip
, dip
, iip
, XFS_ATTR_FORK
);
439 /* generate the checksum. */
440 xfs_dinode_calc_crc(mp
, dip
);
446 libxfs_mod_incore_sb(
447 struct xfs_mount
*mp
,
452 long long lcounter
; /* long counter for 64 bit fields */
455 case XFS_TRANS_SB_FDBLOCKS
:
456 lcounter
= (long long)mp
->m_sb
.sb_fdblocks
;
460 mp
->m_sb
.sb_fdblocks
= lcounter
;
469 * This routine allocates disk space for the given file.
470 * Originally derived from xfs_alloc_file_space().
473 libxfs_alloc_file_space(
482 xfs_filblks_t datablocks
;
483 xfs_filblks_t allocated_fsb
;
484 xfs_filblks_t allocatesize_fsb
;
485 xfs_bmbt_irec_t
*imapp
;
486 xfs_bmbt_irec_t imaps
[1];
489 xfs_fileoff_t startoffset_fsb
;
501 xfs_bmapi_flags
= alloc_type
? XFS_BMAPI_PREALLOC
: 0;
503 startoffset_fsb
= XFS_B_TO_FSBT(mp
, offset
);
504 allocatesize_fsb
= XFS_B_TO_FSB(mp
, count
);
506 /* allocate file space until done or until there is an error */
507 while (allocatesize_fsb
&& !error
) {
508 datablocks
= allocatesize_fsb
;
510 resblks
= (uint
)XFS_DIOSTRAT_SPACE_RES(mp
, datablocks
);
511 error
= xfs_trans_alloc(mp
, &M_RES(mp
)->tr_write
, resblks
,
514 * Check for running out of space
517 ASSERT(error
== -ENOSPC
);
520 xfs_trans_ijoin(tp
, ip
, 0);
522 error
= xfs_bmapi_write(tp
, ip
, startoffset_fsb
, allocatesize_fsb
,
523 xfs_bmapi_flags
, 0, imapp
, &reccount
);
529 * Complete the transaction
531 error
= xfs_trans_commit(tp
);
535 allocated_fsb
= imapp
->br_blockcount
;
539 startoffset_fsb
+= allocated_fsb
;
540 allocatesize_fsb
-= allocated_fsb
;
544 error0
: /* Cancel bmap, cancel trans */
545 xfs_trans_cancel(tp
);
550 * Wrapper around call to libxfs_ialloc. Takes care of committing and
551 * allocating a new transaction as needed.
553 * Originally there were two copies of this code - one in mkfs, the
554 * other in repair - now there is just the one.
567 xfs_buf_t
*ialloc_context
;
571 ialloc_context
= (xfs_buf_t
*)0;
572 error
= libxfs_ialloc(*tp
, pip
, mode
, nlink
, rdev
, cr
, fsx
,
573 &ialloc_context
, &ip
);
578 if (!ialloc_context
&& !ip
) {
583 if (ialloc_context
) {
585 xfs_trans_bhold(*tp
, ialloc_context
);
587 error
= xfs_trans_roll(tp
);
589 fprintf(stderr
, _("%s: cannot duplicate transaction: %s\n"),
590 progname
, strerror(error
));
593 xfs_trans_bjoin(*tp
, ialloc_context
);
594 error
= libxfs_ialloc(*tp
, pip
, mode
, nlink
, rdev
, cr
,
595 fsx
, &ialloc_context
, &ip
);
607 * Userspace versions of common diagnostic routines (varargs fun).
610 libxfs_fs_repair_cmn_err(int level
, xfs_mount_t
*mp
, char *fmt
, ...)
615 vfprintf(stderr
, fmt
, ap
);
616 fprintf(stderr
, " This is a bug.\n");
617 fprintf(stderr
, "%s version %s\n", progname
, VERSION
);
619 "Please capture the filesystem metadata with xfs_metadump and\n"
620 "report it to linux-xfs@vger.kernel.org\n");
625 libxfs_fs_cmn_err(int level
, xfs_mount_t
*mp
, char *fmt
, ...)
630 vfprintf(stderr
, fmt
, ap
);
636 cmn_err(int level
, char *fmt
, ...)
641 vfprintf(stderr
, fmt
, ap
);
647 * Warnings specifically for verifier errors. Differentiate CRC vs. invalid
648 * values, and omit the stack trace unless the error level is tuned high.
654 xfs_failaddr_t failaddr
)
656 xfs_buf_ioerror(bp
, error
);
658 xfs_alert(NULL
, "Metadata %s detected at %p, %s block 0x%llx/0x%x",
659 bp
->b_error
== -EFSBADCRC
? "CRC error" : "corruption",
660 failaddr
? failaddr
: __return_address
,
661 bp
->b_ops
->name
, bp
->b_bn
, BBTOB(bp
->b_length
));
665 * Warnings for inode corruption problems. Don't bother with the stack
666 * trace unless the error level is turned up high.
669 xfs_inode_verifier_error(
670 struct xfs_inode
*ip
,
675 xfs_failaddr_t failaddr
)
677 xfs_alert(NULL
, "Metadata %s detected at %p, inode 0x%llx %s",
678 error
== -EFSBADCRC
? "CRC error" : "corruption",
679 failaddr
? failaddr
: __return_address
,
684 * This is called from I/O verifiers on v5 superblock filesystems. In the
685 * kernel, it validates the metadata LSN parameter against the current LSN of
686 * the active log. We don't have an active log in userspace so this kind of
687 * validation is not required. Therefore, this function always returns true in
690 * xfs_repair piggybacks off this mechanism to help track the largest metadata
691 * LSN in use on a filesystem. Keep a record of the largest LSN seen such that
692 * repair can validate it against the state of the log.
694 xfs_lsn_t libxfs_max_lsn
= 0;
695 static pthread_mutex_t libxfs_max_lsn_lock
= PTHREAD_MUTEX_INITIALIZER
;
699 struct xfs_mount
*mp
,
702 int cycle
= CYCLE_LSN(lsn
);
703 int block
= BLOCK_LSN(lsn
);
707 if (lsn
== NULLCOMMITLSN
)
710 pthread_mutex_lock(&libxfs_max_lsn_lock
);
712 max_cycle
= CYCLE_LSN(libxfs_max_lsn
);
713 max_block
= BLOCK_LSN(libxfs_max_lsn
);
715 if ((cycle
> max_cycle
) ||
716 (cycle
== max_cycle
&& block
> max_block
))
717 libxfs_max_lsn
= lsn
;
719 pthread_mutex_unlock(&libxfs_max_lsn_lock
);
724 static struct xfs_buftarg
*
725 xfs_find_bdev_for_inode(
726 struct xfs_inode
*ip
)
728 struct xfs_mount
*mp
= ip
->i_mount
;
730 if (XFS_IS_REALTIME_INODE(ip
))
731 return mp
->m_rtdev_targp
;
732 return mp
->m_ddev_targp
;
736 xfs_fsb_to_db(struct xfs_inode
*ip
, xfs_fsblock_t fsb
)
738 if (XFS_IS_REALTIME_INODE(ip
))
739 return XFS_FSB_TO_BB(ip
->i_mount
, fsb
);
740 return XFS_FSB_TO_DADDR(ip
->i_mount
, (fsb
));
745 struct xfs_inode
*ip
,
746 xfs_fsblock_t start_fsb
,
749 xfs_daddr_t sector
= xfs_fsb_to_db(ip
, start_fsb
);
750 ssize_t size
= XFS_FSB_TO_BB(ip
->i_mount
, count_fsb
);
752 return libxfs_device_zero(xfs_find_bdev_for_inode(ip
), sector
, size
);
756 hweight8(unsigned int w
)
758 unsigned int res
= w
- ((w
>> 1) & 0x55);
759 res
= (res
& 0x33) + ((res
>> 2) & 0x33);
760 return (res
+ (res
>> 4)) & 0x0F;
764 hweight32(unsigned int w
)
766 unsigned int res
= w
- ((w
>> 1) & 0x55555555);
767 res
= (res
& 0x33333333) + ((res
>> 2) & 0x33333333);
768 res
= (res
+ (res
>> 4)) & 0x0F0F0F0F;
769 res
= res
+ (res
>> 8);
770 return (res
+ (res
>> 16)) & 0x000000FF;
776 return hweight32((unsigned int)w
) +
777 hweight32((unsigned int)(w
>> 32));