2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 #include "libxfs_priv.h"
20 #include "libxfs_io.h"
23 #include "xfs_shared.h"
24 #include "xfs_format.h"
25 #include "xfs_log_format.h"
26 #include "xfs_trans_resv.h"
27 #include "xfs_mount.h"
28 #include "xfs_defer.h"
29 #include "xfs_inode_buf.h"
30 #include "xfs_inode_fork.h"
31 #include "xfs_inode.h"
32 #include "xfs_trans.h"
34 #include "xfs_bmap_btree.h"
35 #include "xfs_trans_space.h"
36 #include "xfs_ialloc.h"
37 #include "xfs_alloc.h"
41 * Calculate the worst case log unit reservation for a given superblock
42 * configuration. Copied and munged from the kernel code, and assumes a
43 * worse case header usage (maximum log buffer sizes)
46 xfs_log_calc_unit_res(
51 int iclog_header_size
;
55 if (xfs_sb_version_haslogv2(&mp
->m_sb
)) {
56 iclog_size
= XLOG_MAX_RECORD_BSIZE
;
57 iclog_header_size
= BBTOB(iclog_size
/ XLOG_HEADER_CYCLE_SIZE
);
59 iclog_size
= XLOG_BIG_RECORD_BSIZE
;
60 iclog_header_size
= BBSIZE
;
64 * Permanent reservations have up to 'cnt'-1 active log operations
65 * in the log. A unit in this case is the amount of space for one
66 * of these log operations. Normal reservations have a cnt of 1
67 * and their unit amount is the total amount of space required.
69 * The following lines of code account for non-transaction data
70 * which occupy space in the on-disk log.
72 * Normal form of a transaction is:
73 * <oph><trans-hdr><start-oph><reg1-oph><reg1><reg2-oph>...<commit-oph>
74 * and then there are LR hdrs, split-recs and roundoff at end of syncs.
76 * We need to account for all the leadup data and trailer data
77 * around the transaction data.
78 * And then we need to account for the worst case in terms of using
80 * The worst case will happen if:
81 * - the placement of the transaction happens to be such that the
82 * roundoff is at its maximum
83 * - the transaction data is synced before the commit record is synced
84 * i.e. <transaction-data><roundoff> | <commit-rec><roundoff>
85 * Therefore the commit record is in its own Log Record.
86 * This can happen as the commit record is called with its
87 * own region to xlog_write().
88 * This then means that in the worst case, roundoff can happen for
89 * the commit-rec as well.
90 * The commit-rec is smaller than padding in this scenario and so it is
91 * not added separately.
94 /* for trans header */
95 unit_bytes
+= sizeof(xlog_op_header_t
);
96 unit_bytes
+= sizeof(xfs_trans_header_t
);
99 unit_bytes
+= sizeof(xlog_op_header_t
);
102 * for LR headers - the space for data in an iclog is the size minus
103 * the space used for the headers. If we use the iclog size, then we
104 * undercalculate the number of headers required.
106 * Furthermore - the addition of op headers for split-recs might
107 * increase the space required enough to require more log and op
108 * headers, so take that into account too.
110 * IMPORTANT: This reservation makes the assumption that if this
111 * transaction is the first in an iclog and hence has the LR headers
112 * accounted to it, then the remaining space in the iclog is
113 * exclusively for this transaction. i.e. if the transaction is larger
114 * than the iclog, it will be the only thing in that iclog.
115 * Fundamentally, this means we must pass the entire log vector to
116 * xlog_write to guarantee this.
118 iclog_space
= iclog_size
- iclog_header_size
;
119 num_headers
= howmany(unit_bytes
, iclog_space
);
121 /* for split-recs - ophdrs added when data split over LRs */
122 unit_bytes
+= sizeof(xlog_op_header_t
) * num_headers
;
124 /* add extra header reservations if we overrun */
125 while (!num_headers
||
126 howmany(unit_bytes
, iclog_space
) > num_headers
) {
127 unit_bytes
+= sizeof(xlog_op_header_t
);
130 unit_bytes
+= iclog_header_size
* num_headers
;
132 /* for commit-rec LR header - note: padding will subsume the ophdr */
133 unit_bytes
+= iclog_header_size
;
135 /* for roundoff padding for transaction data and one for commit record */
136 if (xfs_sb_version_haslogv2(&mp
->m_sb
) && mp
->m_sb
.sb_logsunit
> 1) {
137 /* log su roundoff */
138 unit_bytes
+= 2 * mp
->m_sb
.sb_logsunit
;
141 unit_bytes
+= 2 * BBSIZE
;
148 * Change the requested timestamp in the given inode.
150 * This was once shared with the kernel, but has diverged to the point
151 * where it's no longer worth the hassle of maintaining common code.
154 libxfs_trans_ichgtime(
155 struct xfs_trans
*tp
,
156 struct xfs_inode
*ip
,
162 gettimeofday(&stv
, (struct timezone
*)0);
163 tv
.tv_sec
= stv
.tv_sec
;
164 tv
.tv_nsec
= stv
.tv_usec
* 1000;
165 if (flags
& XFS_ICHGTIME_MOD
)
166 VFS_I(ip
)->i_mtime
= tv
;
167 if (flags
& XFS_ICHGTIME_CHG
)
168 VFS_I(ip
)->i_ctime
= tv
;
169 if (flags
& XFS_ICHGTIME_CREATE
) {
170 ip
->i_d
.di_crtime
.t_sec
= (__int32_t
)tv
.tv_sec
;
171 ip
->i_d
.di_crtime
.t_nsec
= (__int32_t
)tv
.tv_nsec
;
176 * Allocate an inode on disk and return a copy of its in-core version.
177 * Set mode, nlink, and rdev appropriately within the inode.
178 * The uid and gid for the inode are set according to the contents of
179 * the given cred structure.
181 * This was once shared with the kernel, but has diverged to the point
182 * where it's no longer worth the hassle of maintaining common code.
194 xfs_buf_t
**ialloc_context
,
203 * Call the space management code to pick
204 * the on-disk inode to be allocated.
206 error
= xfs_dialloc(tp
, pip
? pip
->i_ino
: 0, mode
, okalloc
,
207 ialloc_context
, &ino
);
210 if (*ialloc_context
|| ino
== NULLFSINO
) {
214 ASSERT(*ialloc_context
== NULL
);
216 error
= xfs_trans_iget(tp
->t_mountp
, tp
, ino
, 0, 0, &ip
);
221 VFS_I(ip
)->i_mode
= mode
;
222 set_nlink(VFS_I(ip
), nlink
);
223 ip
->i_d
.di_uid
= cr
->cr_uid
;
224 ip
->i_d
.di_gid
= cr
->cr_gid
;
225 xfs_set_projid(&ip
->i_d
, pip
? 0 : fsx
->fsx_projid
);
226 xfs_trans_ichgtime(tp
, ip
, XFS_ICHGTIME_CHG
| XFS_ICHGTIME_MOD
);
229 * We only support filesystems that understand v2 format inodes. So if
230 * this is currently an old format inode, then change the inode version
231 * number now. This way we only do the conversion here rather than here
232 * and in the flush/logging code.
234 if (ip
->i_d
.di_version
== 1) {
235 ip
->i_d
.di_version
= 2;
237 * old link count, projid_lo/hi field, pad field
242 if (pip
&& (VFS_I(pip
)->i_mode
& S_ISGID
)) {
243 ip
->i_d
.di_gid
= pip
->i_d
.di_gid
;
244 if ((VFS_I(pip
)->i_mode
& S_ISGID
) && (mode
& S_IFMT
) == S_IFDIR
)
245 VFS_I(ip
)->i_mode
|= S_ISGID
;
249 ip
->i_d
.di_nextents
= 0;
250 ASSERT(ip
->i_d
.di_nblocks
== 0);
251 ip
->i_d
.di_extsize
= pip
? 0 : fsx
->fsx_extsize
;
252 ip
->i_d
.di_dmevmask
= 0;
253 ip
->i_d
.di_dmstate
= 0;
254 ip
->i_d
.di_flags
= pip
? 0 : fsx
->fsx_xflags
;
256 if (ip
->i_d
.di_version
== 3) {
257 ASSERT(ip
->i_d
.di_ino
== ino
);
258 ASSERT(uuid_equal(&ip
->i_d
.di_uuid
, &mp
->m_sb
.sb_meta_uuid
));
259 VFS_I(ip
)->i_version
= 1;
260 ip
->i_d
.di_flags2
= 0;
261 ip
->i_d
.di_crtime
.t_sec
= (__int32_t
)VFS_I(ip
)->i_mtime
.tv_sec
;
262 ip
->i_d
.di_crtime
.t_nsec
= (__int32_t
)VFS_I(ip
)->i_mtime
.tv_nsec
;
265 flags
= XFS_ILOG_CORE
;
266 switch (mode
& S_IFMT
) {
269 /* doesn't make sense to set an rdev for these */
274 ip
->i_d
.di_format
= XFS_DINODE_FMT_DEV
;
275 ip
->i_df
.if_u2
.if_rdev
= rdev
;
276 flags
|= XFS_ILOG_DEV
;
280 if (pip
&& (pip
->i_d
.di_flags
& XFS_DIFLAG_ANY
)) {
283 if ((mode
& S_IFMT
) == S_IFDIR
) {
284 if (pip
->i_d
.di_flags
& XFS_DIFLAG_RTINHERIT
)
285 di_flags
|= XFS_DIFLAG_RTINHERIT
;
286 if (pip
->i_d
.di_flags
& XFS_DIFLAG_EXTSZINHERIT
) {
287 di_flags
|= XFS_DIFLAG_EXTSZINHERIT
;
288 ip
->i_d
.di_extsize
= pip
->i_d
.di_extsize
;
291 if (pip
->i_d
.di_flags
& XFS_DIFLAG_RTINHERIT
) {
292 di_flags
|= XFS_DIFLAG_REALTIME
;
294 if (pip
->i_d
.di_flags
& XFS_DIFLAG_EXTSZINHERIT
) {
295 di_flags
|= XFS_DIFLAG_EXTSIZE
;
296 ip
->i_d
.di_extsize
= pip
->i_d
.di_extsize
;
299 if (pip
->i_d
.di_flags
& XFS_DIFLAG_PROJINHERIT
)
300 di_flags
|= XFS_DIFLAG_PROJINHERIT
;
301 ip
->i_d
.di_flags
|= di_flags
;
305 ip
->i_d
.di_format
= XFS_DINODE_FMT_EXTENTS
;
306 ip
->i_df
.if_flags
= XFS_IFEXTENTS
;
307 ip
->i_df
.if_bytes
= ip
->i_df
.if_real_bytes
= 0;
308 ip
->i_df
.if_u1
.if_extents
= NULL
;
313 /* Attribute fork settings for new inode. */
314 ip
->i_d
.di_aformat
= XFS_DINODE_FMT_EXTENTS
;
315 ip
->i_d
.di_anextents
= 0;
318 * set up the inode ops structure that the libxfs code relies on
321 ip
->d_ops
= ip
->i_mount
->m_dir_inode_ops
;
323 ip
->d_ops
= ip
->i_mount
->m_nondir_inode_ops
;
326 * Log the new values stuffed into the inode.
328 xfs_trans_log_inode(tp
, ip
, flags
);
337 struct xfs_icdinode
*dip
;
338 xfs_bmbt_rec_host_t
*ep
;
340 xfs_extnum_t nextents
;
342 printf("Inode %lx\n", (unsigned long)ip
);
343 printf(" i_ino %llx\n", (unsigned long long)ip
->i_ino
);
345 if (ip
->i_df
.if_flags
& XFS_IFEXTENTS
)
348 printf(" i_df.if_bytes %d\n", ip
->i_df
.if_bytes
);
349 printf(" i_df.if_u1.if_extents/if_data %lx\n",
350 (unsigned long)ip
->i_df
.if_u1
.if_extents
);
351 if (ip
->i_df
.if_flags
& XFS_IFEXTENTS
) {
352 nextents
= ip
->i_df
.if_bytes
/ (uint
)sizeof(*ep
);
353 for (ep
= ip
->i_df
.if_u1
.if_extents
, i
= 0; i
< nextents
;
357 xfs_bmbt_get_all(ep
, &rec
);
358 printf("\t%d: startoff %llu, startblock 0x%llx,"
359 " blockcount %llu, state %d\n",
360 i
, (unsigned long long)rec
.br_startoff
,
361 (unsigned long long)rec
.br_startblock
,
362 (unsigned long long)rec
.br_blockcount
,
366 printf(" i_df.if_broot %lx\n", (unsigned long)ip
->i_df
.if_broot
);
367 printf(" i_df.if_broot_bytes %x\n", ip
->i_df
.if_broot_bytes
);
370 printf("\nOn disk portion\n");
371 printf(" di_mode %o\n", VFS_I(ip
)->i_mode
);
372 printf(" di_version %x\n", (uint
)dip
->di_version
);
373 switch (ip
->i_d
.di_format
) {
374 case XFS_DINODE_FMT_LOCAL
:
375 printf(" Inline inode\n");
377 case XFS_DINODE_FMT_EXTENTS
:
378 printf(" Extents inode\n");
380 case XFS_DINODE_FMT_BTREE
:
381 printf(" B-tree inode\n");
384 printf(" Other inode\n");
387 printf(" di_nlink %x\n", VFS_I(ip
)->i_nlink
);
388 printf(" di_uid %d\n", dip
->di_uid
);
389 printf(" di_gid %d\n", dip
->di_gid
);
390 printf(" di_nextents %d\n", dip
->di_nextents
);
391 printf(" di_size %llu\n", (unsigned long long)dip
->di_size
);
392 printf(" di_gen %x\n", VFS_I(ip
)->i_generation
);
393 printf(" di_extsize %d\n", dip
->di_extsize
);
394 printf(" di_flags %x\n", dip
->di_flags
);
395 printf(" di_nblocks %llu\n", (unsigned long long)dip
->di_nblocks
);
399 * Writes a modified inode's changes out to the inode's on disk home.
400 * Originally based on xfs_iflush_int() from xfs_inode.c in the kernel.
403 libxfs_iflush_int(xfs_inode_t
*ip
, xfs_buf_t
*bp
)
405 xfs_inode_log_item_t
*iip
;
409 ASSERT(XFS_BUF_FSPRIVATE(bp
, void *) != NULL
);
410 ASSERT(ip
->i_d
.di_format
!= XFS_DINODE_FMT_BTREE
||
411 ip
->i_d
.di_nextents
> ip
->i_df
.if_ext_max
);
412 ASSERT(ip
->i_d
.di_version
> 1);
417 /* set *dip = inode's place in the buffer */
418 dip
= xfs_buf_offset(bp
, ip
->i_imap
.im_boffset
);
420 ASSERT(ip
->i_d
.di_magic
== XFS_DINODE_MAGIC
);
422 ASSERT( (ip
->i_d
.di_format
== XFS_DINODE_FMT_EXTENTS
) ||
423 (ip
->i_d
.di_format
== XFS_DINODE_FMT_BTREE
) );
424 } else if (XFS_ISDIR(ip
)) {
425 ASSERT( (ip
->i_d
.di_format
== XFS_DINODE_FMT_EXTENTS
) ||
426 (ip
->i_d
.di_format
== XFS_DINODE_FMT_BTREE
) ||
427 (ip
->i_d
.di_format
== XFS_DINODE_FMT_LOCAL
) );
429 ASSERT(ip
->i_d
.di_nextents
+ip
->i_d
.di_anextents
<= ip
->i_d
.di_nblocks
);
430 ASSERT(ip
->i_d
.di_forkoff
<= mp
->m_sb
.sb_inodesize
);
432 /* bump the change count on v3 inodes */
433 if (ip
->i_d
.di_version
== 3)
434 VFS_I(ip
)->i_version
++;
437 * Copy the dirty parts of the inode into the on-disk
438 * inode. We always copy out the core of the inode,
439 * because if the inode is dirty at all the core must
442 xfs_inode_to_disk(ip
, dip
, iip
->ili_item
.li_lsn
);
444 xfs_iflush_fork(ip
, dip
, iip
, XFS_DATA_FORK
);
446 xfs_iflush_fork(ip
, dip
, iip
, XFS_ATTR_FORK
);
448 /* generate the checksum. */
449 xfs_dinode_calc_crc(mp
, dip
);
455 libxfs_mod_incore_sb(
456 struct xfs_mount
*mp
,
461 long long lcounter
; /* long counter for 64 bit fields */
464 case XFS_TRANS_SB_FDBLOCKS
:
465 lcounter
= (long long)mp
->m_sb
.sb_fdblocks
;
469 mp
->m_sb
.sb_fdblocks
= lcounter
;
478 * This routine allocates disk space for the given file.
479 * Originally derived from xfs_alloc_file_space().
482 libxfs_alloc_file_space(
491 xfs_filblks_t datablocks
;
492 xfs_filblks_t allocated_fsb
;
493 xfs_filblks_t allocatesize_fsb
;
494 xfs_fsblock_t firstfsb
;
495 struct xfs_defer_ops free_list
;
496 xfs_bmbt_irec_t
*imapp
;
497 xfs_bmbt_irec_t imaps
[1];
500 xfs_fileoff_t startoffset_fsb
;
512 xfs_bmapi_flags
= alloc_type
? XFS_BMAPI_PREALLOC
: 0;
514 startoffset_fsb
= XFS_B_TO_FSBT(mp
, offset
);
515 allocatesize_fsb
= XFS_B_TO_FSB(mp
, count
);
517 /* allocate file space until done or until there is an error */
518 while (allocatesize_fsb
&& !error
) {
519 datablocks
= allocatesize_fsb
;
521 resblks
= (uint
)XFS_DIOSTRAT_SPACE_RES(mp
, datablocks
);
522 error
= xfs_trans_alloc(mp
, &M_RES(mp
)->tr_write
, resblks
,
525 * Check for running out of space
528 ASSERT(error
== -ENOSPC
);
531 xfs_trans_ijoin(tp
, ip
, 0);
533 xfs_defer_init(&free_list
, &firstfsb
);
534 error
= xfs_bmapi_write(tp
, ip
, startoffset_fsb
, allocatesize_fsb
,
535 xfs_bmapi_flags
, &firstfsb
, 0, imapp
,
536 &reccount
, &free_list
);
541 /* complete the transaction */
542 error
= xfs_defer_finish(&tp
, &free_list
, ip
);
546 error
= xfs_trans_commit(tp
);
550 allocated_fsb
= imapp
->br_blockcount
;
554 startoffset_fsb
+= allocated_fsb
;
555 allocatesize_fsb
-= allocated_fsb
;
559 error0
: /* Cancel bmap, cancel trans */
560 xfs_defer_cancel(&free_list
);
561 xfs_trans_cancel(tp
);
566 libxfs_log2_roundup(unsigned int i
)
570 for (rval
= 0; rval
< NBBY
* sizeof(i
); rval
++) {
571 if ((1 << rval
) >= i
)
578 * Wrapper around call to libxfs_ialloc. Takes care of committing and
579 * allocating a new transaction as needed.
581 * Originally there were two copies of this code - one in mkfs, the
582 * other in repair - now there is just the one.
595 xfs_buf_t
*ialloc_context
;
599 ialloc_context
= (xfs_buf_t
*)0;
600 error
= libxfs_ialloc(*tp
, pip
, mode
, nlink
, rdev
, cr
, fsx
,
601 1, &ialloc_context
, &ip
);
606 if (!ialloc_context
&& !ip
) {
611 if (ialloc_context
) {
613 xfs_trans_bhold(*tp
, ialloc_context
);
615 error
= xfs_trans_roll(tp
, NULL
);
617 fprintf(stderr
, _("%s: cannot duplicate transaction: %s\n"),
618 progname
, strerror(error
));
621 xfs_trans_bjoin(*tp
, ialloc_context
);
622 error
= libxfs_ialloc(*tp
, pip
, mode
, nlink
, rdev
, cr
,
623 fsx
, 1, &ialloc_context
, &ip
);
635 * Userspace versions of common diagnostic routines (varargs fun).
638 libxfs_fs_repair_cmn_err(int level
, xfs_mount_t
*mp
, char *fmt
, ...)
643 vfprintf(stderr
, fmt
, ap
);
644 fprintf(stderr
, " This is a bug.\n");
645 fprintf(stderr
, "%s version %s\n", progname
, VERSION
);
646 fprintf(stderr
, "Please capture the filesystem metadata with "
647 "xfs_metadump and\nreport it to xfs@oss.sgi.com.\n");
652 libxfs_fs_cmn_err(int level
, xfs_mount_t
*mp
, char *fmt
, ...)
657 vfprintf(stderr
, fmt
, ap
);
663 cmn_err(int level
, char *fmt
, ...)
668 vfprintf(stderr
, fmt
, ap
);
674 * Warnings specifically for verifier errors. Differentiate CRC vs. invalid
675 * values, and omit the stack trace unless the error level is tuned high.
681 xfs_alert(NULL
, "Metadata %s detected at %s block 0x%llx/0x%x",
682 bp
->b_error
== -EFSBADCRC
? "CRC error" : "corruption",
683 bp
->b_ops
->name
, bp
->b_bn
, BBTOB(bp
->b_length
));
687 * This is called from I/O verifiers on v5 superblock filesystems. In the
688 * kernel, it validates the metadata LSN parameter against the current LSN of
689 * the active log. We don't have an active log in userspace so this kind of
690 * validation is not required. Therefore, this function always returns true in
693 * xfs_repair piggybacks off this mechanism to help track the largest metadata
694 * LSN in use on a filesystem. Keep a record of the largest LSN seen such that
695 * repair can validate it against the state of the log.
697 xfs_lsn_t libxfs_max_lsn
= 0;
698 pthread_mutex_t libxfs_max_lsn_lock
= PTHREAD_MUTEX_INITIALIZER
;
702 struct xfs_mount
*mp
,
705 int cycle
= CYCLE_LSN(lsn
);
706 int block
= BLOCK_LSN(lsn
);
710 if (lsn
== NULLCOMMITLSN
)
713 pthread_mutex_lock(&libxfs_max_lsn_lock
);
715 max_cycle
= CYCLE_LSN(libxfs_max_lsn
);
716 max_block
= BLOCK_LSN(libxfs_max_lsn
);
718 if ((cycle
> max_cycle
) ||
719 (cycle
== max_cycle
&& block
> max_block
))
720 libxfs_max_lsn
= lsn
;
722 pthread_mutex_unlock(&libxfs_max_lsn_lock
);
727 static struct xfs_buftarg
*
728 xfs_find_bdev_for_inode(
729 struct xfs_inode
*ip
)
731 struct xfs_mount
*mp
= ip
->i_mount
;
733 if (XFS_IS_REALTIME_INODE(ip
))
734 return mp
->m_rtdev_targp
;
735 return mp
->m_ddev_targp
;
739 xfs_fsb_to_db(struct xfs_inode
*ip
, xfs_fsblock_t fsb
)
741 if (XFS_IS_REALTIME_INODE(ip
))
742 return XFS_FSB_TO_BB(ip
->i_mount
, fsb
);
743 return XFS_FSB_TO_DADDR(ip
->i_mount
, (fsb
));
748 struct xfs_inode
*ip
,
749 xfs_fsblock_t start_fsb
,
752 xfs_daddr_t sector
= xfs_fsb_to_db(ip
, start_fsb
);
753 ssize_t size
= XFS_FSB_TO_BB(ip
->i_mount
, count_fsb
);
755 return libxfs_device_zero(xfs_find_bdev_for_inode(ip
), sector
, size
);