2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 #include "libxfs_priv.h"
20 #include "libxfs_io.h"
23 #include "xfs_shared.h"
24 #include "xfs_format.h"
25 #include "xfs_log_format.h"
26 #include "xfs_trans_resv.h"
27 #include "xfs_mount.h"
28 #include "xfs_inode_buf.h"
29 #include "xfs_inode_fork.h"
30 #include "xfs_inode.h"
31 #include "xfs_trans.h"
33 #include "xfs_bmap_btree.h"
34 #include "xfs_trans_space.h"
35 #include "xfs_ialloc.h"
36 #include "xfs_alloc.h"
40 * Calculate the worst case log unit reservation for a given superblock
41 * configuration. Copied and munged from the kernel code, and assumes a
42 * worse case header usage (maximum log buffer sizes)
45 xfs_log_calc_unit_res(
50 int iclog_header_size
;
54 if (xfs_sb_version_haslogv2(&mp
->m_sb
)) {
55 iclog_size
= XLOG_MAX_RECORD_BSIZE
;
56 iclog_header_size
= BBTOB(iclog_size
/ XLOG_HEADER_CYCLE_SIZE
);
58 iclog_size
= XLOG_BIG_RECORD_BSIZE
;
59 iclog_header_size
= BBSIZE
;
63 * Permanent reservations have up to 'cnt'-1 active log operations
64 * in the log. A unit in this case is the amount of space for one
65 * of these log operations. Normal reservations have a cnt of 1
66 * and their unit amount is the total amount of space required.
68 * The following lines of code account for non-transaction data
69 * which occupy space in the on-disk log.
71 * Normal form of a transaction is:
72 * <oph><trans-hdr><start-oph><reg1-oph><reg1><reg2-oph>...<commit-oph>
73 * and then there are LR hdrs, split-recs and roundoff at end of syncs.
75 * We need to account for all the leadup data and trailer data
76 * around the transaction data.
77 * And then we need to account for the worst case in terms of using
79 * The worst case will happen if:
80 * - the placement of the transaction happens to be such that the
81 * roundoff is at its maximum
82 * - the transaction data is synced before the commit record is synced
83 * i.e. <transaction-data><roundoff> | <commit-rec><roundoff>
84 * Therefore the commit record is in its own Log Record.
85 * This can happen as the commit record is called with its
86 * own region to xlog_write().
87 * This then means that in the worst case, roundoff can happen for
88 * the commit-rec as well.
89 * The commit-rec is smaller than padding in this scenario and so it is
90 * not added separately.
93 /* for trans header */
94 unit_bytes
+= sizeof(xlog_op_header_t
);
95 unit_bytes
+= sizeof(xfs_trans_header_t
);
98 unit_bytes
+= sizeof(xlog_op_header_t
);
101 * for LR headers - the space for data in an iclog is the size minus
102 * the space used for the headers. If we use the iclog size, then we
103 * undercalculate the number of headers required.
105 * Furthermore - the addition of op headers for split-recs might
106 * increase the space required enough to require more log and op
107 * headers, so take that into account too.
109 * IMPORTANT: This reservation makes the assumption that if this
110 * transaction is the first in an iclog and hence has the LR headers
111 * accounted to it, then the remaining space in the iclog is
112 * exclusively for this transaction. i.e. if the transaction is larger
113 * than the iclog, it will be the only thing in that iclog.
114 * Fundamentally, this means we must pass the entire log vector to
115 * xlog_write to guarantee this.
117 iclog_space
= iclog_size
- iclog_header_size
;
118 num_headers
= howmany(unit_bytes
, iclog_space
);
120 /* for split-recs - ophdrs added when data split over LRs */
121 unit_bytes
+= sizeof(xlog_op_header_t
) * num_headers
;
123 /* add extra header reservations if we overrun */
124 while (!num_headers
||
125 howmany(unit_bytes
, iclog_space
) > num_headers
) {
126 unit_bytes
+= sizeof(xlog_op_header_t
);
129 unit_bytes
+= iclog_header_size
* num_headers
;
131 /* for commit-rec LR header - note: padding will subsume the ophdr */
132 unit_bytes
+= iclog_header_size
;
134 /* for roundoff padding for transaction data and one for commit record */
135 if (xfs_sb_version_haslogv2(&mp
->m_sb
) && mp
->m_sb
.sb_logsunit
> 1) {
136 /* log su roundoff */
137 unit_bytes
+= 2 * mp
->m_sb
.sb_logsunit
;
140 unit_bytes
+= 2 * BBSIZE
;
147 * Change the requested timestamp in the given inode.
149 * This was once shared with the kernel, but has diverged to the point
150 * where it's no longer worth the hassle of maintaining common code.
153 libxfs_trans_ichgtime(
154 struct xfs_trans
*tp
,
155 struct xfs_inode
*ip
,
161 gettimeofday(&stv
, (struct timezone
*)0);
162 tv
.tv_sec
= stv
.tv_sec
;
163 tv
.tv_nsec
= stv
.tv_usec
* 1000;
164 if (flags
& XFS_ICHGTIME_MOD
)
165 VFS_I(ip
)->i_mtime
= tv
;
166 if (flags
& XFS_ICHGTIME_CHG
)
167 VFS_I(ip
)->i_ctime
= tv
;
168 if (flags
& XFS_ICHGTIME_CREATE
) {
169 ip
->i_d
.di_crtime
.t_sec
= (__int32_t
)tv
.tv_sec
;
170 ip
->i_d
.di_crtime
.t_nsec
= (__int32_t
)tv
.tv_nsec
;
175 * Allocate an inode on disk and return a copy of its in-core version.
176 * Set mode, nlink, and rdev appropriately within the inode.
177 * The uid and gid for the inode are set according to the contents of
178 * the given cred structure.
180 * This was once shared with the kernel, but has diverged to the point
181 * where it's no longer worth the hassle of maintaining common code.
193 xfs_buf_t
**ialloc_context
,
202 * Call the space management code to pick
203 * the on-disk inode to be allocated.
205 error
= xfs_dialloc(tp
, pip
? pip
->i_ino
: 0, mode
, okalloc
,
206 ialloc_context
, &ino
);
209 if (*ialloc_context
|| ino
== NULLFSINO
) {
213 ASSERT(*ialloc_context
== NULL
);
215 error
= xfs_trans_iget(tp
->t_mountp
, tp
, ino
, 0, 0, &ip
);
220 VFS_I(ip
)->i_mode
= mode
;
221 set_nlink(VFS_I(ip
), nlink
);
222 ip
->i_d
.di_uid
= cr
->cr_uid
;
223 ip
->i_d
.di_gid
= cr
->cr_gid
;
224 xfs_set_projid(&ip
->i_d
, pip
? 0 : fsx
->fsx_projid
);
225 xfs_trans_ichgtime(tp
, ip
, XFS_ICHGTIME_CHG
| XFS_ICHGTIME_MOD
);
228 * We only support filesystems that understand v2 format inodes. So if
229 * this is currently an old format inode, then change the inode version
230 * number now. This way we only do the conversion here rather than here
231 * and in the flush/logging code.
233 if (ip
->i_d
.di_version
== 1) {
234 ip
->i_d
.di_version
= 2;
236 * old link count, projid_lo/hi field, pad field
241 if (pip
&& (VFS_I(pip
)->i_mode
& S_ISGID
)) {
242 ip
->i_d
.di_gid
= pip
->i_d
.di_gid
;
243 if ((VFS_I(pip
)->i_mode
& S_ISGID
) && (mode
& S_IFMT
) == S_IFDIR
)
244 VFS_I(ip
)->i_mode
|= S_ISGID
;
248 ip
->i_d
.di_nextents
= 0;
249 ASSERT(ip
->i_d
.di_nblocks
== 0);
250 ip
->i_d
.di_extsize
= pip
? 0 : fsx
->fsx_extsize
;
251 ip
->i_d
.di_dmevmask
= 0;
252 ip
->i_d
.di_dmstate
= 0;
253 ip
->i_d
.di_flags
= pip
? 0 : fsx
->fsx_xflags
;
255 if (ip
->i_d
.di_version
== 3) {
256 ASSERT(ip
->i_d
.di_ino
== ino
);
257 ASSERT(uuid_equal(&ip
->i_d
.di_uuid
, &mp
->m_sb
.sb_meta_uuid
));
258 VFS_I(ip
)->i_version
= 1;
259 ip
->i_d
.di_flags2
= 0;
260 ip
->i_d
.di_crtime
.t_sec
= (__int32_t
)VFS_I(ip
)->i_mtime
.tv_sec
;
261 ip
->i_d
.di_crtime
.t_nsec
= (__int32_t
)VFS_I(ip
)->i_mtime
.tv_nsec
;
264 flags
= XFS_ILOG_CORE
;
265 switch (mode
& S_IFMT
) {
268 /* doesn't make sense to set an rdev for these */
273 ip
->i_d
.di_format
= XFS_DINODE_FMT_DEV
;
274 ip
->i_df
.if_u2
.if_rdev
= rdev
;
275 flags
|= XFS_ILOG_DEV
;
279 if (pip
&& (pip
->i_d
.di_flags
& XFS_DIFLAG_ANY
)) {
282 if ((mode
& S_IFMT
) == S_IFDIR
) {
283 if (pip
->i_d
.di_flags
& XFS_DIFLAG_RTINHERIT
)
284 di_flags
|= XFS_DIFLAG_RTINHERIT
;
285 if (pip
->i_d
.di_flags
& XFS_DIFLAG_EXTSZINHERIT
) {
286 di_flags
|= XFS_DIFLAG_EXTSZINHERIT
;
287 ip
->i_d
.di_extsize
= pip
->i_d
.di_extsize
;
290 if (pip
->i_d
.di_flags
& XFS_DIFLAG_RTINHERIT
) {
291 di_flags
|= XFS_DIFLAG_REALTIME
;
293 if (pip
->i_d
.di_flags
& XFS_DIFLAG_EXTSZINHERIT
) {
294 di_flags
|= XFS_DIFLAG_EXTSIZE
;
295 ip
->i_d
.di_extsize
= pip
->i_d
.di_extsize
;
298 if (pip
->i_d
.di_flags
& XFS_DIFLAG_PROJINHERIT
)
299 di_flags
|= XFS_DIFLAG_PROJINHERIT
;
300 ip
->i_d
.di_flags
|= di_flags
;
304 ip
->i_d
.di_format
= XFS_DINODE_FMT_EXTENTS
;
305 ip
->i_df
.if_flags
= XFS_IFEXTENTS
;
306 ip
->i_df
.if_bytes
= ip
->i_df
.if_real_bytes
= 0;
307 ip
->i_df
.if_u1
.if_extents
= NULL
;
312 /* Attribute fork settings for new inode. */
313 ip
->i_d
.di_aformat
= XFS_DINODE_FMT_EXTENTS
;
314 ip
->i_d
.di_anextents
= 0;
317 * set up the inode ops structure that the libxfs code relies on
320 ip
->d_ops
= ip
->i_mount
->m_dir_inode_ops
;
322 ip
->d_ops
= ip
->i_mount
->m_nondir_inode_ops
;
325 * Log the new values stuffed into the inode.
327 xfs_trans_log_inode(tp
, ip
, flags
);
336 struct xfs_icdinode
*dip
;
337 xfs_bmbt_rec_host_t
*ep
;
339 xfs_extnum_t nextents
;
341 printf("Inode %lx\n", (unsigned long)ip
);
342 printf(" i_ino %llx\n", (unsigned long long)ip
->i_ino
);
344 if (ip
->i_df
.if_flags
& XFS_IFEXTENTS
)
347 printf(" i_df.if_bytes %d\n", ip
->i_df
.if_bytes
);
348 printf(" i_df.if_u1.if_extents/if_data %lx\n",
349 (unsigned long)ip
->i_df
.if_u1
.if_extents
);
350 if (ip
->i_df
.if_flags
& XFS_IFEXTENTS
) {
351 nextents
= ip
->i_df
.if_bytes
/ (uint
)sizeof(*ep
);
352 for (ep
= ip
->i_df
.if_u1
.if_extents
, i
= 0; i
< nextents
;
356 xfs_bmbt_get_all(ep
, &rec
);
357 printf("\t%d: startoff %llu, startblock 0x%llx,"
358 " blockcount %llu, state %d\n",
359 i
, (unsigned long long)rec
.br_startoff
,
360 (unsigned long long)rec
.br_startblock
,
361 (unsigned long long)rec
.br_blockcount
,
365 printf(" i_df.if_broot %lx\n", (unsigned long)ip
->i_df
.if_broot
);
366 printf(" i_df.if_broot_bytes %x\n", ip
->i_df
.if_broot_bytes
);
369 printf("\nOn disk portion\n");
370 printf(" di_mode %o\n", VFS_I(ip
)->i_mode
);
371 printf(" di_version %x\n", (uint
)dip
->di_version
);
372 switch (ip
->i_d
.di_format
) {
373 case XFS_DINODE_FMT_LOCAL
:
374 printf(" Inline inode\n");
376 case XFS_DINODE_FMT_EXTENTS
:
377 printf(" Extents inode\n");
379 case XFS_DINODE_FMT_BTREE
:
380 printf(" B-tree inode\n");
383 printf(" Other inode\n");
386 printf(" di_nlink %x\n", VFS_I(ip
)->i_nlink
);
387 printf(" di_uid %d\n", dip
->di_uid
);
388 printf(" di_gid %d\n", dip
->di_gid
);
389 printf(" di_nextents %d\n", dip
->di_nextents
);
390 printf(" di_size %llu\n", (unsigned long long)dip
->di_size
);
391 printf(" di_gen %x\n", VFS_I(ip
)->i_generation
);
392 printf(" di_extsize %d\n", dip
->di_extsize
);
393 printf(" di_flags %x\n", dip
->di_flags
);
394 printf(" di_nblocks %llu\n", (unsigned long long)dip
->di_nblocks
);
398 * Writes a modified inode's changes out to the inode's on disk home.
399 * Originally based on xfs_iflush_int() from xfs_inode.c in the kernel.
402 libxfs_iflush_int(xfs_inode_t
*ip
, xfs_buf_t
*bp
)
404 xfs_inode_log_item_t
*iip
;
408 ASSERT(XFS_BUF_FSPRIVATE(bp
, void *) != NULL
);
409 ASSERT(ip
->i_d
.di_format
!= XFS_DINODE_FMT_BTREE
||
410 ip
->i_d
.di_nextents
> ip
->i_df
.if_ext_max
);
411 ASSERT(ip
->i_d
.di_version
> 1);
416 /* set *dip = inode's place in the buffer */
417 dip
= xfs_buf_offset(bp
, ip
->i_imap
.im_boffset
);
419 ASSERT(ip
->i_d
.di_magic
== XFS_DINODE_MAGIC
);
421 ASSERT( (ip
->i_d
.di_format
== XFS_DINODE_FMT_EXTENTS
) ||
422 (ip
->i_d
.di_format
== XFS_DINODE_FMT_BTREE
) );
423 } else if (XFS_ISDIR(ip
)) {
424 ASSERT( (ip
->i_d
.di_format
== XFS_DINODE_FMT_EXTENTS
) ||
425 (ip
->i_d
.di_format
== XFS_DINODE_FMT_BTREE
) ||
426 (ip
->i_d
.di_format
== XFS_DINODE_FMT_LOCAL
) );
428 ASSERT(ip
->i_d
.di_nextents
+ip
->i_d
.di_anextents
<= ip
->i_d
.di_nblocks
);
429 ASSERT(ip
->i_d
.di_forkoff
<= mp
->m_sb
.sb_inodesize
);
431 /* bump the change count on v3 inodes */
432 if (ip
->i_d
.di_version
== 3)
433 VFS_I(ip
)->i_version
++;
436 * Copy the dirty parts of the inode into the on-disk
437 * inode. We always copy out the core of the inode,
438 * because if the inode is dirty at all the core must
441 xfs_inode_to_disk(ip
, dip
, iip
->ili_item
.li_lsn
);
443 xfs_iflush_fork(ip
, dip
, iip
, XFS_DATA_FORK
);
445 xfs_iflush_fork(ip
, dip
, iip
, XFS_ATTR_FORK
);
447 /* generate the checksum. */
448 xfs_dinode_calc_crc(mp
, dip
);
454 libxfs_mod_incore_sb(
455 struct xfs_mount
*mp
,
460 long long lcounter
; /* long counter for 64 bit fields */
463 case XFS_TRANS_SB_FDBLOCKS
:
464 lcounter
= (long long)mp
->m_sb
.sb_fdblocks
;
468 mp
->m_sb
.sb_fdblocks
= lcounter
;
478 struct xfs_trans
**tp
,
479 struct xfs_bmap_free
*flist
,
480 struct xfs_inode
*ip
)
482 xfs_bmap_free_item_t
*free
; /* free extent list item */
483 xfs_bmap_free_item_t
*next
; /* next item on free list */
486 if (flist
->xbf_count
== 0)
489 for (free
= flist
->xbf_first
; free
!= NULL
; free
= next
) {
490 next
= free
->xbfi_next
;
491 error
= xfs_free_extent(*tp
, free
->xbfi_startblock
,
492 free
->xbfi_blockcount
);
495 xfs_bmap_del_free(flist
, NULL
, free
);
501 * This routine allocates disk space for the given file.
502 * Originally derived from xfs_alloc_file_space().
505 libxfs_alloc_file_space(
514 xfs_filblks_t datablocks
;
515 xfs_filblks_t allocated_fsb
;
516 xfs_filblks_t allocatesize_fsb
;
517 xfs_fsblock_t firstfsb
;
518 xfs_bmap_free_t free_list
;
519 xfs_bmbt_irec_t
*imapp
;
520 xfs_bmbt_irec_t imaps
[1];
523 xfs_fileoff_t startoffset_fsb
;
535 xfs_bmapi_flags
= alloc_type
? XFS_BMAPI_PREALLOC
: 0;
537 startoffset_fsb
= XFS_B_TO_FSBT(mp
, offset
);
538 allocatesize_fsb
= XFS_B_TO_FSB(mp
, count
);
540 /* allocate file space until done or until there is an error */
541 while (allocatesize_fsb
&& !error
) {
542 datablocks
= allocatesize_fsb
;
544 tp
= xfs_trans_alloc(mp
, XFS_TRANS_DIOSTRAT
);
545 resblks
= (uint
)XFS_DIOSTRAT_SPACE_RES(mp
, datablocks
);
546 error
= xfs_trans_reserve(tp
, &M_RES(mp
)->tr_write
,
549 * Check for running out of space
553 * Free the transaction structure.
555 ASSERT(error
== -ENOSPC
);
556 xfs_trans_cancel(tp
);
559 xfs_trans_ijoin(tp
, ip
, 0);
561 xfs_bmap_init(&free_list
, &firstfsb
);
562 error
= xfs_bmapi_write(tp
, ip
, startoffset_fsb
, allocatesize_fsb
,
563 xfs_bmapi_flags
, &firstfsb
, 0, imapp
,
564 &reccount
, &free_list
);
569 /* complete the transaction */
570 error
= xfs_bmap_finish(&tp
, &free_list
, ip
);
574 error
= xfs_trans_commit(tp
);
578 allocated_fsb
= imapp
->br_blockcount
;
582 startoffset_fsb
+= allocated_fsb
;
583 allocatesize_fsb
-= allocated_fsb
;
587 error0
: /* Cancel bmap, cancel trans */
588 xfs_bmap_cancel(&free_list
);
589 xfs_trans_cancel(tp
);
594 libxfs_log2_roundup(unsigned int i
)
598 for (rval
= 0; rval
< NBBY
* sizeof(i
); rval
++) {
599 if ((1 << rval
) >= i
)
606 * Wrapper around call to libxfs_ialloc. Takes care of committing and
607 * allocating a new transaction as needed.
609 * Originally there were two copies of this code - one in mkfs, the
610 * other in repair - now there is just the one.
623 xfs_buf_t
*ialloc_context
;
627 ialloc_context
= (xfs_buf_t
*)0;
628 error
= libxfs_ialloc(*tp
, pip
, mode
, nlink
, rdev
, cr
, fsx
,
629 1, &ialloc_context
, &ip
);
634 if (!ialloc_context
&& !ip
) {
639 if (ialloc_context
) {
641 xfs_trans_bhold(*tp
, ialloc_context
);
643 error
= xfs_trans_roll(tp
, NULL
);
645 fprintf(stderr
, _("%s: cannot duplicate transaction: %s\n"),
646 progname
, strerror(error
));
649 xfs_trans_bjoin(*tp
, ialloc_context
);
650 error
= libxfs_ialloc(*tp
, pip
, mode
, nlink
, rdev
, cr
,
651 fsx
, 1, &ialloc_context
, &ip
);
663 * Userspace versions of common diagnostic routines (varargs fun).
666 libxfs_fs_repair_cmn_err(int level
, xfs_mount_t
*mp
, char *fmt
, ...)
671 vfprintf(stderr
, fmt
, ap
);
672 fprintf(stderr
, " This is a bug.\n");
673 fprintf(stderr
, "%s version %s\n", progname
, VERSION
);
674 fprintf(stderr
, "Please capture the filesystem metadata with "
675 "xfs_metadump and\nreport it to xfs@oss.sgi.com.\n");
680 libxfs_fs_cmn_err(int level
, xfs_mount_t
*mp
, char *fmt
, ...)
685 vfprintf(stderr
, fmt
, ap
);
691 cmn_err(int level
, char *fmt
, ...)
696 vfprintf(stderr
, fmt
, ap
);
702 * Warnings specifically for verifier errors. Differentiate CRC vs. invalid
703 * values, and omit the stack trace unless the error level is tuned high.
709 xfs_alert(NULL
, "Metadata %s detected at %s block 0x%llx/0x%x",
710 bp
->b_error
== -EFSBADCRC
? "CRC error" : "corruption",
711 bp
->b_ops
->name
, bp
->b_bn
, BBTOB(bp
->b_length
));
715 * This is called from I/O verifiers on v5 superblock filesystems. In the
716 * kernel, it validates the metadata LSN parameter against the current LSN of
717 * the active log. We don't have an active log in userspace so this kind of
718 * validation is not required. Therefore, this function always returns true in
721 * xfs_repair piggybacks off this mechanism to help track the largest metadata
722 * LSN in use on a filesystem. Keep a record of the largest LSN seen such that
723 * repair can validate it against the state of the log.
725 xfs_lsn_t libxfs_max_lsn
= 0;
726 pthread_mutex_t libxfs_max_lsn_lock
= PTHREAD_MUTEX_INITIALIZER
;
730 struct xfs_mount
*mp
,
733 int cycle
= CYCLE_LSN(lsn
);
734 int block
= BLOCK_LSN(lsn
);
738 if (lsn
== NULLCOMMITLSN
)
741 pthread_mutex_lock(&libxfs_max_lsn_lock
);
743 max_cycle
= CYCLE_LSN(libxfs_max_lsn
);
744 max_block
= BLOCK_LSN(libxfs_max_lsn
);
746 if ((cycle
> max_cycle
) ||
747 (cycle
== max_cycle
&& block
> max_block
))
748 libxfs_max_lsn
= lsn
;
750 pthread_mutex_unlock(&libxfs_max_lsn_lock
);
755 static struct xfs_buftarg
*
756 xfs_find_bdev_for_inode(
757 struct xfs_inode
*ip
)
759 struct xfs_mount
*mp
= ip
->i_mount
;
761 if (XFS_IS_REALTIME_INODE(ip
))
762 return mp
->m_rtdev_targp
;
763 return mp
->m_ddev_targp
;
767 xfs_fsb_to_db(struct xfs_inode
*ip
, xfs_fsblock_t fsb
)
769 if (XFS_IS_REALTIME_INODE(ip
))
770 return XFS_FSB_TO_BB(ip
->i_mount
, fsb
);
771 return XFS_FSB_TO_DADDR(ip
->i_mount
, (fsb
));
776 struct xfs_inode
*ip
,
777 xfs_fsblock_t start_fsb
,
780 xfs_daddr_t sector
= xfs_fsb_to_db(ip
, start_fsb
);
781 ssize_t size
= XFS_FSB_TO_BB(ip
->i_mount
, count_fsb
);
783 return libxfs_device_zero(xfs_find_bdev_for_inode(ip
), sector
, size
);