2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 #include "libxfs_priv.h"
22 #include "xfs_shared.h"
23 #include "xfs_format.h"
24 #include "xfs_log_format.h"
25 #include "xfs_trans_resv.h"
26 #include "xfs_mount.h"
27 #include "xfs_inode_buf.h"
28 #include "xfs_inode_fork.h"
29 #include "xfs_inode.h"
30 #include "xfs_trans.h"
32 #include "xfs_bmap_btree.h"
33 #include "xfs_trans_space.h"
34 #include "xfs_ialloc.h"
35 #include "xfs_alloc.h"
38 * Calculate the worst case log unit reservation for a given superblock
39 * configuration. Copied and munged from the kernel code, and assumes a
40 * worse case header usage (maximum log buffer sizes)
43 xfs_log_calc_unit_res(
48 int iclog_header_size
;
52 if (xfs_sb_version_haslogv2(&mp
->m_sb
)) {
53 iclog_size
= XLOG_MAX_RECORD_BSIZE
;
54 iclog_header_size
= BBTOB(iclog_size
/ XLOG_HEADER_CYCLE_SIZE
);
56 iclog_size
= XLOG_BIG_RECORD_BSIZE
;
57 iclog_header_size
= BBSIZE
;
61 * Permanent reservations have up to 'cnt'-1 active log operations
62 * in the log. A unit in this case is the amount of space for one
63 * of these log operations. Normal reservations have a cnt of 1
64 * and their unit amount is the total amount of space required.
66 * The following lines of code account for non-transaction data
67 * which occupy space in the on-disk log.
69 * Normal form of a transaction is:
70 * <oph><trans-hdr><start-oph><reg1-oph><reg1><reg2-oph>...<commit-oph>
71 * and then there are LR hdrs, split-recs and roundoff at end of syncs.
73 * We need to account for all the leadup data and trailer data
74 * around the transaction data.
75 * And then we need to account for the worst case in terms of using
77 * The worst case will happen if:
78 * - the placement of the transaction happens to be such that the
79 * roundoff is at its maximum
80 * - the transaction data is synced before the commit record is synced
81 * i.e. <transaction-data><roundoff> | <commit-rec><roundoff>
82 * Therefore the commit record is in its own Log Record.
83 * This can happen as the commit record is called with its
84 * own region to xlog_write().
85 * This then means that in the worst case, roundoff can happen for
86 * the commit-rec as well.
87 * The commit-rec is smaller than padding in this scenario and so it is
88 * not added separately.
91 /* for trans header */
92 unit_bytes
+= sizeof(xlog_op_header_t
);
93 unit_bytes
+= sizeof(xfs_trans_header_t
);
96 unit_bytes
+= sizeof(xlog_op_header_t
);
99 * for LR headers - the space for data in an iclog is the size minus
100 * the space used for the headers. If we use the iclog size, then we
101 * undercalculate the number of headers required.
103 * Furthermore - the addition of op headers for split-recs might
104 * increase the space required enough to require more log and op
105 * headers, so take that into account too.
107 * IMPORTANT: This reservation makes the assumption that if this
108 * transaction is the first in an iclog and hence has the LR headers
109 * accounted to it, then the remaining space in the iclog is
110 * exclusively for this transaction. i.e. if the transaction is larger
111 * than the iclog, it will be the only thing in that iclog.
112 * Fundamentally, this means we must pass the entire log vector to
113 * xlog_write to guarantee this.
115 iclog_space
= iclog_size
- iclog_header_size
;
116 num_headers
= howmany(unit_bytes
, iclog_space
);
118 /* for split-recs - ophdrs added when data split over LRs */
119 unit_bytes
+= sizeof(xlog_op_header_t
) * num_headers
;
121 /* add extra header reservations if we overrun */
122 while (!num_headers
||
123 howmany(unit_bytes
, iclog_space
) > num_headers
) {
124 unit_bytes
+= sizeof(xlog_op_header_t
);
127 unit_bytes
+= iclog_header_size
* num_headers
;
129 /* for commit-rec LR header - note: padding will subsume the ophdr */
130 unit_bytes
+= iclog_header_size
;
132 /* for roundoff padding for transaction data and one for commit record */
133 if (xfs_sb_version_haslogv2(&mp
->m_sb
) && mp
->m_sb
.sb_logsunit
> 1) {
134 /* log su roundoff */
135 unit_bytes
+= 2 * mp
->m_sb
.sb_logsunit
;
138 unit_bytes
+= 2 * BBSIZE
;
145 * Change the requested timestamp in the given inode.
147 * This was once shared with the kernel, but has diverged to the point
148 * where it's no longer worth the hassle of maintaining common code.
151 libxfs_trans_ichgtime(
152 struct xfs_trans
*tp
,
153 struct xfs_inode
*ip
,
159 gettimeofday(&stv
, (struct timezone
*)0);
160 tv
.tv_sec
= stv
.tv_sec
;
161 tv
.tv_nsec
= stv
.tv_usec
* 1000;
162 if (flags
& XFS_ICHGTIME_MOD
) {
163 ip
->i_d
.di_mtime
.t_sec
= (__int32_t
)tv
.tv_sec
;
164 ip
->i_d
.di_mtime
.t_nsec
= (__int32_t
)tv
.tv_nsec
;
166 if (flags
& XFS_ICHGTIME_CHG
) {
167 ip
->i_d
.di_ctime
.t_sec
= (__int32_t
)tv
.tv_sec
;
168 ip
->i_d
.di_ctime
.t_nsec
= (__int32_t
)tv
.tv_nsec
;
170 if (flags
& XFS_ICHGTIME_CREATE
) {
171 ip
->i_d
.di_crtime
.t_sec
= (__int32_t
)tv
.tv_sec
;
172 ip
->i_d
.di_crtime
.t_nsec
= (__int32_t
)tv
.tv_nsec
;
177 * Allocate an inode on disk and return a copy of its in-core version.
178 * Set mode, nlink, and rdev appropriately within the inode.
179 * The uid and gid for the inode are set according to the contents of
180 * the given cred structure.
182 * This was once shared with the kernel, but has diverged to the point
183 * where it's no longer worth the hassle of maintaining common code.
195 xfs_buf_t
**ialloc_context
,
204 * Call the space management code to pick
205 * the on-disk inode to be allocated.
207 error
= xfs_dialloc(tp
, pip
? pip
->i_ino
: 0, mode
, okalloc
,
208 ialloc_context
, &ino
);
211 if (*ialloc_context
|| ino
== NULLFSINO
) {
215 ASSERT(*ialloc_context
== NULL
);
217 error
= xfs_trans_iget(tp
->t_mountp
, tp
, ino
, 0, 0, &ip
);
222 ip
->i_d
.di_mode
= (__uint16_t
)mode
;
223 ip
->i_d
.di_onlink
= 0;
224 ip
->i_d
.di_nlink
= nlink
;
225 ASSERT(ip
->i_d
.di_nlink
== nlink
);
226 ip
->i_d
.di_uid
= cr
->cr_uid
;
227 ip
->i_d
.di_gid
= cr
->cr_gid
;
228 xfs_set_projid(&ip
->i_d
, pip
? 0 : fsx
->fsx_projid
);
229 memset(&(ip
->i_d
.di_pad
[0]), 0, sizeof(ip
->i_d
.di_pad
));
230 xfs_trans_ichgtime(tp
, ip
, XFS_ICHGTIME_CHG
| XFS_ICHGTIME_MOD
);
233 * We only support filesystems that understand v2 format inodes. So if
234 * this is currently an old format inode, then change the inode version
235 * number now. This way we only do the conversion here rather than here
236 * and in the flush/logging code.
238 if (ip
->i_d
.di_version
== 1) {
239 ip
->i_d
.di_version
= 2;
241 * old link count, projid_lo/hi field, pad field
246 if (pip
&& (pip
->i_d
.di_mode
& S_ISGID
)) {
247 ip
->i_d
.di_gid
= pip
->i_d
.di_gid
;
248 if ((pip
->i_d
.di_mode
& S_ISGID
) && (mode
& S_IFMT
) == S_IFDIR
)
249 ip
->i_d
.di_mode
|= S_ISGID
;
253 ip
->i_d
.di_nextents
= 0;
254 ASSERT(ip
->i_d
.di_nblocks
== 0);
256 * di_gen will have been taken care of in xfs_iread.
258 ip
->i_d
.di_extsize
= pip
? 0 : fsx
->fsx_extsize
;
259 ip
->i_d
.di_dmevmask
= 0;
260 ip
->i_d
.di_dmstate
= 0;
261 ip
->i_d
.di_flags
= pip
? 0 : fsx
->fsx_xflags
;
263 if (ip
->i_d
.di_version
== 3) {
264 ASSERT(ip
->i_d
.di_ino
== ino
);
265 ASSERT(uuid_equal(&ip
->i_d
.di_uuid
, &mp
->m_sb
.sb_meta_uuid
));
267 ip
->i_d
.di_changecount
= 1;
269 ip
->i_d
.di_flags2
= 0;
270 memset(&(ip
->i_d
.di_pad2
[0]), 0, sizeof(ip
->i_d
.di_pad2
));
271 ip
->i_d
.di_crtime
= ip
->i_d
.di_mtime
;
274 flags
= XFS_ILOG_CORE
;
275 switch (mode
& S_IFMT
) {
278 /* doesn't make sense to set an rdev for these */
283 ip
->i_d
.di_format
= XFS_DINODE_FMT_DEV
;
284 ip
->i_df
.if_u2
.if_rdev
= rdev
;
285 flags
|= XFS_ILOG_DEV
;
289 if (pip
&& (pip
->i_d
.di_flags
& XFS_DIFLAG_ANY
)) {
292 if ((mode
& S_IFMT
) == S_IFDIR
) {
293 if (pip
->i_d
.di_flags
& XFS_DIFLAG_RTINHERIT
)
294 di_flags
|= XFS_DIFLAG_RTINHERIT
;
295 if (pip
->i_d
.di_flags
& XFS_DIFLAG_EXTSZINHERIT
) {
296 di_flags
|= XFS_DIFLAG_EXTSZINHERIT
;
297 ip
->i_d
.di_extsize
= pip
->i_d
.di_extsize
;
300 if (pip
->i_d
.di_flags
& XFS_DIFLAG_RTINHERIT
) {
301 di_flags
|= XFS_DIFLAG_REALTIME
;
303 if (pip
->i_d
.di_flags
& XFS_DIFLAG_EXTSZINHERIT
) {
304 di_flags
|= XFS_DIFLAG_EXTSIZE
;
305 ip
->i_d
.di_extsize
= pip
->i_d
.di_extsize
;
308 if (pip
->i_d
.di_flags
& XFS_DIFLAG_PROJINHERIT
)
309 di_flags
|= XFS_DIFLAG_PROJINHERIT
;
310 ip
->i_d
.di_flags
|= di_flags
;
314 ip
->i_d
.di_format
= XFS_DINODE_FMT_EXTENTS
;
315 ip
->i_df
.if_flags
= XFS_IFEXTENTS
;
316 ip
->i_df
.if_bytes
= ip
->i_df
.if_real_bytes
= 0;
317 ip
->i_df
.if_u1
.if_extents
= NULL
;
322 /* Attribute fork settings for new inode. */
323 ip
->i_d
.di_aformat
= XFS_DINODE_FMT_EXTENTS
;
324 ip
->i_d
.di_anextents
= 0;
327 * set up the inode ops structure that the libxfs code relies on
329 if (S_ISDIR(ip
->i_d
.di_mode
))
330 ip
->d_ops
= ip
->i_mount
->m_dir_inode_ops
;
332 ip
->d_ops
= ip
->i_mount
->m_nondir_inode_ops
;
335 * Log the new values stuffed into the inode.
337 xfs_trans_log_inode(tp
, ip
, flags
);
347 xfs_bmbt_rec_host_t
*ep
;
349 xfs_extnum_t nextents
;
351 printf("Inode %lx\n", (unsigned long)ip
);
352 printf(" i_ino %llx\n", (unsigned long long)ip
->i_ino
);
354 if (ip
->i_df
.if_flags
& XFS_IFEXTENTS
)
357 printf(" i_df.if_bytes %d\n", ip
->i_df
.if_bytes
);
358 printf(" i_df.if_u1.if_extents/if_data %lx\n",
359 (unsigned long)ip
->i_df
.if_u1
.if_extents
);
360 if (ip
->i_df
.if_flags
& XFS_IFEXTENTS
) {
361 nextents
= ip
->i_df
.if_bytes
/ (uint
)sizeof(*ep
);
362 for (ep
= ip
->i_df
.if_u1
.if_extents
, i
= 0; i
< nextents
;
366 xfs_bmbt_get_all(ep
, &rec
);
367 printf("\t%d: startoff %llu, startblock 0x%llx,"
368 " blockcount %llu, state %d\n",
369 i
, (unsigned long long)rec
.br_startoff
,
370 (unsigned long long)rec
.br_startblock
,
371 (unsigned long long)rec
.br_blockcount
,
375 printf(" i_df.if_broot %lx\n", (unsigned long)ip
->i_df
.if_broot
);
376 printf(" i_df.if_broot_bytes %x\n", ip
->i_df
.if_broot_bytes
);
379 printf("\nOn disk portion\n");
380 printf(" di_magic %x\n", dip
->di_magic
);
381 printf(" di_mode %o\n", dip
->di_mode
);
382 printf(" di_version %x\n", (uint
)dip
->di_version
);
383 switch (ip
->i_d
.di_format
) {
384 case XFS_DINODE_FMT_LOCAL
:
385 printf(" Inline inode\n");
387 case XFS_DINODE_FMT_EXTENTS
:
388 printf(" Extents inode\n");
390 case XFS_DINODE_FMT_BTREE
:
391 printf(" B-tree inode\n");
394 printf(" Other inode\n");
397 printf(" di_nlink %x\n", dip
->di_nlink
);
398 printf(" di_uid %d\n", dip
->di_uid
);
399 printf(" di_gid %d\n", dip
->di_gid
);
400 printf(" di_nextents %d\n", dip
->di_nextents
);
401 printf(" di_size %llu\n", (unsigned long long)dip
->di_size
);
402 printf(" di_gen %x\n", dip
->di_gen
);
403 printf(" di_extsize %d\n", dip
->di_extsize
);
404 printf(" di_flags %x\n", dip
->di_flags
);
405 printf(" di_nblocks %llu\n", (unsigned long long)dip
->di_nblocks
);
409 * Writes a modified inode's changes out to the inode's on disk home.
410 * Originally based on xfs_iflush_int() from xfs_inode.c in the kernel.
413 libxfs_iflush_int(xfs_inode_t
*ip
, xfs_buf_t
*bp
)
415 xfs_inode_log_item_t
*iip
;
419 ASSERT(XFS_BUF_FSPRIVATE(bp
, void *) != NULL
);
420 ASSERT(ip
->i_d
.di_format
!= XFS_DINODE_FMT_BTREE
||
421 ip
->i_d
.di_nextents
> ip
->i_df
.if_ext_max
);
422 ASSERT(ip
->i_d
.di_version
> 1);
427 /* set *dip = inode's place in the buffer */
428 dip
= xfs_buf_offset(bp
, ip
->i_imap
.im_boffset
);
430 ASSERT(ip
->i_d
.di_magic
== XFS_DINODE_MAGIC
);
431 if ((ip
->i_d
.di_mode
& S_IFMT
) == S_IFREG
) {
432 ASSERT( (ip
->i_d
.di_format
== XFS_DINODE_FMT_EXTENTS
) ||
433 (ip
->i_d
.di_format
== XFS_DINODE_FMT_BTREE
) );
435 else if ((ip
->i_d
.di_mode
& S_IFMT
) == S_IFDIR
) {
436 ASSERT( (ip
->i_d
.di_format
== XFS_DINODE_FMT_EXTENTS
) ||
437 (ip
->i_d
.di_format
== XFS_DINODE_FMT_BTREE
) ||
438 (ip
->i_d
.di_format
== XFS_DINODE_FMT_LOCAL
) );
440 ASSERT(ip
->i_d
.di_nextents
+ip
->i_d
.di_anextents
<= ip
->i_d
.di_nblocks
);
441 ASSERT(ip
->i_d
.di_forkoff
<= mp
->m_sb
.sb_inodesize
);
443 /* bump the change count on v3 inodes */
444 if (ip
->i_d
.di_version
== 3)
445 ip
->i_d
.di_changecount
++;
448 * Copy the dirty parts of the inode into the on-disk
449 * inode. We always copy out the core of the inode,
450 * because if the inode is dirty at all the core must
453 xfs_dinode_to_disk(dip
, &ip
->i_d
);
455 xfs_iflush_fork(ip
, dip
, iip
, XFS_DATA_FORK
);
457 xfs_iflush_fork(ip
, dip
, iip
, XFS_ATTR_FORK
);
459 /* update the lsn in the on disk inode if required */
460 if (ip
->i_d
.di_version
== 3)
461 dip
->di_lsn
= cpu_to_be64(iip
->ili_item
.li_lsn
);
463 /* generate the checksum. */
464 xfs_dinode_calc_crc(mp
, dip
);
470 libxfs_mod_incore_sb(
471 struct xfs_mount
*mp
,
476 long long lcounter
; /* long counter for 64 bit fields */
479 case XFS_TRANS_SB_FDBLOCKS
:
480 lcounter
= (long long)mp
->m_sb
.sb_fdblocks
;
484 mp
->m_sb
.sb_fdblocks
= lcounter
;
495 xfs_bmap_free_t
*flist
,
498 xfs_bmap_free_item_t
*free
; /* free extent list item */
499 xfs_bmap_free_item_t
*next
; /* next item on free list */
502 if (flist
->xbf_count
== 0) {
507 for (free
= flist
->xbf_first
; free
!= NULL
; free
= next
) {
508 next
= free
->xbfi_next
;
509 if ((error
= xfs_free_extent(*tp
, free
->xbfi_startblock
,
510 free
->xbfi_blockcount
)))
512 xfs_bmap_del_free(flist
, NULL
, free
);
519 * This routine allocates disk space for the given file.
520 * Originally derived from xfs_alloc_file_space().
523 libxfs_alloc_file_space(
532 xfs_filblks_t datablocks
;
533 xfs_filblks_t allocated_fsb
;
534 xfs_filblks_t allocatesize_fsb
;
535 xfs_fsblock_t firstfsb
;
536 xfs_bmap_free_t free_list
;
537 xfs_bmbt_irec_t
*imapp
;
538 xfs_bmbt_irec_t imaps
[1];
541 xfs_fileoff_t startoffset_fsb
;
554 xfs_bmapi_flags
= alloc_type
? XFS_BMAPI_PREALLOC
: 0;
556 startoffset_fsb
= XFS_B_TO_FSBT(mp
, offset
);
557 allocatesize_fsb
= XFS_B_TO_FSB(mp
, count
);
559 /* allocate file space until done or until there is an error */
560 while (allocatesize_fsb
&& !error
) {
561 datablocks
= allocatesize_fsb
;
563 tp
= xfs_trans_alloc(mp
, XFS_TRANS_DIOSTRAT
);
564 resblks
= (uint
)XFS_DIOSTRAT_SPACE_RES(mp
, datablocks
);
565 error
= xfs_trans_reserve(tp
, &M_RES(mp
)->tr_write
,
568 * Check for running out of space
572 * Free the transaction structure.
574 ASSERT(error
== -ENOSPC
);
575 xfs_trans_cancel(tp
);
578 xfs_trans_ijoin(tp
, ip
, 0);
580 xfs_bmap_init(&free_list
, &firstfsb
);
581 error
= xfs_bmapi_write(tp
, ip
, startoffset_fsb
, allocatesize_fsb
,
582 xfs_bmapi_flags
, &firstfsb
, 0, imapp
,
583 &reccount
, &free_list
);
588 /* complete the transaction */
589 error
= xfs_bmap_finish(&tp
, &free_list
, &committed
);
593 error
= xfs_trans_commit(tp
);
597 allocated_fsb
= imapp
->br_blockcount
;
601 startoffset_fsb
+= allocated_fsb
;
602 allocatesize_fsb
-= allocated_fsb
;
606 error0
: /* Cancel bmap, cancel trans */
607 xfs_bmap_cancel(&free_list
);
608 xfs_trans_cancel(tp
);
613 libxfs_log2_roundup(unsigned int i
)
617 for (rval
= 0; rval
< NBBY
* sizeof(i
); rval
++) {
618 if ((1 << rval
) >= i
)
625 * Wrapper around call to libxfs_ialloc. Takes care of committing and
626 * allocating a new transaction as needed.
628 * Originally there were two copies of this code - one in mkfs, the
629 * other in repair - now there is just the one.
642 xfs_buf_t
*ialloc_context
;
646 ialloc_context
= (xfs_buf_t
*)0;
647 error
= libxfs_ialloc(*tp
, pip
, mode
, nlink
, rdev
, cr
, fsx
,
648 1, &ialloc_context
, &ip
);
653 if (!ialloc_context
&& !ip
) {
658 if (ialloc_context
) {
660 xfs_trans_bhold(*tp
, ialloc_context
);
662 error
= xfs_trans_roll(tp
, NULL
);
664 fprintf(stderr
, _("%s: cannot duplicate transaction: %s\n"),
665 progname
, strerror(error
));
668 xfs_trans_bjoin(*tp
, ialloc_context
);
669 error
= libxfs_ialloc(*tp
, pip
, mode
, nlink
, rdev
, cr
,
670 fsx
, 1, &ialloc_context
, &ip
);
682 * Userspace versions of common diagnostic routines (varargs fun).
685 libxfs_fs_repair_cmn_err(int level
, xfs_mount_t
*mp
, char *fmt
, ...)
690 vfprintf(stderr
, fmt
, ap
);
691 fprintf(stderr
, " This is a bug.\n");
692 fprintf(stderr
, "%s version %s\n", progname
, VERSION
);
693 fprintf(stderr
, "Please capture the filesystem metadata with "
694 "xfs_metadump and\nreport it to xfs@oss.sgi.com.\n");
699 libxfs_fs_cmn_err(int level
, xfs_mount_t
*mp
, char *fmt
, ...)
704 vfprintf(stderr
, fmt
, ap
);
710 cmn_err(int level
, char *fmt
, ...)
715 vfprintf(stderr
, fmt
, ap
);
721 * Warnings specifically for verifier errors. Differentiate CRC vs. invalid
722 * values, and omit the stack trace unless the error level is tuned high.
728 xfs_alert(NULL
, "Metadata %s detected at block 0x%llx/0x%x",
729 bp
->b_error
== -EFSBADCRC
? "CRC error" : "corruption",
730 bp
->b_bn
, BBTOB(bp
->b_length
));
735 struct xfs_mount
*mp
,