#include "libfrog/util.h"
#include "libxfs.h"
#include <ctype.h>
+#include <linux/blkzoned.h>
+#include "libxfs/xfs_zones.h"
#include "xfs_multidisk.h"
#include "libxcmd.h"
#include "libfrog/fsgeom.h"
R_RGCOUNT,
R_RGSIZE,
R_CONCURRENCY,
+ R_ZONED,
+ R_START,
+ R_RESERVED,
R_MAX_OPTS,
};
[R_RGCOUNT] = "rgcount",
[R_RGSIZE] = "rgsize",
[R_CONCURRENCY] = "concurrency",
+ [R_ZONED] = "zoned",
+ [R_START] = "start",
+ [R_RESERVED] = "reserved",
[R_MAX_OPTS] = NULL,
},
.subopt_params = {
.maxval = INT_MAX,
.defaultval = 1,
},
+ { .index = R_ZONED,
+ .conflicts = { { &ropts, R_EXTSIZE },
+ { NULL, LAST_CONFLICT } },
+ .minval = 0,
+ .maxval = 1,
+ .defaultval = 1,
+ },
+ { .index = R_START,
+ .conflicts = { { &ropts, R_DEV },
+ { NULL, LAST_CONFLICT } },
+ .convert = true,
+ .minval = 0,
+ .maxval = LLONG_MAX,
+ .defaultval = SUBOPT_NEEDS_VAL,
+ },
+ { .index = R_RESERVED,
+ .conflicts = { { NULL, LAST_CONFLICT } },
+ .convert = true,
+ .minval = 0,
+ .maxval = LLONG_MAX,
+ .defaultval = SUBOPT_NEEDS_VAL,
+ },
},
};
bool nortalign;
bool nrext64;
bool exchrange; /* XFS_SB_FEAT_INCOMPAT_EXCHRANGE */
+ bool zoned;
+ bool zone_gaps;
uint16_t qflags;
};
char *lsu;
char *rtextsize;
char *rtsize;
+ char *rtstart;
+ uint64_t rtreserved;
/* parameters where 0 is a valid CLI value */
int dsunit;
char *label;
struct sb_feat_args sb_feat;
+ uint64_t rtstart;
+ uint64_t rtreserved;
};
/*
/* prototype file */ [-p fname]\n\
/* quiet */ [-q]\n\
/* realtime subvol */ [-r extsize=num,size=num,rtdev=xxx,rgcount=n,rgsize=n,\n\
- concurrency=num]\n\
+ concurrency=num,zoned=0|1,start=n,reserved=n]\n\
/* sectorsize */ [-s size=num]\n\
/* version */ [-V]\n\
devicename\n\
printf("Done.\n");
}
+static void
+reset_zones(
+ struct mkfs_params *cfg,
+ int fd,
+ uint64_t start_sector,
+ uint64_t nsectors,
+ int quiet)
+{
+ struct blk_zone_range range = {
+ .sector = start_sector,
+ .nr_sectors = nsectors,
+ };
+
+ if (!quiet) {
+ printf("Resetting zones...");
+ fflush(stdout);
+ }
+
+ if (ioctl(fd, BLKRESETZONE, &range) < 0) {
+ if (!quiet)
+ printf(" FAILED (%d)\n", -errno);
+ exit(1);
+ }
+
+ if (!quiet)
+ printf("Done.\n");
+}
+
static __attribute__((noreturn)) void
illegal_option(
const char *value,
case R_CONCURRENCY:
set_rtvol_concurrency(opts, subopt, cli, value);
break;
+ case R_ZONED:
+ cli->sb_feat.zoned = getnum(value, opts, subopt);
+ break;
+ case R_START:
+ cli->rtstart = getstr(value, opts, subopt);
+ break;
+ case R_RESERVED:
+ cli->rtreserved = getnum(value, opts, subopt);
+ break;
default:
return -EINVAL;
}
_("log stripe unit specified, using v2 logs\n"));
cli->sb_feat.log_version = 2;
}
+}
+
+struct zone_info {
+ /* number of zones, conventional or sequential */
+ unsigned int nr_zones;
+ /* number of conventional zones */
+ unsigned int nr_conv_zones;
+
+ /* size of the address space for a zone, in 512b blocks */
+ xfs_daddr_t zone_size;
+ /* write capacity of a zone, in 512b blocks */
+ xfs_daddr_t zone_capacity;
+};
+
+struct zone_topology {
+ struct zone_info data;
+ struct zone_info rt;
+ struct zone_info log;
+};
+
+/* random size that allows efficient processing */
+#define ZONES_PER_IOCTL 16384
+
+static void
+report_zones(
+ const char *name,
+ struct zone_info *zi)
+{
+ struct blk_zone_report *rep;
+ bool found_seq = false;
+ int fd, ret = 0;
+ uint64_t device_size;
+ uint64_t sector = 0;
+ size_t rep_size;
+ unsigned int i, n = 0;
+ struct stat st;
+
+ fd = open(name, O_RDONLY);
+ if (fd < 0) {
+ fprintf(stderr, _("Failed to open RT device: %d.\n"), -errno);
+ exit(1);
+ }
+
+ if (fstat(fd, &st) < 0) {
+ ret = -EIO;
+ goto out_close;
+ }
+ if (!S_ISBLK(st.st_mode))
+ goto out_close;
+
+ if (ioctl(fd, BLKGETSIZE64, &device_size)) {
+ fprintf(stderr, _("Failed to get block size: %d.\n"), -errno);
+ exit(1);
+ }
+ if (ioctl(fd, BLKGETZONESZ, &zi->zone_size) || !zi->zone_size)
+ goto out_close; /* not zoned */
+
+ /* BLKGETSIZE64 reports a byte value */
+ device_size = BTOBB(device_size);
+ zi->nr_zones = device_size / zi->zone_size;
+ zi->nr_conv_zones = 0;
+
+ rep_size = sizeof(struct blk_zone_report) +
+ sizeof(struct blk_zone) * ZONES_PER_IOCTL;
+ rep = malloc(rep_size);
+ if (!rep) {
+ fprintf(stderr,
+_("Failed to allocate memory for zone reporting.\n"));
+ exit(1);
+ }
+
+ while (n < zi->nr_zones) {
+ struct blk_zone *zones = (struct blk_zone *)(rep + 1);
+
+ memset(rep, 0, rep_size);
+ rep->sector = sector;
+ rep->nr_zones = ZONES_PER_IOCTL;
+
+ ret = ioctl(fd, BLKREPORTZONE, rep);
+ if (ret) {
+ fprintf(stderr,
+_("ioctl(BLKREPORTZONE) failed: %d!\n"), -errno);
+ exit(1);
+ }
+ if (!rep->nr_zones)
+ break;
+
+ for (i = 0; i < rep->nr_zones; i++) {
+ if (n >= zi->nr_zones)
+ break;
+
+ if (zones[i].len != zi->zone_size) {
+ fprintf(stderr,
+_("Inconsistent zone size!\n"));
+ exit(1);
+ }
+
+ switch (zones[i].type) {
+ case BLK_ZONE_TYPE_CONVENTIONAL:
+ /*
+ * We can only use the conventional space at the
+ * start of the device for metadata, so don't
+ * count later conventional zones. This is
+ * not an error because we can use them for data
+ * just fine.
+ */
+ if (!found_seq)
+ zi->nr_conv_zones++;
+ break;
+ case BLK_ZONE_TYPE_SEQWRITE_REQ:
+ found_seq = true;
+ break;
+ case BLK_ZONE_TYPE_SEQWRITE_PREF:
+ fprintf(stderr,
+_("Sequential write preferred zones not supported.\n"));
+ exit(1);
+ default:
+ fprintf(stderr,
+_("Unknown zone type (0x%x) found.\n"), zones[i].type);
+ exit(1);
+ }
+
+ if (!n) {
+ zi->zone_capacity = zones[i].capacity;
+ if (zi->zone_capacity > zi->zone_size) {
+ fprintf(stderr,
+_("Zone capacity larger than zone size!\n"));
+ exit(1);
+ }
+ } else if (zones[i].capacity != zi->zone_capacity) {
+ fprintf(stderr,
+_("Inconsistent zone capacity!\n"));
+ exit(1);
+ }
+
+ n++;
+ }
+ sector = zones[rep->nr_zones - 1].start +
+ zones[rep->nr_zones - 1].len;
+ }
+
+ free(rep);
+out_close:
+ close(fd);
+}
+
+static void
+validate_zoned(
+ struct mkfs_params *cfg,
+ struct cli_params *cli,
+ struct mkfs_default_params *dft,
+ struct zone_topology *zt)
+{
+ if (!cli->xi->data.isfile) {
+ report_zones(cli->xi->data.name, &zt->data);
+ if (zt->data.nr_zones) {
+ if (!zt->data.nr_conv_zones) {
+ fprintf(stderr,
+_("Data devices requires conventional zones.\n"));
+ usage();
+ }
+ if (zt->data.zone_capacity != zt->data.zone_size) {
+ fprintf(stderr,
+_("Zone capacity equal to Zone size required for conventional zones.\n"));
+ usage();
+ }
+
+ cli->sb_feat.zoned = true;
+ cfg->rtstart =
+ zt->data.nr_conv_zones * zt->data.zone_capacity;
+ }
+ }
+
+ if (cli->xi->rt.name && !cli->xi->rt.isfile) {
+ report_zones(cli->xi->rt.name, &zt->rt);
+ if (zt->rt.nr_zones && !cli->sb_feat.zoned)
+ cli->sb_feat.zoned = true;
+ if (zt->rt.zone_size != zt->rt.zone_capacity)
+ cli->sb_feat.zone_gaps = true;
+ }
+
+ if (cli->xi->log.name && !cli->xi->log.isfile) {
+ report_zones(cli->xi->log.name, &zt->log);
+ if (zt->log.nr_zones) {
+ fprintf(stderr,
+_("Zoned devices not supported as log device!\n"));
+ usage();
+ }
+ }
+ if (cli->rtstart) {
+ /*
+ * For zoned devices with conventional zones, cfg->rtstart is
+ * set to the start of the first sequential write required zoned
+ * above. Don't allow the user to override it as that won't
+ * work.
+ */
+ if (cfg->rtstart) {
+ fprintf(stderr,
+_("rtstart override not allowed on zoned devices.\n"));
+ usage();
+ }
+ cfg->rtstart = getnum(cli->rtstart, &ropts, R_START) / 512;
+ }
+
+ if (cli->rtreserved)
+ cfg->rtreserved = cli->rtreserved;
}
/*
cli->sb_feat.inobtcnt = false;
}
- if (cli->xi->rt.name) {
+ if (cli->sb_feat.zoned) {
+ if (!cli->sb_feat.metadir) {
+ if (cli_opt_set(&mopts, M_METADIR)) {
+ fprintf(stderr,
+_("zoned realtime device not supported without metadir support\n"));
+ usage();
+ }
+ cli->sb_feat.metadir = true;
+ }
+ if (cli->rtextsize) {
+ if (cli_opt_set(&ropts, R_EXTSIZE)) {
+ fprintf(stderr,
+_("rt extent size not supported on realtime devices with zoned mode\n"));
+ usage();
+ }
+ cli->rtextsize = 0;
+ }
+ } else {
+ if (cli->rtstart) {
+ fprintf(stderr,
+_("internal RT section only supported in zoned mode\n"));
+ usage();
+ }
+ if (cli->rtreserved) {
+ fprintf(stderr,
+_("reserved RT blocks only supported in zoned mode\n"));
+ usage();
+ }
+ }
+
+ if (cli->xi->rt.name || cfg->rtstart) {
if (cli->rtextsize && cli->sb_feat.reflink) {
if (cli_opt_set(&mopts, M_REFLINK)) {
fprintf(stderr,
usage();
}
cfg->rtextblocks = 1;
+ } else if (cli->sb_feat.zoned) {
+ /*
+ * Zoned mode only supports a rtextsize of 1.
+ */
+ cfg->rtextblocks = 1;
} else {
/*
* If realtime extsize has not been specified by the user,
static void
open_devices(
struct mkfs_params *cfg,
- struct libxfs_init *xi)
+ struct libxfs_init *xi,
+ struct zone_topology *zt)
{
uint64_t sector_mask;
usage();
}
+ if (zt->data.nr_zones) {
+ zt->rt.zone_size = zt->data.zone_size;
+ zt->rt.zone_capacity = zt->data.zone_capacity;
+ zt->rt.nr_zones = zt->data.nr_zones - zt->data.nr_conv_zones;
+ } else if (cfg->sb_feat.zoned && !cfg->rtstart && !xi->rt.dev) {
+ /*
+ * By default reserve at 1% of the total capacity (rounded up to
+ * the next power of two) for metadata, but match the minimum we
+ * enforce elsewhere. This matches what SMR HDDs provide.
+ */
+ uint64_t rt_target_size = max((xi->data.size + 99) / 100,
+ BTOBB(300 * 1024 * 1024));
+
+ cfg->rtstart = 1;
+ while (cfg->rtstart < rt_target_size)
+ cfg->rtstart <<= 1;
+ }
+
+ if (cfg->rtstart) {
+ if (cfg->rtstart >= xi->data.size) {
+ fprintf(stderr,
+ _("device size %lld too small for zoned allocator\n"), xi->data.size);
+ usage();
+ }
+ xi->rt.size = xi->data.size - cfg->rtstart;
+ xi->data.size = cfg->rtstart;
+ }
+
/*
* Ok, Linux only has a 1024-byte resolution on device _size_,
* and the sizes below are in basic 512-byte blocks,
static void
discard_devices(
+ struct mkfs_params *cfg,
struct libxfs_init *xi,
+ struct zone_topology *zt,
int quiet)
{
/*
* This function has to be called after libxfs has been initialized.
*/
- if (!xi->data.isfile)
- discard_blocks(xi->data.fd, xi->data.size, quiet);
- if (xi->rt.dev && !xi->rt.isfile)
- discard_blocks(xi->rt.fd, xi->rt.size, quiet);
+ if (!xi->data.isfile) {
+ uint64_t nsectors = xi->data.size;
+
+ if (cfg->rtstart && zt->data.nr_zones) {
+ /*
+ * Note that the zone reset here includes the LBA range
+ * for the data device.
+ *
+ * This is because doing a single zone reset all on the
+ * entire device (which the kernel automatically does
+ * for us for a full device range) is a lot faster than
+ * resetting each zone individually and resetting
+ * the conventional zones used for the data device is a
+ * no-op.
+ */
+ reset_zones(cfg, xi->data.fd, 0,
+ cfg->rtstart + xi->rt.size, quiet);
+ nsectors -= cfg->rtstart;
+ }
+ discard_blocks(xi->data.fd, nsectors, quiet);
+ }
+ if (xi->rt.dev && !xi->rt.isfile) {
+ if (zt->rt.nr_zones)
+ reset_zones(cfg, xi->rt.fd, 0, xi->rt.size, quiet);
+ else
+ discard_blocks(xi->rt.fd, xi->rt.size, quiet);
+ }
if (xi->log.dev && xi->log.dev != xi->data.dev && !xi->log.isfile)
discard_blocks(xi->log.fd, xi->log.size, quiet);
}
static void
validate_rtdev(
struct mkfs_params *cfg,
- struct cli_params *cli)
+ struct cli_params *cli,
+ struct zone_topology *zt)
{
struct libxfs_init *xi = cli->xi;
- if (!xi->rt.dev) {
+ if (!xi->rt.dev && !cfg->rtstart) {
if (cli->rtsize) {
fprintf(stderr,
_("size specified for non-existent rt subvolume\n"));
if (cli->rtsize) {
if (cfg->rtblocks > DTOBT(xi->rt.size, cfg->blocklog)) {
fprintf(stderr,
-_("size %s specified for rt subvolume is too large, maxi->um is %lld blocks\n"),
+_("size %s specified for rt subvolume is too large, maximum is %lld blocks\n"),
cli->rtsize,
(long long)DTOBT(xi->rt.size, cfg->blocklog));
usage();
reported by the device (%u).\n"),
cfg->sectorsize, xi->rt.bsize);
}
+ } else if (zt->rt.nr_zones) {
+ cfg->rtblocks = DTOBT(zt->rt.nr_zones * zt->rt.zone_capacity,
+ cfg->blocklog);
} else {
/* grab volume size */
cfg->rtblocks = DTOBT(xi->rt.size, cfg->blocklog);
NBBY * (cfg->blocksize - sizeof(struct xfs_rtbuf_blkinfo)));
}
+static void
+calculate_zone_geometry(
+ struct mkfs_params *cfg,
+ struct cli_params *cli,
+ struct libxfs_init *xi,
+ struct zone_topology *zt)
+{
+ if (cfg->rtblocks == 0) {
+ fprintf(stderr,
+_("empty zoned realtime device not supported.\n"));
+ usage();
+ }
+
+ if (zt->rt.nr_zones) {
+ /* The RT device has hardware zones */
+ cfg->rgsize = zt->rt.zone_capacity * 512;
+
+ if (cfg->rgsize % cfg->blocksize) {
+ fprintf(stderr,
+_("rgsize (%s) not a multiple of fs blk size (%d)\n"),
+ cli->rgsize, cfg->blocksize);
+ usage();
+ }
+ if (cli->rgsize) {
+ fprintf(stderr,
+_("rgsize (%s) may not be specified when the rt device is zoned\n"),
+ cli->rgsize);
+ usage();
+ }
+
+ cfg->rgsize /= cfg->blocksize;
+ cfg->rgcount = howmany(cfg->rtblocks, cfg->rgsize);
+
+ if (cli->rgcount > cfg->rgcount) {
+ fprintf(stderr,
+_("rgcount (%llu) is larger than hardware zone count (%llu)\n"),
+ (unsigned long long)cli->rgcount,
+ (unsigned long long)cfg->rgcount);
+ usage();
+ } else if (cli->rgcount && cli->rgcount < cfg->rgcount) {
+ /* constrain the rt device to the given rgcount */
+ cfg->rgcount = cli->rgcount;
+ }
+ } else {
+ /* No hardware zones */
+ if (cli->rgsize) {
+ /* User-specified rtgroup size */
+ cfg->rgsize = getnum(cli->rgsize, &ropts, R_RGSIZE);
+
+ /* Check specified agsize is a multiple of blocksize. */
+ if (cfg->rgsize % cfg->blocksize) {
+ fprintf(stderr,
+_("rgsize (%s) not a multiple of fs blk size (%d)\n"),
+ cli->rgsize, cfg->blocksize);
+ usage();
+ }
+ cfg->rgsize /= cfg->blocksize;
+ cfg->rgcount = cfg->rtblocks / cfg->rgsize +
+ (cfg->rtblocks % cfg->rgsize != 0);
+ } else if (cli->rgcount) {
+ /* User-specified rtgroup count */
+ cfg->rgcount = cli->rgcount;
+ cfg->rgsize = cfg->rtblocks / cfg->rgcount +
+ (cfg->rtblocks % cfg->rgcount != 0);
+ } else {
+ /* 256MB zones just like typical SMR HDDs */
+ cfg->rgsize = MEGABYTES(256, cfg->blocklog);
+ cfg->rgcount = cfg->rtblocks / cfg->rgsize +
+ (cfg->rtblocks % cfg->rgsize != 0);
+ }
+ }
+
+ if (cfg->rgcount < XFS_MIN_ZONES) {
+ fprintf(stderr,
+_("realtime group count (%llu) must be greater than the minimum zone count (%u)\n"),
+ (unsigned long long)cfg->rgcount,
+ XFS_MIN_ZONES);
+ usage();
+ }
+
+ validate_rtgroup_geometry(cfg);
+
+ /* Zoned RT devices don't use the rtbitmap, and have no bitmap blocks */
+ cfg->rtbmblocks = 0;
+}
+
static void
calculate_imaxpct(
struct mkfs_params *cfg,
sbp->sb_rgblklog = libxfs_compute_rgblklog(sbp->sb_rgextents,
cfg->rtextblocks);
}
+
+ if (fp->zoned) {
+ sbp->sb_features_incompat |= XFS_SB_FEAT_INCOMPAT_ZONED;
+ sbp->sb_rtstart = (cfg->rtstart * 512) / cfg->blocksize;
+ sbp->sb_rtreserved = cfg->rtreserved / cfg->blocksize;
+ }
+ if (fp->zone_gaps)
+ sbp->sb_features_incompat |= XFS_SB_FEAT_INCOMPAT_ZONE_GAPS;
}
/*
(xfs_extlen_t)XFS_FSB_TO_BB(mp, cfg->logblocks),
&sbp->sb_uuid, cfg->sb_feat.log_version,
lsunit, XLOG_FMT, XLOG_INIT_CYCLE, false);
-
/* finally, check we can write the last block in the realtime area */
- if (mp->m_rtdev_targp->bt_bdev && cfg->rtblocks > 0) {
+ if (mp->m_rtdev_targp->bt_bdev &&
+ mp->m_rtdev_targp != mp->m_ddev_targp &&
+ cfg->rtblocks > 0 &&
+ !xfs_has_zoned(mp)) {
buf = alloc_write_buf(mp->m_rtdev_targp,
XFS_FSB_TO_BB(mp, cfg->rtblocks - 1LL),
BTOBB(cfg->blocksize));
*/
},
};
-
+ struct zone_topology zt = {};
struct list_head buffer_list;
int error;
sectorsize = cfg.sectorsize;
validate_log_sectorsize(&cfg, &cli, &dft, &ft);
+ validate_zoned(&cfg, &cli, &dft, &zt);
validate_sb_features(&cfg, &cli);
/*
/*
* Open and validate the device configurations
*/
- open_devices(&cfg, &xi);
+ open_devices(&cfg, &xi, &zt);
validate_overwrite(xi.data.name, force_overwrite);
validate_datadev(&cfg, &cli);
validate_logdev(&cfg, &cli);
- validate_rtdev(&cfg, &cli);
+ validate_rtdev(&cfg, &cli, &zt);
calc_stripe_factors(&cfg, &cli, &ft);
/*
*/
calculate_initial_ag_geometry(&cfg, &cli, &xi);
align_ag_geometry(&cfg);
- calculate_rtgroup_geometry(&cfg, &cli, &xi);
+ if (cfg.sb_feat.zoned)
+ calculate_zone_geometry(&cfg, &cli, &xi, &zt);
+ else
+ calculate_rtgroup_geometry(&cfg, &cli, &xi);
calculate_imaxpct(&cfg, &cli);
/*
* All values have been validated, discard the old device layout.
*/
+ if (cli.sb_feat.zoned && !discard) {
+ fprintf(stderr,
+ _("-K not support for zoned file systems.\n"));
+ return 1;
+ }
if (discard && !dry_run)
- discard_devices(&xi, quiet);
+ discard_devices(&cfg, &xi, &zt, quiet);
/*
* we need the libxfs buffer cache from here on in.