static LIST_HEAD(all_mddevs);
static DEFINE_SPINLOCK(all_mddevs_lock);
-/*
- * iterates through all used mddevs in the system.
- * We take care to grab the all_mddevs_lock whenever navigating
- * the list, and to always hold a refcount when unlocked.
- * Any code which breaks out of this loop while own
- * a reference to the current mddev and must mddev_put it.
- */
-#define for_each_mddev(_mddev,_tmp) \
- \
- for (({ spin_lock(&all_mddevs_lock); \
- _tmp = all_mddevs.next; \
- _mddev = NULL;}); \
- ({ if (_tmp != &all_mddevs) \
- mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\
- spin_unlock(&all_mddevs_lock); \
- if (_mddev) mddev_put(_mddev); \
- _mddev = list_entry(_tmp, struct mddev, all_mddevs); \
- _tmp != &all_mddevs;}); \
- ({ spin_lock(&all_mddevs_lock); \
- _tmp = _tmp->next;}) \
- )
-
/* Rather than calling directly into the personality make_request function,
* IO requests come here first so that we can check if the device is
* being suspended pending a reconfiguration.
void mddev_init(struct mddev *mddev)
{
- kobject_init(&mddev->kobj, &md_ktype);
mutex_init(&mddev->open_mutex);
mutex_init(&mddev->reconfig_mutex);
mutex_init(&mddev->bitmap_info.mutex);
return ERR_PTR(error);
}
+static void mddev_free(struct mddev *mddev)
+{
+ spin_lock(&all_mddevs_lock);
+ list_del(&mddev->all_mddevs);
+ spin_unlock(&all_mddevs_lock);
+
+ kfree(mddev);
+}
+
static const struct attribute_group md_redundancy_group;
void mddev_unlock(struct mddev *mddev)
}
int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
- struct page *page, int op, int op_flags, bool metadata_op)
+ struct page *page, blk_opf_t opf, bool metadata_op)
{
struct bio bio;
struct bio_vec bvec;
if (metadata_op && rdev->meta_bdev)
- bio_init(&bio, rdev->meta_bdev, &bvec, 1, op | op_flags);
+ bio_init(&bio, rdev->meta_bdev, &bvec, 1, opf);
else
- bio_init(&bio, rdev->bdev, &bvec, 1, op | op_flags);
+ bio_init(&bio, rdev->bdev, &bvec, 1, opf);
if (metadata_op)
bio.bi_iter.bi_sector = sector + rdev->sb_start;
if (rdev->sb_loaded)
return 0;
- if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true))
+ if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true))
goto fail;
rdev->sb_loaded = 1;
return 0;
return -EINVAL;
bb_sector = (long long)offset;
if (!sync_page_io(rdev, bb_sector, sectors << 9,
- rdev->bb_page, REQ_OP_READ, 0, true))
+ rdev->bb_page, REQ_OP_READ, true))
return -EIO;
bbp = (__le64 *)page_address(rdev->bb_page);
rdev->badblocks.shift = sb->bblog_shift;
mdname(mddev), mddev->max_disks);
return -EBUSY;
}
- bdevname(rdev->bdev,b);
+ snprintf(b, sizeof(b), "%pg", rdev->bdev);
strreplace(b, '/', '!');
rdev->mddev = mddev;
return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
}
-static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
+static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b)
{
/* check if two start/length pairs overlap */
- if (s1+l1 <= s2)
- return 0;
- if (s2+l2 <= s1)
- return 0;
- return 1;
+ if (a->data_offset + a->sectors <= b->data_offset)
+ return false;
+ if (b->data_offset + b->sectors <= a->data_offset)
+ return false;
+ return true;
+}
+
+static bool md_rdev_overlaps(struct md_rdev *rdev)
+{
+ struct mddev *mddev;
+ struct md_rdev *rdev2;
+
+ spin_lock(&all_mddevs_lock);
+ list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
+ rdev_for_each(rdev2, mddev) {
+ if (rdev != rdev2 && rdev->bdev == rdev2->bdev &&
+ md_rdevs_overlap(rdev, rdev2)) {
+ spin_unlock(&all_mddevs_lock);
+ return true;
+ }
+ }
+ }
+ spin_unlock(&all_mddevs_lock);
+ return false;
}
static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
return -EINVAL; /* component must fit device */
rdev->sectors = sectors;
- if (sectors > oldsectors && my_mddev->external) {
- /* Need to check that all other rdevs with the same
- * ->bdev do not overlap. 'rcu' is sufficient to walk
- * the rdev lists safely.
- * This check does not provide a hard guarantee, it
- * just helps avoid dangerous mistakes.
- */
- struct mddev *mddev;
- int overlap = 0;
- struct list_head *tmp;
- rcu_read_lock();
- for_each_mddev(mddev, tmp) {
- struct md_rdev *rdev2;
-
- rdev_for_each(rdev2, mddev)
- if (rdev->bdev == rdev2->bdev &&
- rdev != rdev2 &&
- overlaps(rdev->data_offset, rdev->sectors,
- rdev2->data_offset,
- rdev2->sectors)) {
- overlap = 1;
- break;
- }
- if (overlap) {
- mddev_put(mddev);
- break;
- }
- }
- rcu_read_unlock();
- if (overlap) {
- /* Someone else could have slipped in a size
- * change here, but doing so is just silly.
- * We put oldsectors back because we *know* it is
- * safe, and trust userspace not to race with
- * itself
- */
- rdev->sectors = oldsectors;
- return -EBUSY;
- }
+ /*
+ * Check that all other rdevs with the same bdev do not overlap. This
+ * check does not provide a hard guarantee, it just helps avoid
+ * dangerous mistakes.
+ */
+ if (sectors > oldsectors && my_mddev->external &&
+ md_rdev_overlaps(rdev)) {
+ /*
+ * Someone else could have slipped in a size change here, but
+ * doing so is just silly. We put oldsectors back because we
+ * know it is safe, and trust userspace not to race with itself.
+ */
+ rdev->sectors = oldsectors;
+ return -EBUSY;
}
return len;
}
if (work_pending(&mddev->del_work))
flush_workqueue(md_misc_wq);
if (mddev->sync_thread) {
+ sector_t save_rp = mddev->reshape_position;
+
+ mddev_unlock(mddev);
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ md_unregister_thread(&mddev->sync_thread);
+ mddev_lock_nointr(mddev);
+ /*
+ * set RECOVERY_INTR again and restore reshape
+ * position in case others changed them after
+ * got lock, eg, reshape_position_store and
+ * md_check_recovery.
+ */
+ mddev->reshape_position = save_rp;
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_reap_sync_thread(mddev);
}
sync_speed_show(struct mddev *mddev, char *page)
{
unsigned long resync, dt, db;
- if (mddev->curr_resync == 0)
+ if (mddev->curr_resync == MD_RESYNC_NONE)
return sprintf(page, "none\n");
resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
dt = (jiffies - mddev->resync_mark) / HZ;
if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return sprintf(page, "none\n");
- if (mddev->curr_resync == 1 ||
- mddev->curr_resync == 2)
+ if (mddev->curr_resync == MD_RESYNC_YIELDED ||
+ mddev->curr_resync == MD_RESYNC_DELAYED)
return sprintf(page, "delayed\n");
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
return rv;
}
-static void md_free(struct kobject *ko)
+static void md_kobj_release(struct kobject *ko)
{
struct mddev *mddev = container_of(ko, struct mddev, kobj);
if (mddev->sysfs_level)
sysfs_put(mddev->sysfs_level);
- if (mddev->gendisk) {
- del_gendisk(mddev->gendisk);
- blk_cleanup_disk(mddev->gendisk);
- }
- percpu_ref_exit(&mddev->writes_pending);
-
- bioset_exit(&mddev->bio_set);
- bioset_exit(&mddev->sync_set);
- kfree(mddev);
+ del_gendisk(mddev->gendisk);
+ put_disk(mddev->gendisk);
}
static const struct sysfs_ops md_sysfs_ops = {
.store = md_attr_store,
};
static struct kobj_type md_ktype = {
- .release = md_free,
+ .release = md_kobj_release,
.sysfs_ops = &md_sysfs_ops,
.default_groups = md_attr_groups,
};
{
struct mddev *mddev = container_of(ws, struct mddev, del_work);
- kobject_del(&mddev->kobj);
kobject_put(&mddev->kobj);
}
}
EXPORT_SYMBOL_GPL(mddev_init_writes_pending);
-static int md_alloc(dev_t dev, char *name)
+int md_alloc(dev_t dev, char *name)
{
/*
* If dev is zero, name is the name of a device to allocate with
mutex_lock(&disks_mutex);
mddev = mddev_alloc(dev);
if (IS_ERR(mddev)) {
- mutex_unlock(&disks_mutex);
- return PTR_ERR(mddev);
+ error = PTR_ERR(mddev);
+ goto out_unlock;
}
partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
strcmp(mddev2->gendisk->disk_name, name) == 0) {
spin_unlock(&all_mddevs_lock);
error = -EEXIST;
- goto out_unlock_disks_mutex;
+ goto out_free_mddev;
}
spin_unlock(&all_mddevs_lock);
}
error = -ENOMEM;
disk = blk_alloc_disk(NUMA_NO_NODE);
if (!disk)
- goto out_unlock_disks_mutex;
+ goto out_free_mddev;
disk->major = MAJOR(mddev->unit);
disk->first_minor = unit << shift;
mddev->gendisk = disk;
error = add_disk(disk);
if (error)
- goto out_cleanup_disk;
+ goto out_put_disk;
+ kobject_init(&mddev->kobj, &md_ktype);
error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md");
- if (error)
- goto out_del_gendisk;
+ if (error) {
+ /*
+ * The disk is already live at this point. Clear the hold flag
+ * and let mddev_put take care of the deletion, as it isn't any
+ * different from a normal close on last release now.
+ */
+ mddev->hold_active = 0;
+ goto done;
+ }
kobject_uevent(&mddev->kobj, KOBJ_ADD);
mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level");
- goto out_unlock_disks_mutex;
-out_del_gendisk:
- del_gendisk(disk);
-out_cleanup_disk:
- blk_cleanup_disk(disk);
-out_unlock_disks_mutex:
+done:
mutex_unlock(&disks_mutex);
mddev_put(mddev);
return error;
+
+out_put_disk:
+ put_disk(disk);
+out_free_mddev:
+ mddev_free(mddev);
+out_unlock:
+ mutex_unlock(&disks_mutex);
+ return error;
}
static void md_probe(dev_t dev)
flush_workqueue(md_misc_wq);
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ md_unregister_thread(&mddev->sync_thread);
md_reap_sync_thread(mddev);
}
return ret;
}
+static void md_free_disk(struct gendisk *disk)
+{
+ struct mddev *mddev = disk->private_data;
+
+ percpu_ref_exit(&mddev->writes_pending);
+ bioset_exit(&mddev->bio_set);
+ bioset_exit(&mddev->sync_set);
+
+ kfree(mddev);
+}
+
const struct block_device_operations md_fops =
{
.owner = THIS_MODULE,
.getgeo = md_getgeo,
.check_events = md_check_events,
.set_read_only = md_set_read_only,
+ .free_disk = md_free_disk,
};
static int md_thread(void *arg)
max_sectors = mddev->dev_sectors;
resync = mddev->curr_resync;
- if (resync <= 3) {
+ if (resync < MD_RESYNC_ACTIVE) {
if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
/* Still cleaning up */
resync = max_sectors;
- } else if (resync > max_sectors)
+ } else if (resync > max_sectors) {
resync = max_sectors;
- else
+ } else {
resync -= atomic_read(&mddev->recovery_active);
+ if (resync < MD_RESYNC_ACTIVE) {
+ /*
+ * Resync has started, but the subtraction has
+ * yielded one of the special values. Force it
+ * to active to ensure the status reports an
+ * active resync.
+ */
+ resync = MD_RESYNC_ACTIVE;
+ }
+ }
- if (resync == 0) {
+ if (resync == MD_RESYNC_NONE) {
if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) {
struct md_rdev *rdev;
}
return 0;
}
- if (resync < 3) {
+ if (resync < MD_RESYNC_ACTIVE) {
seq_printf(seq, "\tresync=DELAYED");
return 1;
}
unsigned long update_time;
sector_t mark_cnt[SYNC_MARKS];
int last_mark,m;
- struct list_head *tmp;
sector_t last_check;
int skipped = 0;
struct md_rdev *rdev;
mddev->last_sync_action = action ?: desc;
- /* we overload curr_resync somewhat here.
- * 0 == not engaged in resync at all
- * 2 == checking that there is no conflict with another sync
- * 1 == like 2, but have yielded to allow conflicting resync to
- * commence
- * other == active in resync - this many blocks
- *
+ /*
* Before starting a resync we must have set curr_resync to
* 2, and then checked that every "conflicting" array has curr_resync
* less than ours. When we find one that is the same or higher
do {
int mddev2_minor = -1;
- mddev->curr_resync = 2;
+ mddev->curr_resync = MD_RESYNC_DELAYED;
try_again:
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
goto skip;
- for_each_mddev(mddev2, tmp) {
+ spin_lock(&all_mddevs_lock);
+ list_for_each_entry(mddev2, &all_mddevs, all_mddevs) {
if (mddev2 == mddev)
continue;
if (!mddev->parallel_resync
&& mddev2->curr_resync
&& match_mddev_units(mddev, mddev2)) {
DEFINE_WAIT(wq);
- if (mddev < mddev2 && mddev->curr_resync == 2) {
+ if (mddev < mddev2 &&
+ mddev->curr_resync == MD_RESYNC_DELAYED) {
/* arbitrarily yield */
- mddev->curr_resync = 1;
+ mddev->curr_resync = MD_RESYNC_YIELDED;
wake_up(&resync_wait);
}
- if (mddev > mddev2 && mddev->curr_resync == 1)
+ if (mddev > mddev2 &&
+ mddev->curr_resync == MD_RESYNC_YIELDED)
/* no need to wait here, we can wait the next
* time 'round when curr_resync == 2
*/
desc, mdname(mddev),
mdname(mddev2));
}
- mddev_put(mddev2);
+ spin_unlock(&all_mddevs_lock);
+
if (signal_pending(current))
flush_signals(current);
schedule();
finish_wait(&resync_wait, &wq);
}
}
- } while (mddev->curr_resync < 2);
+ spin_unlock(&all_mddevs_lock);
+ } while (mddev->curr_resync < MD_RESYNC_DELAYED);
j = 0;
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
desc, mdname(mddev));
mddev->curr_resync = j;
} else
- mddev->curr_resync = 3; /* no longer delayed */
+ mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */
mddev->curr_resync_completed = j;
sysfs_notify_dirent_safe(mddev->sysfs_completed);
md_new_event();
if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
- mddev->curr_resync > 3) {
+ mddev->curr_resync >= MD_RESYNC_ACTIVE) {
mddev->curr_resync_completed = mddev->curr_resync;
sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
mddev->pers->sync_request(mddev, max_sectors, &skipped);
if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
- mddev->curr_resync > 3) {
+ mddev->curr_resync >= MD_RESYNC_ACTIVE) {
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
if (mddev->curr_resync >= mddev->recovery_cp) {
} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
mddev->resync_min = mddev->curr_resync_completed;
set_bit(MD_RECOVERY_DONE, &mddev->recovery);
- mddev->curr_resync = 0;
+ mddev->curr_resync = MD_RESYNC_NONE;
spin_unlock(&mddev->lock);
wake_up(&resync_wait);
* ->spare_active and clear saved_raid_disk
*/
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ md_unregister_thread(&mddev->sync_thread);
md_reap_sync_thread(mddev);
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
goto unlock;
}
if (mddev->sync_thread) {
+ md_unregister_thread(&mddev->sync_thread);
md_reap_sync_thread(mddev);
goto unlock;
}
sector_t old_dev_sectors = mddev->dev_sectors;
bool is_reshaped = false;
- /* resync has finished, collect result */
- md_unregister_thread(&mddev->sync_thread);
+ /* sync_thread should be unregistered, collect result */
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
mddev->degraded != mddev->raid_disks) {
wake_up(&resync_wait);
/* flag recovery needed just to double check */
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ sysfs_notify_dirent_safe(mddev->sysfs_completed);
sysfs_notify_dirent_safe(mddev->sysfs_action);
md_new_event();
if (mddev->event_work.func)
static int md_notify_reboot(struct notifier_block *this,
unsigned long code, void *x)
{
- struct list_head *tmp;
- struct mddev *mddev;
+ struct mddev *mddev, *n;
int need_delay = 0;
- for_each_mddev(mddev, tmp) {
+ spin_lock(&all_mddevs_lock);
+ list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) {
+ mddev_get(mddev);
+ spin_unlock(&all_mddevs_lock);
if (mddev_trylock(mddev)) {
if (mddev->pers)
__md_stop_writes(mddev);
mddev_unlock(mddev);
}
need_delay = 1;
+ mddev_put(mddev);
+ spin_lock(&all_mddevs_lock);
}
+ spin_unlock(&all_mddevs_lock);
+
/*
* certain more exotic SCSI devices are known to be
* volatile wrt too early system reboots. While the
static __exit void md_exit(void)
{
- struct mddev *mddev;
- struct list_head *tmp;
+ struct mddev *mddev, *n;
int delay = 1;
unregister_blkdev(MD_MAJOR,"md");
}
remove_proc_entry("mdstat", NULL);
- for_each_mddev(mddev, tmp) {
+ spin_lock(&all_mddevs_lock);
+ list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) {
+ mddev_get(mddev);
+ spin_unlock(&all_mddevs_lock);
export_array(mddev);
mddev->ctime = 0;
mddev->hold_active = 0;
/*
- * for_each_mddev() will call mddev_put() at the end of each
- * iteration. As the mddev is now fully clear, this will
- * schedule the mddev for destruction by a workqueue, and the
+ * As the mddev is now fully clear, mddev_put will schedule
+ * the mddev for destruction by a workqueue, and the
* destroy_workqueue() below will wait for that to complete.
*/
+ mddev_put(mddev);
+ spin_lock(&all_mddevs_lock);
}
+ spin_unlock(&all_mddevs_lock);
+
destroy_workqueue(md_rdev_misc_wq);
destroy_workqueue(md_misc_wq);
destroy_workqueue(md_wq);