util.c

   1 /*
   2  * mdadm - manage Linux "md" devices aka RAID arrays.
   3  *
   4  * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de>
   5  *
   6  *
   7  *    This program is free software; you can redistribute it and/or modify
   8  *    it under the terms of the GNU General Public License as published by
   9  *    the Free Software Foundation; either version 2 of the License, or
  10  *    (at your option) any later version.
  11  *
  12  *    This program is distributed in the hope that it will be useful,
  13  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  *    GNU General Public License for more details.
  16  *
  17  *    You should have received a copy of the GNU General Public License
  18  *    along with this program; if not, write to the Free Software
  19  *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  20  *
  21  *    Author: Neil Brown
  22  *    Email: <neilb@suse.de>
  23  */
  24
  25 #include        "mdadm.h"
  26 #include        "md_p.h"
  27 #include        "xmalloc.h"
  28
  29 #include        <sys/socket.h>
  30 #include        <sys/utsname.h>
  31 #include        <sys/wait.h>
  32 #include        <sys/un.h>
  33 #include        <sys/resource.h>
  34 #include        <sys/vfs.h>
  35 #include        <sys/mman.h>
  36 #include        <linux/magic.h>
  37 #include        <poll.h>
  38 #include        <ctype.h>
  39 #include        <dirent.h>
  40 #include        <dlfcn.h>
  41 #include        <limits.h>
  42
  43 /*
  44  * following taken from linux/blkpg.h because they aren't
  45  * anywhere else and it isn't safe to #include linux/ * stuff.
  46  */
  47
  48 #define BLKPG      _IO(0x12,105)
  49
  50 /* The argument structure */
  51 struct blkpg_ioctl_arg {
  52         int op;
  53         int flags;
  54         int datalen;
  55         void *data;
  56 };
  57
  58 /* The subfunctions (for the op field) */
  59 #define BLKPG_ADD_PARTITION     1
  60 #define BLKPG_DEL_PARTITION     2
  61
  62 /* Sizes of name fields. Unused at present. */
  63 #define BLKPG_DEVNAMELTH        64
  64 #define BLKPG_VOLNAMELTH        64
  65
  66 /* The data structure for ADD_PARTITION and DEL_PARTITION */
  67 struct blkpg_partition {
  68         long long start;                /* starting offset in bytes */
  69         long long length;               /* length in bytes */
  70         int pno;                        /* partition number */
  71         char devname[BLKPG_DEVNAMELTH]; /* partition name, like sda5 or c0d1p2,
  72                                            to be used in kernel messages */
  73         char volname[BLKPG_VOLNAMELTH]; /* volume label */
  74 };
  75
  76 #include "part.h"
  77
  78 /* Force a compilation error if condition is true */
  79 #define BUILD_BUG_ON(condition) ((void)BUILD_BUG_ON_ZERO(condition))
  80
  81 /* Force a compilation error if condition is true, but also produce a
  82    result (of value 0 and type size_t), so the expression can be used
  83    e.g. in a structure initializer (or where-ever else comma expressions
  84    aren't permitted). */
  85 #define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); }))
  86
  87 static int is_dlm_hooks_ready = 0;
  88
  89 int dlm_funs_ready(void)
  90 {
  91         return is_dlm_hooks_ready ? 1 : 0;
  92 }
  93
  94 static struct dlm_hooks *dlm_hooks = NULL;
  95 struct dlm_lock_resource *dlm_lock_res = NULL;
  96 static int ast_called = 0;
  97
  98 struct dlm_lock_resource {
  99         dlm_lshandle_t *ls;
 100         struct dlm_lksb lksb;
 101 };
 102
 103 /* Using poll(2) to wait for and dispatch ASTs */
 104 static int poll_for_ast(dlm_lshandle_t ls)
 105 {
 106         struct pollfd pfd;
 107
 108         pfd.fd = dlm_hooks->ls_get_fd(ls);
 109         pfd.events = POLLIN;
 110
 111         while (!ast_called)
 112         {
 113                 if (poll(&pfd, 1, 0) < 0)
 114                 {
 115                         perror("poll");
 116                         return -1;
 117                 }
 118                 dlm_hooks->dispatch(dlm_hooks->ls_get_fd(ls));
 119         }
 120         ast_called = 0;
 121
 122         return 0;
 123 }
 124
 125 static void dlm_ast(void *arg)
 126 {
 127         ast_called = 1;
 128 }
 129
 130 static char *cluster_name = NULL;
 131 /* Create the lockspace, take bitmapXXX locks on all the bitmaps. */
 132 int cluster_get_dlmlock(void)
 133 {
 134         int ret = -1;
 135         char str[64];
 136         int flags = LKF_NOQUEUE;
 137         int retry_count = 0;
 138
 139         if (!dlm_funs_ready()) {
 140                 pr_err("Something wrong with dlm library\n");
 141                 return -1;
 142         }
 143
 144         ret = get_cluster_name(&cluster_name);
 145         if (ret) {
 146                 pr_err("The md can't get cluster name\n");
 147                 return -1;
 148         }
 149
 150         dlm_lock_res = xmalloc(sizeof(struct dlm_lock_resource));
 151         dlm_lock_res->ls = dlm_hooks->open_lockspace(cluster_name);
 152         if (!dlm_lock_res->ls) {
 153                 dlm_lock_res->ls = dlm_hooks->create_lockspace(cluster_name, O_RDWR);
 154                 if (!dlm_lock_res->ls) {
 155                         pr_err("%s failed to create lockspace\n", cluster_name);
 156                         return -ENOMEM;
 157                 }
 158         } else {
 159                 pr_err("open existed %s lockspace\n", cluster_name);
 160         }
 161
 162         snprintf(str, 64, "bitmap%s", cluster_name);
 163 retry:
 164         ret = dlm_hooks->ls_lock(dlm_lock_res->ls, LKM_PWMODE,
 165                                  &dlm_lock_res->lksb, flags, str, strlen(str),
 166                                  0, dlm_ast, dlm_lock_res, NULL, NULL);
 167         if (ret) {
 168                 pr_err("error %d when get PW mode on lock %s\n", errno, str);
 169                 /* let's try several times if EAGAIN happened */
 170                 if (dlm_lock_res->lksb.sb_status == EAGAIN && retry_count < 10) {
 171                         sleep_for(10, 0, true);
 172                         retry_count++;
 173                         goto retry;
 174                 }
 175                 dlm_hooks->release_lockspace(cluster_name, dlm_lock_res->ls, 1);
 176                 return ret;
 177         }
 178
 179         /* Wait for it to complete */
 180         poll_for_ast(dlm_lock_res->ls);
 181
 182         if (dlm_lock_res->lksb.sb_status) {
 183                 pr_err("failed to lock cluster\n");
 184                 return -1;
 185         }
 186         return 1;
 187 }
 188
 189 int cluster_release_dlmlock(void)
 190 {
 191         int ret = -1;
 192
 193         if (!cluster_name)
 194                 goto out;
 195
 196         if (!dlm_lock_res->lksb.sb_lkid)
 197                 goto out;
 198
 199         ret = dlm_hooks->ls_unlock_wait(dlm_lock_res->ls,
 200                                         dlm_lock_res->lksb.sb_lkid, 0,
 201                                         &dlm_lock_res->lksb);
 202         if (ret) {
 203                 pr_err("error %d happened when unlock\n", errno);
 204                 /* XXX make sure the lock is unlocked eventually */
 205                 goto out;
 206         }
 207
 208         /* Wait for it to complete */
 209         poll_for_ast(dlm_lock_res->ls);
 210
 211         errno = dlm_lock_res->lksb.sb_status;
 212         if (errno != EUNLOCK) {
 213                 pr_err("error %d happened in ast when unlock lockspace\n",
 214                        errno);
 215                 /* XXX make sure the lockspace is unlocked eventually */
 216                 goto out;
 217         }
 218
 219         ret = dlm_hooks->release_lockspace(cluster_name, dlm_lock_res->ls, 1);
 220         if (ret) {
 221                 pr_err("error %d happened when release lockspace\n", errno);
 222                 /* XXX make sure the lockspace is released eventually */
 223                 goto out;
 224         }
 225         free(dlm_lock_res);
 226
 227 out:
 228         return ret;
 229 }
 230
 231 int md_array_valid(int fd)
 232 {
 233         struct mdinfo *sra;
 234         int ret;
 235
 236         sra = sysfs_read(fd, NULL, GET_ARRAY_STATE);
 237         if (sra) {
 238                 if (sra->array_state != ARRAY_UNKNOWN_STATE)
 239                         ret = 0;
 240                 else
 241                         ret = -ENODEV;
 242
 243                 free(sra);
 244         } else {
 245                 /*
 246                  * GET_ARRAY_INFO doesn't provide access to the proper state
 247                  * information, so fallback to a basic check for raid_disks != 0
 248                  */
 249                 ret = ioctl(fd, RAID_VERSION);
 250         }
 251
 252         return !ret;
 253 }
 254
 255 int md_array_active(int fd)
 256 {
 257         struct mdinfo *sra;
 258         struct mdu_array_info_s array;
 259         int ret = 0;
 260
 261         sra = sysfs_read(fd, NULL, GET_ARRAY_STATE);
 262         if (sra) {
 263                 if (!md_array_is_active(sra))
 264                         ret = -ENODEV;
 265
 266                 free(sra);
 267         } else {
 268                 /*
 269                  * GET_ARRAY_INFO doesn't provide access to the proper state
 270                  * information, so fallback to a basic check for raid_disks != 0
 271                  */
 272                 ret = md_get_array_info(fd, &array);
 273         }
 274
 275         return !ret;
 276 }
 277
 278 int md_array_is_active(struct mdinfo *info)
 279 {
 280         return (info->array_state != ARRAY_CLEAR &&
 281                 info->array_state != ARRAY_INACTIVE &&
 282                 info->array_state != ARRAY_UNKNOWN_STATE);
 283 }
 284
 285 /*
 286  * Get array info from the kernel. Longer term we want to deprecate the
 287  * ioctl and get it from sysfs.
 288  */
 289 int md_get_array_info(int fd, struct mdu_array_info_s *array)
 290 {
 291         return ioctl(fd, GET_ARRAY_INFO, array);
 292 }
 293
 294 /*
 295  * Set array info
 296  */
 297 int md_set_array_info(int fd, struct mdu_array_info_s *array)
 298 {
 299         return ioctl(fd, SET_ARRAY_INFO, array);
 300 }
 301
 302 /*
 303  * Get disk info from the kernel.
 304  */
 305 int md_get_disk_info(int fd, struct mdu_disk_info_s *disk)
 306 {
 307         return ioctl(fd, GET_DISK_INFO, disk);
 308 }
 309
 310 int get_linux_version()
 311 {
 312         struct utsname name;
 313         char *cp;
 314         int a = 0, b = 0,c = 0;
 315         if (uname(&name) <0)
 316                 return -1;
 317
 318         cp = name.release;
 319         a = strtoul(cp, &cp, 10);
 320         if (*cp == '.')
 321                 b = strtoul(cp+1, &cp, 10);
 322         if (*cp == '.')
 323                 c = strtoul(cp+1, &cp, 10);
 324
 325         return (a*1000000)+(b*1000)+c;
 326 }
 327
 328 int mdadm_version(char *version)
 329 {
 330         int a, b, c;
 331         char *cp;
 332
 333         if (!version)
 334                 version = Version;
 335
 336         cp = strchr(version, '-');
 337         if (!cp || *(cp+1) != ' ' || *(cp+2) != 'v')
 338                 return -1;
 339         cp += 3;
 340         a = strtoul(cp, &cp, 10);
 341         if (*cp != '.')
 342                 return -1;
 343         b = strtoul(cp+1, &cp, 10);
 344         if (*cp == '.')
 345                 c = strtoul(cp+1, &cp, 10);
 346         else
 347                 c = 0;
 348         if (*cp != ' ' && *cp != '-')
 349                 return -1;
 350         return (a*1000000)+(b*1000)+c;
 351 }
 352
 353 unsigned long long parse_size(char *size)
 354 {
 355         /* parse 'size' which should be a number optionally
 356          * followed by 'K', 'M'. 'G' or 'T'.
 357          * Without a suffix, K is assumed.
 358          * Number returned is in sectors (half-K)
 359          * INVALID_SECTORS returned on error.
 360          */
 361         char *c;
 362         long long s = strtoll(size, &c, 10);
 363         if (s > 0) {
 364                 switch (*c) {
 365                 case 'K':
 366                         c++;
 367                 default:
 368                         s *= 2;
 369                         break;
 370                 case 'M':
 371                         c++;
 372                         s *= 1024 * 2;
 373                         break;
 374                 case 'G':
 375                         c++;
 376                         s *= 1024 * 1024 * 2;
 377                         break;
 378                 case 'T':
 379                         c++;
 380                         s *= 1024 * 1024 * 1024 * 2LL;
 381                         break;
 382                 case 's': /* sectors */
 383                         c++;
 384                         break;
 385                 }
 386         } else
 387                 s = INVALID_SECTORS;
 388         if (*c)
 389                 s = INVALID_SECTORS;
 390         return s;
 391 }
 392
 393 int is_near_layout_10(int layout)
 394 {
 395         int fc, fo;
 396
 397         fc = (layout >> 8) & 255;
 398         fo = layout & (1 << 16);
 399         if (fc > 1 || fo > 0)
 400                 return 0;
 401         return 1;
 402 }
 403
 404 int parse_layout_10(char *layout)
 405 {
 406         int copies, rv;
 407         char *cp;
 408         /* Parse the layout string for raid10 */
 409         /* 'f', 'o' or 'n' followed by a number <= raid_disks */
 410         if ((layout[0] !=  'n' && layout[0] != 'f' && layout[0] != 'o') ||
 411             (copies = strtoul(layout+1, &cp, 10)) < 1 ||
 412             copies > 200 ||
 413             *cp)
 414                 return -1;
 415         if (layout[0] == 'n')
 416                 rv = 256 + copies;
 417         else if (layout[0] == 'o')
 418                 rv = 0x10000 + (copies<<8) + 1;
 419         else
 420                 rv = 1 + (copies<<8);
 421         return rv;
 422 }
 423
 424 int parse_layout_faulty(char *layout)
 425 {
 426         int ln, mode;
 427         char *m;
 428
 429         if (!layout)
 430                 return -1;
 431
 432         /* Parse the layout string for 'faulty' */
 433         ln = strcspn(layout, "0123456789");
 434         m = xstrdup(layout);
 435         m[ln] = 0;
 436         mode = map_name(faultylayout, m);
 437         free(m);
 438
 439         if (mode == UnSet)
 440                 return -1;
 441
 442         return mode | (atoi(layout+ln)<< ModeShift);
 443 }
 444
 445 int parse_cluster_confirm_arg(char *input, char **devname, int *slot)
 446 {
 447         char *dev;
 448         *slot = strtoul(input, &dev, 10);
 449         if (dev == input || dev[0] != ':')
 450                 return -1;
 451         *devname = dev+1;
 452         return 0;
 453 }
 454
 455 void remove_partitions(int fd)
 456 {
 457         /* remove partitions from this block devices.
 458          * This is used for components added to an array
 459          */
 460 #ifdef BLKPG_DEL_PARTITION
 461         struct blkpg_ioctl_arg a;
 462         struct blkpg_partition p;
 463
 464         a.op = BLKPG_DEL_PARTITION;
 465         a.data = (void*)&p;
 466         a.datalen = sizeof(p);
 467         a.flags = 0;
 468         memset(a.data, 0, a.datalen);
 469         for (p.pno = 0; p.pno < 16; p.pno++)
 470                 ioctl(fd, BLKPG, &a);
 471 #endif
 472 }
 473
 474 int test_partition(int fd)
 475 {
 476         /* Check if fd is a whole-disk or a partition.
 477          * BLKPG will return EINVAL on a partition, and BLKPG_DEL_PARTITION
 478          * will return ENXIO on an invalid partition number.
 479          */
 480         struct blkpg_ioctl_arg a;
 481         struct blkpg_partition p;
 482         a.op = BLKPG_DEL_PARTITION;
 483         a.data = (void*)&p;
 484         a.datalen = sizeof(p);
 485         a.flags = 0;
 486         memset(a.data, 0, a.datalen);
 487         p.pno = 1<<30;
 488         if (ioctl(fd, BLKPG, &a) == 0)
 489                 /* Very unlikely, but not a partition */
 490                 return 0;
 491         if (errno == ENXIO || errno == ENOTTY)
 492                 /* not a partition */
 493                 return 0;
 494
 495         return 1;
 496 }
 497
 498 int test_partition_from_id(dev_t id)
 499 {
 500         char buf[20];
 501         int fd, rv;
 502
 503         sprintf(buf, "%d:%d", major(id), minor(id));
 504         fd = dev_open(buf, O_RDONLY);
 505         if (fd < 0)
 506                 return -1;
 507         rv = test_partition(fd);
 508         close(fd);
 509         return rv;
 510 }
 511
 512 int enough(int level, int raid_disks, int layout, int clean, char *avail)
 513 {
 514         int copies, first;
 515         int i;
 516         int avail_disks = 0;
 517
 518         if (raid_disks <= 0)
 519                 return 0;
 520
 521         for (i = 0; i < raid_disks; i++)
 522                 avail_disks += !!avail[i];
 523
 524         switch (level) {
 525         case 10:
 526                 /* This is the tricky one - we need to check
 527                  * which actual disks are present.
 528                  */
 529                 copies = (layout & 255) * ((layout >> 8) & 255);
 530                 first = 0;
 531                 do {
 532                         /* there must be one of the 'copies' form 'first' */
 533                         int n = copies;
 534                         int cnt = 0;
 535                         int this = first;
 536                         while (n--) {
 537                                 if (avail[this])
 538                                         cnt++;
 539                                 this = (this + 1) % raid_disks;
 540                         }
 541                         if (cnt == 0)
 542                                 return 0;
 543                         first = (first + (layout & 255)) % raid_disks;
 544                 } while (first != 0);
 545                 return 1;
 546
 547         case LEVEL_MULTIPATH:
 548                 return avail_disks >= 1;
 549         case LEVEL_LINEAR:
 550         case 0:
 551                 return avail_disks == raid_disks;
 552         case 1:
 553                 return avail_disks >= 1;
 554         case 4:
 555                 if (avail_disks == raid_disks - 1 &&
 556                     !avail[raid_disks - 1])
 557                         /* If just the parity device is missing, then we
 558                          * have enough, even if not clean
 559                          */
 560                         return 1;
 561                 /* FALL THROUGH */
 562         case 5:
 563                 if (clean)
 564                         return avail_disks >= raid_disks - 1;
 565                 else
 566                         return avail_disks >= raid_disks;
 567         case 6:
 568                 if (clean)
 569                         return avail_disks >= raid_disks - 2;
 570                 else
 571                         return avail_disks >= raid_disks;
 572         default:
 573                 return 0;
 574         }
 575 }
 576
 577 char *__fname_from_uuid(int id[4], int swap, char *buf, char sep)
 578 {
 579         int i, j;
 580         char uuid[16];
 581         char *c = buf;
 582         strcpy(c, "UUID-");
 583         c += strlen(c);
 584         copy_uuid(uuid, id, swap);
 585         for (i = 0; i < 4; i++) {
 586                 if (i)
 587                         *c++ = sep;
 588                 for (j = 3; j >= 0; j--) {
 589                         sprintf(c,"%02x", (unsigned char) uuid[j+4*i]);
 590                         c+= 2;
 591                 }
 592         }
 593         return buf;
 594
 595 }
 596
 597 /**
 598  * fname_from_uuid() - generate uuid string. Should not be used with super1.
 599  * @info: info with uuid
 600  * @buf: buf to fill.
 601  *
 602  * This routine should not be used with super1. See detail_fname_from_uuid() for details. It does
 603  * not use superswitch swapuuid as it should be 0 but it has to do UUID conversion if host is big
 604  * endian- left for backward compatibility.
 605  */
 606 char *fname_from_uuid(struct mdinfo *info, char *buf)
 607 {
 608 #if __BYTE_ORDER == BIG_ENDIAN
 609         return __fname_from_uuid(info->uuid, true, buf, ':');
 610 #else
 611         return __fname_from_uuid(info->uuid, false, buf, ':');
 612 #endif
 613 }
 614
 615 int check_ext2(int fd, char *name)
 616 {
 617         /*
 618          * Check for an ext2fs file system.
 619          * Superblock is always 1K at 1K offset
 620          *
 621          * s_magic is le16 at 56 == 0xEF53
 622          * report mtime - le32 at 44
 623          * blocks - le32 at 4
 624          * logblksize - le32 at 24
 625          */
 626         unsigned char sb[1024];
 627         time_t mtime;
 628         unsigned long long size;
 629         int bsize;
 630         if (lseek(fd, 1024,0)!= 1024)
 631                 return 0;
 632         if (read(fd, sb, 1024)!= 1024)
 633                 return 0;
 634         if (sb[56] != 0x53 || sb[57] != 0xef)
 635                 return 0;
 636
 637         mtime = sb[44]|(sb[45]|(sb[46]|sb[47]<<8)<<8)<<8;
 638         bsize = sb[24]|(sb[25]|(sb[26]|sb[27]<<8)<<8)<<8;
 639         size = sb[4]|(sb[5]|(sb[6]|sb[7]<<8)<<8)<<8;
 640         size <<= bsize;
 641         pr_info("%s appears to contain an ext2fs file system\n",
 642                 name);
 643         pr_info("size=%lluK  mtime=%s", size, ctime(&mtime));
 644         return 1;
 645 }
 646
 647 int check_reiser(int fd, char *name)
 648 {
 649         /*
 650          * superblock is at 64K
 651          * size is 1024;
 652          * Magic string "ReIsErFs" or "ReIsEr2Fs" at 52
 653          *
 654          */
 655         unsigned char sb[1024];
 656         unsigned long long size;
 657         if (lseek(fd, 64*1024, 0) != 64*1024)
 658                 return 0;
 659         if (read(fd, sb, 1024) != 1024)
 660                 return 0;
 661         if (strncmp((char*)sb+52, "ReIsErFs",8) != 0 &&
 662             strncmp((char*)sb+52, "ReIsEr2Fs",9) != 0)
 663                 return 0;
 664         pr_err("%s appears to contain a reiserfs file system\n",name);
 665         size = sb[0]|(sb[1]|(sb[2]|sb[3]<<8)<<8)<<8;
 666         cont_err("size = %lluK\n", size*4);
 667
 668         return 1;
 669 }
 670
 671 int check_raid(int fd, char *name)
 672 {
 673         struct mdinfo info;
 674         time_t crtime;
 675         char *level;
 676         struct supertype *st = guess_super(fd);
 677
 678         if (!st)
 679                 return 0;
 680         if (st->ss->add_to_super != NULL) {
 681                 st->ss->load_super(st, fd, name);
 682                 /* Looks like a raid array .. */
 683                 pr_err("%s appears to be part of a raid array:\n", name);
 684                 st->ss->getinfo_super(st, &info, NULL);
 685                 st->ss->free_super(st);
 686                 crtime = info.array.ctime;
 687                 level = map_num(pers, info.array.level);
 688                 if (!level)
 689                         level = "-unknown-";
 690                 cont_err("level=%s devices=%d ctime=%s",
 691                         level, info.array.raid_disks, ctime(&crtime));
 692         } else {
 693                 /* Looks like GPT or MBR */
 694                 pr_err("partition table exists on %s\n", name);
 695         }
 696
 697         free(st);
 698         return 1;
 699 }
 700
 701 int fstat_is_blkdev(int fd, char *devname, dev_t *rdev)
 702 {
 703         struct stat stb;
 704
 705         if (fstat(fd, &stb) != 0) {
 706                 pr_err("fstat failed for %s: %s\n", devname, strerror(errno));
 707                 return 0;
 708         }
 709         if ((S_IFMT & stb.st_mode) != S_IFBLK) {
 710                 pr_err("%s is not a block device.\n", devname);
 711                 return 0;
 712         }
 713         if (rdev)
 714                 *rdev = stb.st_rdev;
 715         return 1;
 716 }
 717
 718 int stat_is_blkdev(char *devname, dev_t *rdev)
 719 {
 720         struct stat stb;
 721
 722         if (stat(devname, &stb) != 0) {
 723                 pr_err("stat failed for %s: %s\n", devname, strerror(errno));
 724                 return 0;
 725         }
 726         if ((S_IFMT & stb.st_mode) != S_IFBLK) {
 727                 pr_err("%s is not a block device.\n", devname);
 728                 return 0;
 729         }
 730         if (rdev)
 731                 *rdev = stb.st_rdev;
 732         return 1;
 733 }
 734
 735 /**
 736  * ask() - prompt user for "yes/no" dialog.
 737  * @mesg: message to be printed, without '?' sign.
 738  * Returns: 1 if 'Y/y', 0 otherwise.
 739  *
 740  * The default value is 'N/n', thus the caps on "N" on prompt.
 741  */
 742 int ask(char *mesg)
 743 {
 744         char buf[3] = {0};
 745
 746         fprintf(stderr, "%s [y/N]? ", mesg);
 747         fflush(stderr);
 748         if (fgets(buf, 3, stdin) == NULL)
 749                 return 0;
 750         if (strlen(buf) == 1) {
 751                 pr_err("assuming no.\n");
 752                 return 0;
 753         }
 754         if (buf[1] != '\n')
 755                 goto bad_option;
 756         if (toupper(buf[0]) == 'Y')
 757                 return 1;
 758         if (toupper(buf[0]) == 'N')
 759                 return 0;
 760 bad_option:
 761         pr_err("bad option.\n");
 762         return 0;
 763 }
 764
 765 unsigned long calc_csum(void *super, int bytes)
 766 {
 767         unsigned long long newcsum = 0;
 768         int i;
 769         unsigned int csum;
 770         unsigned int *superc = (unsigned int*) super;
 771
 772         for(i = 0; i < bytes/4; i++)
 773                 newcsum += superc[i];
 774         csum = (newcsum& 0xffffffff) + (newcsum>>32);
 775 #ifdef __alpha__
 776 /* The in-kernel checksum calculation is always 16bit on
 777  * the alpha, though it is 32 bit on i386...
 778  * I wonder what it is elsewhere... (it uses an API in
 779  * a way that it shouldn't).
 780  */
 781         csum = (csum & 0xffff) + (csum >> 16);
 782         csum = (csum & 0xffff) + (csum >> 16);
 783 #endif
 784         return csum;
 785 }
 786
 787 char *human_size(long long bytes)
 788 {
 789         static char buf[47];
 790
 791         /* We convert bytes to either centi-M{ega,ibi}bytes,
 792          * centi-G{igi,ibi}bytes or centi-T{era,ebi}bytes
 793          * with appropriate rounding, and then print
 794          * 1/100th of those as a decimal.
 795          * We allow upto 2048Megabytes before converting to
 796          * gigabytes and 2048Gigabytes before converting to
 797          * terabytes, as that shows more precision and isn't
 798          * too large a number.
 799          */
 800
 801         if (bytes < 5000*1024)
 802                 buf[0] = 0;
 803         else if (bytes < 2*1024LL*1024LL*1024LL) {
 804                 long cMiB = (bytes * 200LL / (1LL<<20) + 1) / 2;
 805                 long cMB  = (bytes / ( 1000000LL / 200LL ) +1) /2;
 806                 snprintf(buf, sizeof(buf), " (%ld.%02ld MiB %ld.%02ld MB)",
 807                         cMiB/100, cMiB % 100, cMB/100, cMB % 100);
 808         } else if (bytes < 2*1024LL*1024LL*1024LL*1024LL) {
 809                 long cGiB = (bytes * 200LL / (1LL<<30) +1) / 2;
 810                 long cGB  = (bytes / (1000000000LL/200LL ) +1) /2;
 811                 snprintf(buf, sizeof(buf), " (%ld.%02ld GiB %ld.%02ld GB)",
 812                         cGiB/100, cGiB % 100, cGB/100, cGB % 100);
 813         } else {
 814                 long cTiB = (bytes * 200LL / (1LL<<40) + 1) / 2;
 815                 long cTB  = (bytes / (1000000000000LL / 200LL) + 1) / 2;
 816                 snprintf(buf, sizeof(buf), " (%ld.%02ld TiB %ld.%02ld TB)",
 817                         cTiB/100, cTiB % 100, cTB/100, cTB % 100);
 818         }
 819         return buf;
 820 }
 821
 822 char *human_size_brief(long long bytes, int prefix)
 823 {
 824         static char buf[30];
 825
 826         /* We convert bytes to either centi-M{ega,ibi}bytes,
 827          * centi-G{igi,ibi}bytes or centi-T{era,ebi}bytes
 828          * with appropriate rounding, and then print
 829          * 1/100th of those as a decimal.
 830          * We allow upto 2048Megabytes before converting to
 831          * gigabytes and 2048Gigabytes before converting to
 832          * terabytes, as that shows more precision and isn't
 833          * too large a number.
 834          *
 835          * If prefix == IEC, we mean prefixes like kibi,mebi,gibi etc.
 836          * If prefix == JEDEC, we mean prefixes like kilo,mega,giga etc.
 837          */
 838
 839         if (bytes < 5000*1024)
 840                 buf[0] = 0;
 841         else if (prefix == IEC) {
 842                 if (bytes < 2*1024LL*1024LL*1024LL) {
 843                         long cMiB = (bytes * 200LL / (1LL<<20) +1) /2;
 844                         snprintf(buf, sizeof(buf), "%ld.%02ldMiB",
 845                                  cMiB/100, cMiB % 100);
 846                 } else if (bytes < 2*1024LL*1024LL*1024LL*1024LL) {
 847                         long cGiB = (bytes * 200LL / (1LL<<30) +1) /2;
 848                         snprintf(buf, sizeof(buf), "%ld.%02ldGiB",
 849                                  cGiB/100, cGiB % 100);
 850                 } else {
 851                         long cTiB = (bytes * 200LL / (1LL<<40) + 1) / 2;
 852                         snprintf(buf, sizeof(buf), "%ld.%02ldTiB",
 853                                  cTiB/100, cTiB % 100);
 854                 }
 855         }
 856         else if (prefix == JEDEC) {
 857                 if (bytes < 2*1024LL*1024LL*1024LL) {
 858                         long cMB  = (bytes / ( 1000000LL / 200LL ) +1) /2;
 859                         snprintf(buf, sizeof(buf), "%ld.%02ldMB",
 860                                  cMB/100, cMB % 100);
 861                 } else if (bytes < 2*1024LL*1024LL*1024LL*1024LL) {
 862                         long cGB  = (bytes / (1000000000LL/200LL ) +1) /2;
 863                         snprintf(buf, sizeof(buf), "%ld.%02ldGB",
 864                                  cGB/100, cGB % 100);
 865                 } else {
 866                         long cTB  = (bytes / (1000000000000LL / 200LL) + 1) / 2;
 867                         snprintf(buf, sizeof(buf), "%ld.%02ldTB",
 868                                  cTB/100, cTB % 100);
 869                 }
 870         }
 871         else
 872                 buf[0] = 0;
 873
 874         return buf;
 875 }
 876
 877 void print_r10_layout(int layout)
 878 {
 879         int near = layout & 255;
 880         int far = (layout >> 8) & 255;
 881         int offset = (layout&0x10000);
 882         char *sep = "";
 883
 884         if (near != 1) {
 885                 printf("%s near=%d", sep, near);
 886                 sep = ",";
 887         }
 888         if (far != 1)
 889                 printf("%s %s=%d", sep, offset?"offset":"far", far);
 890         if (near*far == 1)
 891                 printf("NO REDUNDANCY");
 892 }
 893
 894 unsigned long long calc_array_size(int level, int raid_disks, int layout,
 895                                    int chunksize, unsigned long long devsize)
 896 {
 897         if (level == 1)
 898                 return devsize;
 899         devsize &= ~(unsigned long long)((chunksize>>9)-1);
 900         return get_data_disks(level, layout, raid_disks) * devsize;
 901 }
 902
 903 int get_data_disks(int level, int layout, int raid_disks)
 904 {
 905         int data_disks = 0;
 906         switch (level) {
 907         case 0: data_disks = raid_disks;
 908                 break;
 909         case 1: data_disks = 1;
 910                 break;
 911         case 4:
 912         case 5: data_disks = raid_disks - 1;
 913                 break;
 914         case 6: data_disks = raid_disks - 2;
 915                 break;
 916         case 10: data_disks = raid_disks / (layout & 255) / ((layout>>8)&255);
 917                 break;
 918         }
 919
 920         return data_disks;
 921 }
 922
 923 dev_t devnm2devid(char *devnm)
 924 {
 925         /* First look in /sys/block/$DEVNM/dev for %d:%d
 926          * If that fails, try parsing out a number
 927          */
 928         char path[PATH_MAX];
 929         char *ep;
 930         int fd;
 931         int mjr,mnr;
 932
 933         snprintf(path, sizeof(path), "/sys/block/%s/dev", devnm);
 934         fd = open(path, O_RDONLY);
 935         if (fd >= 0) {
 936                 char buf[20];
 937                 int n = read(fd, buf, sizeof(buf));
 938                 close(fd);
 939                 if (n > 0)
 940                         buf[n] = 0;
 941                 if (n > 0 && sscanf(buf, "%d:%d\n", &mjr, &mnr) == 2)
 942                         return makedev(mjr, mnr);
 943         }
 944         if (strncmp(devnm, "md_d", 4) == 0 &&
 945             isdigit(devnm[4]) &&
 946             (mnr = strtoul(devnm+4, &ep, 10)) >= 0 &&
 947             ep > devnm && *ep == 0)
 948                 return makedev(get_mdp_major(), mnr << MdpMinorShift);
 949
 950         if (strncmp(devnm, "md", 2) == 0 &&
 951             isdigit(devnm[2]) &&
 952             (mnr = strtoul(devnm+2, &ep, 10)) >= 0 &&
 953             ep > devnm && *ep == 0)
 954                 return makedev(MD_MAJOR, mnr);
 955
 956         return 0;
 957 }
 958
 959 /**
 960  * is_devname_numbered() - helper for numbered devname verification.
 961  * @devname: path or name to check.
 962  * @pref: expected devname prefix.
 963  * @pref_len: prefix len.
 964  */
 965 static bool is_devname_numbered(const char *devname, const char *pref, const int pref_len)
 966 {
 967         int val;
 968
 969         assert(devname && pref);
 970
 971         if (strncmp(devname, pref, pref_len) != 0)
 972                 return false;
 973
 974         if (parse_num(&val, devname + pref_len) != 0)
 975                 return false;
 976
 977         /* Allow any number that represents a valid minor number */
 978         if (val >= (1 << 20))
 979                 return false;
 980
 981         return true;
 982 }
 983
 984 /**
 985  * is_devname_md_numbered() - check if &devname is numbered MD device (md).
 986  * @devname: path or name to check.
 987  */
 988 bool is_devname_md_numbered(const char *devname)
 989 {
 990         return is_devname_numbered(devname, DEV_NUM_PREF, DEV_NUM_PREF_LEN);
 991 }
 992
 993 /**
 994  * is_devname_md_d_numbered() - check if &devname is secondary numbered MD device (md_d).
 995  * @devname: path or name to check.
 996  */
 997 bool is_devname_md_d_numbered(const char *devname)
 998 {
 999         static const char d_dev[] = DEV_NUM_PREF "_d";
1000
1001         return is_devname_numbered(devname, d_dev, sizeof(d_dev) - 1);
1002 }
1003
1004 /**
1005  * get_md_name() - Get main dev node of the md device.
1006  * @devnm: Md device name or path.
1007  *
1008  * Function checks if the full name was passed and returns md name
1009  * if it is the MD device.
1010  *
1011  * Return: Main dev node of the md device or NULL if not found.
1012  */
1013 char *get_md_name(char *devnm)
1014 {
1015         static char devname[NAME_MAX];
1016         struct stat stb;
1017
1018         if (strncmp(devnm, "/dev/", 5) == 0)
1019                 snprintf(devname, sizeof(devname), "%s", devnm);
1020         else
1021                 snprintf(devname, sizeof(devname), "/dev/%s", devnm);
1022
1023         if (!is_mddev(devname))
1024                 return NULL;
1025         if (stat(devname, &stb) == 0 && (S_IFMT&stb.st_mode) == S_IFBLK)
1026                 return devname;
1027
1028         return NULL;
1029 }
1030
1031 void put_md_name(char *name)
1032 {
1033         if (strncmp(name, "/dev/.tmp.md", 12) == 0)
1034                 unlink(name);
1035 }
1036
1037 int get_maj_min(char *dev, int *major, int *minor)
1038 {
1039         char *e;
1040         *major = strtoul(dev, &e, 0);
1041         return (e > dev && *e == ':' && e[1] &&
1042                 (*minor = strtoul(e+1, &e, 0)) >= 0 &&
1043                 *e == 0);
1044 }
1045
1046 /**
1047  * is_bit_set() - get bit value by index.
1048  * @val: value.
1049  * @index: index of the bit (LSB numbering).
1050  *
1051  * Return: bit value.
1052  */
1053 bool is_bit_set(int *val, unsigned char index)
1054 {
1055         if ((*val) & (1 << index))
1056                 return true;
1057         return false;
1058 }
1059
1060 int dev_open(char *dev, int flags)
1061 {
1062         /* like 'open', but if 'dev' matches %d:%d, create a temp
1063          * block device and open that
1064          */
1065         int fd = -1;
1066         char devname[32];
1067         int major;
1068         int minor;
1069
1070         if (!dev)
1071                 return -1;
1072         flags |= O_DIRECT;
1073
1074         if (get_maj_min(dev, &major, &minor)) {
1075                 snprintf(devname, sizeof(devname), "/dev/.tmp.md.%d:%d:%d",
1076                          (int)getpid(), major, minor);
1077                 if (mknod(devname, S_IFBLK|0600, makedev(major, minor)) == 0) {
1078                         fd = open(devname, flags);
1079                         unlink(devname);
1080                 }
1081         } else
1082                 fd = open(dev, flags);
1083         return fd;
1084 }
1085
1086 int open_dev_flags(char *devnm, int flags)
1087 {
1088         dev_t devid;
1089         char buf[20];
1090
1091         devid = devnm2devid(devnm);
1092         sprintf(buf, "%d:%d", major(devid), minor(devid));
1093         return dev_open(buf, flags);
1094 }
1095
1096 int open_dev(char *devnm)
1097 {
1098         return open_dev_flags(devnm, O_RDONLY);
1099 }
1100
1101 int open_dev_excl(char *devnm)
1102 {
1103         char buf[20];
1104         int i;
1105         int flags = O_RDWR;
1106         dev_t devid = devnm2devid(devnm);
1107         unsigned int delay = 1; // miliseconds
1108
1109         sprintf(buf, "%d:%d", major(devid), minor(devid));
1110         for (i = 0; i < 25; i++) {
1111                 int fd = dev_open(buf, flags|O_EXCL);
1112                 if (fd >= 0)
1113                         return fd;
1114                 if (errno == EACCES && flags == O_RDWR) {
1115                         flags = O_RDONLY;
1116                         continue;
1117                 }
1118                 if (errno != EBUSY)
1119                         return fd;
1120                 sleep_for(0, MSEC_TO_NSEC(delay), true);
1121                 if (delay < 200)
1122                         delay *= 2;
1123         }
1124         return -1;
1125 }
1126
1127 int same_dev(char *one, char *two)
1128 {
1129         struct stat st1, st2;
1130         if (stat(one, &st1) != 0)
1131                 return 0;
1132         if (stat(two, &st2) != 0)
1133                 return 0;
1134         if ((st1.st_mode & S_IFMT) != S_IFBLK)
1135                 return 0;
1136         if ((st2.st_mode & S_IFMT) != S_IFBLK)
1137                 return 0;
1138         return st1.st_rdev == st2.st_rdev;
1139 }
1140
1141 void wait_for(char *dev, int fd)
1142 {
1143         int i;
1144         struct stat stb_want;
1145         unsigned int delay = 1; // miliseconds
1146
1147         if (fstat(fd, &stb_want) != 0 ||
1148             (stb_want.st_mode & S_IFMT) != S_IFBLK)
1149                 return;
1150
1151         for (i = 0; i < 25; i++) {
1152                 struct stat stb;
1153                 if (stat(dev, &stb) == 0 &&
1154                     (stb.st_mode & S_IFMT) == S_IFBLK &&
1155                     (stb.st_rdev == stb_want.st_rdev))
1156                         return;
1157                 sleep_for(0, MSEC_TO_NSEC(delay), true);
1158                 if (delay < 200)
1159                         delay *= 2;
1160         }
1161         if (i == 25)
1162                 pr_err("timeout waiting for %s\n", dev);
1163 }
1164
1165 struct superswitch *superlist[] =
1166 {
1167         &super0, &super1,
1168         &super_ddf, &super_imsm,
1169         &mbr, &gpt,
1170         NULL
1171 };
1172
1173 struct supertype *super_by_fd(int fd, char **subarrayp)
1174 {
1175         mdu_array_info_t array;
1176         int vers;
1177         int minor;
1178         struct supertype *st = NULL;
1179         struct mdinfo *sra;
1180         char *verstr;
1181         char version[20];
1182         int i;
1183         char *subarray = NULL;
1184         char container[32] = "";
1185         char *devnm = NULL;
1186
1187         devnm = fd2devnm(fd);
1188         if (!devnm)
1189                 return NULL;
1190
1191         sra = sysfs_read(fd, NULL, GET_VERSION);
1192
1193         if (sra) {
1194                 vers = sra->array.major_version;
1195                 minor = sra->array.minor_version;
1196                 verstr = sra->text_version;
1197         } else {
1198                 if (md_get_array_info(fd, &array))
1199                         array.major_version = array.minor_version = 0;
1200                 vers = array.major_version;
1201                 minor = array.minor_version;
1202                 verstr = "";
1203         }
1204
1205         if (vers != -1) {
1206                 sprintf(version, "%d.%d", vers, minor);
1207                 verstr = version;
1208         }
1209         if (minor == -2 && is_subarray(verstr)) {
1210                 char *dev = verstr+1;
1211
1212                 subarray = strchr(dev, '/');
1213                 if (subarray) {
1214                         *subarray++ = '\0';
1215                         subarray = xstrdup(subarray);
1216                 }
1217                 snprintf(container, sizeof(container), "%s", dev);
1218                 sysfs_free(sra);
1219                 sra = sysfs_read(-1, container, GET_VERSION);
1220                 if (sra && sra->text_version[0])
1221                         verstr = sra->text_version;
1222                 else
1223                         verstr = "-no-metadata-";
1224         }
1225
1226         for (i = 0; st == NULL && superlist[i]; i++)
1227                 st = superlist[i]->match_metadata_desc(verstr);
1228
1229         sysfs_free(sra);
1230         if (st) {
1231                 st->sb = NULL;
1232                 if (subarrayp)
1233                         *subarrayp = subarray;
1234                 strcpy(st->container_devnm, container);
1235                 strncpy(st->devnm, devnm, MD_NAME_MAX - 1);
1236         } else
1237                 free(subarray);
1238
1239         return st;
1240 }
1241
1242 struct supertype *dup_super(struct supertype *orig)
1243 {
1244         struct supertype *st;
1245
1246         if (!orig)
1247                 return orig;
1248         st = xcalloc(1, sizeof(*st));
1249         st->ss = orig->ss;
1250         st->max_devs = orig->max_devs;
1251         st->minor_version = orig->minor_version;
1252         st->ignore_hw_compat = orig->ignore_hw_compat;
1253         st->data_offset = orig->data_offset;
1254         st->sb = NULL;
1255         st->info = NULL;
1256         return st;
1257 }
1258
1259 struct supertype *guess_super_type(int fd, enum guess_types guess_type)
1260 {
1261         /* try each load_super to find the best match,
1262          * and return the best superswitch
1263          */
1264         struct superswitch  *ss;
1265         struct supertype *st;
1266         unsigned int besttime = 0;
1267         int bestsuper = -1;
1268         int i;
1269
1270         st = xcalloc(1, sizeof(*st));
1271         st->container_devnm[0] = 0;
1272
1273         for (i = 0; superlist[i]; i++) {
1274                 int rv;
1275                 ss = superlist[i];
1276                 if (guess_type == guess_array && ss->add_to_super == NULL)
1277                         continue;
1278                 if (guess_type == guess_partitions && ss->add_to_super != NULL)
1279                         continue;
1280                 memset(st, 0, sizeof(*st));
1281                 st->ignore_hw_compat = 1;
1282                 rv = ss->load_super(st, fd, NULL);
1283                 if (rv == 0) {
1284                         struct mdinfo info;
1285                         st->ss->getinfo_super(st, &info, NULL);
1286                         if (bestsuper == -1 ||
1287                             besttime < info.array.ctime) {
1288                                 bestsuper = i;
1289                                 besttime = info.array.ctime;
1290                         }
1291                         ss->free_super(st);
1292                 }
1293         }
1294         if (bestsuper != -1) {
1295                 int rv;
1296                 memset(st, 0, sizeof(*st));
1297                 st->ignore_hw_compat = 1;
1298                 rv = superlist[bestsuper]->load_super(st, fd, NULL);
1299                 if (rv == 0) {
1300                         superlist[bestsuper]->free_super(st);
1301                         return st;
1302                 }
1303         }
1304         free(st);
1305         return NULL;
1306 }
1307
1308 /* Return size of device in bytes */
1309 int get_dev_size(int fd, char *dname, unsigned long long *sizep)
1310 {
1311         unsigned long long ldsize;
1312         struct stat st;
1313
1314         if (fstat(fd, &st) != -1 && S_ISREG(st.st_mode))
1315                 ldsize = (unsigned long long)st.st_size;
1316         else
1317 #ifdef BLKGETSIZE64
1318         if (ioctl(fd, BLKGETSIZE64, &ldsize) != 0)
1319 #endif
1320         {
1321                 unsigned long dsize;
1322                 if (ioctl(fd, BLKGETSIZE, &dsize) == 0) {
1323                         ldsize = dsize;
1324                         ldsize <<= 9;
1325                 } else {
1326                         if (dname)
1327                                 pr_err("Cannot get size of %s: %s\n",
1328                                         dname, strerror(errno));
1329                         return 0;
1330                 }
1331         }
1332         *sizep = ldsize;
1333         return 1;
1334 }
1335
1336 /* Return sector size of device in bytes */
1337 int get_dev_sector_size(int fd, char *dname, unsigned int *sectsizep)
1338 {
1339         unsigned int sectsize;
1340
1341         if (ioctl(fd, BLKSSZGET, &sectsize) != 0) {
1342                 if (dname)
1343                         pr_err("Cannot get sector size of %s: %s\n",
1344                                 dname, strerror(errno));
1345                 return 0;
1346         }
1347
1348         *sectsizep = sectsize;
1349         return 1;
1350 }
1351
1352 /* Return true if this can only be a container, not a member device.
1353  * i.e. is and md device and size is zero
1354  */
1355 int must_be_container(int fd)
1356 {
1357         struct mdinfo *mdi;
1358         unsigned long long size;
1359
1360         mdi = sysfs_read(fd, NULL, GET_VERSION);
1361         if (!mdi)
1362                 return 0;
1363         sysfs_free(mdi);
1364
1365         if (get_dev_size(fd, NULL, &size) == 0)
1366                 return 1;
1367         if (size == 0)
1368                 return 1;
1369         return 0;
1370 }
1371
1372 /* Sets endofpart parameter to the last block used by the last GPT partition on the device.
1373  * Returns: 1 if successful
1374  *         -1 for unknown partition type
1375  *          0 for other errors
1376  */
1377 static int get_gpt_last_partition_end(int fd, unsigned long long *endofpart)
1378 {
1379         struct GPT gpt;
1380         unsigned char empty_gpt_entry[16]= {0};
1381         struct GPT_part_entry *part;
1382         char buf[512];
1383         unsigned long long curr_part_end;
1384         unsigned all_partitions, entry_size;
1385         unsigned part_nr;
1386         unsigned int sector_size = 0;
1387
1388         *endofpart = 0;
1389
1390         BUILD_BUG_ON(sizeof(gpt) != 512);
1391         /* skip protective MBR */
1392         if (!get_dev_sector_size(fd, NULL, &sector_size))
1393                 return 0;
1394         if (lseek(fd, sector_size, SEEK_SET) == -1L)
1395                 return 0;
1396         /* read GPT header */
1397         if (read(fd, &gpt, 512) != 512)
1398                 return 0;
1399
1400         /* get the number of partition entries and the entry size */
1401         all_partitions = __le32_to_cpu(gpt.part_cnt);
1402         entry_size = __le32_to_cpu(gpt.part_size);
1403
1404         /* Check GPT signature*/
1405         if (gpt.magic != GPT_SIGNATURE_MAGIC)
1406                 return -1;
1407
1408         /* sanity checks */
1409         if (all_partitions > 1024 ||
1410             entry_size > sizeof(buf))
1411                 return -1;
1412
1413         part = (struct GPT_part_entry *)buf;
1414
1415         /* set offset to third block (GPT entries) */
1416         if (lseek(fd, sector_size*2, SEEK_SET) == -1L)
1417                 return 0;
1418         for (part_nr = 0; part_nr < all_partitions; part_nr++) {
1419                 /* read partition entry */
1420                 if (read(fd, buf, entry_size) != (ssize_t)entry_size)
1421                         return 0;
1422
1423                 /* is this valid partition? */
1424                 if (memcmp(part->type_guid, empty_gpt_entry, 16) != 0) {
1425                         /* check the last lba for the current partition */
1426                         curr_part_end = __le64_to_cpu(part->ending_lba);
1427                         if (curr_part_end > *endofpart)
1428                                 *endofpart = curr_part_end;
1429                 }
1430
1431         }
1432         return 1;
1433 }
1434
1435 /* Sets endofpart parameter to the last block used by the last partition on the device.
1436  * Returns: 1 if successful
1437  *         -1 for unknown partition type
1438  *          0 for other errors
1439  */
1440 static int get_last_partition_end(int fd, unsigned long long *endofpart)
1441 {
1442         struct MBR boot_sect;
1443         unsigned long long curr_part_end;
1444         unsigned part_nr;
1445         unsigned int sector_size;
1446         int retval = 0;
1447
1448         *endofpart = 0;
1449
1450         BUILD_BUG_ON(sizeof(boot_sect) != 512);
1451         /* read MBR */
1452         if (lseek(fd, 0, 0) == -1L)
1453                 goto abort;
1454         if (read(fd, &boot_sect, 512) != 512)
1455                 goto abort;
1456
1457         /* check MBP signature */
1458         if (boot_sect.magic == MBR_SIGNATURE_MAGIC) {
1459                 retval = 1;
1460                 /* found the correct signature */
1461
1462                 for (part_nr = 0; part_nr < MBR_PARTITIONS; part_nr++) {
1463                         /*
1464                          * Have to make every access through boot_sect rather
1465                          * than using a pointer to the partition table (or an
1466                          * entry), since the entries are not properly aligned.
1467                          */
1468
1469                         /* check for GPT type */
1470                         if (boot_sect.parts[part_nr].part_type ==
1471                             MBR_GPT_PARTITION_TYPE) {
1472                                 retval = get_gpt_last_partition_end(fd, endofpart);
1473                                 break;
1474                         }
1475                         /* check the last used lba for the current partition  */
1476                         curr_part_end =
1477                                 __le32_to_cpu(boot_sect.parts[part_nr].first_sect_lba) +
1478                                 __le32_to_cpu(boot_sect.parts[part_nr].blocks_num);
1479                         if (curr_part_end > *endofpart)
1480                                 *endofpart = curr_part_end;
1481                 }
1482         } else {
1483                 /* Unknown partition table */
1484                 retval = -1;
1485         }
1486         /* calculate number of 512-byte blocks */
1487         if (get_dev_sector_size(fd, NULL, &sector_size))
1488                 *endofpart *= (sector_size / 512);
1489  abort:
1490         return retval;
1491 }
1492
1493 int check_partitions(int fd, char *dname, unsigned long long freesize,
1494                         unsigned long long size)
1495 {
1496         /*
1497          * Check where the last partition ends
1498          */
1499         unsigned long long endofpart;
1500
1501         if (get_last_partition_end(fd, &endofpart) > 0) {
1502                 /* There appears to be a partition table here */
1503                 if (freesize == 0) {
1504                         /* partitions will not be visible in new device */
1505                         pr_err("partition table exists on %s but will be lost or\n"
1506                                "       meaningless after creating array\n",
1507                                dname);
1508                         return 1;
1509                 } else if (endofpart > freesize) {
1510                         /* last partition overlaps metadata */
1511                         pr_err("metadata will over-write last partition on %s.\n",
1512                                dname);
1513                         return 1;
1514                 } else if (size && endofpart > size) {
1515                         /* partitions will be truncated in new device */
1516                         pr_err("array size is too small to cover all partitions on %s.\n",
1517                                dname);
1518                         return 1;
1519                 }
1520         }
1521         return 0;
1522 }
1523
1524 int open_container(int fd)
1525 {
1526         /* 'fd' is a block device.  Find out if it is in use
1527          * by a container, and return an open fd on that container.
1528          */
1529         char path[288];
1530         char *e;
1531         DIR *dir;
1532         struct dirent *de;
1533         int dfd, n;
1534         char buf[200];
1535         int major, minor;
1536         struct stat st;
1537
1538         if (fstat(fd, &st) != 0)
1539                 return -1;
1540         sprintf(path, "/sys/dev/block/%d:%d/holders",
1541                 (int)major(st.st_rdev), (int)minor(st.st_rdev));
1542         e = path + strlen(path);
1543
1544         dir = opendir(path);
1545         if (!dir)
1546                 return -1;
1547         while ((de = readdir(dir))) {
1548                 if (de->d_ino == 0)
1549                         continue;
1550                 if (de->d_name[0] == '.')
1551                         continue;
1552                 /* Need to make sure it is a container and not a volume */
1553                 sprintf(e, "/%s/md/metadata_version", de->d_name);
1554                 dfd = open(path, O_RDONLY);
1555                 if (dfd < 0)
1556                         continue;
1557                 n = read(dfd, buf, sizeof(buf));
1558                 close(dfd);
1559                 if (n <= 0 || (unsigned)n >= sizeof(buf))
1560                         continue;
1561                 buf[n] = 0;
1562                 if (strncmp(buf, "external", 8) != 0 ||
1563                     n < 10 ||
1564                     buf[9] == '/')
1565                         continue;
1566                 sprintf(e, "/%s/dev", de->d_name);
1567                 dfd = open(path, O_RDONLY);
1568                 if (dfd < 0)
1569                         continue;
1570                 n = read(dfd, buf, sizeof(buf));
1571                 close(dfd);
1572                 if (n <= 0 || (unsigned)n >= sizeof(buf))
1573                         continue;
1574                 buf[n] = 0;
1575                 if (sscanf(buf, "%d:%d", &major, &minor) != 2)
1576                         continue;
1577                 sprintf(buf, "%d:%d", major, minor);
1578                 dfd = dev_open(buf, O_RDONLY);
1579                 if (dfd >= 0) {
1580                         closedir(dir);
1581                         return dfd;
1582                 }
1583         }
1584         closedir(dir);
1585         return -1;
1586 }
1587
1588 struct superswitch *version_to_superswitch(char *vers)
1589 {
1590         int i;
1591
1592         for (i = 0; superlist[i]; i++) {
1593                 struct superswitch *ss = superlist[i];
1594
1595                 if (strcmp(vers, ss->name) == 0)
1596                         return ss;
1597         }
1598
1599         return NULL;
1600 }
1601
1602 int metadata_container_matches(char *metadata, char *devnm)
1603 {
1604         /* Check if 'devnm' is the container named in 'metadata'
1605          * which is
1606          *   /containername/componentname  or
1607          *   -containername/componentname
1608          */
1609         int l;
1610         if (*metadata != '/' && *metadata != '-')
1611                 return 0;
1612         l = strlen(devnm);
1613         if (strncmp(metadata+1, devnm, l) != 0)
1614                 return 0;
1615         if (metadata[l+1] != '/')
1616                 return 0;
1617         return 1;
1618 }
1619
1620 int metadata_subdev_matches(char *metadata, char *devnm)
1621 {
1622         /* Check if 'devnm' is the subdev named in 'metadata'
1623          * which is
1624          *   /containername/subdev  or
1625          *   -containername/subdev
1626          */
1627         char *sl;
1628         if (*metadata != '/' && *metadata != '-')
1629                 return 0;
1630         sl = strchr(metadata+1, '/');
1631         if (!sl)
1632                 return 0;
1633         if (strcmp(sl+1, devnm) == 0)
1634                 return 1;
1635         return 0;
1636 }
1637
1638 int is_subarray_active(char *subarray, char *container)
1639 {
1640         struct mdstat_ent *mdstat = mdstat_read(0, 0);
1641         struct mdstat_ent *ent;
1642
1643         for (ent = mdstat; ent; ent = ent->next)
1644                 if (is_container_member(ent, container))
1645                         if (strcmp(to_subarray(ent, container), subarray) == 0)
1646                                 break;
1647
1648         free_mdstat(mdstat);
1649
1650         return ent != NULL;
1651 }
1652
1653 /* open_subarray - opens a subarray in a container
1654  * @dev: container device name
1655  * @st: empty supertype
1656  * @quiet: block reporting errors flag
1657  *
1658  * On success returns an fd to a container and fills in *st
1659  */
1660 int open_subarray(char *dev, char *subarray, struct supertype *st, int quiet)
1661 {
1662         struct mdinfo *mdi;
1663         struct mdinfo *info;
1664         int fd, err = 1;
1665         char *_devnm;
1666
1667         fd = open(dev, O_RDWR|O_EXCL);
1668         if (fd < 0) {
1669                 if (!quiet)
1670                         pr_err("Couldn't open %s, aborting\n",
1671                                 dev);
1672                 return -1;
1673         }
1674
1675         _devnm = fd2devnm(fd);
1676         if (_devnm == NULL) {
1677                 if (!quiet)
1678                         pr_err("Failed to determine device number for %s\n",
1679                                dev);
1680                 goto close_fd;
1681         }
1682         snprintf(st->devnm, sizeof(st->devnm), "%s", _devnm);
1683
1684         mdi = sysfs_read(fd, st->devnm, GET_VERSION|GET_LEVEL);
1685         if (!mdi) {
1686                 if (!quiet)
1687                         pr_err("Failed to read sysfs for %s\n",
1688                                 dev);
1689                 goto close_fd;
1690         }
1691
1692         if (mdi->array.level != UnSet) {
1693                 if (!quiet)
1694                         pr_err("%s is not a container\n", dev);
1695                 goto free_sysfs;
1696         }
1697
1698         st->ss = version_to_superswitch(mdi->text_version);
1699         if (!st->ss) {
1700                 if (!quiet)
1701                         pr_err("Operation not supported for %s metadata\n",
1702                                mdi->text_version);
1703                 goto free_sysfs;
1704         }
1705
1706         if (st->devnm[0] == 0) {
1707                 if (!quiet)
1708                         pr_err("Failed to allocate device name\n");
1709                 goto free_sysfs;
1710         }
1711
1712         if (!st->ss->load_container) {
1713                 if (!quiet)
1714                         pr_err("%s is not a container\n", dev);
1715                 goto free_sysfs;
1716         }
1717
1718         if (st->ss->load_container(st, fd, NULL)) {
1719                 if (!quiet)
1720                         pr_err("Failed to load metadata for %s\n",
1721                                 dev);
1722                 goto free_sysfs;
1723         }
1724
1725         info = st->ss->container_content(st, subarray);
1726         if (!info) {
1727                 if (!quiet)
1728                         pr_err("Failed to find subarray-%s in %s\n",
1729                                 subarray, dev);
1730                 goto free_super;
1731         }
1732         free(info);
1733
1734         err = 0;
1735
1736  free_super:
1737         if (err)
1738                 st->ss->free_super(st);
1739  free_sysfs:
1740         sysfs_free(mdi);
1741  close_fd:
1742         if (err)
1743                 close(fd);
1744
1745         if (err)
1746                 return -1;
1747         else
1748                 return fd;
1749 }
1750
1751 int add_disk(int mdfd, struct supertype *st,
1752              struct mdinfo *sra, struct mdinfo *info)
1753 {
1754         /* Add a device to an array, in one of 2 ways. */
1755         int rv;
1756
1757         if (st->ss->external) {
1758                 if (info->disk.state & (1<<MD_DISK_SYNC))
1759                         info->recovery_start = MaxSector;
1760                 else
1761                         info->recovery_start = 0;
1762                 rv = sysfs_add_disk(sra, info, 0);
1763                 if (! rv) {
1764                         struct mdinfo *sd2;
1765                         for (sd2 = sra->devs; sd2; sd2=sd2->next)
1766                                 if (sd2 == info)
1767                                         break;
1768                         if (sd2 == NULL) {
1769                                 sd2 = xmalloc(sizeof(*sd2));
1770                                 *sd2 = *info;
1771                                 sd2->next = sra->devs;
1772                                 sra->devs = sd2;
1773                         }
1774                 }
1775         } else
1776                 rv = ioctl(mdfd, ADD_NEW_DISK, &info->disk);
1777         return rv;
1778 }
1779
1780 int remove_disk(int mdfd, struct supertype *st,
1781                 struct mdinfo *sra, struct mdinfo *info)
1782 {
1783         int rv;
1784
1785         /* Remove the disk given by 'info' from the array */
1786         if (st->ss->external)
1787                 rv = sysfs_set_str(sra, info, "slot", STR_COMMON_NONE);
1788         else
1789                 rv = ioctl(mdfd, HOT_REMOVE_DISK, makedev(info->disk.major,
1790                                                           info->disk.minor));
1791         return rv;
1792 }
1793
1794 int hot_remove_disk(int mdfd, unsigned long dev, int force)
1795 {
1796         int cnt = force ? 500 : 5;
1797         int ret;
1798
1799         /* HOT_REMOVE_DISK can fail with EBUSY if there are
1800          * outstanding IO requests to the device.
1801          * In this case, it can be helpful to wait a little while,
1802          * up to 5 seconds if 'force' is set, or 50 msec if not.
1803          */
1804         while ((ret = ioctl(mdfd, HOT_REMOVE_DISK, dev)) == -1 &&
1805                errno == EBUSY &&
1806                cnt-- > 0)
1807                 sleep_for(0, MSEC_TO_NSEC(10), true);
1808
1809         return ret;
1810 }
1811
1812 int sys_hot_remove_disk(int statefd, int force)
1813 {
1814         int cnt = force ? 500 : 5;
1815
1816         while (cnt--) {
1817                 int err = 0;
1818                 int ret = sysfs_set_memb_state_fd(statefd, MEMB_STATE_REMOVE, &err);
1819
1820                 if (ret == MDADM_STATUS_SUCCESS)
1821                         return 0;
1822
1823                 if (err != EBUSY)
1824                         break;
1825
1826                 sleep_for(0, MSEC_TO_NSEC(10), true);
1827         }
1828
1829         return -1;
1830 }
1831
1832 int set_array_info(int mdfd, struct supertype *st, struct mdinfo *info)
1833 {
1834         /* Initialise kernel's knowledge of array.
1835          * This varies between externally managed arrays
1836          * and older kernels
1837          */
1838         mdu_array_info_t inf;
1839         int rv;
1840
1841         if (st->ss->external)
1842                 return sysfs_set_array(info);
1843
1844         memset(&inf, 0, sizeof(inf));
1845         inf.major_version = info->array.major_version;
1846         inf.minor_version = info->array.minor_version;
1847         rv = md_set_array_info(mdfd, &inf);
1848
1849         return rv;
1850 }
1851
1852 unsigned long long min_recovery_start(struct mdinfo *array)
1853 {
1854         /* find the minimum recovery_start in an array for metadata
1855          * formats that only record per-array recovery progress instead
1856          * of per-device
1857          */
1858         unsigned long long recovery_start = MaxSector;
1859         struct mdinfo *d;
1860
1861         for (d = array->devs; d; d = d->next)
1862                 recovery_start = min(recovery_start, d->recovery_start);
1863
1864         return recovery_start;
1865 }
1866
1867 int mdmon_pid(const char *devnm)
1868 {
1869         char path[100];
1870         char pid[10];
1871         int fd;
1872         int n;
1873
1874         sprintf(path, "%s/%s.pid", MDMON_DIR, devnm);
1875
1876         fd = open(path, O_RDONLY | O_NOATIME, 0);
1877
1878         if (fd < 0)
1879                 return -1;
1880         n = read(fd, pid, 9);
1881         close(fd);
1882         if (n <= 0)
1883                 return -1;
1884         return atoi(pid);
1885 }
1886
1887 int mdmon_running(const char *devnm)
1888 {
1889         int pid = mdmon_pid(devnm);
1890         if (pid <= 0)
1891                 return 0;
1892         if (kill(pid, 0) == 0)
1893                 return 1;
1894         return 0;
1895 }
1896
1897 /*
1898  * wait_for_mdmon_control_socket() - Waits for mdmon control socket
1899  * to be created within specified time.
1900  * @container_devnm: Device for which mdmon control socket should start.
1901  *
1902  * In foreground mode, when mdadm is trying to connect to control
1903  * socket it is possible that the mdmon has not created it yet.
1904  * Give some time to mdmon to create socket. Timeout set to 2 sec.
1905  *
1906  * Return: MDADM_STATUS_SUCCESS if connect succeed, otherwise return
1907  * error code.
1908  */
1909 mdadm_status_t wait_for_mdmon_control_socket(const char *container_devnm)
1910 {
1911         enum mdadm_status status = MDADM_STATUS_SUCCESS;
1912         int sfd, rv, retry_count = 0;
1913         struct sockaddr_un addr;
1914         char path[PATH_MAX];
1915
1916         snprintf(path, PATH_MAX, "%s/%s.sock", MDMON_DIR, container_devnm);
1917         sfd = socket(PF_LOCAL, SOCK_STREAM, 0);
1918         if (!is_fd_valid(sfd))
1919                 return MDADM_STATUS_ERROR;
1920
1921         addr.sun_family = PF_LOCAL;
1922         strncpy(addr.sun_path, path, sizeof(addr.sun_path) - 1);
1923         addr.sun_path[sizeof(addr.sun_path) - 1] = '\0';
1924
1925         for (retry_count = 0; retry_count < 10; retry_count++) {
1926                 rv = connect(sfd, (struct sockaddr*)&addr, sizeof(addr));
1927                 if (rv < 0) {
1928                         sleep_for(0, MSEC_TO_NSEC(200), true);
1929                         continue;
1930                 }
1931                 break;
1932         }
1933
1934         if (rv < 0) {
1935                 pr_err("Failed to connect to control socket.\n");
1936                 status = MDADM_STATUS_ERROR;
1937         }
1938         close(sfd);
1939         return status;
1940 }
1941
1942 /*
1943  * wait_for_mdmon() - Waits for mdmon within specified time.
1944  * @devnm: Device for which mdmon should start.
1945  *
1946  * Function waits for mdmon to start. It may need few seconds
1947  * to start, we set timeout to 5, it should be sufficient.
1948  * Do not wait if mdmon has been started.
1949  *
1950  * Return: MDADM_STATUS_SUCCESS if mdmon is running, error code otherwise.
1951  */
1952 mdadm_status_t wait_for_mdmon(const char *devnm)
1953 {
1954         const time_t mdmon_timeout = 5;
1955         time_t start_time = time(0);
1956
1957         if (mdmon_running(devnm))
1958                 return MDADM_STATUS_SUCCESS;
1959
1960         pr_info("Waiting for mdmon to start\n");
1961         while (time(0) - start_time < mdmon_timeout) {
1962                 sleep_for(0, MSEC_TO_NSEC(200), true);
1963                 if (mdmon_running(devnm))
1964                         return MDADM_STATUS_SUCCESS;
1965         };
1966
1967         pr_err("Timeout waiting for mdmon\n");
1968         return MDADM_STATUS_ERROR;
1969 }
1970
1971 int start_mdmon(char *devnm)
1972 {
1973         int i;
1974         int len;
1975         pid_t pid;
1976         int status;
1977         char *prefix = in_initrd() ? "initrd-" : "";
1978         char pathbuf[1024];
1979         char *paths[4] = {
1980                 pathbuf,
1981                 BINDIR "/mdmon",
1982                 "./mdmon",
1983                 NULL
1984         };
1985
1986         if (check_env("MDADM_NO_MDMON"))
1987                 return 0;
1988         if (continue_via_systemd(devnm, MDMON_SERVICE, prefix) == MDADM_STATUS_SUCCESS)
1989                 return 0;
1990
1991         /* That failed, try running mdmon directly */
1992         len = readlink("/proc/self/exe", pathbuf, sizeof(pathbuf)-1);
1993         if (len > 0) {
1994                 char *sl;
1995                 pathbuf[len] = 0;
1996                 sl = strrchr(pathbuf, '/');
1997                 if (sl)
1998                         sl++;
1999                 else
2000                         sl = pathbuf;
2001                 strcpy(sl, "mdmon");
2002         } else
2003                 pathbuf[0] = '\0';
2004
2005         switch(fork()) {
2006         case 0:
2007                 manage_fork_fds(1);
2008                 for (i = 0; paths[i]; i++)
2009                         if (paths[i][0]) {
2010                                 execl(paths[i], paths[i],
2011                                       devnm, NULL);
2012                         }
2013                 exit(1);
2014         case -1: pr_err("cannot run mdmon. Array remains readonly\n");
2015                 return -1;
2016         default: /* parent - good */
2017                 pid = wait(&status);
2018                 if (pid < 0 || status != 0) {
2019                         pr_err("failed to launch mdmon. Array remains readonly\n");
2020                         return -1;
2021                 }
2022         }
2023         return 0;
2024 }
2025
2026 __u32 random32(void)
2027 {
2028         __u32 rv;
2029         int rfd = open("/dev/urandom", O_RDONLY);
2030         if (rfd < 0 || read(rfd, &rv, 4) != 4)
2031                 rv = random();
2032         if (rfd >= 0)
2033                 close(rfd);
2034         return rv;
2035 }
2036
2037 void random_uuid(__u8 *buf)
2038 {
2039         int fd, i, len;
2040         __u32 r[4];
2041
2042         fd = open("/dev/urandom", O_RDONLY);
2043         if (fd < 0)
2044                 goto use_random;
2045         len = read(fd, buf, 16);
2046         close(fd);
2047         if (len != 16)
2048                 goto use_random;
2049
2050         return;
2051
2052 use_random:
2053         for (i = 0; i < 4; i++)
2054                 r[i] = random();
2055         memcpy(buf, r, 16);
2056 }
2057
2058 int flush_metadata_updates(struct supertype *st)
2059 {
2060         int sfd;
2061         if (!st->updates) {
2062                 st->update_tail = NULL;
2063                 return -1;
2064         }
2065
2066         sfd = connect_monitor(st->container_devnm);
2067         if (sfd < 0)
2068                 return -1;
2069
2070         while (st->updates) {
2071                 struct metadata_update *mu = st->updates;
2072                 st->updates = mu->next;
2073
2074                 send_message(sfd, mu, 0);
2075                 wait_reply(sfd, 0);
2076                 free(mu->buf);
2077                 free(mu);
2078         }
2079         ack(sfd, 0);
2080         wait_reply(sfd, 0);
2081         close(sfd);
2082         st->update_tail = NULL;
2083         return 0;
2084 }
2085
2086 void append_metadata_update(struct supertype *st, void *buf, int len)
2087 {
2088
2089         struct metadata_update *mu = xmalloc(sizeof(*mu));
2090
2091         mu->buf = buf;
2092         mu->len = len;
2093         mu->space = NULL;
2094         mu->space_list = NULL;
2095         mu->next = NULL;
2096         *st->update_tail = mu;
2097         st->update_tail = &mu->next;
2098 }
2099
2100 #ifdef __TINYC__
2101 /* tinyc doesn't optimize this check in ioctl.h out ... */
2102 unsigned int __invalid_size_argument_for_IOC = 0;
2103 #endif
2104
2105 /**
2106  * disk_fd_matches_criteria() - check if device matches spare criteria.
2107  * @st: supertype, not NULL.
2108  * @disk_fd: file descriptor of the disk.
2109  * @sc: criteria to test.
2110  *
2111  * Return: true if disk matches criteria, false otherwise.
2112  */
2113 bool disk_fd_matches_criteria(struct supertype *st, int disk_fd, struct spare_criteria *sc)
2114 {
2115         unsigned int dev_sector_size = 0;
2116         unsigned long long dev_size = 0;
2117
2118         if (!sc->criteria_set)
2119                 return true;
2120
2121         if (!get_dev_size(disk_fd, NULL, &dev_size) || dev_size < sc->min_size)
2122                 return false;
2123
2124         if (!get_dev_sector_size(disk_fd, NULL, &dev_sector_size) ||
2125             sc->sector_size != dev_sector_size)
2126                 return false;
2127
2128         if (drive_test_and_add_policies(st, &sc->pols, disk_fd, 0))
2129                 return false;
2130
2131         return true;
2132 }
2133
2134 /**
2135  * devid_matches_criteria() - check if device referenced by devid matches spare criteria.
2136  * @st: supertype, not NULL.
2137  * @devid: devid of the device to check.
2138  * @sc: criteria to test.
2139  *
2140  * Return: true if disk matches criteria, false otherwise.
2141  */
2142 bool devid_matches_criteria(struct supertype *st, dev_t devid, struct spare_criteria *sc)
2143 {
2144         char buf[NAME_MAX];
2145         bool ret;
2146         int fd;
2147
2148         if (!sc->criteria_set)
2149                 return true;
2150
2151         snprintf(buf, NAME_MAX, "%d:%d", major(devid), minor(devid));
2152
2153         fd = dev_open(buf, O_RDONLY);
2154         if (!is_fd_valid(fd))
2155                 return false;
2156
2157         /* Error code inherited */
2158         ret = disk_fd_matches_criteria(st, fd, sc);
2159
2160         close(fd);
2161         return ret;
2162 }
2163
2164 /* Pick all spares matching given criteria from a container
2165  * if min_size == 0 do not check size
2166  * if domlist == NULL do not check domains
2167  * if spare_group given add it to domains of each spare
2168  * metadata allows to test domains using metadata of destination array */
2169 struct mdinfo *container_choose_spares(struct supertype *st,
2170                                        struct spare_criteria *criteria,
2171                                        struct domainlist *domlist,
2172                                        char *spare_group,
2173                                        const char *metadata, int get_one)
2174 {
2175         struct mdinfo *d, **dp, *disks = NULL;
2176
2177         /* get list of all disks in container */
2178         if (st->ss->getinfo_super_disks)
2179                 disks = st->ss->getinfo_super_disks(st);
2180
2181         if (!disks)
2182                 return disks;
2183         /* find spare devices on the list */
2184         dp = &disks->devs;
2185         disks->array.spare_disks = 0;
2186         while (*dp) {
2187                 bool found = false;
2188
2189                 d = *dp;
2190                 if (d->disk.state == 0) {
2191                         dev_t dev = makedev(d->disk.major,d->disk.minor);
2192
2193                         found = devid_matches_criteria(st, dev, criteria);
2194
2195                         /* check if domain matches */
2196                         if (found && domlist) {
2197                                 struct dev_policy *pol = devid_policy(dev);
2198                                 if (spare_group)
2199                                         pol_add(&pol, pol_domain,
2200                                                 spare_group, NULL);
2201                                 if (domain_test(domlist, pol, metadata) != 1)
2202                                         found = false;
2203
2204                                 dev_policy_free(pol);
2205                         }
2206                 }
2207                 if (found) {
2208                         dp = &d->next;
2209                         disks->array.spare_disks++;
2210                         if (get_one) {
2211                                 sysfs_free(*dp);
2212                                 d->next = NULL;
2213                         }
2214                 } else {
2215                         *dp = d->next;
2216                         d->next = NULL;
2217                         sysfs_free(d);
2218                 }
2219         }
2220         return disks;
2221 }
2222
2223 /* Checks if paths point to the same device
2224  * Returns 0 if they do.
2225  * Returns 1 if they don't.
2226  * Returns -1 if something went wrong,
2227  * e.g. paths are empty or the files
2228  * they point to don't exist */
2229 int compare_paths (char* path1, char* path2)
2230 {
2231         struct stat st1,st2;
2232
2233         if (path1 == NULL || path2 == NULL)
2234                 return -1;
2235         if (stat(path1,&st1) != 0)
2236                 return -1;
2237         if (stat(path2,&st2) != 0)
2238                 return -1;
2239         if ((st1.st_ino == st2.st_ino) && (st1.st_dev == st2.st_dev))
2240                 return 0;
2241         return 1;
2242 }
2243
2244 /* Make sure we can open as many devices as needed */
2245 void enable_fds(int devices)
2246 {
2247         unsigned int fds = 20 + devices;
2248         struct rlimit lim;
2249         if (getrlimit(RLIMIT_NOFILE, &lim) != 0 || lim.rlim_cur >= fds)
2250                 return;
2251         if (lim.rlim_max < fds)
2252                 lim.rlim_max = fds;
2253         lim.rlim_cur = fds;
2254         setrlimit(RLIMIT_NOFILE, &lim);
2255 }
2256
2257 /* Close all opened descriptors if needed and redirect
2258  * streams to /dev/null.
2259  * For debug purposed, leave STDOUT and STDERR untouched
2260  * Returns:
2261  *      1- if any error occurred
2262  *      0- otherwise
2263  */
2264 void manage_fork_fds(int close_all)
2265 {
2266         DIR *dir;
2267         struct dirent *dirent;
2268         int fd = open("/dev/null", O_RDWR);
2269
2270         if (is_fd_valid(fd)) {
2271                 dup2(fd, 0);
2272 #ifndef DEBUG
2273         dup2(0, 1);
2274         dup2(0, 2);
2275         close_fd(&fd);
2276 #endif
2277         }
2278
2279         if (close_all == 0)
2280                 return;
2281
2282         dir = opendir("/proc/self/fd");
2283         if (!dir) {
2284                 pr_err("Cannot open /proc/self/fd directory.\n");
2285                 return;
2286         }
2287         for (dirent = readdir(dir); dirent; dirent = readdir(dir)) {
2288                 int fd = -1;
2289
2290                 if ((strcmp(dirent->d_name, ".") == 0) ||
2291                     (strcmp(dirent->d_name, "..")) == 0)
2292                         continue;
2293
2294                 fd = strtol(dirent->d_name, NULL, 10);
2295                 if (fd > 2)
2296                         close_fd(&fd);
2297         }
2298         closedir(dir);
2299         return;
2300 }
2301
2302 /* In a systemd/udev world, it is best to get systemd to
2303  * run daemon rather than running in the background.
2304  * Returns:
2305  *      MDADM_STATUS_SUCCESS - if systemd service has been started.
2306  *      MDADM_STATUS_ERROR - otherwise.
2307  */
2308 mdadm_status_t continue_via_systemd(char *devnm, char *service_name, char *prefix)
2309 {
2310         int pid, status;
2311         char pathbuf[PATH_MAX];
2312
2313         dprintf("Start %s service\n", service_name);
2314         /* Simply return that service cannot be started */
2315         if (check_env("MDADM_NO_SYSTEMCTL"))
2316                 return MDADM_STATUS_ERROR;
2317
2318         /* Fork in attempt to start services */
2319         switch (fork()) {
2320         case -1: /* Fork failed, just do it ourselves. */
2321                 break;
2322         case  0: /* child */
2323                 manage_fork_fds(1);
2324                 snprintf(pathbuf, sizeof(pathbuf), "%s@%s%s.service",
2325                          service_name, prefix ? prefix : "", devnm);
2326
2327                 /* Attempt to start service.
2328                  * On success execl() will "kill" the fork, and return status of systemctl call.
2329                  */
2330                 execl("/usr/bin/systemctl", "systemctl", "restart", pathbuf, NULL);
2331                 execl("/bin/systemctl", "systemctl", "restart", pathbuf, NULL);
2332                 exit(MDADM_STATUS_ERROR);
2333         default: /* parent */
2334                 /* Check if forked process successfully trigered service */
2335                 pid = wait(&status);
2336                 if (pid >= 0 && status == 0)
2337                         return MDADM_STATUS_SUCCESS;
2338         }
2339         return MDADM_STATUS_ERROR;
2340 }
2341
2342 int in_initrd(void)
2343 {
2344         return access("/etc/initrd-release", F_OK) >= 0;
2345 }
2346
2347 void reopen_mddev(int mdfd)
2348 {
2349         /* Re-open without any O_EXCL, but keep
2350          * the same fd
2351          */
2352         char *devnm = fd2devnm(mdfd);
2353         int fd = open_dev(devnm);
2354
2355         if (!is_fd_valid(fd))
2356                 return;
2357
2358         dup2(fd, mdfd);
2359
2360         close_fd(&fd);
2361 }
2362
2363 static struct cmap_hooks *cmap_hooks = NULL;
2364 static int is_cmap_hooks_ready = 0;
2365
2366 void set_cmap_hooks(void)
2367 {
2368         cmap_hooks = xmalloc(sizeof(struct cmap_hooks));
2369         cmap_hooks->cmap_handle = dlopen("libcmap.so.4", RTLD_NOW | RTLD_LOCAL);
2370         if (!cmap_hooks->cmap_handle)
2371                 return;
2372
2373         cmap_hooks->initialize =
2374                 dlsym(cmap_hooks->cmap_handle, "cmap_initialize");
2375         cmap_hooks->get_string =
2376                 dlsym(cmap_hooks->cmap_handle, "cmap_get_string");
2377         cmap_hooks->finalize = dlsym(cmap_hooks->cmap_handle, "cmap_finalize");
2378
2379         if (!cmap_hooks->initialize || !cmap_hooks->get_string ||
2380             !cmap_hooks->finalize)
2381                 dlclose(cmap_hooks->cmap_handle);
2382         else
2383                 is_cmap_hooks_ready = 1;
2384 }
2385
2386 int get_cluster_name(char **cluster_name)
2387 {
2388         int rv = -1;
2389         cmap_handle_t handle;
2390
2391         if (!is_cmap_hooks_ready)
2392                 return rv;
2393
2394         rv = cmap_hooks->initialize(&handle);
2395         if (rv != CS_OK)
2396                 goto out;
2397
2398         rv = cmap_hooks->get_string(handle, "totem.cluster_name", cluster_name);
2399         if (rv != CS_OK) {
2400                 free(*cluster_name);
2401                 rv = -1;
2402                 goto name_err;
2403         }
2404
2405         rv = 0;
2406 name_err:
2407         cmap_hooks->finalize(handle);
2408 out:
2409         return rv;
2410 }
2411
2412 void set_dlm_hooks(void)
2413 {
2414         dlm_hooks = xmalloc(sizeof(struct dlm_hooks));
2415         dlm_hooks->dlm_handle = dlopen("libdlm_lt.so.3", RTLD_NOW | RTLD_LOCAL);
2416         if (!dlm_hooks->dlm_handle)
2417                 return;
2418
2419         dlm_hooks->open_lockspace =
2420                 dlsym(dlm_hooks->dlm_handle, "dlm_open_lockspace");
2421         dlm_hooks->create_lockspace =
2422                 dlsym(dlm_hooks->dlm_handle, "dlm_create_lockspace");
2423         dlm_hooks->release_lockspace =
2424                 dlsym(dlm_hooks->dlm_handle, "dlm_release_lockspace");
2425         dlm_hooks->ls_lock = dlsym(dlm_hooks->dlm_handle, "dlm_ls_lock");
2426         dlm_hooks->ls_unlock_wait =
2427                 dlsym(dlm_hooks->dlm_handle, "dlm_ls_unlock_wait");
2428         dlm_hooks->ls_get_fd = dlsym(dlm_hooks->dlm_handle, "dlm_ls_get_fd");
2429         dlm_hooks->dispatch = dlsym(dlm_hooks->dlm_handle, "dlm_dispatch");
2430
2431         if (!dlm_hooks->open_lockspace || !dlm_hooks->create_lockspace ||
2432             !dlm_hooks->ls_lock || !dlm_hooks->ls_unlock_wait ||
2433             !dlm_hooks->release_lockspace || !dlm_hooks->ls_get_fd ||
2434             !dlm_hooks->dispatch)
2435                 dlclose(dlm_hooks->dlm_handle);
2436         else
2437                 is_dlm_hooks_ready = 1;
2438 }
2439
2440 void set_hooks(void)
2441 {
2442         set_dlm_hooks();
2443         set_cmap_hooks();
2444 }
2445
2446 int zero_disk_range(int fd, unsigned long long sector, size_t count)
2447 {
2448         int ret = 0;
2449         int fd_zero;
2450         void *addr = NULL;
2451         size_t written = 0;
2452         size_t len = count * 512;
2453         ssize_t n;
2454
2455         fd_zero = open("/dev/zero", O_RDONLY);
2456         if (fd_zero < 0) {
2457                 pr_err("Cannot open /dev/zero\n");
2458                 return -1;
2459         }
2460
2461         if (lseek(fd, sector * 512, SEEK_SET) < 0) {
2462                 ret = -errno;
2463                 pr_err("Failed to seek offset for zeroing\n");
2464                 goto out;
2465         }
2466
2467         addr = mmap(NULL, len, PROT_READ, MAP_PRIVATE, fd_zero, 0);
2468
2469         if (addr == MAP_FAILED) {
2470                 ret = -errno;
2471                 pr_err("Mapping /dev/zero failed\n");
2472                 goto out;
2473         }
2474
2475         do {
2476                 n = write(fd, addr + written, len - written);
2477                 if (n < 0) {
2478                         if (errno == EINTR)
2479                                 continue;
2480                         ret = -errno;
2481                         pr_err("Zeroing disk range failed\n");
2482                         break;
2483                 }
2484                 written += n;
2485         } while (written != len);
2486
2487         munmap(addr, len);
2488
2489 out:
2490         close(fd_zero);
2491         return ret;
2492 }
2493
2494 /**
2495  * sleep_for() - Sleeps for specified time.
2496  * @sec: Seconds to sleep for.
2497  * @nsec: Nanoseconds to sleep for, has to be less than one second.
2498  * @wake_after_interrupt: If set, wake up if interrupted.
2499  *
2500  * Function immediately returns if error different than EINTR occurs.
2501  */
2502 void sleep_for(unsigned int sec, long nsec, bool wake_after_interrupt)
2503 {
2504         struct timespec delay = {.tv_sec = sec, .tv_nsec = nsec};
2505
2506         assert(nsec < MSEC_TO_NSEC(1000));
2507
2508         do {
2509                 errno = 0;
2510                 nanosleep(&delay, &delay);
2511                 if (errno != 0 && errno != EINTR) {
2512                         pr_err("Error sleeping for %us %ldns: %s\n", sec, nsec, strerror(errno));
2513                         return;
2514                 }
2515         } while (!wake_after_interrupt && errno == EINTR);
2516 }
2517
2518 /* is_directory() - Checks if directory provided by path is indeed a regular directory.
2519  * @path: directory path to be checked
2520  *
2521  * Doesn't accept symlinks.
2522  *
2523  * Return: true if is a directory, false if not
2524  */
2525 bool is_directory(const char *path)
2526 {
2527         struct stat st;
2528
2529         if (lstat(path, &st) != 0) {
2530                 pr_err("%s: %s\n", strerror(errno), path);
2531                 return false;
2532         }
2533
2534         if (!S_ISDIR(st.st_mode))
2535                 return false;
2536
2537         return true;
2538 }
2539
2540 /*
2541  * is_file() - Checks if file provided by path is indeed a regular file.
2542  * @path: file path to be checked
2543  *
2544  * Doesn't accept symlinks.
2545  *
2546  * Return: true if is  a file, false if not
2547  */
2548 bool is_file(const char *path)
2549 {
2550         struct stat st;
2551
2552         if (lstat(path, &st) != 0) {
2553                 pr_err("%s: %s\n", strerror(errno), path);
2554                 return false;
2555         }
2556
2557         if (!S_ISREG(st.st_mode))
2558                 return false;
2559
2560         return true;
2561 }
2562
2563 bool set_md_mod_parameter(const char *name, const char *value)
2564 {
2565         char path[256];
2566         int fd;
2567         bool ret = true;
2568
2569         snprintf(path, sizeof(path), "/sys/module/md_mod/parameters/%s", name);
2570
2571         fd = open(path, O_WRONLY);
2572         if (fd < 0) {
2573                 pr_err("Can't open %s\n", path);
2574                 return false;
2575         }
2576
2577         if (write(fd, value, strlen(value)) != (ssize_t)strlen(value)) {
2578                 pr_err("Failed to write to %s\n", path);
2579                 ret = false;
2580         }
2581
2582         close(fd);
2583         return ret;
2584 }
2585
2586 /* Init kernel md_mod parameters here if needed */
2587 bool init_md_mod_param(void)
2588 {
2589         bool ret = true;
2590
2591         /*
2592          * In kernel 9e59d609763f calls del_gendisk in sync way. So device
2593          * node can be removed after stop command. But it can introduce a
2594          * regression which can be fixed by github pr182. New mdadm version
2595          * with pr182 can work well with new kernel. But users who don't
2596          * update mdadm and update to new kernel, they can't assemble array
2597          * anymore. So kernel adds a kernel parameter legacy_async_del_gendisk
2598          * and uses async as default.
2599          * We'll use sync mode since 6.18 rather than async mode. So in future
2600          * the kernel parameter will be removed.
2601          */
2602         if (get_linux_version() >= 6018000)
2603                 ret = set_md_mod_parameter(MD_MOD_ASYNC_DEL_GENDISK, "N");
2604
2605         return ret;
2606 }