drivers/block/drbd/drbd_receiver.c

   1 /*
   2    drbd_receiver.c
   3
   4    This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
   5
   6    Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
   7    Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
   8    Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
   9
  10    drbd is free software; you can redistribute it and/or modify
  11    it under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 2, or (at your option)
  13    any later version.
  14
  15    drbd is distributed in the hope that it will be useful,
  16    but WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18    GNU General Public License for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with drbd; see the file COPYING.  If not, write to
  22    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  23  */
  24
  25
  26 #include <linux/module.h>
  27
  28 #include <linux/uaccess.h>
  29 #include <net/sock.h>
  30
  31 #include <linux/drbd.h>
  32 #include <linux/fs.h>
  33 #include <linux/file.h>
  34 #include <linux/in.h>
  35 #include <linux/mm.h>
  36 #include <linux/memcontrol.h>
  37 #include <linux/mm_inline.h>
  38 #include <linux/slab.h>
  39 #include <uapi/linux/sched/types.h>
  40 #include <linux/sched/signal.h>
  41 #include <linux/pkt_sched.h>
  42 #define __KERNEL_SYSCALLS__
  43 #include <linux/unistd.h>
  44 #include <linux/vmalloc.h>
  45 #include <linux/random.h>
  46 #include <linux/string.h>
  47 #include <linux/scatterlist.h>
  48 #include "drbd_int.h"
  49 #include "drbd_protocol.h"
  50 #include "drbd_req.h"
  51 #include "drbd_vli.h"
  52
  53 #define PRO_FEATURES (DRBD_FF_TRIM|DRBD_FF_THIN_RESYNC|DRBD_FF_WSAME)
  54
  55 struct packet_info {
  56         enum drbd_packet cmd;
  57         unsigned int size;
  58         unsigned int vnr;
  59         void *data;
  60 };
  61
  62 enum finish_epoch {
  63         FE_STILL_LIVE,
  64         FE_DESTROYED,
  65         FE_RECYCLED,
  66 };
  67
  68 static int drbd_do_features(struct drbd_connection *connection);
  69 static int drbd_do_auth(struct drbd_connection *connection);
  70 static int drbd_disconnected(struct drbd_peer_device *);
  71 static void conn_wait_active_ee_empty(struct drbd_connection *connection);
  72 static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *, struct drbd_epoch *, enum epoch_event);
  73 static int e_end_block(struct drbd_work *, int);
  74
  75
  76 #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
  77
  78 /*
  79  * some helper functions to deal with single linked page lists,
  80  * page->private being our "next" pointer.
  81  */
  82
  83 /* If at least n pages are linked at head, get n pages off.
  84  * Otherwise, don't modify head, and return NULL.
  85  * Locking is the responsibility of the caller.
  86  */
  87 static struct page *page_chain_del(struct page **head, int n)
  88 {
  89         struct page *page;
  90         struct page *tmp;
  91
  92         BUG_ON(!n);
  93         BUG_ON(!head);
  94
  95         page = *head;
  96
  97         if (!page)
  98                 return NULL;
  99
 100         while (page) {
 101                 tmp = page_chain_next(page);
 102                 if (--n == 0)
 103                         break; /* found sufficient pages */
 104                 if (tmp == NULL)
 105                         /* insufficient pages, don't use any of them. */
 106                         return NULL;
 107                 page = tmp;
 108         }
 109
 110         /* add end of list marker for the returned list */
 111         set_page_private(page, 0);
 112         /* actual return value, and adjustment of head */
 113         page = *head;
 114         *head = tmp;
 115         return page;
 116 }
 117
 118 /* may be used outside of locks to find the tail of a (usually short)
 119  * "private" page chain, before adding it back to a global chain head
 120  * with page_chain_add() under a spinlock. */
 121 static struct page *page_chain_tail(struct page *page, int *len)
 122 {
 123         struct page *tmp;
 124         int i = 1;
 125         while ((tmp = page_chain_next(page)))
 126                 ++i, page = tmp;
 127         if (len)
 128                 *len = i;
 129         return page;
 130 }
 131
 132 static int page_chain_free(struct page *page)
 133 {
 134         struct page *tmp;
 135         int i = 0;
 136         page_chain_for_each_safe(page, tmp) {
 137                 put_page(page);
 138                 ++i;
 139         }
 140         return i;
 141 }
 142
 143 static void page_chain_add(struct page **head,
 144                 struct page *chain_first, struct page *chain_last)
 145 {
 146 #if 1
 147         struct page *tmp;
 148         tmp = page_chain_tail(chain_first, NULL);
 149         BUG_ON(tmp != chain_last);
 150 #endif
 151
 152         /* add chain to head */
 153         set_page_private(chain_last, (unsigned long)*head);
 154         *head = chain_first;
 155 }
 156
 157 static struct page *__drbd_alloc_pages(struct drbd_device *device,
 158                                        unsigned int number)
 159 {
 160         struct page *page = NULL;
 161         struct page *tmp = NULL;
 162         unsigned int i = 0;
 163
 164         /* Yes, testing drbd_pp_vacant outside the lock is racy.
 165          * So what. It saves a spin_lock. */
 166         if (drbd_pp_vacant >= number) {
 167                 spin_lock(&drbd_pp_lock);
 168                 page = page_chain_del(&drbd_pp_pool, number);
 169                 if (page)
 170                         drbd_pp_vacant -= number;
 171                 spin_unlock(&drbd_pp_lock);
 172                 if (page)
 173                         return page;
 174         }
 175
 176         /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
 177          * "criss-cross" setup, that might cause write-out on some other DRBD,
 178          * which in turn might block on the other node at this very place.  */
 179         for (i = 0; i < number; i++) {
 180                 tmp = alloc_page(GFP_TRY);
 181                 if (!tmp)
 182                         break;
 183                 set_page_private(tmp, (unsigned long)page);
 184                 page = tmp;
 185         }
 186
 187         if (i == number)
 188                 return page;
 189
 190         /* Not enough pages immediately available this time.
 191          * No need to jump around here, drbd_alloc_pages will retry this
 192          * function "soon". */
 193         if (page) {
 194                 tmp = page_chain_tail(page, NULL);
 195                 spin_lock(&drbd_pp_lock);
 196                 page_chain_add(&drbd_pp_pool, page, tmp);
 197                 drbd_pp_vacant += i;
 198                 spin_unlock(&drbd_pp_lock);
 199         }
 200         return NULL;
 201 }
 202
 203 static void reclaim_finished_net_peer_reqs(struct drbd_device *device,
 204                                            struct list_head *to_be_freed)
 205 {
 206         struct drbd_peer_request *peer_req, *tmp;
 207
 208         /* The EEs are always appended to the end of the list. Since
 209            they are sent in order over the wire, they have to finish
 210            in order. As soon as we see the first not finished we can
 211            stop to examine the list... */
 212
 213         list_for_each_entry_safe(peer_req, tmp, &device->net_ee, w.list) {
 214                 if (drbd_peer_req_has_active_page(peer_req))
 215                         break;
 216                 list_move(&peer_req->w.list, to_be_freed);
 217         }
 218 }
 219
 220 static void drbd_reclaim_net_peer_reqs(struct drbd_device *device)
 221 {
 222         LIST_HEAD(reclaimed);
 223         struct drbd_peer_request *peer_req, *t;
 224
 225         spin_lock_irq(&device->resource->req_lock);
 226         reclaim_finished_net_peer_reqs(device, &reclaimed);
 227         spin_unlock_irq(&device->resource->req_lock);
 228         list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
 229                 drbd_free_net_peer_req(device, peer_req);
 230 }
 231
 232 static void conn_reclaim_net_peer_reqs(struct drbd_connection *connection)
 233 {
 234         struct drbd_peer_device *peer_device;
 235         int vnr;
 236
 237         rcu_read_lock();
 238         idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
 239                 struct drbd_device *device = peer_device->device;
 240                 if (!atomic_read(&device->pp_in_use_by_net))
 241                         continue;
 242
 243                 kref_get(&device->kref);
 244                 rcu_read_unlock();
 245                 drbd_reclaim_net_peer_reqs(device);
 246                 kref_put(&device->kref, drbd_destroy_device);
 247                 rcu_read_lock();
 248         }
 249         rcu_read_unlock();
 250 }
 251
 252 /**
 253  * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled)
 254  * @device:     DRBD device.
 255  * @number:     number of pages requested
 256  * @retry:      whether to retry, if not enough pages are available right now
 257  *
 258  * Tries to allocate number pages, first from our own page pool, then from
 259  * the kernel.
 260  * Possibly retry until DRBD frees sufficient pages somewhere else.
 261  *
 262  * If this allocation would exceed the max_buffers setting, we throttle
 263  * allocation (schedule_timeout) to give the system some room to breathe.
 264  *
 265  * We do not use max-buffers as hard limit, because it could lead to
 266  * congestion and further to a distributed deadlock during online-verify or
 267  * (checksum based) resync, if the max-buffers, socket buffer sizes and
 268  * resync-rate settings are mis-configured.
 269  *
 270  * Returns a page chain linked via page->private.
 271  */
 272 struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int number,
 273                               bool retry)
 274 {
 275         struct drbd_device *device = peer_device->device;
 276         struct page *page = NULL;
 277         struct net_conf *nc;
 278         DEFINE_WAIT(wait);
 279         unsigned int mxb;
 280
 281         rcu_read_lock();
 282         nc = rcu_dereference(peer_device->connection->net_conf);
 283         mxb = nc ? nc->max_buffers : 1000000;
 284         rcu_read_unlock();
 285
 286         if (atomic_read(&device->pp_in_use) < mxb)
 287                 page = __drbd_alloc_pages(device, number);
 288
 289         /* Try to keep the fast path fast, but occasionally we need
 290          * to reclaim the pages we lended to the network stack. */
 291         if (page && atomic_read(&device->pp_in_use_by_net) > 512)
 292                 drbd_reclaim_net_peer_reqs(device);
 293
 294         while (page == NULL) {
 295                 prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
 296
 297                 drbd_reclaim_net_peer_reqs(device);
 298
 299                 if (atomic_read(&device->pp_in_use) < mxb) {
 300                         page = __drbd_alloc_pages(device, number);
 301                         if (page)
 302                                 break;
 303                 }
 304
 305                 if (!retry)
 306                         break;
 307
 308                 if (signal_pending(current)) {
 309                         drbd_warn(device, "drbd_alloc_pages interrupted!\n");
 310                         break;
 311                 }
 312
 313                 if (schedule_timeout(HZ/10) == 0)
 314                         mxb = UINT_MAX;
 315         }
 316         finish_wait(&drbd_pp_wait, &wait);
 317
 318         if (page)
 319                 atomic_add(number, &device->pp_in_use);
 320         return page;
 321 }
 322
 323 /* Must not be used from irq, as that may deadlock: see drbd_alloc_pages.
 324  * Is also used from inside an other spin_lock_irq(&resource->req_lock);
 325  * Either links the page chain back to the global pool,
 326  * or returns all pages to the system. */
 327 static void drbd_free_pages(struct drbd_device *device, struct page *page, int is_net)
 328 {
 329         atomic_t *a = is_net ? &device->pp_in_use_by_net : &device->pp_in_use;
 330         int i;
 331
 332         if (page == NULL)
 333                 return;
 334
 335         if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count)
 336                 i = page_chain_free(page);
 337         else {
 338                 struct page *tmp;
 339                 tmp = page_chain_tail(page, &i);
 340                 spin_lock(&drbd_pp_lock);
 341                 page_chain_add(&drbd_pp_pool, page, tmp);
 342                 drbd_pp_vacant += i;
 343                 spin_unlock(&drbd_pp_lock);
 344         }
 345         i = atomic_sub_return(i, a);
 346         if (i < 0)
 347                 drbd_warn(device, "ASSERTION FAILED: %s: %d < 0\n",
 348                         is_net ? "pp_in_use_by_net" : "pp_in_use", i);
 349         wake_up(&drbd_pp_wait);
 350 }
 351
 352 /*
 353 You need to hold the req_lock:
 354  _drbd_wait_ee_list_empty()
 355
 356 You must not have the req_lock:
 357  drbd_free_peer_req()
 358  drbd_alloc_peer_req()
 359  drbd_free_peer_reqs()
 360  drbd_ee_fix_bhs()
 361  drbd_finish_peer_reqs()
 362  drbd_clear_done_ee()
 363  drbd_wait_ee_list_empty()
 364 */
 365
 366 /* normal: payload_size == request size (bi_size)
 367  * w_same: payload_size == logical_block_size
 368  * trim: payload_size == 0 */
 369 struct drbd_peer_request *
 370 drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
 371                     unsigned int request_size, unsigned int payload_size, gfp_t gfp_mask) __must_hold(local)
 372 {
 373         struct drbd_device *device = peer_device->device;
 374         struct drbd_peer_request *peer_req;
 375         struct page *page = NULL;
 376         unsigned nr_pages = (payload_size + PAGE_SIZE -1) >> PAGE_SHIFT;
 377
 378         if (drbd_insert_fault(device, DRBD_FAULT_AL_EE))
 379                 return NULL;
 380
 381         peer_req = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
 382         if (!peer_req) {
 383                 if (!(gfp_mask & __GFP_NOWARN))
 384                         drbd_err(device, "%s: allocation failed\n", __func__);
 385                 return NULL;
 386         }
 387
 388         if (nr_pages) {
 389                 page = drbd_alloc_pages(peer_device, nr_pages,
 390                                         gfpflags_allow_blocking(gfp_mask));
 391                 if (!page)
 392                         goto fail;
 393         }
 394
 395         memset(peer_req, 0, sizeof(*peer_req));
 396         INIT_LIST_HEAD(&peer_req->w.list);
 397         drbd_clear_interval(&peer_req->i);
 398         peer_req->i.size = request_size;
 399         peer_req->i.sector = sector;
 400         peer_req->submit_jif = jiffies;
 401         peer_req->peer_device = peer_device;
 402         peer_req->pages = page;
 403         /*
 404          * The block_id is opaque to the receiver.  It is not endianness
 405          * converted, and sent back to the sender unchanged.
 406          */
 407         peer_req->block_id = id;
 408
 409         return peer_req;
 410
 411  fail:
 412         mempool_free(peer_req, drbd_ee_mempool);
 413         return NULL;
 414 }
 415
 416 void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req,
 417                        int is_net)
 418 {
 419         might_sleep();
 420         if (peer_req->flags & EE_HAS_DIGEST)
 421                 kfree(peer_req->digest);
 422         drbd_free_pages(device, peer_req->pages, is_net);
 423         D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0);
 424         D_ASSERT(device, drbd_interval_empty(&peer_req->i));
 425         if (!expect(!(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) {
 426                 peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
 427                 drbd_al_complete_io(device, &peer_req->i);
 428         }
 429         mempool_free(peer_req, drbd_ee_mempool);
 430 }
 431
 432 int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list)
 433 {
 434         LIST_HEAD(work_list);
 435         struct drbd_peer_request *peer_req, *t;
 436         int count = 0;
 437         int is_net = list == &device->net_ee;
 438
 439         spin_lock_irq(&device->resource->req_lock);
 440         list_splice_init(list, &work_list);
 441         spin_unlock_irq(&device->resource->req_lock);
 442
 443         list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
 444                 __drbd_free_peer_req(device, peer_req, is_net);
 445                 count++;
 446         }
 447         return count;
 448 }
 449
 450 /*
 451  * See also comments in _req_mod(,BARRIER_ACKED) and receive_Barrier.
 452  */
 453 static int drbd_finish_peer_reqs(struct drbd_device *device)
 454 {
 455         LIST_HEAD(work_list);
 456         LIST_HEAD(reclaimed);
 457         struct drbd_peer_request *peer_req, *t;
 458         int err = 0;
 459
 460         spin_lock_irq(&device->resource->req_lock);
 461         reclaim_finished_net_peer_reqs(device, &reclaimed);
 462         list_splice_init(&device->done_ee, &work_list);
 463         spin_unlock_irq(&device->resource->req_lock);
 464
 465         list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
 466                 drbd_free_net_peer_req(device, peer_req);
 467
 468         /* possible callbacks here:
 469          * e_end_block, and e_end_resync_block, e_send_superseded.
 470          * all ignore the last argument.
 471          */
 472         list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
 473                 int err2;
 474
 475                 /* list_del not necessary, next/prev members not touched */
 476                 err2 = peer_req->w.cb(&peer_req->w, !!err);
 477                 if (!err)
 478                         err = err2;
 479                 drbd_free_peer_req(device, peer_req);
 480         }
 481         wake_up(&device->ee_wait);
 482
 483         return err;
 484 }
 485
 486 static void _drbd_wait_ee_list_empty(struct drbd_device *device,
 487                                      struct list_head *head)
 488 {
 489         DEFINE_WAIT(wait);
 490
 491         /* avoids spin_lock/unlock
 492          * and calling prepare_to_wait in the fast path */
 493         while (!list_empty(head)) {
 494                 prepare_to_wait(&device->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
 495                 spin_unlock_irq(&device->resource->req_lock);
 496                 io_schedule();
 497                 finish_wait(&device->ee_wait, &wait);
 498                 spin_lock_irq(&device->resource->req_lock);
 499         }
 500 }
 501
 502 static void drbd_wait_ee_list_empty(struct drbd_device *device,
 503                                     struct list_head *head)
 504 {
 505         spin_lock_irq(&device->resource->req_lock);
 506         _drbd_wait_ee_list_empty(device, head);
 507         spin_unlock_irq(&device->resource->req_lock);
 508 }
 509
 510 static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags)
 511 {
 512         struct kvec iov = {
 513                 .iov_base = buf,
 514                 .iov_len = size,
 515         };
 516         struct msghdr msg = {
 517                 .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
 518         };
 519         return kernel_recvmsg(sock, &msg, &iov, 1, size, msg.msg_flags);
 520 }
 521
 522 static int drbd_recv(struct drbd_connection *connection, void *buf, size_t size)
 523 {
 524         int rv;
 525
 526         rv = drbd_recv_short(connection->data.socket, buf, size, 0);
 527
 528         if (rv < 0) {
 529                 if (rv == -ECONNRESET)
 530                         drbd_info(connection, "sock was reset by peer\n");
 531                 else if (rv != -ERESTARTSYS)
 532                         drbd_err(connection, "sock_recvmsg returned %d\n", rv);
 533         } else if (rv == 0) {
 534                 if (test_bit(DISCONNECT_SENT, &connection->flags)) {
 535                         long t;
 536                         rcu_read_lock();
 537                         t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10;
 538                         rcu_read_unlock();
 539
 540                         t = wait_event_timeout(connection->ping_wait, connection->cstate < C_WF_REPORT_PARAMS, t);
 541
 542                         if (t)
 543                                 goto out;
 544                 }
 545                 drbd_info(connection, "sock was shut down by peer\n");
 546         }
 547
 548         if (rv != size)
 549                 conn_request_state(connection, NS(conn, C_BROKEN_PIPE), CS_HARD);
 550
 551 out:
 552         return rv;
 553 }
 554
 555 static int drbd_recv_all(struct drbd_connection *connection, void *buf, size_t size)
 556 {
 557         int err;
 558
 559         err = drbd_recv(connection, buf, size);
 560         if (err != size) {
 561                 if (err >= 0)
 562                         err = -EIO;
 563         } else
 564                 err = 0;
 565         return err;
 566 }
 567
 568 static int drbd_recv_all_warn(struct drbd_connection *connection, void *buf, size_t size)
 569 {
 570         int err;
 571
 572         err = drbd_recv_all(connection, buf, size);
 573         if (err && !signal_pending(current))
 574                 drbd_warn(connection, "short read (expected size %d)\n", (int)size);
 575         return err;
 576 }
 577
 578 /* quoting tcp(7):
 579  *   On individual connections, the socket buffer size must be set prior to the
 580  *   listen(2) or connect(2) calls in order to have it take effect.
 581  * This is our wrapper to do so.
 582  */
 583 static void drbd_setbufsize(struct socket *sock, unsigned int snd,
 584                 unsigned int rcv)
 585 {
 586         /* open coded SO_SNDBUF, SO_RCVBUF */
 587         if (snd) {
 588                 sock->sk->sk_sndbuf = snd;
 589                 sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 590         }
 591         if (rcv) {
 592                 sock->sk->sk_rcvbuf = rcv;
 593                 sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 594         }
 595 }
 596
 597 static struct socket *drbd_try_connect(struct drbd_connection *connection)
 598 {
 599         const char *what;
 600         struct socket *sock;
 601         struct sockaddr_in6 src_in6;
 602         struct sockaddr_in6 peer_in6;
 603         struct net_conf *nc;
 604         int err, peer_addr_len, my_addr_len;
 605         int sndbuf_size, rcvbuf_size, connect_int;
 606         int disconnect_on_error = 1;
 607
 608         rcu_read_lock();
 609         nc = rcu_dereference(connection->net_conf);
 610         if (!nc) {
 611                 rcu_read_unlock();
 612                 return NULL;
 613         }
 614         sndbuf_size = nc->sndbuf_size;
 615         rcvbuf_size = nc->rcvbuf_size;
 616         connect_int = nc->connect_int;
 617         rcu_read_unlock();
 618
 619         my_addr_len = min_t(int, connection->my_addr_len, sizeof(src_in6));
 620         memcpy(&src_in6, &connection->my_addr, my_addr_len);
 621
 622         if (((struct sockaddr *)&connection->my_addr)->sa_family == AF_INET6)
 623                 src_in6.sin6_port = 0;
 624         else
 625                 ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
 626
 627         peer_addr_len = min_t(int, connection->peer_addr_len, sizeof(src_in6));
 628         memcpy(&peer_in6, &connection->peer_addr, peer_addr_len);
 629
 630         what = "sock_create_kern";
 631         err = sock_create_kern(&init_net, ((struct sockaddr *)&src_in6)->sa_family,
 632                                SOCK_STREAM, IPPROTO_TCP, &sock);
 633         if (err < 0) {
 634                 sock = NULL;
 635                 goto out;
 636         }
 637
 638         sock->sk->sk_rcvtimeo =
 639         sock->sk->sk_sndtimeo = connect_int * HZ;
 640         drbd_setbufsize(sock, sndbuf_size, rcvbuf_size);
 641
 642        /* explicitly bind to the configured IP as source IP
 643         *  for the outgoing connections.
 644         *  This is needed for multihomed hosts and to be
 645         *  able to use lo: interfaces for drbd.
 646         * Make sure to use 0 as port number, so linux selects
 647         *  a free one dynamically.
 648         */
 649         what = "bind before connect";
 650         err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len);
 651         if (err < 0)
 652                 goto out;
 653
 654         /* connect may fail, peer not yet available.
 655          * stay C_WF_CONNECTION, don't go Disconnecting! */
 656         disconnect_on_error = 0;
 657         what = "connect";
 658         err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0);
 659
 660 out:
 661         if (err < 0) {
 662                 if (sock) {
 663                         sock_release(sock);
 664                         sock = NULL;
 665                 }
 666                 switch (-err) {
 667                         /* timeout, busy, signal pending */
 668                 case ETIMEDOUT: case EAGAIN: case EINPROGRESS:
 669                 case EINTR: case ERESTARTSYS:
 670                         /* peer not (yet) available, network problem */
 671                 case ECONNREFUSED: case ENETUNREACH:
 672                 case EHOSTDOWN:    case EHOSTUNREACH:
 673                         disconnect_on_error = 0;
 674                         break;
 675                 default:
 676                         drbd_err(connection, "%s failed, err = %d\n", what, err);
 677                 }
 678                 if (disconnect_on_error)
 679                         conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
 680         }
 681
 682         return sock;
 683 }
 684
 685 struct accept_wait_data {
 686         struct drbd_connection *connection;
 687         struct socket *s_listen;
 688         struct completion door_bell;
 689         void (*original_sk_state_change)(struct sock *sk);
 690
 691 };
 692
 693 static void drbd_incoming_connection(struct sock *sk)
 694 {
 695         struct accept_wait_data *ad = sk->sk_user_data;
 696         void (*state_change)(struct sock *sk);
 697
 698         state_change = ad->original_sk_state_change;
 699         if (sk->sk_state == TCP_ESTABLISHED)
 700                 complete(&ad->door_bell);
 701         state_change(sk);
 702 }
 703
 704 static int prepare_listen_socket(struct drbd_connection *connection, struct accept_wait_data *ad)
 705 {
 706         int err, sndbuf_size, rcvbuf_size, my_addr_len;
 707         struct sockaddr_in6 my_addr;
 708         struct socket *s_listen;
 709         struct net_conf *nc;
 710         const char *what;
 711
 712         rcu_read_lock();
 713         nc = rcu_dereference(connection->net_conf);
 714         if (!nc) {
 715                 rcu_read_unlock();
 716                 return -EIO;
 717         }
 718         sndbuf_size = nc->sndbuf_size;
 719         rcvbuf_size = nc->rcvbuf_size;
 720         rcu_read_unlock();
 721
 722         my_addr_len = min_t(int, connection->my_addr_len, sizeof(struct sockaddr_in6));
 723         memcpy(&my_addr, &connection->my_addr, my_addr_len);
 724
 725         what = "sock_create_kern";
 726         err = sock_create_kern(&init_net, ((struct sockaddr *)&my_addr)->sa_family,
 727                                SOCK_STREAM, IPPROTO_TCP, &s_listen);
 728         if (err) {
 729                 s_listen = NULL;
 730                 goto out;
 731         }
 732
 733         s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
 734         drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size);
 735
 736         what = "bind before listen";
 737         err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len);
 738         if (err < 0)
 739                 goto out;
 740
 741         ad->s_listen = s_listen;
 742         write_lock_bh(&s_listen->sk->sk_callback_lock);
 743         ad->original_sk_state_change = s_listen->sk->sk_state_change;
 744         s_listen->sk->sk_state_change = drbd_incoming_connection;
 745         s_listen->sk->sk_user_data = ad;
 746         write_unlock_bh(&s_listen->sk->sk_callback_lock);
 747
 748         what = "listen";
 749         err = s_listen->ops->listen(s_listen, 5);
 750         if (err < 0)
 751                 goto out;
 752
 753         return 0;
 754 out:
 755         if (s_listen)
 756                 sock_release(s_listen);
 757         if (err < 0) {
 758                 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
 759                         drbd_err(connection, "%s failed, err = %d\n", what, err);
 760                         conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
 761                 }
 762         }
 763
 764         return -EIO;
 765 }
 766
 767 static void unregister_state_change(struct sock *sk, struct accept_wait_data *ad)
 768 {
 769         write_lock_bh(&sk->sk_callback_lock);
 770         sk->sk_state_change = ad->original_sk_state_change;
 771         sk->sk_user_data = NULL;
 772         write_unlock_bh(&sk->sk_callback_lock);
 773 }
 774
 775 static struct socket *drbd_wait_for_connect(struct drbd_connection *connection, struct accept_wait_data *ad)
 776 {
 777         int timeo, connect_int, err = 0;
 778         struct socket *s_estab = NULL;
 779         struct net_conf *nc;
 780
 781         rcu_read_lock();
 782         nc = rcu_dereference(connection->net_conf);
 783         if (!nc) {
 784                 rcu_read_unlock();
 785                 return NULL;
 786         }
 787         connect_int = nc->connect_int;
 788         rcu_read_unlock();
 789
 790         timeo = connect_int * HZ;
 791         /* 28.5% random jitter */
 792         timeo += (prandom_u32() & 1) ? timeo / 7 : -timeo / 7;
 793
 794         err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo);
 795         if (err <= 0)
 796                 return NULL;
 797
 798         err = kernel_accept(ad->s_listen, &s_estab, 0);
 799         if (err < 0) {
 800                 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
 801                         drbd_err(connection, "accept failed, err = %d\n", err);
 802                         conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
 803                 }
 804         }
 805
 806         if (s_estab)
 807                 unregister_state_change(s_estab->sk, ad);
 808
 809         return s_estab;
 810 }
 811
 812 static int decode_header(struct drbd_connection *, void *, struct packet_info *);
 813
 814 static int send_first_packet(struct drbd_connection *connection, struct drbd_socket *sock,
 815                              enum drbd_packet cmd)
 816 {
 817         if (!conn_prepare_command(connection, sock))
 818                 return -EIO;
 819         return conn_send_command(connection, sock, cmd, 0, NULL, 0);
 820 }
 821
 822 static int receive_first_packet(struct drbd_connection *connection, struct socket *sock)
 823 {
 824         unsigned int header_size = drbd_header_size(connection);
 825         struct packet_info pi;
 826         struct net_conf *nc;
 827         int err;
 828
 829         rcu_read_lock();
 830         nc = rcu_dereference(connection->net_conf);
 831         if (!nc) {
 832                 rcu_read_unlock();
 833                 return -EIO;
 834         }
 835         sock->sk->sk_rcvtimeo = nc->ping_timeo * 4 * HZ / 10;
 836         rcu_read_unlock();
 837
 838         err = drbd_recv_short(sock, connection->data.rbuf, header_size, 0);
 839         if (err != header_size) {
 840                 if (err >= 0)
 841                         err = -EIO;
 842                 return err;
 843         }
 844         err = decode_header(connection, connection->data.rbuf, &pi);
 845         if (err)
 846                 return err;
 847         return pi.cmd;
 848 }
 849
 850 /**
 851  * drbd_socket_okay() - Free the socket if its connection is not okay
 852  * @sock:       pointer to the pointer to the socket.
 853  */
 854 static bool drbd_socket_okay(struct socket **sock)
 855 {
 856         int rr;
 857         char tb[4];
 858
 859         if (!*sock)
 860                 return false;
 861
 862         rr = drbd_recv_short(*sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
 863
 864         if (rr > 0 || rr == -EAGAIN) {
 865                 return true;
 866         } else {
 867                 sock_release(*sock);
 868                 *sock = NULL;
 869                 return false;
 870         }
 871 }
 872
 873 static bool connection_established(struct drbd_connection *connection,
 874                                    struct socket **sock1,
 875                                    struct socket **sock2)
 876 {
 877         struct net_conf *nc;
 878         int timeout;
 879         bool ok;
 880
 881         if (!*sock1 || !*sock2)
 882                 return false;
 883
 884         rcu_read_lock();
 885         nc = rcu_dereference(connection->net_conf);
 886         timeout = (nc->sock_check_timeo ?: nc->ping_timeo) * HZ / 10;
 887         rcu_read_unlock();
 888         schedule_timeout_interruptible(timeout);
 889
 890         ok = drbd_socket_okay(sock1);
 891         ok = drbd_socket_okay(sock2) && ok;
 892
 893         return ok;
 894 }
 895
 896 /* Gets called if a connection is established, or if a new minor gets created
 897    in a connection */
 898 int drbd_connected(struct drbd_peer_device *peer_device)
 899 {
 900         struct drbd_device *device = peer_device->device;
 901         int err;
 902
 903         atomic_set(&device->packet_seq, 0);
 904         device->peer_seq = 0;
 905
 906         device->state_mutex = peer_device->connection->agreed_pro_version < 100 ?
 907                 &peer_device->connection->cstate_mutex :
 908                 &device->own_state_mutex;
 909
 910         err = drbd_send_sync_param(peer_device);
 911         if (!err)
 912                 err = drbd_send_sizes(peer_device, 0, 0);
 913         if (!err)
 914                 err = drbd_send_uuids(peer_device);
 915         if (!err)
 916                 err = drbd_send_current_state(peer_device);
 917         clear_bit(USE_DEGR_WFC_T, &device->flags);
 918         clear_bit(RESIZE_PENDING, &device->flags);
 919         atomic_set(&device->ap_in_flight, 0);
 920         mod_timer(&device->request_timer, jiffies + HZ); /* just start it here. */
 921         return err;
 922 }
 923
 924 /*
 925  * return values:
 926  *   1 yes, we have a valid connection
 927  *   0 oops, did not work out, please try again
 928  *  -1 peer talks different language,
 929  *     no point in trying again, please go standalone.
 930  *  -2 We do not have a network config...
 931  */
 932 static int conn_connect(struct drbd_connection *connection)
 933 {
 934         struct drbd_socket sock, msock;
 935         struct drbd_peer_device *peer_device;
 936         struct net_conf *nc;
 937         int vnr, timeout, h;
 938         bool discard_my_data, ok;
 939         enum drbd_state_rv rv;
 940         struct accept_wait_data ad = {
 941                 .connection = connection,
 942                 .door_bell = COMPLETION_INITIALIZER_ONSTACK(ad.door_bell),
 943         };
 944
 945         clear_bit(DISCONNECT_SENT, &connection->flags);
 946         if (conn_request_state(connection, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS)
 947                 return -2;
 948
 949         mutex_init(&sock.mutex);
 950         sock.sbuf = connection->data.sbuf;
 951         sock.rbuf = connection->data.rbuf;
 952         sock.socket = NULL;
 953         mutex_init(&msock.mutex);
 954         msock.sbuf = connection->meta.sbuf;
 955         msock.rbuf = connection->meta.rbuf;
 956         msock.socket = NULL;
 957
 958         /* Assume that the peer only understands protocol 80 until we know better.  */
 959         connection->agreed_pro_version = 80;
 960
 961         if (prepare_listen_socket(connection, &ad))
 962                 return 0;
 963
 964         do {
 965                 struct socket *s;
 966
 967                 s = drbd_try_connect(connection);
 968                 if (s) {
 969                         if (!sock.socket) {
 970                                 sock.socket = s;
 971                                 send_first_packet(connection, &sock, P_INITIAL_DATA);
 972                         } else if (!msock.socket) {
 973                                 clear_bit(RESOLVE_CONFLICTS, &connection->flags);
 974                                 msock.socket = s;
 975                                 send_first_packet(connection, &msock, P_INITIAL_META);
 976                         } else {
 977                                 drbd_err(connection, "Logic error in conn_connect()\n");
 978                                 goto out_release_sockets;
 979                         }
 980                 }
 981
 982                 if (connection_established(connection, &sock.socket, &msock.socket))
 983                         break;
 984
 985 retry:
 986                 s = drbd_wait_for_connect(connection, &ad);
 987                 if (s) {
 988                         int fp = receive_first_packet(connection, s);
 989                         drbd_socket_okay(&sock.socket);
 990                         drbd_socket_okay(&msock.socket);
 991                         switch (fp) {
 992                         case P_INITIAL_DATA:
 993                                 if (sock.socket) {
 994                                         drbd_warn(connection, "initial packet S crossed\n");
 995                                         sock_release(sock.socket);
 996                                         sock.socket = s;
 997                                         goto randomize;
 998                                 }
 999                                 sock.socket = s;
1000                                 break;
1001                         case P_INITIAL_META:
1002                                 set_bit(RESOLVE_CONFLICTS, &connection->flags);
1003                                 if (msock.socket) {
1004                                         drbd_warn(connection, "initial packet M crossed\n");
1005                                         sock_release(msock.socket);
1006                                         msock.socket = s;
1007                                         goto randomize;
1008                                 }
1009                                 msock.socket = s;
1010                                 break;
1011                         default:
1012                                 drbd_warn(connection, "Error receiving initial packet\n");
1013                                 sock_release(s);
1014 randomize:
1015                                 if (prandom_u32() & 1)
1016                                         goto retry;
1017                         }
1018                 }
1019
1020                 if (connection->cstate <= C_DISCONNECTING)
1021                         goto out_release_sockets;
1022                 if (signal_pending(current)) {
1023                         flush_signals(current);
1024                         smp_rmb();
1025                         if (get_t_state(&connection->receiver) == EXITING)
1026                                 goto out_release_sockets;
1027                 }
1028
1029                 ok = connection_established(connection, &sock.socket, &msock.socket);
1030         } while (!ok);
1031
1032         if (ad.s_listen)
1033                 sock_release(ad.s_listen);
1034
1035         sock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
1036         msock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
1037
1038         sock.socket->sk->sk_allocation = GFP_NOIO;
1039         msock.socket->sk->sk_allocation = GFP_NOIO;
1040
1041         sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
1042         msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE;
1043
1044         /* NOT YET ...
1045          * sock.socket->sk->sk_sndtimeo = connection->net_conf->timeout*HZ/10;
1046          * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
1047          * first set it to the P_CONNECTION_FEATURES timeout,
1048          * which we set to 4x the configured ping_timeout. */
1049         rcu_read_lock();
1050         nc = rcu_dereference(connection->net_conf);
1051
1052         sock.socket->sk->sk_sndtimeo =
1053         sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10;
1054
1055         msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ;
1056         timeout = nc->timeout * HZ / 10;
1057         discard_my_data = nc->discard_my_data;
1058         rcu_read_unlock();
1059
1060         msock.socket->sk->sk_sndtimeo = timeout;
1061
1062         /* we don't want delays.
1063          * we use TCP_CORK where appropriate, though */
1064         drbd_tcp_nodelay(sock.socket);
1065         drbd_tcp_nodelay(msock.socket);
1066
1067         connection->data.socket = sock.socket;
1068         connection->meta.socket = msock.socket;
1069         connection->last_received = jiffies;
1070
1071         h = drbd_do_features(connection);
1072         if (h <= 0)
1073                 return h;
1074
1075         if (connection->cram_hmac_tfm) {
1076                 /* drbd_request_state(device, NS(conn, WFAuth)); */
1077                 switch (drbd_do_auth(connection)) {
1078                 case -1:
1079                         drbd_err(connection, "Authentication of peer failed\n");
1080                         return -1;
1081                 case 0:
1082                         drbd_err(connection, "Authentication of peer failed, trying again.\n");
1083                         return 0;
1084                 }
1085         }
1086
1087         connection->data.socket->sk->sk_sndtimeo = timeout;
1088         connection->data.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
1089
1090         if (drbd_send_protocol(connection) == -EOPNOTSUPP)
1091                 return -1;
1092
1093         /* Prevent a race between resync-handshake and
1094          * being promoted to Primary.
1095          *
1096          * Grab and release the state mutex, so we know that any current
1097          * drbd_set_role() is finished, and any incoming drbd_set_role
1098          * will see the STATE_SENT flag, and wait for it to be cleared.
1099          */
1100         idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
1101                 mutex_lock(peer_device->device->state_mutex);
1102
1103         set_bit(STATE_SENT, &connection->flags);
1104
1105         idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
1106                 mutex_unlock(peer_device->device->state_mutex);
1107
1108         rcu_read_lock();
1109         idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1110                 struct drbd_device *device = peer_device->device;
1111                 kref_get(&device->kref);
1112                 rcu_read_unlock();
1113
1114                 if (discard_my_data)
1115                         set_bit(DISCARD_MY_DATA, &device->flags);
1116                 else
1117                         clear_bit(DISCARD_MY_DATA, &device->flags);
1118
1119                 drbd_connected(peer_device);
1120                 kref_put(&device->kref, drbd_destroy_device);
1121                 rcu_read_lock();
1122         }
1123         rcu_read_unlock();
1124
1125         rv = conn_request_state(connection, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE);
1126         if (rv < SS_SUCCESS || connection->cstate != C_WF_REPORT_PARAMS) {
1127                 clear_bit(STATE_SENT, &connection->flags);
1128                 return 0;
1129         }
1130
1131         drbd_thread_start(&connection->ack_receiver);
1132         /* opencoded create_singlethread_workqueue(),
1133          * to be able to use format string arguments */
1134         connection->ack_sender =
1135                 alloc_ordered_workqueue("drbd_as_%s", WQ_MEM_RECLAIM, connection->resource->name);
1136         if (!connection->ack_sender) {
1137                 drbd_err(connection, "Failed to create workqueue ack_sender\n");
1138                 return 0;
1139         }
1140
1141         mutex_lock(&connection->resource->conf_update);
1142         /* The discard_my_data flag is a single-shot modifier to the next
1143          * connection attempt, the handshake of which is now well underway.
1144          * No need for rcu style copying of the whole struct
1145          * just to clear a single value. */
1146         connection->net_conf->discard_my_data = 0;
1147         mutex_unlock(&connection->resource->conf_update);
1148
1149         return h;
1150
1151 out_release_sockets:
1152         if (ad.s_listen)
1153                 sock_release(ad.s_listen);
1154         if (sock.socket)
1155                 sock_release(sock.socket);
1156         if (msock.socket)
1157                 sock_release(msock.socket);
1158         return -1;
1159 }
1160
1161 static int decode_header(struct drbd_connection *connection, void *header, struct packet_info *pi)
1162 {
1163         unsigned int header_size = drbd_header_size(connection);
1164
1165         if (header_size == sizeof(struct p_header100) &&
1166             *(__be32 *)header == cpu_to_be32(DRBD_MAGIC_100)) {
1167                 struct p_header100 *h = header;
1168                 if (h->pad != 0) {
1169                         drbd_err(connection, "Header padding is not zero\n");
1170                         return -EINVAL;
1171                 }
1172                 pi->vnr = be16_to_cpu(h->volume);
1173                 pi->cmd = be16_to_cpu(h->command);
1174                 pi->size = be32_to_cpu(h->length);
1175         } else if (header_size == sizeof(struct p_header95) &&
1176                    *(__be16 *)header == cpu_to_be16(DRBD_MAGIC_BIG)) {
1177                 struct p_header95 *h = header;
1178                 pi->cmd = be16_to_cpu(h->command);
1179                 pi->size = be32_to_cpu(h->length);
1180                 pi->vnr = 0;
1181         } else if (header_size == sizeof(struct p_header80) &&
1182                    *(__be32 *)header == cpu_to_be32(DRBD_MAGIC)) {
1183                 struct p_header80 *h = header;
1184                 pi->cmd = be16_to_cpu(h->command);
1185                 pi->size = be16_to_cpu(h->length);
1186                 pi->vnr = 0;
1187         } else {
1188                 drbd_err(connection, "Wrong magic value 0x%08x in protocol version %d\n",
1189                          be32_to_cpu(*(__be32 *)header),
1190                          connection->agreed_pro_version);
1191                 return -EINVAL;
1192         }
1193         pi->data = header + header_size;
1194         return 0;
1195 }
1196
1197 static int drbd_recv_header(struct drbd_connection *connection, struct packet_info *pi)
1198 {
1199         void *buffer = connection->data.rbuf;
1200         int err;
1201
1202         err = drbd_recv_all_warn(connection, buffer, drbd_header_size(connection));
1203         if (err)
1204                 return err;
1205
1206         err = decode_header(connection, buffer, pi);
1207         connection->last_received = jiffies;
1208
1209         return err;
1210 }
1211
1212 /* This is blkdev_issue_flush, but asynchronous.
1213  * We want to submit to all component volumes in parallel,
1214  * then wait for all completions.
1215  */
1216 struct issue_flush_context {
1217         atomic_t pending;
1218         int error;
1219         struct completion done;
1220 };
1221 struct one_flush_context {
1222         struct drbd_device *device;
1223         struct issue_flush_context *ctx;
1224 };
1225
1226 void one_flush_endio(struct bio *bio)
1227 {
1228         struct one_flush_context *octx = bio->bi_private;
1229         struct drbd_device *device = octx->device;
1230         struct issue_flush_context *ctx = octx->ctx;
1231
1232         if (bio->bi_error) {
1233                 ctx->error = bio->bi_error;
1234                 drbd_info(device, "local disk FLUSH FAILED with status %d\n", bio->bi_error);
1235         }
1236         kfree(octx);
1237         bio_put(bio);
1238
1239         clear_bit(FLUSH_PENDING, &device->flags);
1240         put_ldev(device);
1241         kref_put(&device->kref, drbd_destroy_device);
1242
1243         if (atomic_dec_and_test(&ctx->pending))
1244                 complete(&ctx->done);
1245 }
1246
1247 static void submit_one_flush(struct drbd_device *device, struct issue_flush_context *ctx)
1248 {
1249         struct bio *bio = bio_alloc(GFP_NOIO, 0);
1250         struct one_flush_context *octx = kmalloc(sizeof(*octx), GFP_NOIO);
1251         if (!bio || !octx) {
1252                 drbd_warn(device, "Could not allocate a bio, CANNOT ISSUE FLUSH\n");
1253                 /* FIXME: what else can I do now?  disconnecting or detaching
1254                  * really does not help to improve the state of the world, either.
1255                  */
1256                 kfree(octx);
1257                 if (bio)
1258                         bio_put(bio);
1259
1260                 ctx->error = -ENOMEM;
1261                 put_ldev(device);
1262                 kref_put(&device->kref, drbd_destroy_device);
1263                 return;
1264         }
1265
1266         octx->device = device;
1267         octx->ctx = ctx;
1268         bio->bi_bdev = device->ldev->backing_bdev;
1269         bio->bi_private = octx;
1270         bio->bi_end_io = one_flush_endio;
1271         bio->bi_opf = REQ_OP_FLUSH | REQ_PREFLUSH;
1272
1273         device->flush_jif = jiffies;
1274         set_bit(FLUSH_PENDING, &device->flags);
1275         atomic_inc(&ctx->pending);
1276         submit_bio(bio);
1277 }
1278
1279 static void drbd_flush(struct drbd_connection *connection)
1280 {
1281         if (connection->resource->write_ordering >= WO_BDEV_FLUSH) {
1282                 struct drbd_peer_device *peer_device;
1283                 struct issue_flush_context ctx;
1284                 int vnr;
1285
1286                 atomic_set(&ctx.pending, 1);
1287                 ctx.error = 0;
1288                 init_completion(&ctx.done);
1289
1290                 rcu_read_lock();
1291                 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1292                         struct drbd_device *device = peer_device->device;
1293
1294                         if (!get_ldev(device))
1295                                 continue;
1296                         kref_get(&device->kref);
1297                         rcu_read_unlock();
1298
1299                         submit_one_flush(device, &ctx);
1300
1301                         rcu_read_lock();
1302                 }
1303                 rcu_read_unlock();
1304
1305                 /* Do we want to add a timeout,
1306                  * if disk-timeout is set? */
1307                 if (!atomic_dec_and_test(&ctx.pending))
1308                         wait_for_completion(&ctx.done);
1309
1310                 if (ctx.error) {
1311                         /* would rather check on EOPNOTSUPP, but that is not reliable.
1312                          * don't try again for ANY return value != 0
1313                          * if (rv == -EOPNOTSUPP) */
1314                         /* Any error is already reported by bio_endio callback. */
1315                         drbd_bump_write_ordering(connection->resource, NULL, WO_DRAIN_IO);
1316                 }
1317         }
1318 }
1319
1320 /**
1321  * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it.
1322  * @device:     DRBD device.
1323  * @epoch:      Epoch object.
1324  * @ev:         Epoch event.
1325  */
1326 static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *connection,
1327                                                struct drbd_epoch *epoch,
1328                                                enum epoch_event ev)
1329 {
1330         int epoch_size;
1331         struct drbd_epoch *next_epoch;
1332         enum finish_epoch rv = FE_STILL_LIVE;
1333
1334         spin_lock(&connection->epoch_lock);
1335         do {
1336                 next_epoch = NULL;
1337
1338                 epoch_size = atomic_read(&epoch->epoch_size);
1339
1340                 switch (ev & ~EV_CLEANUP) {
1341                 case EV_PUT:
1342                         atomic_dec(&epoch->active);
1343                         break;
1344                 case EV_GOT_BARRIER_NR:
1345                         set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
1346                         break;
1347                 case EV_BECAME_LAST:
1348                         /* nothing to do*/
1349                         break;
1350                 }
1351
1352                 if (epoch_size != 0 &&
1353                     atomic_read(&epoch->active) == 0 &&
1354                     (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) {
1355                         if (!(ev & EV_CLEANUP)) {
1356                                 spin_unlock(&connection->epoch_lock);
1357                                 drbd_send_b_ack(epoch->connection, epoch->barrier_nr, epoch_size);
1358                                 spin_lock(&connection->epoch_lock);
1359                         }
1360 #if 0
1361                         /* FIXME: dec unacked on connection, once we have
1362                          * something to count pending connection packets in. */
1363                         if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags))
1364                                 dec_unacked(epoch->connection);
1365 #endif
1366
1367                         if (connection->current_epoch != epoch) {
1368                                 next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
1369                                 list_del(&epoch->list);
1370                                 ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
1371                                 connection->epochs--;
1372                                 kfree(epoch);
1373
1374                                 if (rv == FE_STILL_LIVE)
1375                                         rv = FE_DESTROYED;
1376                         } else {
1377                                 epoch->flags = 0;
1378                                 atomic_set(&epoch->epoch_size, 0);
1379                                 /* atomic_set(&epoch->active, 0); is already zero */
1380                                 if (rv == FE_STILL_LIVE)
1381                                         rv = FE_RECYCLED;
1382                         }
1383                 }
1384
1385                 if (!next_epoch)
1386                         break;
1387
1388                 epoch = next_epoch;
1389         } while (1);
1390
1391         spin_unlock(&connection->epoch_lock);
1392
1393         return rv;
1394 }
1395
1396 static enum write_ordering_e
1397 max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo)
1398 {
1399         struct disk_conf *dc;
1400
1401         dc = rcu_dereference(bdev->disk_conf);
1402
1403         if (wo == WO_BDEV_FLUSH && !dc->disk_flushes)
1404                 wo = WO_DRAIN_IO;
1405         if (wo == WO_DRAIN_IO && !dc->disk_drain)
1406                 wo = WO_NONE;
1407
1408         return wo;
1409 }
1410
1411 /**
1412  * drbd_bump_write_ordering() - Fall back to an other write ordering method
1413  * @connection: DRBD connection.
1414  * @wo:         Write ordering method to try.
1415  */
1416 void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev,
1417                               enum write_ordering_e wo)
1418 {
1419         struct drbd_device *device;
1420         enum write_ordering_e pwo;
1421         int vnr;
1422         static char *write_ordering_str[] = {
1423                 [WO_NONE] = "none",
1424                 [WO_DRAIN_IO] = "drain",
1425                 [WO_BDEV_FLUSH] = "flush",
1426         };
1427
1428         pwo = resource->write_ordering;
1429         if (wo != WO_BDEV_FLUSH)
1430                 wo = min(pwo, wo);
1431         rcu_read_lock();
1432         idr_for_each_entry(&resource->devices, device, vnr) {
1433                 if (get_ldev(device)) {
1434                         wo = max_allowed_wo(device->ldev, wo);
1435                         if (device->ldev == bdev)
1436                                 bdev = NULL;
1437                         put_ldev(device);
1438                 }
1439         }
1440
1441         if (bdev)
1442                 wo = max_allowed_wo(bdev, wo);
1443
1444         rcu_read_unlock();
1445
1446         resource->write_ordering = wo;
1447         if (pwo != resource->write_ordering || wo == WO_BDEV_FLUSH)
1448                 drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
1449 }
1450
1451 /*
1452  * We *may* ignore the discard-zeroes-data setting, if so configured.
1453  *
1454  * Assumption is that it "discard_zeroes_data=0" is only because the backend
1455  * may ignore partial unaligned discards.
1456  *
1457  * LVM/DM thin as of at least
1458  *   LVM version:     2.02.115(2)-RHEL7 (2015-01-28)
1459  *   Library version: 1.02.93-RHEL7 (2015-01-28)
1460  *   Driver version:  4.29.0
1461  * still behaves this way.
1462  *
1463  * For unaligned (wrt. alignment and granularity) or too small discards,
1464  * we zero-out the initial (and/or) trailing unaligned partial chunks,
1465  * but discard all the aligned full chunks.
1466  *
1467  * At least for LVM/DM thin, the result is effectively "discard_zeroes_data=1".
1468  */
1469 int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, unsigned int nr_sectors, bool discard)
1470 {
1471         struct block_device *bdev = device->ldev->backing_bdev;
1472         struct request_queue *q = bdev_get_queue(bdev);
1473         sector_t tmp, nr;
1474         unsigned int max_discard_sectors, granularity;
1475         int alignment;
1476         int err = 0;
1477
1478         if (!discard)
1479                 goto zero_out;
1480
1481         /* Zero-sector (unknown) and one-sector granularities are the same.  */
1482         granularity = max(q->limits.discard_granularity >> 9, 1U);
1483         alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
1484
1485         max_discard_sectors = min(q->limits.max_discard_sectors, (1U << 22));
1486         max_discard_sectors -= max_discard_sectors % granularity;
1487         if (unlikely(!max_discard_sectors))
1488                 goto zero_out;
1489
1490         if (nr_sectors < granularity)
1491                 goto zero_out;
1492
1493         tmp = start;
1494         if (sector_div(tmp, granularity) != alignment) {
1495                 if (nr_sectors < 2*granularity)
1496                         goto zero_out;
1497                 /* start + gran - (start + gran - align) % gran */
1498                 tmp = start + granularity - alignment;
1499                 tmp = start + granularity - sector_div(tmp, granularity);
1500
1501                 nr = tmp - start;
1502                 err |= blkdev_issue_zeroout(bdev, start, nr, GFP_NOIO,
1503                                 BLKDEV_ZERO_NOUNMAP);
1504                 nr_sectors -= nr;
1505                 start = tmp;
1506         }
1507         while (nr_sectors >= granularity) {
1508                 nr = min_t(sector_t, nr_sectors, max_discard_sectors);
1509                 err |= blkdev_issue_discard(bdev, start, nr, GFP_NOIO,
1510                                 BLKDEV_ZERO_NOUNMAP);
1511                 nr_sectors -= nr;
1512                 start += nr;
1513         }
1514  zero_out:
1515         if (nr_sectors) {
1516                 err |= blkdev_issue_zeroout(bdev, start, nr_sectors, GFP_NOIO,
1517                                 BLKDEV_ZERO_NOUNMAP);
1518         }
1519         return err != 0;
1520 }
1521
1522 static bool can_do_reliable_discards(struct drbd_device *device)
1523 {
1524         struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
1525         struct disk_conf *dc;
1526         bool can_do;
1527
1528         if (!blk_queue_discard(q))
1529                 return false;
1530
1531         if (q->limits.discard_zeroes_data)
1532                 return true;
1533
1534         rcu_read_lock();
1535         dc = rcu_dereference(device->ldev->disk_conf);
1536         can_do = dc->discard_zeroes_if_aligned;
1537         rcu_read_unlock();
1538         return can_do;
1539 }
1540
1541 static void drbd_issue_peer_discard(struct drbd_device *device, struct drbd_peer_request *peer_req)
1542 {
1543         /* If the backend cannot discard, or does not guarantee
1544          * read-back zeroes in discarded ranges, we fall back to
1545          * zero-out.  Unless configuration specifically requested
1546          * otherwise. */
1547         if (!can_do_reliable_discards(device))
1548                 peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT;
1549
1550         if (drbd_issue_discard_or_zero_out(device, peer_req->i.sector,
1551             peer_req->i.size >> 9, !(peer_req->flags & EE_IS_TRIM_USE_ZEROOUT)))
1552                 peer_req->flags |= EE_WAS_ERROR;
1553         drbd_endio_write_sec_final(peer_req);
1554 }
1555
1556 static void drbd_issue_peer_wsame(struct drbd_device *device,
1557                                   struct drbd_peer_request *peer_req)
1558 {
1559         struct block_device *bdev = device->ldev->backing_bdev;
1560         sector_t s = peer_req->i.sector;
1561         sector_t nr = peer_req->i.size >> 9;
1562         if (blkdev_issue_write_same(bdev, s, nr, GFP_NOIO, peer_req->pages))
1563                 peer_req->flags |= EE_WAS_ERROR;
1564         drbd_endio_write_sec_final(peer_req);
1565 }
1566
1567
1568 /**
1569  * drbd_submit_peer_request()
1570  * @device:     DRBD device.
1571  * @peer_req:   peer request
1572  * @rw:         flag field, see bio->bi_opf
1573  *
1574  * May spread the pages to multiple bios,
1575  * depending on bio_add_page restrictions.
1576  *
1577  * Returns 0 if all bios have been submitted,
1578  * -ENOMEM if we could not allocate enough bios,
1579  * -ENOSPC (any better suggestion?) if we have not been able to bio_add_page a
1580  *  single page to an empty bio (which should never happen and likely indicates
1581  *  that the lower level IO stack is in some way broken). This has been observed
1582  *  on certain Xen deployments.
1583  */
1584 /* TODO allocate from our own bio_set. */
1585 int drbd_submit_peer_request(struct drbd_device *device,
1586                              struct drbd_peer_request *peer_req,
1587                              const unsigned op, const unsigned op_flags,
1588                              const int fault_type)
1589 {
1590         struct bio *bios = NULL;
1591         struct bio *bio;
1592         struct page *page = peer_req->pages;
1593         sector_t sector = peer_req->i.sector;
1594         unsigned data_size = peer_req->i.size;
1595         unsigned n_bios = 0;
1596         unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
1597         int err = -ENOMEM;
1598
1599         /* TRIM/DISCARD: for now, always use the helper function
1600          * blkdev_issue_zeroout(..., discard=true).
1601          * It's synchronous, but it does the right thing wrt. bio splitting.
1602          * Correctness first, performance later.  Next step is to code an
1603          * asynchronous variant of the same.
1604          */
1605         if (peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) {
1606                 /* wait for all pending IO completions, before we start
1607                  * zeroing things out. */
1608                 conn_wait_active_ee_empty(peer_req->peer_device->connection);
1609                 /* add it to the active list now,
1610                  * so we can find it to present it in debugfs */
1611                 peer_req->submit_jif = jiffies;
1612                 peer_req->flags |= EE_SUBMITTED;
1613
1614                 /* If this was a resync request from receive_rs_deallocated(),
1615                  * it is already on the sync_ee list */
1616                 if (list_empty(&peer_req->w.list)) {
1617                         spin_lock_irq(&device->resource->req_lock);
1618                         list_add_tail(&peer_req->w.list, &device->active_ee);
1619                         spin_unlock_irq(&device->resource->req_lock);
1620                 }
1621
1622                 if (peer_req->flags & EE_IS_TRIM)
1623                         drbd_issue_peer_discard(device, peer_req);
1624                 else /* EE_WRITE_SAME */
1625                         drbd_issue_peer_wsame(device, peer_req);
1626                 return 0;
1627         }
1628
1629         /* In most cases, we will only need one bio.  But in case the lower
1630          * level restrictions happen to be different at this offset on this
1631          * side than those of the sending peer, we may need to submit the
1632          * request in more than one bio.
1633          *
1634          * Plain bio_alloc is good enough here, this is no DRBD internally
1635          * generated bio, but a bio allocated on behalf of the peer.
1636          */
1637 next_bio:
1638         bio = bio_alloc(GFP_NOIO, nr_pages);
1639         if (!bio) {
1640                 drbd_err(device, "submit_ee: Allocation of a bio failed (nr_pages=%u)\n", nr_pages);
1641                 goto fail;
1642         }
1643         /* > peer_req->i.sector, unless this is the first bio */
1644         bio->bi_iter.bi_sector = sector;
1645         bio->bi_bdev = device->ldev->backing_bdev;
1646         bio_set_op_attrs(bio, op, op_flags);
1647         bio->bi_private = peer_req;
1648         bio->bi_end_io = drbd_peer_request_endio;
1649
1650         bio->bi_next = bios;
1651         bios = bio;
1652         ++n_bios;
1653
1654         page_chain_for_each(page) {
1655                 unsigned len = min_t(unsigned, data_size, PAGE_SIZE);
1656                 if (!bio_add_page(bio, page, len, 0))
1657                         goto next_bio;
1658                 data_size -= len;
1659                 sector += len >> 9;
1660                 --nr_pages;
1661         }
1662         D_ASSERT(device, data_size == 0);
1663         D_ASSERT(device, page == NULL);
1664
1665         atomic_set(&peer_req->pending_bios, n_bios);
1666         /* for debugfs: update timestamp, mark as submitted */
1667         peer_req->submit_jif = jiffies;
1668         peer_req->flags |= EE_SUBMITTED;
1669         do {
1670                 bio = bios;
1671                 bios = bios->bi_next;
1672                 bio->bi_next = NULL;
1673
1674                 drbd_generic_make_request(device, fault_type, bio);
1675         } while (bios);
1676         return 0;
1677
1678 fail:
1679         while (bios) {
1680                 bio = bios;
1681                 bios = bios->bi_next;
1682                 bio_put(bio);
1683         }
1684         return err;
1685 }
1686
1687 static void drbd_remove_epoch_entry_interval(struct drbd_device *device,
1688                                              struct drbd_peer_request *peer_req)
1689 {
1690         struct drbd_interval *i = &peer_req->i;
1691
1692         drbd_remove_interval(&device->write_requests, i);
1693         drbd_clear_interval(i);
1694
1695         /* Wake up any processes waiting for this peer request to complete.  */
1696         if (i->waiting)
1697                 wake_up(&device->misc_wait);
1698 }
1699
1700 static void conn_wait_active_ee_empty(struct drbd_connection *connection)
1701 {
1702         struct drbd_peer_device *peer_device;
1703         int vnr;
1704
1705         rcu_read_lock();
1706         idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1707                 struct drbd_device *device = peer_device->device;
1708
1709                 kref_get(&device->kref);
1710                 rcu_read_unlock();
1711                 drbd_wait_ee_list_empty(device, &device->active_ee);
1712                 kref_put(&device->kref, drbd_destroy_device);
1713                 rcu_read_lock();
1714         }
1715         rcu_read_unlock();
1716 }
1717
1718 static int receive_Barrier(struct drbd_connection *connection, struct packet_info *pi)
1719 {
1720         int rv;
1721         struct p_barrier *p = pi->data;
1722         struct drbd_epoch *epoch;
1723
1724         /* FIXME these are unacked on connection,
1725          * not a specific (peer)device.
1726          */
1727         connection->current_epoch->barrier_nr = p->barrier;
1728         connection->current_epoch->connection = connection;
1729         rv = drbd_may_finish_epoch(connection, connection->current_epoch, EV_GOT_BARRIER_NR);
1730
1731         /* P_BARRIER_ACK may imply that the corresponding extent is dropped from
1732          * the activity log, which means it would not be resynced in case the
1733          * R_PRIMARY crashes now.
1734          * Therefore we must send the barrier_ack after the barrier request was
1735          * completed. */
1736         switch (connection->resource->write_ordering) {
1737         case WO_NONE:
1738                 if (rv == FE_RECYCLED)
1739                         return 0;
1740
1741                 /* receiver context, in the writeout path of the other node.
1742                  * avoid potential distributed deadlock */
1743                 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1744                 if (epoch)
1745                         break;
1746                 else
1747                         drbd_warn(connection, "Allocation of an epoch failed, slowing down\n");
1748                         /* Fall through */
1749
1750         case WO_BDEV_FLUSH:
1751         case WO_DRAIN_IO:
1752                 conn_wait_active_ee_empty(connection);
1753                 drbd_flush(connection);
1754
1755                 if (atomic_read(&connection->current_epoch->epoch_size)) {
1756                         epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1757                         if (epoch)
1758                                 break;
1759                 }
1760
1761                 return 0;
1762         default:
1763                 drbd_err(connection, "Strangeness in connection->write_ordering %d\n",
1764                          connection->resource->write_ordering);
1765                 return -EIO;
1766         }
1767
1768         epoch->flags = 0;
1769         atomic_set(&epoch->epoch_size, 0);
1770         atomic_set(&epoch->active, 0);
1771
1772         spin_lock(&connection->epoch_lock);
1773         if (atomic_read(&connection->current_epoch->epoch_size)) {
1774                 list_add(&epoch->list, &connection->current_epoch->list);
1775                 connection->current_epoch = epoch;
1776                 connection->epochs++;
1777         } else {
1778                 /* The current_epoch got recycled while we allocated this one... */
1779                 kfree(epoch);
1780         }
1781         spin_unlock(&connection->epoch_lock);
1782
1783         return 0;
1784 }
1785
1786 /* quick wrapper in case payload size != request_size (write same) */
1787 static void drbd_csum_ee_size(struct crypto_ahash *h,
1788                               struct drbd_peer_request *r, void *d,
1789                               unsigned int payload_size)
1790 {
1791         unsigned int tmp = r->i.size;
1792         r->i.size = payload_size;
1793         drbd_csum_ee(h, r, d);
1794         r->i.size = tmp;
1795 }
1796
1797 /* used from receive_RSDataReply (recv_resync_read)
1798  * and from receive_Data.
1799  * data_size: actual payload ("data in")
1800  *      for normal writes that is bi_size.
1801  *      for discards, that is zero.
1802  *      for write same, it is logical_block_size.
1803  * both trim and write same have the bi_size ("data len to be affected")
1804  * as extra argument in the packet header.
1805  */
1806 static struct drbd_peer_request *
1807 read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
1808               struct packet_info *pi) __must_hold(local)
1809 {
1810         struct drbd_device *device = peer_device->device;
1811         const sector_t capacity = drbd_get_capacity(device->this_bdev);
1812         struct drbd_peer_request *peer_req;
1813         struct page *page;
1814         int digest_size, err;
1815         unsigned int data_size = pi->size, ds;
1816         void *dig_in = peer_device->connection->int_dig_in;
1817         void *dig_vv = peer_device->connection->int_dig_vv;
1818         unsigned long *data;
1819         struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL;
1820         struct p_trim *wsame = (pi->cmd == P_WSAME) ? pi->data : NULL;
1821
1822         digest_size = 0;
1823         if (!trim && peer_device->connection->peer_integrity_tfm) {
1824                 digest_size = crypto_ahash_digestsize(peer_device->connection->peer_integrity_tfm);
1825                 /*
1826                  * FIXME: Receive the incoming digest into the receive buffer
1827                  *        here, together with its struct p_data?
1828                  */
1829                 err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size);
1830                 if (err)
1831                         return NULL;
1832                 data_size -= digest_size;
1833         }
1834
1835         /* assume request_size == data_size, but special case trim and wsame. */
1836         ds = data_size;
1837         if (trim) {
1838                 if (!expect(data_size == 0))
1839                         return NULL;
1840                 ds = be32_to_cpu(trim->size);
1841         } else if (wsame) {
1842                 if (data_size != queue_logical_block_size(device->rq_queue)) {
1843                         drbd_err(peer_device, "data size (%u) != drbd logical block size (%u)\n",
1844                                 data_size, queue_logical_block_size(device->rq_queue));
1845                         return NULL;
1846                 }
1847                 if (data_size != bdev_logical_block_size(device->ldev->backing_bdev)) {
1848                         drbd_err(peer_device, "data size (%u) != backend logical block size (%u)\n",
1849                                 data_size, bdev_logical_block_size(device->ldev->backing_bdev));
1850                         return NULL;
1851                 }
1852                 ds = be32_to_cpu(wsame->size);
1853         }
1854
1855         if (!expect(IS_ALIGNED(ds, 512)))
1856                 return NULL;
1857         if (trim || wsame) {
1858                 if (!expect(ds <= (DRBD_MAX_BBIO_SECTORS << 9)))
1859                         return NULL;
1860         } else if (!expect(ds <= DRBD_MAX_BIO_SIZE))
1861                 return NULL;
1862
1863         /* even though we trust out peer,
1864          * we sometimes have to double check. */
1865         if (sector + (ds>>9) > capacity) {
1866                 drbd_err(device, "request from peer beyond end of local disk: "
1867                         "capacity: %llus < sector: %llus + size: %u\n",
1868                         (unsigned long long)capacity,
1869                         (unsigned long long)sector, ds);
1870                 return NULL;
1871         }
1872
1873         /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
1874          * "criss-cross" setup, that might cause write-out on some other DRBD,
1875          * which in turn might block on the other node at this very place.  */
1876         peer_req = drbd_alloc_peer_req(peer_device, id, sector, ds, data_size, GFP_NOIO);
1877         if (!peer_req)
1878                 return NULL;
1879
1880         peer_req->flags |= EE_WRITE;
1881         if (trim) {
1882                 peer_req->flags |= EE_IS_TRIM;
1883                 return peer_req;
1884         }
1885         if (wsame)
1886                 peer_req->flags |= EE_WRITE_SAME;
1887
1888         /* receive payload size bytes into page chain */
1889         ds = data_size;
1890         page = peer_req->pages;
1891         page_chain_for_each(page) {
1892                 unsigned len = min_t(int, ds, PAGE_SIZE);
1893                 data = kmap(page);
1894                 err = drbd_recv_all_warn(peer_device->connection, data, len);
1895                 if (drbd_insert_fault(device, DRBD_FAULT_RECEIVE)) {
1896                         drbd_err(device, "Fault injection: Corrupting data on receive\n");
1897                         data[0] = data[0] ^ (unsigned long)-1;
1898                 }
1899                 kunmap(page);
1900                 if (err) {
1901                         drbd_free_peer_req(device, peer_req);
1902                         return NULL;
1903                 }
1904                 ds -= len;
1905         }
1906
1907         if (digest_size) {
1908                 drbd_csum_ee_size(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv, data_size);
1909                 if (memcmp(dig_in, dig_vv, digest_size)) {
1910                         drbd_err(device, "Digest integrity check FAILED: %llus +%u\n",
1911                                 (unsigned long long)sector, data_size);
1912                         drbd_free_peer_req(device, peer_req);
1913                         return NULL;
1914                 }
1915         }
1916         device->recv_cnt += data_size >> 9;
1917         return peer_req;
1918 }
1919
1920 /* drbd_drain_block() just takes a data block
1921  * out of the socket input buffer, and discards it.
1922  */
1923 static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size)
1924 {
1925         struct page *page;
1926         int err = 0;
1927         void *data;
1928
1929         if (!data_size)
1930                 return 0;
1931
1932         page = drbd_alloc_pages(peer_device, 1, 1);
1933
1934         data = kmap(page);
1935         while (data_size) {
1936                 unsigned int len = min_t(int, data_size, PAGE_SIZE);
1937
1938                 err = drbd_recv_all_warn(peer_device->connection, data, len);
1939                 if (err)
1940                         break;
1941                 data_size -= len;
1942         }
1943         kunmap(page);
1944         drbd_free_pages(peer_device->device, page, 0);
1945         return err;
1946 }
1947
1948 static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_request *req,
1949                            sector_t sector, int data_size)
1950 {
1951         struct bio_vec bvec;
1952         struct bvec_iter iter;
1953         struct bio *bio;
1954         int digest_size, err, expect;
1955         void *dig_in = peer_device->connection->int_dig_in;
1956         void *dig_vv = peer_device->connection->int_dig_vv;
1957
1958         digest_size = 0;
1959         if (peer_device->connection->peer_integrity_tfm) {
1960                 digest_size = crypto_ahash_digestsize(peer_device->connection->peer_integrity_tfm);
1961                 err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size);
1962                 if (err)
1963                         return err;
1964                 data_size -= digest_size;
1965         }
1966
1967         /* optimistically update recv_cnt.  if receiving fails below,
1968          * we disconnect anyways, and counters will be reset. */
1969         peer_device->device->recv_cnt += data_size>>9;
1970
1971         bio = req->master_bio;
1972         D_ASSERT(peer_device->device, sector == bio->bi_iter.bi_sector);
1973
1974         bio_for_each_segment(bvec, bio, iter) {
1975                 void *mapped = kmap(bvec.bv_page) + bvec.bv_offset;
1976                 expect = min_t(int, data_size, bvec.bv_len);
1977                 err = drbd_recv_all_warn(peer_device->connection, mapped, expect);
1978                 kunmap(bvec.bv_page);
1979                 if (err)
1980                         return err;
1981                 data_size -= expect;
1982         }
1983
1984         if (digest_size) {
1985                 drbd_csum_bio(peer_device->connection->peer_integrity_tfm, bio, dig_vv);
1986                 if (memcmp(dig_in, dig_vv, digest_size)) {
1987                         drbd_err(peer_device, "Digest integrity check FAILED. Broken NICs?\n");
1988                         return -EINVAL;
1989                 }
1990         }
1991
1992         D_ASSERT(peer_device->device, data_size == 0);
1993         return 0;
1994 }
1995
1996 /*
1997  * e_end_resync_block() is called in ack_sender context via
1998  * drbd_finish_peer_reqs().
1999  */
2000 static int e_end_resync_block(struct drbd_work *w, int unused)
2001 {
2002         struct drbd_peer_request *peer_req =
2003                 container_of(w, struct drbd_peer_request, w);
2004         struct drbd_peer_device *peer_device = peer_req->peer_device;
2005         struct drbd_device *device = peer_device->device;
2006         sector_t sector = peer_req->i.sector;
2007         int err;
2008
2009         D_ASSERT(device, drbd_interval_empty(&peer_req->i));
2010
2011         if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
2012                 drbd_set_in_sync(device, sector, peer_req->i.size);
2013                 err = drbd_send_ack(peer_device, P_RS_WRITE_ACK, peer_req);
2014         } else {
2015                 /* Record failure to sync */
2016                 drbd_rs_failed_io(device, sector, peer_req->i.size);
2017
2018                 err  = drbd_send_ack(peer_device, P_NEG_ACK, peer_req);
2019         }
2020         dec_unacked(device);
2021
2022         return err;
2023 }
2024
2025 static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t sector,
2026                             struct packet_info *pi) __releases(local)
2027 {
2028         struct drbd_device *device = peer_device->device;
2029         struct drbd_peer_request *peer_req;
2030
2031         peer_req = read_in_block(peer_device, ID_SYNCER, sector, pi);
2032         if (!peer_req)
2033                 goto fail;
2034
2035         dec_rs_pending(device);
2036
2037         inc_unacked(device);
2038         /* corresponding dec_unacked() in e_end_resync_block()
2039          * respective _drbd_clear_done_ee */
2040
2041         peer_req->w.cb = e_end_resync_block;
2042         peer_req->submit_jif = jiffies;
2043
2044         spin_lock_irq(&device->resource->req_lock);
2045         list_add_tail(&peer_req->w.list, &device->sync_ee);
2046         spin_unlock_irq(&device->resource->req_lock);
2047
2048         atomic_add(pi->size >> 9, &device->rs_sect_ev);
2049         if (drbd_submit_peer_request(device, peer_req, REQ_OP_WRITE, 0,
2050                                      DRBD_FAULT_RS_WR) == 0)
2051                 return 0;
2052
2053         /* don't care for the reason here */
2054         drbd_err(device, "submit failed, triggering re-connect\n");
2055         spin_lock_irq(&device->resource->req_lock);
2056         list_del(&peer_req->w.list);
2057         spin_unlock_irq(&device->resource->req_lock);
2058
2059         drbd_free_peer_req(device, peer_req);
2060 fail:
2061         put_ldev(device);
2062         return -EIO;
2063 }
2064
2065 static struct drbd_request *
2066 find_request(struct drbd_device *device, struct rb_root *root, u64 id,
2067              sector_t sector, bool missing_ok, const char *func)
2068 {
2069         struct drbd_request *req;
2070
2071         /* Request object according to our peer */
2072         req = (struct drbd_request *)(unsigned long)id;
2073         if (drbd_contains_interval(root, sector, &req->i) && req->i.local)
2074                 return req;
2075         if (!missing_ok) {
2076                 drbd_err(device, "%s: failed to find request 0x%lx, sector %llus\n", func,
2077                         (unsigned long)id, (unsigned long long)sector);
2078         }
2079         return NULL;
2080 }
2081
2082 static int receive_DataReply(struct drbd_connection *connection, struct packet_info *pi)
2083 {
2084         struct drbd_peer_device *peer_device;
2085         struct drbd_device *device;
2086         struct drbd_request *req;
2087         sector_t sector;
2088         int err;
2089         struct p_data *p = pi->data;
2090
2091         peer_device = conn_peer_device(connection, pi->vnr);
2092         if (!peer_device)
2093                 return -EIO;
2094         device = peer_device->device;
2095
2096         sector = be64_to_cpu(p->sector);
2097
2098         spin_lock_irq(&device->resource->req_lock);
2099         req = find_request(device, &device->read_requests, p->block_id, sector, false, __func__);
2100         spin_unlock_irq(&device->resource->req_lock);
2101         if (unlikely(!req))
2102                 return -EIO;
2103
2104         /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid
2105          * special casing it there for the various failure cases.
2106          * still no race with drbd_fail_pending_reads */
2107         err = recv_dless_read(peer_device, req, sector, pi->size);
2108         if (!err)
2109                 req_mod(req, DATA_RECEIVED);
2110         /* else: nothing. handled from drbd_disconnect...
2111          * I don't think we may complete this just yet
2112          * in case we are "on-disconnect: freeze" */
2113
2114         return err;
2115 }
2116
2117 static int receive_RSDataReply(struct drbd_connection *connection, struct packet_info *pi)
2118 {
2119         struct drbd_peer_device *peer_device;
2120         struct drbd_device *device;
2121         sector_t sector;
2122         int err;
2123         struct p_data *p = pi->data;
2124
2125         peer_device = conn_peer_device(connection, pi->vnr);
2126         if (!peer_device)
2127                 return -EIO;
2128         device = peer_device->device;
2129
2130         sector = be64_to_cpu(p->sector);
2131         D_ASSERT(device, p->block_id == ID_SYNCER);
2132
2133         if (get_ldev(device)) {
2134                 /* data is submitted to disk within recv_resync_read.
2135                  * corresponding put_ldev done below on error,
2136                  * or in drbd_peer_request_endio. */
2137                 err = recv_resync_read(peer_device, sector, pi);
2138         } else {
2139                 if (__ratelimit(&drbd_ratelimit_state))
2140                         drbd_err(device, "Can not write resync data to local disk.\n");
2141
2142                 err = drbd_drain_block(peer_device, pi->size);
2143
2144                 drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size);
2145         }
2146
2147         atomic_add(pi->size >> 9, &device->rs_sect_in);
2148
2149         return err;
2150 }
2151
2152 static void restart_conflicting_writes(struct drbd_device *device,
2153                                        sector_t sector, int size)
2154 {
2155         struct drbd_interval *i;
2156         struct drbd_request *req;
2157
2158         drbd_for_each_overlap(i, &device->write_requests, sector, size) {
2159                 if (!i->local)
2160                         continue;
2161                 req = container_of(i, struct drbd_request, i);
2162                 if (req->rq_state & RQ_LOCAL_PENDING ||
2163                     !(req->rq_state & RQ_POSTPONED))
2164                         continue;
2165                 /* as it is RQ_POSTPONED, this will cause it to
2166                  * be queued on the retry workqueue. */
2167                 __req_mod(req, CONFLICT_RESOLVED, NULL);
2168         }
2169 }
2170
2171 /*
2172  * e_end_block() is called in ack_sender context via drbd_finish_peer_reqs().
2173  */
2174 static int e_end_block(struct drbd_work *w, int cancel)
2175 {
2176         struct drbd_peer_request *peer_req =
2177                 container_of(w, struct drbd_peer_request, w);
2178         struct drbd_peer_device *peer_device = peer_req->peer_device;
2179         struct drbd_device *device = peer_device->device;
2180         sector_t sector = peer_req->i.sector;
2181         int err = 0, pcmd;
2182
2183         if (peer_req->flags & EE_SEND_WRITE_ACK) {
2184                 if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
2185                         pcmd = (device->state.conn >= C_SYNC_SOURCE &&
2186                                 device->state.conn <= C_PAUSED_SYNC_T &&
2187                                 peer_req->flags & EE_MAY_SET_IN_SYNC) ?
2188                                 P_RS_WRITE_ACK : P_WRITE_ACK;
2189                         err = drbd_send_ack(peer_device, pcmd, peer_req);
2190                         if (pcmd == P_RS_WRITE_ACK)
2191                                 drbd_set_in_sync(device, sector, peer_req->i.size);
2192                 } else {
2193                         err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req);
2194                         /* we expect it to be marked out of sync anyways...
2195                          * maybe assert this?  */
2196                 }
2197                 dec_unacked(device);
2198         }
2199
2200         /* we delete from the conflict detection hash _after_ we sent out the
2201          * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right.  */
2202         if (peer_req->flags & EE_IN_INTERVAL_TREE) {
2203                 spin_lock_irq(&device->resource->req_lock);
2204                 D_ASSERT(device, !drbd_interval_empty(&peer_req->i));
2205                 drbd_remove_epoch_entry_interval(device, peer_req);
2206                 if (peer_req->flags & EE_RESTART_REQUESTS)
2207                         restart_conflicting_writes(device, sector, peer_req->i.size);
2208                 spin_unlock_irq(&device->resource->req_lock);
2209         } else
2210                 D_ASSERT(device, drbd_interval_empty(&peer_req->i));
2211
2212         drbd_may_finish_epoch(peer_device->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
2213
2214         return err;
2215 }
2216
2217 static int e_send_ack(struct drbd_work *w, enum drbd_packet ack)
2218 {
2219         struct drbd_peer_request *peer_req =
2220                 container_of(w, struct drbd_peer_request, w);
2221         struct drbd_peer_device *peer_device = peer_req->peer_device;
2222         int err;
2223
2224         err = drbd_send_ack(peer_device, ack, peer_req);
2225         dec_unacked(peer_device->device);
2226
2227         return err;
2228 }
2229
2230 static int e_send_superseded(struct drbd_work *w, int unused)
2231 {
2232         return e_send_ack(w, P_SUPERSEDED);
2233 }
2234
2235 static int e_send_retry_write(struct drbd_work *w, int unused)
2236 {
2237         struct drbd_peer_request *peer_req =
2238                 container_of(w, struct drbd_peer_request, w);
2239         struct drbd_connection *connection = peer_req->peer_device->connection;
2240
2241         return e_send_ack(w, connection->agreed_pro_version >= 100 ?
2242                              P_RETRY_WRITE : P_SUPERSEDED);
2243 }
2244
2245 static bool seq_greater(u32 a, u32 b)
2246 {
2247         /*
2248          * We assume 32-bit wrap-around here.
2249          * For 24-bit wrap-around, we would have to shift:
2250          *  a <<= 8; b <<= 8;
2251          */
2252         return (s32)a - (s32)b > 0;
2253 }
2254
2255 static u32 seq_max(u32 a, u32 b)
2256 {
2257         return seq_greater(a, b) ? a : b;
2258 }
2259
2260 static void update_peer_seq(struct drbd_peer_device *peer_device, unsigned int peer_seq)
2261 {
2262         struct drbd_device *device = peer_device->device;
2263         unsigned int newest_peer_seq;
2264
2265         if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)) {
2266                 spin_lock(&device->peer_seq_lock);
2267                 newest_peer_seq = seq_max(device->peer_seq, peer_seq);
2268                 device->peer_seq = newest_peer_seq;
2269                 spin_unlock(&device->peer_seq_lock);
2270                 /* wake up only if we actually changed device->peer_seq */
2271                 if (peer_seq == newest_peer_seq)
2272                         wake_up(&device->seq_wait);
2273         }
2274 }
2275
2276 static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
2277 {
2278         return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
2279 }
2280
2281 /* maybe change sync_ee into interval trees as well? */
2282 static bool overlapping_resync_write(struct drbd_device *device, struct drbd_peer_request *peer_req)
2283 {
2284         struct drbd_peer_request *rs_req;
2285         bool rv = false;
2286
2287         spin_lock_irq(&device->resource->req_lock);
2288         list_for_each_entry(rs_req, &device->sync_ee, w.list) {
2289                 if (overlaps(peer_req->i.sector, peer_req->i.size,
2290                              rs_req->i.sector, rs_req->i.size)) {
2291                         rv = true;
2292                         break;
2293                 }
2294         }
2295         spin_unlock_irq(&device->resource->req_lock);
2296
2297         return rv;
2298 }
2299
2300 /* Called from receive_Data.
2301  * Synchronize packets on sock with packets on msock.
2302  *
2303  * This is here so even when a P_DATA packet traveling via sock overtook an Ack
2304  * packet traveling on msock, they are still processed in the order they have
2305  * been sent.
2306  *
2307  * Note: we don't care for Ack packets overtaking P_DATA packets.
2308  *
2309  * In case packet_seq is larger than device->peer_seq number, there are
2310  * outstanding packets on the msock. We wait for them to arrive.
2311  * In case we are the logically next packet, we update device->peer_seq
2312  * ourselves. Correctly handles 32bit wrap around.
2313  *
2314  * Assume we have a 10 GBit connection, that is about 1<<30 byte per second,
2315  * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds
2316  * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have
2317  * 1<<9 == 512 seconds aka ages for the 32bit wrap around...
2318  *
2319  * returns 0 if we may process the packet,
2320  * -ERESTARTSYS if we were interrupted (by disconnect signal). */
2321 static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, const u32 peer_seq)
2322 {
2323         struct drbd_device *device = peer_device->device;
2324         DEFINE_WAIT(wait);
2325         long timeout;
2326         int ret = 0, tp;
2327
2328         if (!test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags))
2329                 return 0;
2330
2331         spin_lock(&device->peer_seq_lock);
2332         for (;;) {
2333                 if (!seq_greater(peer_seq - 1, device->peer_seq)) {
2334                         device->peer_seq = seq_max(device->peer_seq, peer_seq);
2335                         break;
2336                 }
2337
2338                 if (signal_pending(current)) {
2339                         ret = -ERESTARTSYS;
2340                         break;
2341                 }
2342
2343                 rcu_read_lock();
2344                 tp = rcu_dereference(peer_device->connection->net_conf)->two_primaries;
2345                 rcu_read_unlock();
2346
2347                 if (!tp)
2348                         break;
2349
2350                 /* Only need to wait if two_primaries is enabled */
2351                 prepare_to_wait(&device->seq_wait, &wait, TASK_INTERRUPTIBLE);
2352                 spin_unlock(&device->peer_seq_lock);
2353                 rcu_read_lock();
2354                 timeout = rcu_dereference(peer_device->connection->net_conf)->ping_timeo*HZ/10;
2355                 rcu_read_unlock();
2356                 timeout = schedule_timeout(timeout);
2357                 spin_lock(&device->peer_seq_lock);
2358                 if (!timeout) {
2359                         ret = -ETIMEDOUT;
2360                         drbd_err(device, "Timed out waiting for missing ack packets; disconnecting\n");
2361                         break;
2362                 }
2363         }
2364         spin_unlock(&device->peer_seq_lock);
2365         finish_wait(&device->seq_wait, &wait);
2366         return ret;
2367 }
2368
2369 /* see also bio_flags_to_wire()
2370  * DRBD_REQ_*, because we need to semantically map the flags to data packet
2371  * flags and back. We may replicate to other kernel versions. */
2372 static unsigned long wire_flags_to_bio_flags(u32 dpf)
2373 {
2374         return  (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
2375                 (dpf & DP_FUA ? REQ_FUA : 0) |
2376                 (dpf & DP_FLUSH ? REQ_PREFLUSH : 0);
2377 }
2378
2379 static unsigned long wire_flags_to_bio_op(u32 dpf)
2380 {
2381         if (dpf & DP_DISCARD)
2382                 return REQ_OP_DISCARD;
2383         else
2384                 return REQ_OP_WRITE;
2385 }
2386
2387 static void fail_postponed_requests(struct drbd_device *device, sector_t sector,
2388                                     unsigned int size)
2389 {
2390         struct drbd_interval *i;
2391
2392     repeat:
2393         drbd_for_each_overlap(i, &device->write_requests, sector, size) {
2394                 struct drbd_request *req;
2395                 struct bio_and_error m;
2396
2397                 if (!i->local)
2398                         continue;
2399                 req = container_of(i, struct drbd_request, i);
2400                 if (!(req->rq_state & RQ_POSTPONED))
2401                         continue;
2402                 req->rq_state &= ~RQ_POSTPONED;
2403                 __req_mod(req, NEG_ACKED, &m);
2404                 spin_unlock_irq(&device->resource->req_lock);
2405                 if (m.bio)
2406                         complete_master_bio(device, &m);
2407                 spin_lock_irq(&device->resource->req_lock);
2408                 goto repeat;
2409         }
2410 }
2411
2412 static int handle_write_conflicts(struct drbd_device *device,
2413                                   struct drbd_peer_request *peer_req)
2414 {
2415         struct drbd_connection *connection = peer_req->peer_device->connection;
2416         bool resolve_conflicts = test_bit(RESOLVE_CONFLICTS, &connection->flags);
2417         sector_t sector = peer_req->i.sector;
2418         const unsigned int size = peer_req->i.size;
2419         struct drbd_interval *i;
2420         bool equal;
2421         int err;
2422
2423         /*
2424          * Inserting the peer request into the write_requests tree will prevent
2425          * new conflicting local requests from being added.
2426          */
2427         drbd_insert_interval(&device->write_requests, &peer_req->i);
2428
2429     repeat:
2430         drbd_for_each_overlap(i, &device->write_requests, sector, size) {
2431                 if (i == &peer_req->i)
2432                         continue;
2433                 if (i->completed)
2434                         continue;
2435
2436                 if (!i->local) {
2437                         /*
2438                          * Our peer has sent a conflicting remote request; this
2439                          * should not happen in a two-node setup.  Wait for the
2440                          * earlier peer request to complete.
2441                          */
2442                         err = drbd_wait_misc(device, i);
2443                         if (err)
2444                                 goto out;
2445                         goto repeat;
2446                 }
2447
2448                 equal = i->sector == sector && i->size == size;
2449                 if (resolve_conflicts) {
2450                         /*
2451                          * If the peer request is fully contained within the
2452                          * overlapping request, it can be considered overwritten
2453                          * and thus superseded; otherwise, it will be retried
2454                          * once all overlapping requests have completed.
2455                          */
2456                         bool superseded = i->sector <= sector && i->sector +
2457                                        (i->size >> 9) >= sector + (size >> 9);
2458
2459                         if (!equal)
2460                                 drbd_alert(device, "Concurrent writes detected: "
2461                                                "local=%llus +%u, remote=%llus +%u, "
2462                                                "assuming %s came first\n",
2463                                           (unsigned long long)i->sector, i->size,
2464                                           (unsigned long long)sector, size,
2465                                           superseded ? "local" : "remote");
2466
2467                         peer_req->w.cb = superseded ? e_send_superseded :
2468                                                    e_send_retry_write;
2469                         list_add_tail(&peer_req->w.list, &device->done_ee);
2470                         queue_work(connection->ack_sender, &peer_req->peer_device->send_acks_work);
2471
2472                         err = -ENOENT;
2473                         goto out;
2474                 } else {
2475                         struct drbd_request *req =
2476                                 container_of(i, struct drbd_request, i);
2477
2478                         if (!equal)
2479                                 drbd_alert(device, "Concurrent writes detected: "
2480                                                "local=%llus +%u, remote=%llus +%u\n",
2481                                           (unsigned long long)i->sector, i->size,
2482                                           (unsigned long long)sector, size);
2483
2484                         if (req->rq_state & RQ_LOCAL_PENDING ||
2485                             !(req->rq_state & RQ_POSTPONED)) {
2486                                 /*
2487                                  * Wait for the node with the discard flag to
2488                                  * decide if this request has been superseded
2489                                  * or needs to be retried.
2490                                  * Requests that have been superseded will
2491                                  * disappear from the write_requests tree.
2492                                  *
2493                                  * In addition, wait for the conflicting
2494                                  * request to finish locally before submitting
2495                                  * the conflicting peer request.
2496                                  */
2497                                 err = drbd_wait_misc(device, &req->i);
2498                                 if (err) {
2499                                         _conn_request_state(connection, NS(conn, C_TIMEOUT), CS_HARD);
2500                                         fail_postponed_requests(device, sector, size);
2501                                         goto out;
2502                                 }
2503                                 goto repeat;
2504                         }
2505                         /*
2506                          * Remember to restart the conflicting requests after
2507                          * the new peer request has completed.
2508                          */
2509                         peer_req->flags |= EE_RESTART_REQUESTS;
2510                 }
2511         }
2512         err = 0;
2513
2514     out:
2515         if (err)
2516                 drbd_remove_epoch_entry_interval(device, peer_req);
2517         return err;
2518 }
2519
2520 /* mirrored write */
2521 static int receive_Data(struct drbd_connection *connection, struct packet_info *pi)
2522 {
2523         struct drbd_peer_device *peer_device;
2524         struct drbd_device *device;
2525         struct net_conf *nc;
2526         sector_t sector;
2527         struct drbd_peer_request *peer_req;
2528         struct p_data *p = pi->data;
2529         u32 peer_seq = be32_to_cpu(p->seq_num);
2530         int op, op_flags;
2531         u32 dp_flags;
2532         int err, tp;
2533
2534         peer_device = conn_peer_device(connection, pi->vnr);
2535         if (!peer_device)
2536                 return -EIO;
2537         device = peer_device->device;
2538
2539         if (!get_ldev(device)) {
2540                 int err2;
2541
2542                 err = wait_for_and_update_peer_seq(peer_device, peer_seq);
2543                 drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size);
2544                 atomic_inc(&connection->current_epoch->epoch_size);
2545                 err2 = drbd_drain_block(peer_device, pi->size);
2546                 if (!err)
2547                         err = err2;
2548                 return err;
2549         }
2550
2551         /*
2552          * Corresponding put_ldev done either below (on various errors), or in
2553          * drbd_peer_request_endio, if we successfully submit the data at the
2554          * end of this function.
2555          */
2556
2557         sector = be64_to_cpu(p->sector);
2558         peer_req = read_in_block(peer_device, p->block_id, sector, pi);
2559         if (!peer_req) {
2560                 put_ldev(device);
2561                 return -EIO;
2562         }
2563
2564         peer_req->w.cb = e_end_block;
2565         peer_req->submit_jif = jiffies;
2566         peer_req->flags |= EE_APPLICATION;
2567
2568         dp_flags = be32_to_cpu(p->dp_flags);
2569         op = wire_flags_to_bio_op(dp_flags);
2570         op_flags = wire_flags_to_bio_flags(dp_flags);
2571         if (pi->cmd == P_TRIM) {
2572                 D_ASSERT(peer_device, peer_req->i.size > 0);
2573                 D_ASSERT(peer_device, op == REQ_OP_DISCARD);
2574                 D_ASSERT(peer_device, peer_req->pages == NULL);
2575         } else if (peer_req->pages == NULL) {
2576                 D_ASSERT(device, peer_req->i.size == 0);
2577                 D_ASSERT(device, dp_flags & DP_FLUSH);
2578         }
2579
2580         if (dp_flags & DP_MAY_SET_IN_SYNC)
2581                 peer_req->flags |= EE_MAY_SET_IN_SYNC;
2582
2583         spin_lock(&connection->epoch_lock);
2584         peer_req->epoch = connection->current_epoch;
2585         atomic_inc(&peer_req->epoch->epoch_size);
2586         atomic_inc(&peer_req->epoch->active);
2587         spin_unlock(&connection->epoch_lock);
2588
2589         rcu_read_lock();
2590         nc = rcu_dereference(peer_device->connection->net_conf);
2591         tp = nc->two_primaries;
2592         if (peer_device->connection->agreed_pro_version < 100) {
2593                 switch (nc->wire_protocol) {
2594                 case DRBD_PROT_C:
2595                         dp_flags |= DP_SEND_WRITE_ACK;
2596                         break;
2597                 case DRBD_PROT_B:
2598                         dp_flags |= DP_SEND_RECEIVE_ACK;
2599                         break;
2600                 }
2601         }
2602         rcu_read_unlock();
2603
2604         if (dp_flags & DP_SEND_WRITE_ACK) {
2605                 peer_req->flags |= EE_SEND_WRITE_ACK;
2606                 inc_unacked(device);
2607                 /* corresponding dec_unacked() in e_end_block()
2608                  * respective _drbd_clear_done_ee */
2609         }
2610
2611         if (dp_flags & DP_SEND_RECEIVE_ACK) {
2612                 /* I really don't like it that the receiver thread
2613                  * sends on the msock, but anyways */
2614                 drbd_send_ack(peer_device, P_RECV_ACK, peer_req);
2615         }
2616
2617         if (tp) {
2618                 /* two primaries implies protocol C */
2619                 D_ASSERT(device, dp_flags & DP_SEND_WRITE_ACK);
2620                 peer_req->flags |= EE_IN_INTERVAL_TREE;
2621                 err = wait_for_and_update_peer_seq(peer_device, peer_seq);
2622                 if (err)
2623                         goto out_interrupted;
2624                 spin_lock_irq(&device->resource->req_lock);
2625                 err = handle_write_conflicts(device, peer_req);
2626                 if (err) {
2627                         spin_unlock_irq(&device->resource->req_lock);
2628                         if (err == -ENOENT) {
2629                                 put_ldev(device);
2630                                 return 0;
2631                         }
2632                         goto out_interrupted;
2633                 }
2634         } else {
2635                 update_peer_seq(peer_device, peer_seq);
2636                 spin_lock_irq(&device->resource->req_lock);
2637         }
2638         /* TRIM and WRITE_SAME are processed synchronously,
2639          * we wait for all pending requests, respectively wait for
2640          * active_ee to become empty in drbd_submit_peer_request();
2641          * better not add ourselves here. */
2642         if ((peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) == 0)
2643                 list_add_tail(&peer_req->w.list, &device->active_ee);
2644         spin_unlock_irq(&device->resource->req_lock);
2645
2646         if (device->state.conn == C_SYNC_TARGET)
2647                 wait_event(device->ee_wait, !overlapping_resync_write(device, peer_req));
2648
2649         if (device->state.pdsk < D_INCONSISTENT) {
2650                 /* In case we have the only disk of the cluster, */
2651                 drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size);
2652                 peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
2653                 drbd_al_begin_io(device, &peer_req->i);
2654                 peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
2655         }
2656
2657         err = drbd_submit_peer_request(device, peer_req, op, op_flags,
2658                                        DRBD_FAULT_DT_WR);
2659         if (!err)
2660                 return 0;
2661
2662         /* don't care for the reason here */
2663         drbd_err(device, "submit failed, triggering re-connect\n");
2664         spin_lock_irq(&device->resource->req_lock);
2665         list_del(&peer_req->w.list);
2666         drbd_remove_epoch_entry_interval(device, peer_req);
2667         spin_unlock_irq(&device->resource->req_lock);
2668         if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) {
2669                 peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
2670                 drbd_al_complete_io(device, &peer_req->i);
2671         }
2672
2673 out_interrupted:
2674         drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT | EV_CLEANUP);
2675         put_ldev(device);
2676         drbd_free_peer_req(device, peer_req);
2677         return err;
2678 }
2679
2680 /* We may throttle resync, if the lower device seems to be busy,
2681  * and current sync rate is above c_min_rate.
2682  *
2683  * To decide whether or not the lower device is busy, we use a scheme similar
2684  * to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
2685  * (more than 64 sectors) of activity we cannot account for with our own resync
2686  * activity, it obviously is "busy".
2687  *
2688  * The current sync rate used here uses only the most recent two step marks,
2689  * to have a short time average so we can react faster.
2690  */
2691 bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
2692                 bool throttle_if_app_is_waiting)
2693 {
2694         struct lc_element *tmp;
2695         bool throttle = drbd_rs_c_min_rate_throttle(device);
2696
2697         if (!throttle || throttle_if_app_is_waiting)
2698                 return throttle;
2699
2700         spin_lock_irq(&device->al_lock);
2701         tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector));
2702         if (tmp) {
2703                 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
2704                 if (test_bit(BME_PRIORITY, &bm_ext->flags))
2705                         throttle = false;
2706                 /* Do not slow down if app IO is already waiting for this extent,
2707                  * and our progress is necessary for application IO to complete. */
2708         }
2709         spin_unlock_irq(&device->al_lock);
2710
2711         return throttle;
2712 }
2713
2714 bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
2715 {
2716         struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk;
2717         unsigned long db, dt, dbdt;
2718         unsigned int c_min_rate;
2719         int curr_events;
2720
2721         rcu_read_lock();
2722         c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate;
2723         rcu_read_unlock();
2724
2725         /* feature disabled? */
2726         if (c_min_rate == 0)
2727                 return false;
2728
2729         curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
2730                       (int)part_stat_read(&disk->part0, sectors[1]) -
2731                         atomic_read(&device->rs_sect_ev);
2732
2733         if (atomic_read(&device->ap_actlog_cnt)
2734             || curr_events - device->rs_last_events > 64) {
2735                 unsigned long rs_left;
2736                 int i;
2737
2738                 device->rs_last_events = curr_events;
2739
2740                 /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
2741                  * approx. */
2742                 i = (device->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
2743
2744                 if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
2745                         rs_left = device->ov_left;
2746                 else
2747                         rs_left = drbd_bm_total_weight(device) - device->rs_failed;
2748
2749                 dt = ((long)jiffies - (long)device->rs_mark_time[i]) / HZ;
2750                 if (!dt)
2751                         dt++;
2752                 db = device->rs_mark_left[i] - rs_left;
2753                 dbdt = Bit2KB(db/dt);
2754
2755                 if (dbdt > c_min_rate)
2756                         return true;
2757         }
2758         return false;
2759 }
2760
2761 static int receive_DataRequest(struct drbd_connection *connection, struct packet_info *pi)
2762 {
2763         struct drbd_peer_device *peer_device;
2764         struct drbd_device *device;
2765         sector_t sector;
2766         sector_t capacity;
2767         struct drbd_peer_request *peer_req;
2768         struct digest_info *di = NULL;
2769         int size, verb;
2770         unsigned int fault_type;
2771         struct p_block_req *p = pi->data;
2772
2773         peer_device = conn_peer_device(connection, pi->vnr);
2774         if (!peer_device)
2775                 return -EIO;
2776         device = peer_device->device;
2777         capacity = drbd_get_capacity(device->this_bdev);
2778
2779         sector = be64_to_cpu(p->sector);
2780         size   = be32_to_cpu(p->blksize);
2781
2782         if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
2783                 drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
2784                                 (unsigned long long)sector, size);
2785                 return -EINVAL;
2786         }
2787         if (sector + (size>>9) > capacity) {
2788                 drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
2789                                 (unsigned long long)sector, size);
2790                 return -EINVAL;
2791         }
2792
2793         if (!get_ldev_if_state(device, D_UP_TO_DATE)) {
2794                 verb = 1;
2795                 switch (pi->cmd) {
2796                 case P_DATA_REQUEST:
2797                         drbd_send_ack_rp(peer_device, P_NEG_DREPLY, p);
2798                         break;
2799                 case P_RS_THIN_REQ:
2800                 case P_RS_DATA_REQUEST:
2801                 case P_CSUM_RS_REQUEST:
2802                 case P_OV_REQUEST:
2803                         drbd_send_ack_rp(peer_device, P_NEG_RS_DREPLY , p);
2804                         break;
2805                 case P_OV_REPLY:
2806                         verb = 0;
2807                         dec_rs_pending(device);
2808                         drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size, ID_IN_SYNC);
2809                         break;
2810                 default:
2811                         BUG();
2812                 }
2813                 if (verb && __ratelimit(&drbd_ratelimit_state))
2814                         drbd_err(device, "Can not satisfy peer's read request, "
2815                             "no local data.\n");
2816
2817                 /* drain possibly payload */
2818                 return drbd_drain_block(peer_device, pi->size);
2819         }
2820
2821         /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
2822          * "criss-cross" setup, that might cause write-out on some other DRBD,
2823          * which in turn might block on the other node at this very place.  */
2824         peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size,
2825                         size, GFP_NOIO);
2826         if (!peer_req) {
2827                 put_ldev(device);
2828                 return -ENOMEM;
2829         }
2830
2831         switch (pi->cmd) {
2832         case P_DATA_REQUEST:
2833                 peer_req->w.cb = w_e_end_data_req;
2834                 fault_type = DRBD_FAULT_DT_RD;
2835                 /* application IO, don't drbd_rs_begin_io */
2836                 peer_req->flags |= EE_APPLICATION;
2837                 goto submit;
2838
2839         case P_RS_THIN_REQ:
2840                 /* If at some point in the future we have a smart way to
2841                    find out if this data block is completely deallocated,
2842                    then we would do something smarter here than reading
2843                    the block... */
2844                 peer_req->flags |= EE_RS_THIN_REQ;
2845         case P_RS_DATA_REQUEST:
2846                 peer_req->w.cb = w_e_end_rsdata_req;
2847                 fault_type = DRBD_FAULT_RS_RD;
2848                 /* used in the sector offset progress display */
2849                 device->bm_resync_fo = BM_SECT_TO_BIT(sector);
2850                 break;
2851
2852         case P_OV_REPLY:
2853         case P_CSUM_RS_REQUEST:
2854                 fault_type = DRBD_FAULT_RS_RD;
2855                 di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO);
2856                 if (!di)
2857                         goto out_free_e;
2858
2859                 di->digest_size = pi->size;
2860                 di->digest = (((char *)di)+sizeof(struct digest_info));
2861
2862                 peer_req->digest = di;
2863                 peer_req->flags |= EE_HAS_DIGEST;
2864
2865                 if (drbd_recv_all(peer_device->connection, di->digest, pi->size))
2866                         goto out_free_e;
2867
2868                 if (pi->cmd == P_CSUM_RS_REQUEST) {
2869                         D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89);
2870                         peer_req->w.cb = w_e_end_csum_rs_req;
2871                         /* used in the sector offset progress display */
2872                         device->bm_resync_fo = BM_SECT_TO_BIT(sector);
2873                         /* remember to report stats in drbd_resync_finished */
2874                         device->use_csums = true;
2875                 } else if (pi->cmd == P_OV_REPLY) {
2876                         /* track progress, we may need to throttle */
2877                         atomic_add(size >> 9, &device->rs_sect_in);
2878                         peer_req->w.cb = w_e_end_ov_reply;
2879                         dec_rs_pending(device);
2880                         /* drbd_rs_begin_io done when we sent this request,
2881                          * but accounting still needs to be done. */
2882                         goto submit_for_resync;
2883                 }
2884                 break;
2885
2886         case P_OV_REQUEST:
2887                 if (device->ov_start_sector == ~(sector_t)0 &&
2888                     peer_device->connection->agreed_pro_version >= 90) {
2889                         unsigned long now = jiffies;
2890                         int i;
2891                         device->ov_start_sector = sector;
2892                         device->ov_position = sector;
2893                         device->ov_left = drbd_bm_bits(device) - BM_SECT_TO_BIT(sector);
2894                         device->rs_total = device->ov_left;
2895                         for (i = 0; i < DRBD_SYNC_MARKS; i++) {
2896                                 device->rs_mark_left[i] = device->ov_left;
2897                                 device->rs_mark_time[i] = now;
2898                         }
2899                         drbd_info(device, "Online Verify start sector: %llu\n",
2900                                         (unsigned long long)sector);
2901                 }
2902                 peer_req->w.cb = w_e_end_ov_req;
2903                 fault_type = DRBD_FAULT_RS_RD;
2904                 break;
2905
2906         default:
2907                 BUG();
2908         }
2909
2910         /* Throttle, drbd_rs_begin_io and submit should become asynchronous
2911          * wrt the receiver, but it is not as straightforward as it may seem.
2912          * Various places in the resync start and stop logic assume resync
2913          * requests are processed in order, requeuing this on the worker thread
2914          * introduces a bunch of new code for synchronization between threads.
2915          *
2916          * Unlimited throttling before drbd_rs_begin_io may stall the resync
2917          * "forever", throttling after drbd_rs_begin_io will lock that extent
2918          * for application writes for the same time.  For now, just throttle
2919          * here, where the rest of the code expects the receiver to sleep for
2920          * a while, anyways.
2921          */
2922
2923         /* Throttle before drbd_rs_begin_io, as that locks out application IO;
2924          * this defers syncer requests for some time, before letting at least
2925          * on request through.  The resync controller on the receiving side
2926          * will adapt to the incoming rate accordingly.
2927          *
2928          * We cannot throttle here if remote is Primary/SyncTarget:
2929          * we would also throttle its application reads.
2930          * In that case, throttling is done on the SyncTarget only.
2931          */
2932
2933         /* Even though this may be a resync request, we do add to "read_ee";
2934          * "sync_ee" is only used for resync WRITEs.
2935          * Add to list early, so debugfs can find this request
2936          * even if we have to sleep below. */
2937         spin_lock_irq(&device->resource->req_lock);
2938         list_add_tail(&peer_req->w.list, &device->read_ee);
2939         spin_unlock_irq(&device->resource->req_lock);
2940
2941         update_receiver_timing_details(connection, drbd_rs_should_slow_down);
2942         if (device->state.peer != R_PRIMARY
2943         && drbd_rs_should_slow_down(device, sector, false))
2944                 schedule_timeout_uninterruptible(HZ/10);
2945         update_receiver_timing_details(connection, drbd_rs_begin_io);
2946         if (drbd_rs_begin_io(device, sector))
2947                 goto out_free_e;
2948
2949 submit_for_resync:
2950         atomic_add(size >> 9, &device->rs_sect_ev);
2951
2952 submit:
2953         update_receiver_timing_details(connection, drbd_submit_peer_request);
2954         inc_unacked(device);
2955         if (drbd_submit_peer_request(device, peer_req, REQ_OP_READ, 0,
2956                                      fault_type) == 0)
2957                 return 0;
2958
2959         /* don't care for the reason here */
2960         drbd_err(device, "submit failed, triggering re-connect\n");
2961
2962 out_free_e:
2963         spin_lock_irq(&device->resource->req_lock);
2964         list_del(&peer_req->w.list);
2965         spin_unlock_irq(&device->resource->req_lock);
2966         /* no drbd_rs_complete_io(), we are dropping the connection anyways */
2967
2968         put_ldev(device);
2969         drbd_free_peer_req(device, peer_req);
2970         return -EIO;
2971 }
2972
2973 /**
2974  * drbd_asb_recover_0p  -  Recover after split-brain with no remaining primaries
2975  */
2976 static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold(local)
2977 {
2978         struct drbd_device *device = peer_device->device;
2979         int self, peer, rv = -100;
2980         unsigned long ch_self, ch_peer;
2981         enum drbd_after_sb_p after_sb_0p;
2982
2983         self = device->ldev->md.uuid[UI_BITMAP] & 1;
2984         peer = device->p_uuid[UI_BITMAP] & 1;
2985
2986         ch_peer = device->p_uuid[UI_SIZE];
2987         ch_self = device->comm_bm_set;
2988
2989         rcu_read_lock();
2990         after_sb_0p = rcu_dereference(peer_device->connection->net_conf)->after_sb_0p;
2991         rcu_read_unlock();
2992         switch (after_sb_0p) {
2993         case ASB_CONSENSUS:
2994         case ASB_DISCARD_SECONDARY:
2995         case ASB_CALL_HELPER:
2996         case ASB_VIOLENTLY:
2997                 drbd_err(device, "Configuration error.\n");
2998                 break;
2999         case ASB_DISCONNECT:
3000                 break;
3001         case ASB_DISCARD_YOUNGER_PRI:
3002                 if (self == 0 && peer == 1) {
3003                         rv = -1;
3004                         break;
3005                 }
3006                 if (self == 1 && peer == 0) {
3007                         rv =  1;
3008                         break;
3009                 }
3010                 /* Else fall through to one of the other strategies... */
3011         case ASB_DISCARD_OLDER_PRI:
3012                 if (self == 0 && peer == 1) {
3013                         rv = 1;
3014                         break;
3015                 }
3016                 if (self == 1 && peer == 0) {
3017                         rv = -1;
3018                         break;
3019                 }
3020                 /* Else fall through to one of the other strategies... */
3021                 drbd_warn(device, "Discard younger/older primary did not find a decision\n"
3022                      "Using discard-least-changes instead\n");
3023         case ASB_DISCARD_ZERO_CHG:
3024                 if (ch_peer == 0 && ch_self == 0) {
3025                         rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
3026                                 ? -1 : 1;
3027                         break;
3028                 } else {
3029                         if (ch_peer == 0) { rv =  1; break; }
3030                         if (ch_self == 0) { rv = -1; break; }
3031                 }
3032                 if (after_sb_0p == ASB_DISCARD_ZERO_CHG)
3033                         break;
3034         case ASB_DISCARD_LEAST_CHG:
3035                 if      (ch_self < ch_peer)
3036                         rv = -1;
3037                 else if (ch_self > ch_peer)
3038                         rv =  1;
3039                 else /* ( ch_self == ch_peer ) */
3040                      /* Well, then use something else. */
3041                         rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
3042                                 ? -1 : 1;
3043                 break;
3044         case ASB_DISCARD_LOCAL:
3045                 rv = -1;
3046                 break;
3047         case ASB_DISCARD_REMOTE:
3048                 rv =  1;
3049         }
3050
3051         return rv;
3052 }
3053
3054 /**
3055  * drbd_asb_recover_1p  -  Recover after split-brain with one remaining primary
3056  */
3057 static int drbd_asb_recover_1p(struct drbd_peer_device *peer_device) __must_hold(local)
3058 {
3059         struct drbd_device *device = peer_device->device;
3060         int hg, rv = -100;
3061         enum drbd_after_sb_p after_sb_1p;
3062
3063         rcu_read_lock();
3064         after_sb_1p = rcu_dereference(peer_device->connection->net_conf)->after_sb_1p;
3065         rcu_read_unlock();
3066         switch (after_sb_1p) {
3067         case ASB_DISCARD_YOUNGER_PRI:
3068         case ASB_DISCARD_OLDER_PRI:
3069         case ASB_DISCARD_LEAST_CHG:
3070         case ASB_DISCARD_LOCAL:
3071         case ASB_DISCARD_REMOTE:
3072         case ASB_DISCARD_ZERO_CHG:
3073                 drbd_err(device, "Configuration error.\n");
3074                 break;
3075         case ASB_DISCONNECT:
3076                 break;
3077         case ASB_CONSENSUS:
3078                 hg = drbd_asb_recover_0p(peer_device);
3079                 if (hg == -1 && device->state.role == R_SECONDARY)
3080                         rv = hg;
3081                 if (hg == 1  && device->state.role == R_PRIMARY)
3082                         rv = hg;
3083                 break;
3084         case ASB_VIOLENTLY:
3085                 rv = drbd_asb_recover_0p(peer_device);
3086                 break;
3087         case ASB_DISCARD_SECONDARY:
3088                 return device->state.role == R_PRIMARY ? 1 : -1;
3089         case ASB_CALL_HELPER:
3090                 hg = drbd_asb_recover_0p(peer_device);
3091                 if (hg == -1 && device->state.role == R_PRIMARY) {
3092                         enum drbd_state_rv rv2;
3093
3094                          /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
3095                           * we might be here in C_WF_REPORT_PARAMS which is transient.
3096                           * we do not need to wait for the after state change work either. */
3097                         rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY));
3098                         if (rv2 != SS_SUCCESS) {
3099                                 drbd_khelper(device, "pri-lost-after-sb");
3100                         } else {
3101                                 drbd_warn(device, "Successfully gave up primary role.\n");
3102                                 rv = hg;
3103                         }
3104                 } else
3105                         rv = hg;
3106         }
3107
3108         return rv;
3109 }
3110
3111 /**
3112  * drbd_asb_recover_2p  -  Recover after split-brain with two remaining primaries
3113  */
3114 static int drbd_asb_recover_2p(struct drbd_peer_device *peer_device) __must_hold(local)
3115 {
3116         struct drbd_device *device = peer_device->device;
3117         int hg, rv = -100;
3118         enum drbd_after_sb_p after_sb_2p;
3119
3120         rcu_read_lock();
3121         after_sb_2p = rcu_dereference(peer_device->connection->net_conf)->after_sb_2p;
3122         rcu_read_unlock();
3123         switch (after_sb_2p) {
3124         case ASB_DISCARD_YOUNGER_PRI:
3125         case ASB_DISCARD_OLDER_PRI:
3126         case ASB_DISCARD_LEAST_CHG:
3127         case ASB_DISCARD_LOCAL:
3128         case ASB_DISCARD_REMOTE:
3129         case ASB_CONSENSUS:
3130         case ASB_DISCARD_SECONDARY:
3131         case ASB_DISCARD_ZERO_CHG:
3132                 drbd_err(device, "Configuration error.\n");
3133                 break;
3134         case ASB_VIOLENTLY:
3135                 rv = drbd_asb_recover_0p(peer_device);
3136                 break;
3137         case ASB_DISCONNECT:
3138                 break;
3139         case ASB_CALL_HELPER:
3140                 hg = drbd_asb_recover_0p(peer_device);
3141                 if (hg == -1) {
3142                         enum drbd_state_rv rv2;
3143
3144                          /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
3145                           * we might be here in C_WF_REPORT_PARAMS which is transient.
3146                           * we do not need to wait for the after state change work either. */
3147                         rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY));
3148                         if (rv2 != SS_SUCCESS) {
3149                                 drbd_khelper(device, "pri-lost-after-sb");
3150                         } else {
3151                                 drbd_warn(device, "Successfully gave up primary role.\n");
3152                                 rv = hg;
3153                         }
3154                 } else
3155                         rv = hg;
3156         }
3157
3158         return rv;
3159 }
3160
3161 static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid,
3162                            u64 bits, u64 flags)
3163 {
3164         if (!uuid) {
3165                 drbd_info(device, "%s uuid info vanished while I was looking!\n", text);
3166                 return;
3167         }
3168         drbd_info(device, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
3169              text,
3170              (unsigned long long)uuid[UI_CURRENT],
3171              (unsigned long long)uuid[UI_BITMAP],
3172              (unsigned long long)uuid[UI_HISTORY_START],
3173              (unsigned long long)uuid[UI_HISTORY_END],
3174              (unsigned long long)bits,
3175              (unsigned long long)flags);
3176 }
3177
3178 /*
3179   100   after split brain try auto recover
3180     2   C_SYNC_SOURCE set BitMap
3181     1   C_SYNC_SOURCE use BitMap
3182     0   no Sync
3183    -1   C_SYNC_TARGET use BitMap
3184    -2   C_SYNC_TARGET set BitMap
3185  -100   after split brain, disconnect
3186 -1000   unrelated data
3187 -1091   requires proto 91
3188 -1096   requires proto 96
3189  */
3190
3191 static int drbd_uuid_compare(struct drbd_device *const device, enum drbd_role const peer_role, int *rule_nr) __must_hold(local)
3192 {
3193         struct drbd_peer_device *const peer_device = first_peer_device(device);
3194         struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
3195         u64 self, peer;
3196         int i, j;
3197
3198         self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
3199         peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
3200
3201         *rule_nr = 10;
3202         if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
3203                 return 0;
3204
3205         *rule_nr = 20;
3206         if ((self == UUID_JUST_CREATED || self == (u64)0) &&
3207              peer != UUID_JUST_CREATED)
3208                 return -2;
3209
3210         *rule_nr = 30;
3211         if (self != UUID_JUST_CREATED &&
3212             (peer == UUID_JUST_CREATED || peer == (u64)0))
3213                 return 2;
3214
3215         if (self == peer) {
3216                 int rct, dc; /* roles at crash time */
3217
3218                 if (device->p_uuid[UI_BITMAP] == (u64)0 && device->ldev->md.uuid[UI_BITMAP] != (u64)0) {
3219
3220                         if (connection->agreed_pro_version < 91)
3221                                 return -1091;
3222
3223                         if ((device->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
3224                             (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
3225                                 drbd_info(device, "was SyncSource, missed the resync finished event, corrected myself:\n");
3226                                 drbd_uuid_move_history(device);
3227                                 device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[UI_BITMAP];
3228                                 device->ldev->md.uuid[UI_BITMAP] = 0;
3229
3230                                 drbd_uuid_dump(device, "self", device->ldev->md.uuid,
3231                                                device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0);
3232                                 *rule_nr = 34;
3233                         } else {
3234                                 drbd_info(device, "was SyncSource (peer failed to write sync_uuid)\n");
3235                                 *rule_nr = 36;
3236                         }
3237
3238                         return 1;
3239                 }
3240
3241                 if (device->ldev->md.uuid[UI_BITMAP] == (u64)0 && device->p_uuid[UI_BITMAP] != (u64)0) {
3242
3243                         if (connection->agreed_pro_version < 91)
3244                                 return -1091;
3245
3246                         if ((device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_BITMAP] & ~((u64)1)) &&
3247                             (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
3248                                 drbd_info(device, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
3249
3250                                 device->p_uuid[UI_HISTORY_START + 1] = device->p_uuid[UI_HISTORY_START];
3251                                 device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_BITMAP];
3252                                 device->p_uuid[UI_BITMAP] = 0UL;
3253
3254                                 drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
3255                                 *rule_nr = 35;
3256                         } else {
3257                                 drbd_info(device, "was SyncTarget (failed to write sync_uuid)\n");
3258                                 *rule_nr = 37;
3259                         }
3260
3261                         return -1;
3262                 }
3263
3264                 /* Common power [off|failure] */
3265                 rct = (test_bit(CRASHED_PRIMARY, &device->flags) ? 1 : 0) +
3266                         (device->p_uuid[UI_FLAGS] & 2);
3267                 /* lowest bit is set when we were primary,
3268                  * next bit (weight 2) is set when peer was primary */
3269                 *rule_nr = 40;
3270
3271                 /* Neither has the "crashed primary" flag set,
3272                  * only a replication link hickup. */
3273                 if (rct == 0)
3274                         return 0;
3275
3276                 /* Current UUID equal and no bitmap uuid; does not necessarily
3277                  * mean this was a "simultaneous hard crash", maybe IO was
3278                  * frozen, so no UUID-bump happened.
3279                  * This is a protocol change, overload DRBD_FF_WSAME as flag
3280                  * for "new-enough" peer DRBD version. */
3281                 if (device->state.role == R_PRIMARY || peer_role == R_PRIMARY) {
3282                         *rule_nr = 41;
3283                         if (!(connection->agreed_features & DRBD_FF_WSAME)) {
3284                                 drbd_warn(peer_device, "Equivalent unrotated UUIDs, but current primary present.\n");
3285                                 return -(0x10000 | PRO_VERSION_MAX | (DRBD_FF_WSAME << 8));
3286                         }
3287                         if (device->state.role == R_PRIMARY && peer_role == R_PRIMARY) {
3288                                 /* At least one has the "crashed primary" bit set,
3289                                  * both are primary now, but neither has rotated its UUIDs?
3290                                  * "Can not happen." */
3291                                 drbd_err(peer_device, "Equivalent unrotated UUIDs, but both are primary. Can not resolve this.\n");
3292                                 return -100;
3293                         }
3294                         if (device->state.role == R_PRIMARY)
3295                                 return 1;
3296                         return -1;
3297                 }
3298
3299                 /* Both are secondary.
3300                  * Really looks like recovery from simultaneous hard crash.
3301                  * Check which had been primary before, and arbitrate. */
3302                 switch (rct) {
3303                 case 0: /* !self_pri && !peer_pri */ return 0; /* already handled */
3304                 case 1: /*  self_pri && !peer_pri */ return 1;
3305                 case 2: /* !self_pri &&  peer_pri */ return -1;
3306                 case 3: /*  self_pri &&  peer_pri */
3307                         dc = test_bit(RESOLVE_CONFLICTS, &connection->flags);
3308                         return dc ? -1 : 1;
3309                 }
3310         }
3311
3312         *rule_nr = 50;
3313         peer = device->p_uuid[UI_BITMAP] & ~((u64)1);
3314         if (self == peer)
3315                 return -1;
3316
3317         *rule_nr = 51;
3318         peer = device->p_uuid[UI_HISTORY_START] & ~((u64)1);
3319         if (self == peer) {
3320                 if (connection->agreed_pro_version < 96 ?
3321                     (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
3322                     (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
3323                     peer + UUID_NEW_BM_OFFSET == (device->p_uuid[UI_BITMAP] & ~((u64)1))) {
3324                         /* The last P_SYNC_UUID did not get though. Undo the last start of
3325                            resync as sync source modifications of the peer's UUIDs. */
3326
3327                         if (connection->agreed_pro_version < 91)
3328                                 return -1091;
3329
3330                         device->p_uuid[UI_BITMAP] = device->p_uuid[UI_HISTORY_START];
3331                         device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_HISTORY_START + 1];
3332
3333                         drbd_info(device, "Lost last syncUUID packet, corrected:\n");
3334                         drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
3335
3336                         return -1;
3337                 }
3338         }
3339
3340         *rule_nr = 60;
3341         self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
3342         for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
3343                 peer = device->p_uuid[i] & ~((u64)1);
3344                 if (self == peer)
3345                         return -2;
3346         }
3347
3348         *rule_nr = 70;
3349         self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
3350         peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
3351         if (self == peer)
3352                 return 1;
3353
3354         *rule_nr = 71;
3355         self = device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
3356         if (self == peer) {
3357                 if (connection->agreed_pro_version < 96 ?
3358                     (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
3359                     (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
3360                     self + UUID_NEW_BM_OFFSET == (device->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
3361                         /* The last P_SYNC_UUID did not get though. Undo the last start of
3362                            resync as sync source modifications of our UUIDs. */
3363
3364                         if (connection->agreed_pro_version < 91)
3365                                 return -1091;
3366
3367                         __drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_HISTORY_START]);
3368                         __drbd_uuid_set(device, UI_HISTORY_START, device->ldev->md.uuid[UI_HISTORY_START + 1]);
3369
3370                         drbd_info(device, "Last syncUUID did not get through, corrected:\n");
3371                         drbd_uuid_dump(device, "self", device->ldev->md.uuid,
3372                                        device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0);
3373
3374                         return 1;
3375                 }
3376         }
3377
3378
3379         *rule_nr = 80;
3380         peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
3381         for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
3382                 self = device->ldev->md.uuid[i] & ~((u64)1);
3383                 if (self == peer)
3384                         return 2;
3385         }
3386
3387         *rule_nr = 90;
3388         self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
3389         peer = device->p_uuid[UI_BITMAP] & ~((u64)1);
3390         if (self == peer && self != ((u64)0))
3391                 return 100;
3392
3393         *rule_nr = 100;
3394         for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
3395                 self = device->ldev->md.uuid[i] & ~((u64)1);
3396                 for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) {
3397                         peer = device->p_uuid[j] & ~((u64)1);
3398                         if (self == peer)
3399                                 return -100;
3400                 }
3401         }
3402
3403         return -1000;
3404 }
3405
3406 /* drbd_sync_handshake() returns the new conn state on success, or
3407    CONN_MASK (-1) on failure.
3408  */
3409 static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device,
3410                                            enum drbd_role peer_role,
3411                                            enum drbd_disk_state peer_disk) __must_hold(local)
3412 {
3413         struct drbd_device *device = peer_device->device;
3414         enum drbd_conns rv = C_MASK;
3415         enum drbd_disk_state mydisk;
3416         struct net_conf *nc;
3417         int hg, rule_nr, rr_conflict, tentative;
3418
3419         mydisk = device->state.disk;
3420         if (mydisk == D_NEGOTIATING)
3421                 mydisk = device->new_state_tmp.disk;
3422
3423         drbd_info(device, "drbd_sync_handshake:\n");
3424
3425         spin_lock_irq(&device->ldev->md.uuid_lock);
3426         drbd_uuid_dump(device, "self", device->ldev->md.uuid, device->comm_bm_set, 0);
3427         drbd_uuid_dump(device, "peer", device->p_uuid,
3428                        device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
3429
3430         hg = drbd_uuid_compare(device, peer_role, &rule_nr);
3431         spin_unlock_irq(&device->ldev->md.uuid_lock);
3432
3433         drbd_info(device, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
3434
3435         if (hg == -1000) {
3436                 drbd_alert(device, "Unrelated data, aborting!\n");
3437                 return C_MASK;
3438         }
3439         if (hg < -0x10000) {
3440                 int proto, fflags;
3441                 hg = -hg;
3442                 proto = hg & 0xff;
3443                 fflags = (hg >> 8) & 0xff;
3444                 drbd_alert(device, "To resolve this both sides have to support at least protocol %d and feature flags 0x%x\n",
3445                                         proto, fflags);
3446                 return C_MASK;
3447         }
3448         if (hg < -1000) {
3449                 drbd_alert(device, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000);
3450                 return C_MASK;
3451         }
3452
3453         if    ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) ||
3454             (peer_disk == D_INCONSISTENT && mydisk    > D_INCONSISTENT)) {
3455                 int f = (hg == -100) || abs(hg) == 2;
3456                 hg = mydisk > D_INCONSISTENT ? 1 : -1;
3457                 if (f)
3458                         hg = hg*2;
3459                 drbd_info(device, "Becoming sync %s due to disk states.\n",
3460                      hg > 0 ? "source" : "target");
3461         }
3462
3463         if (abs(hg) == 100)
3464                 drbd_khelper(device, "initial-split-brain");
3465
3466         rcu_read_lock();
3467         nc = rcu_dereference(peer_device->connection->net_conf);
3468
3469         if (hg == 100 || (hg == -100 && nc->always_asbp)) {
3470                 int pcount = (device->state.role == R_PRIMARY)
3471                            + (peer_role == R_PRIMARY);
3472                 int forced = (hg == -100);
3473
3474                 switch (pcount) {
3475                 case 0:
3476                         hg = drbd_asb_recover_0p(peer_device);
3477                         break;
3478                 case 1:
3479                         hg = drbd_asb_recover_1p(peer_device);
3480                         break;
3481                 case 2:
3482                         hg = drbd_asb_recover_2p(peer_device);
3483                         break;
3484                 }
3485                 if (abs(hg) < 100) {
3486                         drbd_warn(device, "Split-Brain detected, %d primaries, "
3487                              "automatically solved. Sync from %s node\n",
3488                              pcount, (hg < 0) ? "peer" : "this");
3489                         if (forced) {
3490                                 drbd_warn(device, "Doing a full sync, since"
3491                                      " UUIDs where ambiguous.\n");
3492                                 hg = hg*2;
3493                         }
3494                 }
3495         }
3496
3497         if (hg == -100) {
3498                 if (test_bit(DISCARD_MY_DATA, &device->flags) && !(device->p_uuid[UI_FLAGS]&1))
3499                         hg = -1;
3500                 if (!test_bit(DISCARD_MY_DATA, &device->flags) && (device->p_uuid[UI_FLAGS]&1))
3501                         hg = 1;
3502
3503                 if (abs(hg) < 100)
3504                         drbd_warn(device, "Split-Brain detected, manually solved. "
3505                              "Sync from %s node\n",
3506                              (hg < 0) ? "peer" : "this");
3507         }
3508         rr_conflict = nc->rr_conflict;
3509         tentative = nc->tentative;
3510         rcu_read_unlock();
3511
3512         if (hg == -100) {
3513                 /* FIXME this log message is not correct if we end up here
3514                  * after an attempted attach on a diskless node.
3515                  * We just refuse to attach -- well, we drop the "connection"
3516                  * to that disk, in a way... */
3517                 drbd_alert(device, "Split-Brain detected but unresolved, dropping connection!\n");
3518                 drbd_khelper(device, "split-brain");
3519                 return C_MASK;
3520         }
3521
3522         if (hg > 0 && mydisk <= D_INCONSISTENT) {
3523                 drbd_err(device, "I shall become SyncSource, but I am inconsistent!\n");
3524                 return C_MASK;
3525         }
3526
3527         if (hg < 0 && /* by intention we do not use mydisk here. */
3528             device->state.role == R_PRIMARY && device->state.disk >= D_CONSISTENT) {
3529                 switch (rr_conflict) {
3530                 case ASB_CALL_HELPER:
3531                         drbd_khelper(device, "pri-lost");
3532                         /* fall through */
3533                 case ASB_DISCONNECT:
3534                         drbd_err(device, "I shall become SyncTarget, but I am primary!\n");
3535                         return C_MASK;
3536                 case ASB_VIOLENTLY:
3537                         drbd_warn(device, "Becoming SyncTarget, violating the stable-data"
3538                              "assumption\n");
3539                 }
3540         }
3541
3542         if (tentative || test_bit(CONN_DRY_RUN, &peer_device->connection->flags)) {
3543                 if (hg == 0)
3544                         drbd_info(device, "dry-run connect: No resync, would become Connected immediately.\n");
3545                 else
3546                         drbd_info(device, "dry-run connect: Would become %s, doing a %s resync.",
3547                                  drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET),
3548                                  abs(hg) >= 2 ? "full" : "bit-map based");
3549                 return C_MASK;
3550         }
3551
3552         if (abs(hg) >= 2) {
3553                 drbd_info(device, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
3554                 if (drbd_bitmap_io(device, &drbd_bmio_set_n_write, "set_n_write from sync_handshake",
3555                                         BM_LOCKED_SET_ALLOWED))
3556                         return C_MASK;
3557         }
3558
3559         if (hg > 0) { /* become sync source. */
3560                 rv = C_WF_BITMAP_S;
3561         } else if (hg < 0) { /* become sync target */
3562                 rv = C_WF_BITMAP_T;
3563         } else {
3564                 rv = C_CONNECTED;
3565                 if (drbd_bm_total_weight(device)) {
3566                         drbd_info(device, "No resync, but %lu bits in bitmap!\n",
3567                              drbd_bm_total_weight(device));
3568                 }
3569         }
3570
3571         return rv;
3572 }
3573
3574 static enum drbd_after_sb_p convert_after_sb(enum drbd_after_sb_p peer)
3575 {
3576         /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
3577         if (peer == ASB_DISCARD_REMOTE)
3578                 return ASB_DISCARD_LOCAL;
3579
3580         /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
3581         if (peer == ASB_DISCARD_LOCAL)
3582                 return ASB_DISCARD_REMOTE;
3583
3584         /* everything else is valid if they are equal on both sides. */
3585         return peer;
3586 }
3587
3588 static int receive_protocol(struct drbd_connection *connection, struct packet_info *pi)
3589 {
3590         struct p_protocol *p = pi->data;
3591         enum drbd_after_sb_p p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
3592         int p_proto, p_discard_my_data, p_two_primaries, cf;
3593         struct net_conf *nc, *old_net_conf, *new_net_conf = NULL;
3594         char integrity_alg[SHARED_SECRET_MAX] = "";
3595         struct crypto_ahash *peer_integrity_tfm = NULL;
3596         void *int_dig_in = NULL, *int_dig_vv = NULL;
3597
3598         p_proto         = be32_to_cpu(p->protocol);
3599         p_after_sb_0p   = be32_to_cpu(p->after_sb_0p);
3600         p_after_sb_1p   = be32_to_cpu(p->after_sb_1p);
3601         p_after_sb_2p   = be32_to_cpu(p->after_sb_2p);
3602         p_two_primaries = be32_to_cpu(p->two_primaries);
3603         cf              = be32_to_cpu(p->conn_flags);
3604         p_discard_my_data = cf & CF_DISCARD_MY_DATA;
3605
3606         if (connection->agreed_pro_version >= 87) {
3607                 int err;
3608
3609                 if (pi->size > sizeof(integrity_alg))
3610                         return -EIO;
3611                 err = drbd_recv_all(connection, integrity_alg, pi->size);
3612                 if (err)
3613                         return err;
3614                 integrity_alg[SHARED_SECRET_MAX - 1] = 0;
3615         }
3616
3617         if (pi->cmd != P_PROTOCOL_UPDATE) {
3618                 clear_bit(CONN_DRY_RUN, &connection->flags);
3619
3620                 if (cf & CF_DRY_RUN)
3621                         set_bit(CONN_DRY_RUN, &connection->flags);
3622
3623                 rcu_read_lock();
3624                 nc = rcu_dereference(connection->net_conf);
3625
3626                 if (p_proto != nc->wire_protocol) {
3627                         drbd_err(connection, "incompatible %s settings\n", "protocol");
3628                         goto disconnect_rcu_unlock;
3629                 }
3630
3631                 if (convert_after_sb(p_after_sb_0p) != nc->after_sb_0p) {
3632                         drbd_err(connection, "incompatible %s settings\n", "after-sb-0pri");
3633                         goto disconnect_rcu_unlock;
3634                 }
3635
3636                 if (convert_after_sb(p_after_sb_1p) != nc->after_sb_1p) {
3637                         drbd_err(connection, "incompatible %s settings\n", "after-sb-1pri");
3638                         goto disconnect_rcu_unlock;
3639                 }
3640
3641                 if (convert_after_sb(p_after_sb_2p) != nc->after_sb_2p) {
3642                         drbd_err(connection, "incompatible %s settings\n", "after-sb-2pri");
3643                         goto disconnect_rcu_unlock;
3644                 }
3645
3646                 if (p_discard_my_data && nc->discard_my_data) {
3647                         drbd_err(connection, "incompatible %s settings\n", "discard-my-data");
3648                         goto disconnect_rcu_unlock;
3649                 }
3650
3651                 if (p_two_primaries != nc->two_primaries) {
3652                         drbd_err(connection, "incompatible %s settings\n", "allow-two-primaries");
3653                         goto disconnect_rcu_unlock;
3654                 }
3655
3656                 if (strcmp(integrity_alg, nc->integrity_alg)) {
3657                         drbd_err(connection, "incompatible %s settings\n", "data-integrity-alg");
3658                         goto disconnect_rcu_unlock;
3659                 }
3660
3661                 rcu_read_unlock();
3662         }
3663
3664         if (integrity_alg[0]) {
3665                 int hash_size;
3666
3667                 /*
3668                  * We can only change the peer data integrity algorithm
3669                  * here.  Changing our own data integrity algorithm
3670                  * requires that we send a P_PROTOCOL_UPDATE packet at
3671                  * the same time; otherwise, the peer has no way to
3672                  * tell between which packets the algorithm should
3673                  * change.
3674                  */
3675
3676                 peer_integrity_tfm = crypto_alloc_ahash(integrity_alg, 0, CRYPTO_ALG_ASYNC);
3677                 if (IS_ERR(peer_integrity_tfm)) {
3678                         peer_integrity_tfm = NULL;
3679                         drbd_err(connection, "peer data-integrity-alg %s not supported\n",
3680                                  integrity_alg);
3681                         goto disconnect;
3682                 }
3683
3684                 hash_size = crypto_ahash_digestsize(peer_integrity_tfm);
3685                 int_dig_in = kmalloc(hash_size, GFP_KERNEL);
3686                 int_dig_vv = kmalloc(hash_size, GFP_KERNEL);
3687                 if (!(int_dig_in && int_dig_vv)) {
3688                         drbd_err(connection, "Allocation of buffers for data integrity checking failed\n");
3689                         goto disconnect;
3690                 }
3691         }
3692
3693         new_net_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
3694         if (!new_net_conf) {
3695                 drbd_err(connection, "Allocation of new net_conf failed\n");
3696                 goto disconnect;
3697         }
3698
3699         mutex_lock(&connection->data.mutex);
3700         mutex_lock(&connection->resource->conf_update);
3701         old_net_conf = connection->net_conf;
3702         *new_net_conf = *old_net_conf;
3703
3704         new_net_conf->wire_protocol = p_proto;
3705         new_net_conf->after_sb_0p = convert_after_sb(p_after_sb_0p);
3706         new_net_conf->after_sb_1p = convert_after_sb(p_after_sb_1p);
3707         new_net_conf->after_sb_2p = convert_after_sb(p_after_sb_2p);
3708         new_net_conf->two_primaries = p_two_primaries;
3709
3710         rcu_assign_pointer(connection->net_conf, new_net_conf);
3711         mutex_unlock(&connection->resource->conf_update);
3712         mutex_unlock(&connection->data.mutex);
3713
3714         crypto_free_ahash(connection->peer_integrity_tfm);
3715         kfree(connection->int_dig_in);
3716         kfree(connection->int_dig_vv);
3717         connection->peer_integrity_tfm = peer_integrity_tfm;
3718         connection->int_dig_in = int_dig_in;
3719         connection->int_dig_vv = int_dig_vv;
3720
3721         if (strcmp(old_net_conf->integrity_alg, integrity_alg))
3722                 drbd_info(connection, "peer data-integrity-alg: %s\n",
3723                           integrity_alg[0] ? integrity_alg : "(none)");
3724
3725         synchronize_rcu();
3726         kfree(old_net_conf);
3727         return 0;
3728
3729 disconnect_rcu_unlock:
3730         rcu_read_unlock();
3731 disconnect:
3732         crypto_free_ahash(peer_integrity_tfm);
3733         kfree(int_dig_in);
3734         kfree(int_dig_vv);
3735         conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
3736         return -EIO;
3737 }
3738
3739 /* helper function
3740  * input: alg name, feature name
3741  * return: NULL (alg name was "")
3742  *         ERR_PTR(error) if something goes wrong
3743  *         or the crypto hash ptr, if it worked out ok. */
3744 static struct crypto_ahash *drbd_crypto_alloc_digest_safe(const struct drbd_device *device,
3745                 const char *alg, const char *name)
3746 {
3747         struct crypto_ahash *tfm;
3748
3749         if (!alg[0])
3750                 return NULL;
3751
3752         tfm = crypto_alloc_ahash(alg, 0, CRYPTO_ALG_ASYNC);
3753         if (IS_ERR(tfm)) {
3754                 drbd_err(device, "Can not allocate \"%s\" as %s (reason: %ld)\n",
3755                         alg, name, PTR_ERR(tfm));
3756                 return tfm;
3757         }
3758         return tfm;
3759 }
3760
3761 static int ignore_remaining_packet(struct drbd_connection *connection, struct packet_info *pi)
3762 {
3763         void *buffer = connection->data.rbuf;
3764         int size = pi->size;
3765
3766         while (size) {
3767                 int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE);
3768                 s = drbd_recv(connection, buffer, s);
3769                 if (s <= 0) {
3770                         if (s < 0)
3771                                 return s;
3772                         break;
3773                 }
3774                 size -= s;
3775         }
3776         if (size)
3777                 return -EIO;
3778         return 0;
3779 }
3780
3781 /*
3782  * config_unknown_volume  -  device configuration command for unknown volume
3783  *
3784  * When a device is added to an existing connection, the node on which the
3785  * device is added first will send configuration commands to its peer but the
3786  * peer will not know about the device yet.  It will warn and ignore these
3787  * commands.  Once the device is added on the second node, the second node will
3788  * send the same device configuration commands, but in the other direction.
3789  *
3790  * (We can also end up here if drbd is misconfigured.)
3791  */
3792 static int config_unknown_volume(struct drbd_connection *connection, struct packet_info *pi)
3793 {
3794         drbd_warn(connection, "%s packet received for volume %u, which is not configured locally\n",
3795                   cmdname(pi->cmd), pi->vnr);
3796         return ignore_remaining_packet(connection, pi);
3797 }
3798
3799 static int receive_SyncParam(struct drbd_connection *connection, struct packet_info *pi)
3800 {
3801         struct drbd_peer_device *peer_device;
3802         struct drbd_device *device;
3803         struct p_rs_param_95 *p;
3804         unsigned int header_size, data_size, exp_max_sz;
3805         struct crypto_ahash *verify_tfm = NULL;
3806         struct crypto_ahash *csums_tfm = NULL;
3807         struct net_conf *old_net_conf, *new_net_conf = NULL;
3808         struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL;
3809         const int apv = connection->agreed_pro_version;
3810         struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
3811         int fifo_size = 0;
3812         int err;
3813
3814         peer_device = conn_peer_device(connection, pi->vnr);
3815         if (!peer_device)
3816                 return config_unknown_volume(connection, pi);
3817         device = peer_device->device;
3818
3819         exp_max_sz  = apv <= 87 ? sizeof(struct p_rs_param)
3820                     : apv == 88 ? sizeof(struct p_rs_param)
3821                                         + SHARED_SECRET_MAX
3822                     : apv <= 94 ? sizeof(struct p_rs_param_89)
3823                     : /* apv >= 95 */ sizeof(struct p_rs_param_95);
3824
3825         if (pi->size > exp_max_sz) {
3826                 drbd_err(device, "SyncParam packet too long: received %u, expected <= %u bytes\n",
3827                     pi->size, exp_max_sz);
3828                 return -EIO;
3829         }
3830
3831         if (apv <= 88) {
3832                 header_size = sizeof(struct p_rs_param);
3833                 data_size = pi->size - header_size;
3834         } else if (apv <= 94) {
3835                 header_size = sizeof(struct p_rs_param_89);
3836                 data_size = pi->size - header_size;
3837                 D_ASSERT(device, data_size == 0);
3838         } else {
3839                 header_size = sizeof(struct p_rs_param_95);
3840                 data_size = pi->size - header_size;
3841                 D_ASSERT(device, data_size == 0);
3842         }
3843
3844         /* initialize verify_alg and csums_alg */
3845         p = pi->data;
3846         memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
3847
3848         err = drbd_recv_all(peer_device->connection, p, header_size);
3849         if (err)
3850                 return err;
3851
3852         mutex_lock(&connection->resource->conf_update);
3853         old_net_conf = peer_device->connection->net_conf;
3854         if (get_ldev(device)) {
3855                 new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
3856                 if (!new_disk_conf) {
3857                         put_ldev(device);
3858                         mutex_unlock(&connection->resource->conf_update);
3859                         drbd_err(device, "Allocation of new disk_conf failed\n");
3860                         return -ENOMEM;
3861                 }
3862
3863                 old_disk_conf = device->ldev->disk_conf;
3864                 *new_disk_conf = *old_disk_conf;
3865
3866                 new_disk_conf->resync_rate = be32_to_cpu(p->resync_rate);
3867         }
3868
3869         if (apv >= 88) {
3870                 if (apv == 88) {
3871                         if (data_size > SHARED_SECRET_MAX || data_size == 0) {
3872                                 drbd_err(device, "verify-alg of wrong size, "
3873                                         "peer wants %u, accepting only up to %u byte\n",
3874                                         data_size, SHARED_SECRET_MAX);
3875                                 err = -EIO;
3876                                 goto reconnect;
3877                         }
3878
3879                         err = drbd_recv_all(peer_device->connection, p->verify_alg, data_size);
3880                         if (err)
3881                                 goto reconnect;
3882                         /* we expect NUL terminated string */
3883                         /* but just in case someone tries to be evil */
3884                         D_ASSERT(device, p->verify_alg[data_size-1] == 0);
3885                         p->verify_alg[data_size-1] = 0;
3886
3887                 } else /* apv >= 89 */ {
3888                         /* we still expect NUL terminated strings */
3889                         /* but just in case someone tries to be evil */
3890                         D_ASSERT(device, p->verify_alg[SHARED_SECRET_MAX-1] == 0);
3891                         D_ASSERT(device, p->csums_alg[SHARED_SECRET_MAX-1] == 0);
3892                         p->verify_alg[SHARED_SECRET_MAX-1] = 0;
3893                         p->csums_alg[SHARED_SECRET_MAX-1] = 0;
3894                 }
3895
3896                 if (strcmp(old_net_conf->verify_alg, p->verify_alg)) {
3897                         if (device->state.conn == C_WF_REPORT_PARAMS) {
3898                                 drbd_err(device, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
3899                                     old_net_conf->verify_alg, p->verify_alg);
3900                                 goto disconnect;
3901                         }
3902                         verify_tfm = drbd_crypto_alloc_digest_safe(device,
3903                                         p->verify_alg, "verify-alg");
3904                         if (IS_ERR(verify_tfm)) {
3905                                 verify_tfm = NULL;
3906                                 goto disconnect;
3907                         }
3908                 }
3909
3910                 if (apv >= 89 && strcmp(old_net_conf->csums_alg, p->csums_alg)) {
3911                         if (device->state.conn == C_WF_REPORT_PARAMS) {
3912                                 drbd_err(device, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
3913                                     old_net_conf->csums_alg, p->csums_alg);
3914                                 goto disconnect;
3915                         }
3916                         csums_tfm = drbd_crypto_alloc_digest_safe(device,
3917                                         p->csums_alg, "csums-alg");
3918                         if (IS_ERR(csums_tfm)) {
3919                                 csums_tfm = NULL;
3920                                 goto disconnect;
3921                         }
3922                 }
3923
3924                 if (apv > 94 && new_disk_conf) {
3925                         new_disk_conf->c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
3926                         new_disk_conf->c_delay_target = be32_to_cpu(p->c_delay_target);
3927                         new_disk_conf->c_fill_target = be32_to_cpu(p->c_fill_target);
3928                         new_disk_conf->c_max_rate = be32_to_cpu(p->c_max_rate);
3929
3930                         fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
3931                         if (fifo_size != device->rs_plan_s->size) {
3932                                 new_plan = fifo_alloc(fifo_size);
3933                                 if (!new_plan) {
3934                                         drbd_err(device, "kmalloc of fifo_buffer failed");
3935                                         put_ldev(device);
3936                                         goto disconnect;
3937                                 }
3938                         }
3939                 }
3940
3941                 if (verify_tfm || csums_tfm) {
3942                         new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
3943                         if (!new_net_conf) {
3944                                 drbd_err(device, "Allocation of new net_conf failed\n");
3945                                 goto disconnect;
3946                         }
3947
3948                         *new_net_conf = *old_net_conf;
3949
3950                         if (verify_tfm) {
3951                                 strcpy(new_net_conf->verify_alg, p->verify_alg);
3952                                 new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1;
3953                                 crypto_free_ahash(peer_device->connection->verify_tfm);
3954                                 peer_device->connection->verify_tfm = verify_tfm;
3955                                 drbd_info(device, "using verify-alg: \"%s\"\n", p->verify_alg);
3956                         }
3957                         if (csums_tfm) {
3958                                 strcpy(new_net_conf->csums_alg, p->csums_alg);
3959                                 new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1;
3960                                 crypto_free_ahash(peer_device->connection->csums_tfm);
3961                                 peer_device->connection->csums_tfm = csums_tfm;
3962                                 drbd_info(device, "using csums-alg: \"%s\"\n", p->csums_alg);
3963                         }
3964                         rcu_assign_pointer(connection->net_conf, new_net_conf);
3965                 }
3966         }
3967
3968         if (new_disk_conf) {
3969                 rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
3970                 put_ldev(device);
3971         }
3972
3973         if (new_plan) {
3974                 old_plan = device->rs_plan_s;
3975                 rcu_assign_pointer(device->rs_plan_s, new_plan);
3976         }
3977
3978         mutex_unlock(&connection->resource->conf_update);
3979         synchronize_rcu();
3980         if (new_net_conf)
3981                 kfree(old_net_conf);
3982         kfree(old_disk_conf);
3983         kfree(old_plan);
3984
3985         return 0;
3986
3987 reconnect:
3988         if (new_disk_conf) {
3989                 put_ldev(device);
3990                 kfree(new_disk_conf);
3991         }
3992         mutex_unlock(&connection->resource->conf_update);
3993         return -EIO;
3994
3995 disconnect:
3996         kfree(new_plan);
3997         if (new_disk_conf) {
3998                 put_ldev(device);
3999                 kfree(new_disk_conf);
4000         }
4001         mutex_unlock(&connection->resource->conf_update);
4002         /* just for completeness: actually not needed,
4003          * as this is not reached if csums_tfm was ok. */
4004         crypto_free_ahash(csums_tfm);
4005         /* but free the verify_tfm again, if csums_tfm did not work out */
4006         crypto_free_ahash(verify_tfm);
4007         conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
4008         return -EIO;
4009 }
4010
4011 /* warn if the arguments differ by more than 12.5% */
4012 static void warn_if_differ_considerably(struct drbd_device *device,
4013         const char *s, sector_t a, sector_t b)
4014 {
4015         sector_t d;
4016         if (a == 0 || b == 0)
4017                 return;
4018         d = (a > b) ? (a - b) : (b - a);
4019         if (d > (a>>3) || d > (b>>3))
4020                 drbd_warn(device, "Considerable difference in %s: %llus vs. %llus\n", s,
4021                      (unsigned long long)a, (unsigned long long)b);
4022 }
4023
4024 static int receive_sizes(struct drbd_connection *connection, struct packet_info *pi)
4025 {
4026         struct drbd_peer_device *peer_device;
4027         struct drbd_device *device;
4028         struct p_sizes *p = pi->data;
4029         struct o_qlim *o = (connection->agreed_features & DRBD_FF_WSAME) ? p->qlim : NULL;
4030         enum determine_dev_size dd = DS_UNCHANGED;
4031         sector_t p_size, p_usize, p_csize, my_usize;
4032         int ldsc = 0; /* local disk size changed */
4033         enum dds_flags ddsf;
4034
4035         peer_device = conn_peer_device(connection, pi->vnr);
4036         if (!peer_device)
4037                 return config_unknown_volume(connection, pi);
4038         device = peer_device->device;
4039
4040         p_size = be64_to_cpu(p->d_size);
4041         p_usize = be64_to_cpu(p->u_size);
4042         p_csize = be64_to_cpu(p->c_size);
4043
4044         /* just store the peer's disk size for now.
4045          * we still need to figure out whether we accept that. */
4046         device->p_size = p_size;
4047
4048         if (get_ldev(device)) {
4049                 sector_t new_size, cur_size;
4050                 rcu_read_lock();
4051                 my_usize = rcu_dereference(device->ldev->disk_conf)->disk_size;
4052                 rcu_read_unlock();
4053
4054                 warn_if_differ_considerably(device, "lower level device sizes",
4055                            p_size, drbd_get_max_capacity(device->ldev));
4056                 warn_if_differ_considerably(device, "user requested size",
4057                                             p_usize, my_usize);
4058
4059                 /* if this is the first connect, or an otherwise expected
4060                  * param exchange, choose the minimum */
4061                 if (device->state.conn == C_WF_REPORT_PARAMS)
4062                         p_usize = min_not_zero(my_usize, p_usize);
4063
4064                 /* Never shrink a device with usable data during connect.
4065                    But allow online shrinking if we are connected. */
4066                 new_size = drbd_new_dev_size(device, device->ldev, p_usize, 0);
4067                 cur_size = drbd_get_capacity(device->this_bdev);
4068                 if (new_size < cur_size &&
4069                     device->state.disk >= D_OUTDATED &&
4070                     device->state.conn < C_CONNECTED) {
4071                         drbd_err(device, "The peer's disk size is too small! (%llu < %llu sectors)\n",
4072                                         (unsigned long long)new_size, (unsigned long long)cur_size);
4073                         conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
4074                         put_ldev(device);
4075                         return -EIO;
4076                 }
4077
4078                 if (my_usize != p_usize) {
4079                         struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
4080
4081                         new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
4082                         if (!new_disk_conf) {
4083                                 drbd_err(device, "Allocation of new disk_conf failed\n");
4084                                 put_ldev(device);
4085                                 return -ENOMEM;
4086                         }
4087
4088                         mutex_lock(&connection->resource->conf_update);
4089                         old_disk_conf = device->ldev->disk_conf;
4090                         *new_disk_conf = *old_disk_conf;
4091                         new_disk_conf->disk_size = p_usize;
4092
4093                         rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
4094                         mutex_unlock(&connection->resource->conf_update);
4095                         synchronize_rcu();
4096                         kfree(old_disk_conf);
4097
4098                         drbd_info(device, "Peer sets u_size to %lu sectors\n",
4099                                  (unsigned long)my_usize);
4100                 }
4101
4102                 put_ldev(device);
4103         }
4104
4105         device->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
4106         /* Leave drbd_reconsider_queue_parameters() before drbd_determine_dev_size().
4107            In case we cleared the QUEUE_FLAG_DISCARD from our queue in
4108            drbd_reconsider_queue_parameters(), we can be sure that after
4109            drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */
4110
4111         ddsf = be16_to_cpu(p->dds_flags);
4112         if (get_ldev(device)) {
4113                 drbd_reconsider_queue_parameters(device, device->ldev, o);
4114                 dd = drbd_determine_dev_size(device, ddsf, NULL);
4115                 put_ldev(device);
4116                 if (dd == DS_ERROR)
4117                         return -EIO;
4118                 drbd_md_sync(device);
4119         } else {
4120                 /*
4121                  * I am diskless, need to accept the peer's *current* size.
4122                  * I must NOT accept the peers backing disk size,
4123                  * it may have been larger than mine all along...
4124                  *
4125                  * At this point, the peer knows more about my disk, or at
4126                  * least about what we last agreed upon, than myself.
4127                  * So if his c_size is less than his d_size, the most likely
4128                  * reason is that *my* d_size was smaller last time we checked.
4129                  *
4130                  * However, if he sends a zero current size,
4131                  * take his (user-capped or) backing disk size anyways.
4132                  */
4133                 drbd_reconsider_queue_parameters(device, NULL, o);
4134                 drbd_set_my_capacity(device, p_csize ?: p_usize ?: p_size);
4135         }
4136
4137         if (get_ldev(device)) {
4138                 if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev)) {
4139                         device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev);
4140                         ldsc = 1;
4141                 }
4142
4143                 put_ldev(device);
4144         }
4145
4146         if (device->state.conn > C_WF_REPORT_PARAMS) {
4147                 if (be64_to_cpu(p->c_size) !=
4148                     drbd_get_capacity(device->this_bdev) || ldsc) {
4149                         /* we have different sizes, probably peer
4150                          * needs to know my new size... */
4151                         drbd_send_sizes(peer_device, 0, ddsf);
4152                 }
4153                 if (test_and_clear_bit(RESIZE_PENDING, &device->flags) ||
4154                     (dd == DS_GREW && device->state.conn == C_CONNECTED)) {
4155                         if (device->state.pdsk >= D_INCONSISTENT &&
4156                             device->state.disk >= D_INCONSISTENT) {
4157                                 if (ddsf & DDSF_NO_RESYNC)
4158                                         drbd_info(device, "Resync of new storage suppressed with --assume-clean\n");
4159                                 else
4160                                         resync_after_online_grow(device);
4161                         } else
4162                                 set_bit(RESYNC_AFTER_NEG, &device->flags);
4163                 }
4164         }
4165
4166         return 0;
4167 }
4168
4169 static int receive_uuids(struct drbd_connection *connection, struct packet_info *pi)
4170 {
4171         struct drbd_peer_device *peer_device;
4172         struct drbd_device *device;
4173         struct p_uuids *p = pi->data;
4174         u64 *p_uuid;
4175         int i, updated_uuids = 0;
4176
4177         peer_device = conn_peer_device(connection, pi->vnr);
4178         if (!peer_device)
4179                 return config_unknown_volume(connection, pi);
4180         device = peer_device->device;
4181
4182         p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
4183         if (!p_uuid) {
4184                 drbd_err(device, "kmalloc of p_uuid failed\n");
4185                 return false;
4186         }
4187
4188         for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
4189                 p_uuid[i] = be64_to_cpu(p->uuid[i]);
4190
4191         kfree(device->p_uuid);
4192         device->p_uuid = p_uuid;
4193
4194         if (device->state.conn < C_CONNECTED &&
4195             device->state.disk < D_INCONSISTENT &&
4196             device->state.role == R_PRIMARY &&
4197             (device->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
4198                 drbd_err(device, "Can only connect to data with current UUID=%016llX\n",
4199                     (unsigned long long)device->ed_uuid);
4200                 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
4201                 return -EIO;
4202         }
4203
4204         if (get_ldev(device)) {
4205                 int skip_initial_sync =
4206                         device->state.conn == C_CONNECTED &&
4207                         peer_device->connection->agreed_pro_version >= 90 &&
4208                         device->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
4209                         (p_uuid[UI_FLAGS] & 8);
4210                 if (skip_initial_sync) {
4211                         drbd_info(device, "Accepted new current UUID, preparing to skip initial sync\n");
4212                         drbd_bitmap_io(device, &drbd_bmio_clear_n_write,
4213                                         "clear_n_write from receive_uuids",
4214                                         BM_LOCKED_TEST_ALLOWED);
4215                         _drbd_uuid_set(device, UI_CURRENT, p_uuid[UI_CURRENT]);
4216                         _drbd_uuid_set(device, UI_BITMAP, 0);
4217                         _drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
4218                                         CS_VERBOSE, NULL);
4219                         drbd_md_sync(device);
4220                         updated_uuids = 1;
4221                 }
4222                 put_ldev(device);
4223         } else if (device->state.disk < D_INCONSISTENT &&
4224                    device->state.role == R_PRIMARY) {
4225                 /* I am a diskless primary, the peer just created a new current UUID
4226                    for me. */
4227                 updated_uuids = drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]);
4228         }
4229
4230         /* Before we test for the disk state, we should wait until an eventually
4231            ongoing cluster wide state change is finished. That is important if
4232            we are primary and are detaching from our disk. We need to see the
4233            new disk state... */
4234         mutex_lock(device->state_mutex);
4235         mutex_unlock(device->state_mutex);
4236         if (device->state.conn >= C_CONNECTED && device->state.disk < D_INCONSISTENT)
4237                 updated_uuids |= drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]);
4238
4239         if (updated_uuids)
4240                 drbd_print_uuids(device, "receiver updated UUIDs to");
4241
4242         return 0;
4243 }
4244
4245 /**
4246  * convert_state() - Converts the peer's view of the cluster state to our point of view
4247  * @ps:         The state as seen by the peer.
4248  */
4249 static union drbd_state convert_state(union drbd_state ps)
4250 {
4251         union drbd_state ms;
4252
4253         static enum drbd_conns c_tab[] = {
4254                 [C_WF_REPORT_PARAMS] = C_WF_REPORT_PARAMS,
4255                 [C_CONNECTED] = C_CONNECTED,
4256
4257                 [C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
4258                 [C_STARTING_SYNC_T] = C_STARTING_SYNC_S,
4259                 [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */
4260                 [C_VERIFY_S]       = C_VERIFY_T,
4261                 [C_MASK]   = C_MASK,
4262         };
4263
4264         ms.i = ps.i;
4265
4266         ms.conn = c_tab[ps.conn];
4267         ms.peer = ps.role;
4268         ms.role = ps.peer;
4269         ms.pdsk = ps.disk;
4270         ms.disk = ps.pdsk;
4271         ms.peer_isp = (ps.aftr_isp | ps.user_isp);
4272
4273         return ms;
4274 }
4275
4276 static int receive_req_state(struct drbd_connection *connection, struct packet_info *pi)
4277 {
4278         struct drbd_peer_device *peer_device;
4279         struct drbd_device *device;
4280         struct p_req_state *p = pi->data;
4281         union drbd_state mask, val;
4282         enum drbd_state_rv rv;
4283
4284         peer_device = conn_peer_device(connection, pi->vnr);
4285         if (!peer_device)
4286                 return -EIO;
4287         device = peer_device->device;
4288
4289         mask.i = be32_to_cpu(p->mask);
4290         val.i = be32_to_cpu(p->val);
4291
4292         if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags) &&
4293             mutex_is_locked(device->state_mutex)) {
4294                 drbd_send_sr_reply(peer_device, SS_CONCURRENT_ST_CHG);
4295                 return 0;
4296         }
4297
4298         mask = convert_state(mask);
4299         val = convert_state(val);
4300
4301         rv = drbd_change_state(device, CS_VERBOSE, mask, val);
4302         drbd_send_sr_reply(peer_device, rv);
4303
4304         drbd_md_sync(device);
4305
4306         return 0;
4307 }
4308
4309 static int receive_req_conn_state(struct drbd_connection *connection, struct packet_info *pi)
4310 {
4311         struct p_req_state *p = pi->data;
4312         union drbd_state mask, val;
4313         enum drbd_state_rv rv;
4314
4315         mask.i = be32_to_cpu(p->mask);
4316         val.i = be32_to_cpu(p->val);
4317
4318         if (test_bit(RESOLVE_CONFLICTS, &connection->flags) &&
4319             mutex_is_locked(&connection->cstate_mutex)) {
4320                 conn_send_sr_reply(connection, SS_CONCURRENT_ST_CHG);
4321                 return 0;
4322         }
4323
4324         mask = convert_state(mask);
4325         val = convert_state(val);
4326
4327         rv = conn_request_state(connection, mask, val, CS_VERBOSE | CS_LOCAL_ONLY | CS_IGN_OUTD_FAIL);
4328         conn_send_sr_reply(connection, rv);
4329
4330         return 0;
4331 }
4332
4333 static int receive_state(struct drbd_connection *connection, struct packet_info *pi)
4334 {
4335         struct drbd_peer_device *peer_device;
4336         struct drbd_device *device;
4337         struct p_state *p = pi->data;
4338         union drbd_state os, ns, peer_state;
4339         enum drbd_disk_state real_peer_disk;
4340         enum chg_state_flags cs_flags;
4341         int rv;
4342
4343         peer_device = conn_peer_device(connection, pi->vnr);
4344         if (!peer_device)
4345                 return config_unknown_volume(connection, pi);
4346         device = peer_device->device;
4347
4348         peer_state.i = be32_to_cpu(p->state);
4349
4350         real_peer_disk = peer_state.disk;
4351         if (peer_state.disk == D_NEGOTIATING) {
4352                 real_peer_disk = device->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
4353                 drbd_info(device, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
4354         }
4355
4356         spin_lock_irq(&device->resource->req_lock);
4357  retry:
4358         os = ns = drbd_read_state(device);
4359         spin_unlock_irq(&device->resource->req_lock);
4360
4361         /* If some other part of the code (ack_receiver thread, timeout)
4362          * already decided to close the connection again,
4363          * we must not "re-establish" it here. */
4364         if (os.conn <= C_TEAR_DOWN)
4365                 return -ECONNRESET;
4366
4367         /* If this is the "end of sync" confirmation, usually the peer disk
4368          * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits
4369          * set) resync started in PausedSyncT, or if the timing of pause-/
4370          * unpause-sync events has been "just right", the peer disk may
4371          * transition from D_CONSISTENT to D_UP_TO_DATE as well.
4372          */
4373         if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) &&
4374             real_peer_disk == D_UP_TO_DATE &&
4375             os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
4376                 /* If we are (becoming) SyncSource, but peer is still in sync
4377                  * preparation, ignore its uptodate-ness to avoid flapping, it
4378                  * will change to inconsistent once the peer reaches active
4379                  * syncing states.
4380                  * It may have changed syncer-paused flags, however, so we
4381                  * cannot ignore this completely. */
4382                 if (peer_state.conn > C_CONNECTED &&
4383                     peer_state.conn < C_SYNC_SOURCE)
4384                         real_peer_disk = D_INCONSISTENT;
4385
4386                 /* if peer_state changes to connected at the same time,
4387                  * it explicitly notifies us that it finished resync.
4388                  * Maybe we should finish it up, too? */
4389                 else if (os.conn >= C_SYNC_SOURCE &&
4390                          peer_state.conn == C_CONNECTED) {
4391                         if (drbd_bm_total_weight(device) <= device->rs_failed)
4392                                 drbd_resync_finished(device);
4393                         return 0;
4394                 }
4395         }
4396
4397         /* explicit verify finished notification, stop sector reached. */
4398         if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE &&
4399             peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) {
4400                 ov_out_of_sync_print(device);
4401                 drbd_resync_finished(device);
4402                 return 0;
4403         }
4404
4405         /* peer says his disk is inconsistent, while we think it is uptodate,
4406          * and this happens while the peer still thinks we have a sync going on,
4407          * but we think we are already done with the sync.
4408          * We ignore this to avoid flapping pdsk.
4409          * This should not happen, if the peer is a recent version of drbd. */
4410         if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT &&
4411             os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE)
4412                 real_peer_disk = D_UP_TO_DATE;
4413
4414         if (ns.conn == C_WF_REPORT_PARAMS)
4415                 ns.conn = C_CONNECTED;
4416
4417         if (peer_state.conn == C_AHEAD)
4418                 ns.conn = C_BEHIND;
4419
4420         if (device->p_uuid && peer_state.disk >= D_NEGOTIATING &&
4421             get_ldev_if_state(device, D_NEGOTIATING)) {
4422                 int cr; /* consider resync */
4423
4424                 /* if we established a new connection */
4425                 cr  = (os.conn < C_CONNECTED);
4426                 /* if we had an established connection
4427                  * and one of the nodes newly attaches a disk */
4428                 cr |= (os.conn == C_CONNECTED &&
4429                        (peer_state.disk == D_NEGOTIATING ||
4430                         os.disk == D_NEGOTIATING));
4431                 /* if we have both been inconsistent, and the peer has been
4432                  * forced to be UpToDate with --overwrite-data */
4433                 cr |= test_bit(CONSIDER_RESYNC, &device->flags);
4434                 /* if we had been plain connected, and the admin requested to
4435                  * start a sync by "invalidate" or "invalidate-remote" */
4436                 cr |= (os.conn == C_CONNECTED &&
4437                                 (peer_state.conn >= C_STARTING_SYNC_S &&
4438                                  peer_state.conn <= C_WF_BITMAP_T));
4439
4440                 if (cr)
4441                         ns.conn = drbd_sync_handshake(peer_device, peer_state.role, real_peer_disk);
4442
4443                 put_ldev(device);
4444                 if (ns.conn == C_MASK) {
4445                         ns.conn = C_CONNECTED;
4446                         if (device->state.disk == D_NEGOTIATING) {
4447                                 drbd_force_state(device, NS(disk, D_FAILED));
4448                         } else if (peer_state.disk == D_NEGOTIATING) {
4449                                 drbd_err(device, "Disk attach process on the peer node was aborted.\n");
4450                                 peer_state.disk = D_DISKLESS;
4451                                 real_peer_disk = D_DISKLESS;
4452                         } else {
4453                                 if (test_and_clear_bit(CONN_DRY_RUN, &peer_device->connection->flags))
4454                                         return -EIO;
4455                                 D_ASSERT(device, os.conn == C_WF_REPORT_PARAMS);
4456                                 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
4457                                 return -EIO;
4458                         }
4459                 }
4460         }
4461
4462         spin_lock_irq(&device->resource->req_lock);
4463         if (os.i != drbd_read_state(device).i)
4464                 goto retry;
4465         clear_bit(CONSIDER_RESYNC, &device->flags);
4466         ns.peer = peer_state.role;
4467         ns.pdsk = real_peer_disk;
4468         ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
4469         if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
4470                 ns.disk = device->new_state_tmp.disk;
4471         cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
4472         if (ns.pdsk == D_CONSISTENT && drbd_suspended(device) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
4473             test_bit(NEW_CUR_UUID, &device->flags)) {
4474                 /* Do not allow tl_restart(RESEND) for a rebooted peer. We can only allow this
4475                    for temporal network outages! */
4476                 spin_unlock_irq(&device->resource->req_lock);
4477                 drbd_err(device, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
4478                 tl_clear(peer_device->connection);
4479                 drbd_uuid_new_current(device);
4480                 clear_bit(NEW_CUR_UUID, &device->flags);
4481                 conn_request_state(peer_device->connection, NS2(conn, C_PROTOCOL_ERROR, susp, 0), CS_HARD);
4482                 return -EIO;
4483         }
4484         rv = _drbd_set_state(device, ns, cs_flags, NULL);
4485         ns = drbd_read_state(device);
4486         spin_unlock_irq(&device->resource->req_lock);
4487
4488         if (rv < SS_SUCCESS) {
4489                 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
4490                 return -EIO;
4491         }
4492
4493         if (os.conn > C_WF_REPORT_PARAMS) {
4494                 if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
4495                     peer_state.disk != D_NEGOTIATING ) {
4496                         /* we want resync, peer has not yet decided to sync... */
4497                         /* Nowadays only used when forcing a node into primary role and
4498                            setting its disk to UpToDate with that */
4499                         drbd_send_uuids(peer_device);
4500                         drbd_send_current_state(peer_device);
4501                 }
4502         }
4503
4504         clear_bit(DISCARD_MY_DATA, &device->flags);
4505
4506         drbd_md_sync(device); /* update connected indicator, la_size_sect, ... */
4507
4508         return 0;
4509 }
4510
4511 static int receive_sync_uuid(struct drbd_connection *connection, struct packet_info *pi)
4512 {
4513         struct drbd_peer_device *peer_device;
4514         struct drbd_device *device;
4515         struct p_rs_uuid *p = pi->data;
4516
4517         peer_device = conn_peer_device(connection, pi->vnr);
4518         if (!peer_device)
4519                 return -EIO;
4520         device = peer_device->device;
4521
4522         wait_event(device->misc_wait,
4523                    device->state.conn == C_WF_SYNC_UUID ||
4524                    device->state.conn == C_BEHIND ||
4525                    device->state.conn < C_CONNECTED ||
4526                    device->state.disk < D_NEGOTIATING);
4527
4528         /* D_ASSERT(device,  device->state.conn == C_WF_SYNC_UUID ); */
4529
4530         /* Here the _drbd_uuid_ functions are right, current should
4531            _not_ be rotated into the history */
4532         if (get_ldev_if_state(device, D_NEGOTIATING)) {
4533                 _drbd_uuid_set(device, UI_CURRENT, be64_to_cpu(p->uuid));
4534                 _drbd_uuid_set(device, UI_BITMAP, 0UL);
4535
4536                 drbd_print_uuids(device, "updated sync uuid");
4537                 drbd_start_resync(device, C_SYNC_TARGET);
4538
4539                 put_ldev(device);
4540         } else
4541                 drbd_err(device, "Ignoring SyncUUID packet!\n");
4542
4543         return 0;
4544 }
4545
4546 /**
4547  * receive_bitmap_plain
4548  *
4549  * Return 0 when done, 1 when another iteration is needed, and a negative error
4550  * code upon failure.
4551  */
4552 static int
4553 receive_bitmap_plain(struct drbd_peer_device *peer_device, unsigned int size,
4554                      unsigned long *p, struct bm_xfer_ctx *c)
4555 {
4556         unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE -
4557                                  drbd_header_size(peer_device->connection);
4558         unsigned int num_words = min_t(size_t, data_size / sizeof(*p),
4559                                        c->bm_words - c->word_offset);
4560         unsigned int want = num_words * sizeof(*p);
4561         int err;
4562
4563         if (want != size) {
4564                 drbd_err(peer_device, "%s:want (%u) != size (%u)\n", __func__, want, size);
4565                 return -EIO;
4566         }
4567         if (want == 0)
4568                 return 0;
4569         err = drbd_recv_all(peer_device->connection, p, want);
4570         if (err)
4571                 return err;
4572
4573         drbd_bm_merge_lel(peer_device->device, c->word_offset, num_words, p);
4574
4575         c->word_offset += num_words;
4576         c->bit_offset = c->word_offset * BITS_PER_LONG;
4577         if (c->bit_offset > c->bm_bits)
4578                 c->bit_offset = c->bm_bits;
4579
4580         return 1;
4581 }
4582
4583 static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p)
4584 {
4585         return (enum drbd_bitmap_code)(p->encoding & 0x0f);
4586 }
4587
4588 static int dcbp_get_start(struct p_compressed_bm *p)
4589 {
4590         return (p->encoding & 0x80) != 0;
4591 }
4592
4593 static int dcbp_get_pad_bits(struct p_compressed_bm *p)
4594 {
4595         return (p->encoding >> 4) & 0x7;
4596 }
4597
4598 /**
4599  * recv_bm_rle_bits
4600  *
4601  * Return 0 when done, 1 when another iteration is needed, and a negative error
4602  * code upon failure.
4603  */
4604 static int
4605 recv_bm_rle_bits(struct drbd_peer_device *peer_device,
4606                 struct p_compressed_bm *p,
4607                  struct bm_xfer_ctx *c,
4608                  unsigned int len)
4609 {
4610         struct bitstream bs;
4611         u64 look_ahead;
4612         u64 rl;
4613         u64 tmp;
4614         unsigned long s = c->bit_offset;
4615         unsigned long e;
4616         int toggle = dcbp_get_start(p);
4617         int have;
4618         int bits;
4619
4620         bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p));
4621
4622         bits = bitstream_get_bits(&bs, &look_ahead, 64);
4623         if (bits < 0)
4624                 return -EIO;
4625
4626         for (have = bits; have > 0; s += rl, toggle = !toggle) {
4627                 bits = vli_decode_bits(&rl, look_ahead);
4628                 if (bits <= 0)
4629                         return -EIO;
4630
4631                 if (toggle) {
4632                         e = s + rl -1;
4633                         if (e >= c->bm_bits) {
4634                                 drbd_err(peer_device, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
4635                                 return -EIO;
4636                         }
4637                         _drbd_bm_set_bits(peer_device->device, s, e);
4638                 }
4639
4640                 if (have < bits) {
4641                         drbd_err(peer_device, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
4642                                 have, bits, look_ahead,
4643                                 (unsigned int)(bs.cur.b - p->code),
4644                                 (unsigned int)bs.buf_len);
4645                         return -EIO;
4646                 }
4647                 /* if we consumed all 64 bits, assign 0; >> 64 is "undefined"; */
4648                 if (likely(bits < 64))
4649                         look_ahead >>= bits;
4650                 else
4651                         look_ahead = 0;
4652                 have -= bits;
4653
4654                 bits = bitstream_get_bits(&bs, &tmp, 64 - have);
4655                 if (bits < 0)
4656                         return -EIO;
4657                 look_ahead |= tmp << have;
4658                 have += bits;
4659         }
4660
4661         c->bit_offset = s;
4662         bm_xfer_ctx_bit_to_word_offset(c);
4663
4664         return (s != c->bm_bits);
4665 }
4666
4667 /**
4668  * decode_bitmap_c
4669  *
4670  * Return 0 when done, 1 when another iteration is needed, and a negative error
4671  * code upon failure.
4672  */
4673 static int
4674 decode_bitmap_c(struct drbd_peer_device *peer_device,
4675                 struct p_compressed_bm *p,
4676                 struct bm_xfer_ctx *c,
4677                 unsigned int len)
4678 {
4679         if (dcbp_get_code(p) == RLE_VLI_Bits)
4680                 return recv_bm_rle_bits(peer_device, p, c, len - sizeof(*p));
4681
4682         /* other variants had been implemented for evaluation,
4683          * but have been dropped as this one turned out to be "best"
4684          * during all our tests. */
4685
4686         drbd_err(peer_device, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
4687         conn_request_state(peer_device->connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
4688         return -EIO;
4689 }
4690
4691 void INFO_bm_xfer_stats(struct drbd_device *device,
4692                 const char *direction, struct bm_xfer_ctx *c)
4693 {
4694         /* what would it take to transfer it "plaintext" */
4695         unsigned int header_size = drbd_header_size(first_peer_device(device)->connection);
4696         unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
4697         unsigned int plain =
4698                 header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) +
4699                 c->bm_words * sizeof(unsigned long);
4700         unsigned int total = c->bytes[0] + c->bytes[1];
4701         unsigned int r;
4702
4703         /* total can not be zero. but just in case: */
4704         if (total == 0)
4705                 return;
4706
4707         /* don't report if not compressed */
4708         if (total >= plain)
4709                 return;
4710
4711         /* total < plain. check for overflow, still */
4712         r = (total > UINT_MAX/1000) ? (total / (plain/1000))
4713                                     : (1000 * total / plain);
4714
4715         if (r > 1000)
4716                 r = 1000;
4717
4718         r = 1000 - r;
4719         drbd_info(device, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
4720              "total %u; compression: %u.%u%%\n",
4721                         direction,
4722                         c->bytes[1], c->packets[1],
4723                         c->bytes[0], c->packets[0],
4724                         total, r/10, r % 10);
4725 }
4726
4727 /* Since we are processing the bitfield from lower addresses to higher,
4728    it does not matter if the process it in 32 bit chunks or 64 bit
4729    chunks as long as it is little endian. (Understand it as byte stream,
4730    beginning with the lowest byte...) If we would use big endian
4731    we would need to process it from the highest address to the lowest,
4732    in order to be agnostic to the 32 vs 64 bits issue.
4733
4734    returns 0 on failure, 1 if we successfully received it. */
4735 static int receive_bitmap(struct drbd_connection *connection, struct packet_info *pi)
4736 {
4737         struct drbd_peer_device *peer_device;
4738         struct drbd_device *device;
4739         struct bm_xfer_ctx c;
4740         int err;
4741
4742         peer_device = conn_peer_device(connection, pi->vnr);
4743         if (!peer_device)
4744                 return -EIO;
4745         device = peer_device->device;
4746
4747         drbd_bm_lock(device, "receive bitmap", BM_LOCKED_SET_ALLOWED);
4748         /* you are supposed to send additional out-of-sync information
4749          * if you actually set bits during this phase */
4750
4751         c = (struct bm_xfer_ctx) {
4752                 .bm_bits = drbd_bm_bits(device),
4753                 .bm_words = drbd_bm_words(device),
4754         };
4755
4756         for(;;) {
4757                 if (pi->cmd == P_BITMAP)
4758                         err = receive_bitmap_plain(peer_device, pi->size, pi->data, &c);
4759                 else if (pi->cmd == P_COMPRESSED_BITMAP) {
4760                         /* MAYBE: sanity check that we speak proto >= 90,
4761                          * and the feature is enabled! */
4762                         struct p_compressed_bm *p = pi->data;
4763
4764                         if (pi->size > DRBD_SOCKET_BUFFER_SIZE - drbd_header_size(connection)) {
4765                                 drbd_err(device, "ReportCBitmap packet too large\n");
4766                                 err = -EIO;
4767                                 goto out;
4768                         }
4769                         if (pi->size <= sizeof(*p)) {
4770                                 drbd_err(device, "ReportCBitmap packet too small (l:%u)\n", pi->size);
4771                                 err = -EIO;
4772                                 goto out;
4773                         }
4774                         err = drbd_recv_all(peer_device->connection, p, pi->size);
4775                         if (err)
4776                                goto out;
4777                         err = decode_bitmap_c(peer_device, p, &c, pi->size);
4778                 } else {
4779                         drbd_warn(device, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd);
4780                         err = -EIO;
4781                         goto out;
4782                 }
4783
4784                 c.packets[pi->cmd == P_BITMAP]++;
4785                 c.bytes[pi->cmd == P_BITMAP] += drbd_header_size(connection) + pi->size;
4786
4787                 if (err <= 0) {
4788                         if (err < 0)
4789                                 goto out;
4790                         break;
4791                 }
4792                 err = drbd_recv_header(peer_device->connection, pi);
4793                 if (err)
4794                         goto out;
4795         }
4796
4797         INFO_bm_xfer_stats(device, "receive", &c);
4798
4799         if (device->state.conn == C_WF_BITMAP_T) {
4800                 enum drbd_state_rv rv;
4801
4802                 err = drbd_send_bitmap(device);
4803                 if (err)
4804                         goto out;
4805                 /* Omit CS_ORDERED with this state transition to avoid deadlocks. */
4806                 rv = _drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
4807                 D_ASSERT(device, rv == SS_SUCCESS);
4808         } else if (device->state.conn != C_WF_BITMAP_S) {
4809                 /* admin may have requested C_DISCONNECTING,
4810                  * other threads may have noticed network errors */
4811                 drbd_info(device, "unexpected cstate (%s) in receive_bitmap\n",
4812                     drbd_conn_str(device->state.conn));
4813         }
4814         err = 0;
4815
4816  out:
4817         drbd_bm_unlock(device);
4818         if (!err && device->state.conn == C_WF_BITMAP_S)
4819                 drbd_start_resync(device, C_SYNC_SOURCE);
4820         return err;
4821 }
4822
4823 static int receive_skip(struct drbd_connection *connection, struct packet_info *pi)
4824 {
4825         drbd_warn(connection, "skipping unknown optional packet type %d, l: %d!\n",
4826                  pi->cmd, pi->size);
4827
4828         return ignore_remaining_packet(connection, pi);
4829 }
4830
4831 static int receive_UnplugRemote(struct drbd_connection *connection, struct packet_info *pi)
4832 {
4833         /* Make sure we've acked all the TCP data associated
4834          * with the data requests being unplugged */
4835         drbd_tcp_quickack(connection->data.socket);
4836
4837         return 0;
4838 }
4839
4840 static int receive_out_of_sync(struct drbd_connection *connection, struct packet_info *pi)
4841 {
4842         struct drbd_peer_device *peer_device;
4843         struct drbd_device *device;
4844         struct p_block_desc *p = pi->data;
4845
4846         peer_device = conn_peer_device(connection, pi->vnr);
4847         if (!peer_device)
4848                 return -EIO;
4849         device = peer_device->device;
4850
4851         switch (device->state.conn) {
4852         case C_WF_SYNC_UUID:
4853         case C_WF_BITMAP_T:
4854         case C_BEHIND:
4855                         break;
4856         default:
4857                 drbd_err(device, "ASSERT FAILED cstate = %s, expected: WFSyncUUID|WFBitMapT|Behind\n",
4858                                 drbd_conn_str(device->state.conn));
4859         }
4860
4861         drbd_set_out_of_sync(device, be64_to_cpu(p->sector), be32_to_cpu(p->blksize));
4862
4863         return 0;
4864 }
4865
4866 static int receive_rs_deallocated(struct drbd_connection *connection, struct packet_info *pi)
4867 {
4868         struct drbd_peer_device *peer_device;
4869         struct p_block_desc *p = pi->data;
4870         struct drbd_device *device;
4871         sector_t sector;
4872         int size, err = 0;
4873
4874         peer_device = conn_peer_device(connection, pi->vnr);
4875         if (!peer_device)
4876                 return -EIO;
4877         device = peer_device->device;
4878
4879         sector = be64_to_cpu(p->sector);
4880         size = be32_to_cpu(p->blksize);
4881
4882         dec_rs_pending(device);
4883
4884         if (get_ldev(device)) {
4885                 struct drbd_peer_request *peer_req;
4886                 const int op = REQ_OP_DISCARD;
4887
4888                 peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER, sector,
4889                                                size, 0, GFP_NOIO);
4890                 if (!peer_req) {
4891                         put_ldev(device);
4892                         return -ENOMEM;
4893                 }
4894
4895                 peer_req->w.cb = e_end_resync_block;
4896                 peer_req->submit_jif = jiffies;
4897                 peer_req->flags |= EE_IS_TRIM;
4898
4899                 spin_lock_irq(&device->resource->req_lock);
4900                 list_add_tail(&peer_req->w.list, &device->sync_ee);
4901                 spin_unlock_irq(&device->resource->req_lock);
4902
4903                 atomic_add(pi->size >> 9, &device->rs_sect_ev);
4904                 err = drbd_submit_peer_request(device, peer_req, op, 0, DRBD_FAULT_RS_WR);
4905
4906                 if (err) {
4907                         spin_lock_irq(&device->resource->req_lock);
4908                         list_del(&peer_req->w.list);
4909                         spin_unlock_irq(&device->resource->req_lock);
4910
4911                         drbd_free_peer_req(device, peer_req);
4912                         put_ldev(device);
4913                         err = 0;
4914                         goto fail;
4915                 }
4916
4917                 inc_unacked(device);
4918
4919                 /* No put_ldev() here. Gets called in drbd_endio_write_sec_final(),
4920                    as well as drbd_rs_complete_io() */
4921         } else {
4922         fail:
4923                 drbd_rs_complete_io(device, sector);
4924                 drbd_send_ack_ex(peer_device, P_NEG_ACK, sector, size, ID_SYNCER);
4925         }
4926
4927         atomic_add(size >> 9, &device->rs_sect_in);
4928
4929         return err;
4930 }
4931
4932 struct data_cmd {
4933         int expect_payload;
4934         unsigned int pkt_size;
4935         int (*fn)(struct drbd_connection *, struct packet_info *);
4936 };
4937
4938 static struct data_cmd drbd_cmd_handler[] = {
4939         [P_DATA]            = { 1, sizeof(struct p_data), receive_Data },
4940         [P_DATA_REPLY]      = { 1, sizeof(struct p_data), receive_DataReply },
4941         [P_RS_DATA_REPLY]   = { 1, sizeof(struct p_data), receive_RSDataReply } ,
4942         [P_BARRIER]         = { 0, sizeof(struct p_barrier), receive_Barrier } ,
4943         [P_BITMAP]          = { 1, 0, receive_bitmap } ,
4944         [P_COMPRESSED_BITMAP] = { 1, 0, receive_bitmap } ,
4945         [P_UNPLUG_REMOTE]   = { 0, 0, receive_UnplugRemote },
4946         [P_DATA_REQUEST]    = { 0, sizeof(struct p_block_req), receive_DataRequest },
4947         [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
4948         [P_SYNC_PARAM]      = { 1, 0, receive_SyncParam },
4949         [P_SYNC_PARAM89]    = { 1, 0, receive_SyncParam },
4950         [P_PROTOCOL]        = { 1, sizeof(struct p_protocol), receive_protocol },
4951         [P_UUIDS]           = { 0, sizeof(struct p_uuids), receive_uuids },
4952         [P_SIZES]           = { 0, sizeof(struct p_sizes), receive_sizes },
4953         [P_STATE]           = { 0, sizeof(struct p_state), receive_state },
4954         [P_STATE_CHG_REQ]   = { 0, sizeof(struct p_req_state), receive_req_state },
4955         [P_SYNC_UUID]       = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid },
4956         [P_OV_REQUEST]      = { 0, sizeof(struct p_block_req), receive_DataRequest },
4957         [P_OV_REPLY]        = { 1, sizeof(struct p_block_req), receive_DataRequest },
4958         [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
4959         [P_RS_THIN_REQ]     = { 0, sizeof(struct p_block_req), receive_DataRequest },
4960         [P_DELAY_PROBE]     = { 0, sizeof(struct p_delay_probe93), receive_skip },
4961         [P_OUT_OF_SYNC]     = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
4962         [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
4963         [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
4964         [P_TRIM]            = { 0, sizeof(struct p_trim), receive_Data },
4965         [P_RS_DEALLOCATED]  = { 0, sizeof(struct p_block_desc), receive_rs_deallocated },
4966         [P_WSAME]           = { 1, sizeof(struct p_wsame), receive_Data },
4967 };
4968
4969 static void drbdd(struct drbd_connection *connection)
4970 {
4971         struct packet_info pi;
4972         size_t shs; /* sub header size */
4973         int err;
4974
4975         while (get_t_state(&connection->receiver) == RUNNING) {
4976                 struct data_cmd const *cmd;
4977
4978                 drbd_thread_current_set_cpu(&connection->receiver);
4979                 update_receiver_timing_details(connection, drbd_recv_header);
4980                 if (drbd_recv_header(connection, &pi))
4981                         goto err_out;
4982
4983                 cmd = &drbd_cmd_handler[pi.cmd];
4984                 if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) {
4985                         drbd_err(connection, "Unexpected data packet %s (0x%04x)",
4986                                  cmdname(pi.cmd), pi.cmd);
4987                         goto err_out;
4988                 }
4989
4990                 shs = cmd->pkt_size;
4991                 if (pi.cmd == P_SIZES && connection->agreed_features & DRBD_FF_WSAME)
4992                         shs += sizeof(struct o_qlim);
4993                 if (pi.size > shs && !cmd->expect_payload) {
4994                         drbd_err(connection, "No payload expected %s l:%d\n",
4995                                  cmdname(pi.cmd), pi.size);
4996                         goto err_out;
4997                 }
4998                 if (pi.size < shs) {
4999                         drbd_err(connection, "%s: unexpected packet size, expected:%d received:%d\n",
5000                                  cmdname(pi.cmd), (int)shs, pi.size);
5001                         goto err_out;
5002                 }
5003
5004                 if (shs) {
5005                         update_receiver_timing_details(connection, drbd_recv_all_warn);
5006                         err = drbd_recv_all_warn(connection, pi.data, shs);
5007                         if (err)
5008                                 goto err_out;
5009                         pi.size -= shs;
5010                 }
5011
5012                 update_receiver_timing_details(connection, cmd->fn);
5013                 err = cmd->fn(connection, &pi);
5014                 if (err) {
5015                         drbd_err(connection, "error receiving %s, e: %d l: %d!\n",
5016                                  cmdname(pi.cmd), err, pi.size);
5017                         goto err_out;
5018                 }
5019         }
5020         return;
5021
5022     err_out:
5023         conn_request_state(connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
5024 }
5025
5026 static void conn_disconnect(struct drbd_connection *connection)
5027 {
5028         struct drbd_peer_device *peer_device;
5029         enum drbd_conns oc;
5030         int vnr;
5031
5032         if (connection->cstate == C_STANDALONE)
5033                 return;
5034
5035         /* We are about to start the cleanup after connection loss.
5036          * Make sure drbd_make_request knows about that.
5037          * Usually we should be in some network failure state already,
5038          * but just in case we are not, we fix it up here.
5039          */
5040         conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
5041
5042         /* ack_receiver does not clean up anything. it must not interfere, either */
5043         drbd_thread_stop(&connection->ack_receiver);
5044         if (connection->ack_sender) {
5045                 destroy_workqueue(connection->ack_sender);
5046                 connection->ack_sender = NULL;
5047         }
5048         drbd_free_sock(connection);
5049
5050         rcu_read_lock();
5051         idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
5052                 struct drbd_device *device = peer_device->device;
5053                 kref_get(&device->kref);
5054                 rcu_read_unlock();
5055                 drbd_disconnected(peer_device);
5056                 kref_put(&device->kref, drbd_destroy_device);
5057                 rcu_read_lock();
5058         }
5059         rcu_read_unlock();
5060
5061         if (!list_empty(&connection->current_epoch->list))
5062                 drbd_err(connection, "ASSERTION FAILED: connection->current_epoch->list not empty\n");
5063         /* ok, no more ee's on the fly, it is safe to reset the epoch_size */
5064         atomic_set(&connection->current_epoch->epoch_size, 0);
5065         connection->send.seen_any_write_yet = false;
5066
5067         drbd_info(connection, "Connection closed\n");
5068
5069         if (conn_highest_role(connection) == R_PRIMARY && conn_highest_pdsk(connection) >= D_UNKNOWN)
5070                 conn_try_outdate_peer_async(connection);
5071
5072         spin_lock_irq(&connection->resource->req_lock);
5073         oc = connection->cstate;
5074         if (oc >= C_UNCONNECTED)
5075                 _conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE);
5076
5077         spin_unlock_irq(&connection->resource->req_lock);
5078
5079         if (oc == C_DISCONNECTING)
5080                 conn_request_state(connection, NS(conn, C_STANDALONE), CS_VERBOSE | CS_HARD);
5081 }
5082
5083 static int drbd_disconnected(struct drbd_peer_device *peer_device)
5084 {
5085         struct drbd_device *device = peer_device->device;
5086         unsigned int i;
5087
5088         /* wait for current activity to cease. */
5089         spin_lock_irq(&device->resource->req_lock);
5090         _drbd_wait_ee_list_empty(device, &device->active_ee);
5091         _drbd_wait_ee_list_empty(device, &device->sync_ee);
5092         _drbd_wait_ee_list_empty(device, &device->read_ee);
5093         spin_unlock_irq(&device->resource->req_lock);
5094
5095         /* We do not have data structures that would allow us to
5096          * get the rs_pending_cnt down to 0 again.
5097          *  * On C_SYNC_TARGET we do not have any data structures describing
5098          *    the pending RSDataRequest's we have sent.
5099          *  * On C_SYNC_SOURCE there is no data structure that tracks
5100          *    the P_RS_DATA_REPLY blocks that we sent to the SyncTarget.
5101          *  And no, it is not the sum of the reference counts in the
5102          *  resync_LRU. The resync_LRU tracks the whole operation including
5103          *  the disk-IO, while the rs_pending_cnt only tracks the blocks
5104          *  on the fly. */
5105         drbd_rs_cancel_all(device);
5106         device->rs_total = 0;
5107         device->rs_failed = 0;
5108         atomic_set(&device->rs_pending_cnt, 0);
5109         wake_up(&device->misc_wait);
5110
5111         del_timer_sync(&device->resync_timer);
5112         resync_timer_fn((unsigned long)device);
5113
5114         /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
5115          * w_make_resync_request etc. which may still be on the worker queue
5116          * to be "canceled" */
5117         drbd_flush_workqueue(&peer_device->connection->sender_work);
5118
5119         drbd_finish_peer_reqs(device);
5120
5121         /* This second workqueue flush is necessary, since drbd_finish_peer_reqs()
5122            might have issued a work again. The one before drbd_finish_peer_reqs() is
5123            necessary to reclain net_ee in drbd_finish_peer_reqs(). */
5124         drbd_flush_workqueue(&peer_device->connection->sender_work);
5125
5126         /* need to do it again, drbd_finish_peer_reqs() may have populated it
5127          * again via drbd_try_clear_on_disk_bm(). */
5128         drbd_rs_cancel_all(device);
5129
5130         kfree(device->p_uuid);
5131         device->p_uuid = NULL;
5132
5133         if (!drbd_suspended(device))
5134                 tl_clear(peer_device->connection);
5135
5136         drbd_md_sync(device);
5137
5138         if (get_ldev(device)) {
5139                 drbd_bitmap_io(device, &drbd_bm_write_copy_pages,
5140                                 "write from disconnected", BM_LOCKED_CHANGE_ALLOWED);
5141                 put_ldev(device);
5142         }
5143
5144         /* tcp_close and release of sendpage pages can be deferred.  I don't
5145          * want to use SO_LINGER, because apparently it can be deferred for
5146          * more than 20 seconds (longest time I checked).
5147          *
5148          * Actually we don't care for exactly when the network stack does its
5149          * put_page(), but release our reference on these pages right here.
5150          */
5151         i = drbd_free_peer_reqs(device, &device->net_ee);
5152         if (i)
5153                 drbd_info(device, "net_ee not empty, killed %u entries\n", i);
5154         i = atomic_read(&device->pp_in_use_by_net);
5155         if (i)
5156                 drbd_info(device, "pp_in_use_by_net = %d, expected 0\n", i);
5157         i = atomic_read(&device->pp_in_use);
5158         if (i)
5159                 drbd_info(device, "pp_in_use = %d, expected 0\n", i);
5160
5161         D_ASSERT(device, list_empty(&device->read_ee));
5162         D_ASSERT(device, list_empty(&device->active_ee));
5163         D_ASSERT(device, list_empty(&device->sync_ee));
5164         D_ASSERT(device, list_empty(&device->done_ee));
5165
5166         return 0;
5167 }
5168
5169 /*
5170  * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version
5171  * we can agree on is stored in agreed_pro_version.
5172  *
5173  * feature flags and the reserved array should be enough room for future
5174  * enhancements of the handshake protocol, and possible plugins...
5175  *
5176  * for now, they are expected to be zero, but ignored.
5177  */
5178 static int drbd_send_features(struct drbd_connection *connection)
5179 {
5180         struct drbd_socket *sock;
5181         struct p_connection_features *p;
5182
5183         sock = &connection->data;
5184         p = conn_prepare_command(connection, sock);
5185         if (!p)
5186                 return -EIO;
5187         memset(p, 0, sizeof(*p));
5188         p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
5189         p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
5190         p->feature_flags = cpu_to_be32(PRO_FEATURES);
5191         return conn_send_command(connection, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0);
5192 }
5193
5194 /*
5195  * return values:
5196  *   1 yes, we have a valid connection
5197  *   0 oops, did not work out, please try again
5198  *  -1 peer talks different language,
5199  *     no point in trying again, please go standalone.
5200  */
5201 static int drbd_do_features(struct drbd_connection *connection)
5202 {
5203         /* ASSERT current == connection->receiver ... */
5204         struct p_connection_features *p;
5205         const int expect = sizeof(struct p_connection_features);
5206         struct packet_info pi;
5207         int err;
5208
5209         err = drbd_send_features(connection);
5210         if (err)
5211                 return 0;
5212
5213         err = drbd_recv_header(connection, &pi);
5214         if (err)
5215                 return 0;
5216
5217         if (pi.cmd != P_CONNECTION_FEATURES) {
5218                 drbd_err(connection, "expected ConnectionFeatures packet, received: %s (0x%04x)\n",
5219                          cmdname(pi.cmd), pi.cmd);
5220                 return -1;
5221         }
5222
5223         if (pi.size != expect) {
5224                 drbd_err(connection, "expected ConnectionFeatures length: %u, received: %u\n",
5225                      expect, pi.size);
5226                 return -1;
5227         }
5228
5229         p = pi.data;
5230         err = drbd_recv_all_warn(connection, p, expect);
5231         if (err)
5232                 return 0;
5233
5234         p->protocol_min = be32_to_cpu(p->protocol_min);
5235         p->protocol_max = be32_to_cpu(p->protocol_max);
5236         if (p->protocol_max == 0)
5237                 p->protocol_max = p->protocol_min;
5238
5239         if (PRO_VERSION_MAX < p->protocol_min ||
5240             PRO_VERSION_MIN > p->protocol_max)
5241                 goto incompat;
5242
5243         connection->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
5244         connection->agreed_features = PRO_FEATURES & be32_to_cpu(p->feature_flags);
5245
5246         drbd_info(connection, "Handshake successful: "
5247              "Agreed network protocol version %d\n", connection->agreed_pro_version);
5248
5249         drbd_info(connection, "Feature flags enabled on protocol level: 0x%x%s%s%s.\n",
5250                   connection->agreed_features,
5251                   connection->agreed_features & DRBD_FF_TRIM ? " TRIM" : "",
5252                   connection->agreed_features & DRBD_FF_THIN_RESYNC ? " THIN_RESYNC" : "",
5253                   connection->agreed_features & DRBD_FF_WSAME ? " WRITE_SAME" :
5254                   connection->agreed_features ? "" : " none");
5255
5256         return 1;
5257
5258  incompat:
5259         drbd_err(connection, "incompatible DRBD dialects: "
5260             "I support %d-%d, peer supports %d-%d\n",
5261             PRO_VERSION_MIN, PRO_VERSION_MAX,
5262             p->protocol_min, p->protocol_max);
5263         return -1;
5264 }
5265
5266 #if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
5267 static int drbd_do_auth(struct drbd_connection *connection)
5268 {
5269         drbd_err(connection, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
5270         drbd_err(connection, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
5271         return -1;
5272 }
5273 #else
5274 #define CHALLENGE_LEN 64
5275
5276 /* Return value:
5277         1 - auth succeeded,
5278         0 - failed, try again (network error),
5279         -1 - auth failed, don't try again.
5280 */
5281
5282 static int drbd_do_auth(struct drbd_connection *connection)
5283 {
5284         struct drbd_socket *sock;
5285         char my_challenge[CHALLENGE_LEN];  /* 64 Bytes... */
5286         char *response = NULL;
5287         char *right_response = NULL;
5288         char *peers_ch = NULL;
5289         unsigned int key_len;
5290         char secret[SHARED_SECRET_MAX]; /* 64 byte */
5291         unsigned int resp_size;
5292         SHASH_DESC_ON_STACK(desc, connection->cram_hmac_tfm);
5293         struct packet_info pi;
5294         struct net_conf *nc;
5295         int err, rv;
5296
5297         /* FIXME: Put the challenge/response into the preallocated socket buffer.  */
5298
5299         rcu_read_lock();
5300         nc = rcu_dereference(connection->net_conf);
5301         key_len = strlen(nc->shared_secret);
5302         memcpy(secret, nc->shared_secret, key_len);
5303         rcu_read_unlock();
5304
5305         desc->tfm = connection->cram_hmac_tfm;
5306         desc->flags = 0;
5307
5308         rv = crypto_shash_setkey(connection->cram_hmac_tfm, (u8 *)secret, key_len);
5309         if (rv) {
5310                 drbd_err(connection, "crypto_shash_setkey() failed with %d\n", rv);
5311                 rv = -1;
5312                 goto fail;
5313         }
5314
5315         get_random_bytes(my_challenge, CHALLENGE_LEN);
5316
5317         sock = &connection->data;
5318         if (!conn_prepare_command(connection, sock)) {
5319                 rv = 0;
5320                 goto fail;
5321         }
5322         rv = !conn_send_command(connection, sock, P_AUTH_CHALLENGE, 0,
5323                                 my_challenge, CHALLENGE_LEN);
5324         if (!rv)
5325                 goto fail;
5326
5327         err = drbd_recv_header(connection, &pi);
5328         if (err) {
5329                 rv = 0;
5330                 goto fail;
5331         }
5332
5333         if (pi.cmd != P_AUTH_CHALLENGE) {
5334                 drbd_err(connection, "expected AuthChallenge packet, received: %s (0x%04x)\n",
5335                          cmdname(pi.cmd), pi.cmd);
5336                 rv = 0;
5337                 goto fail;
5338         }
5339
5340         if (pi.size > CHALLENGE_LEN * 2) {
5341                 drbd_err(connection, "expected AuthChallenge payload too big.\n");
5342                 rv = -1;
5343                 goto fail;
5344         }
5345
5346         if (pi.size < CHALLENGE_LEN) {
5347                 drbd_err(connection, "AuthChallenge payload too small.\n");
5348                 rv = -1;
5349                 goto fail;
5350         }
5351
5352         peers_ch = kmalloc(pi.size, GFP_NOIO);
5353         if (peers_ch == NULL) {
5354                 drbd_err(connection, "kmalloc of peers_ch failed\n");
5355                 rv = -1;
5356                 goto fail;
5357         }
5358
5359         err = drbd_recv_all_warn(connection, peers_ch, pi.size);
5360         if (err) {
5361                 rv = 0;
5362                 goto fail;
5363         }
5364
5365         if (!memcmp(my_challenge, peers_ch, CHALLENGE_LEN)) {
5366                 drbd_err(connection, "Peer presented the same challenge!\n");
5367                 rv = -1;
5368                 goto fail;
5369         }
5370
5371         resp_size = crypto_shash_digestsize(connection->cram_hmac_tfm);
5372         response = kmalloc(resp_size, GFP_NOIO);
5373         if (response == NULL) {
5374                 drbd_err(connection, "kmalloc of response failed\n");
5375                 rv = -1;
5376                 goto fail;
5377         }
5378
5379         rv = crypto_shash_digest(desc, peers_ch, pi.size, response);
5380         if (rv) {
5381                 drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv);
5382                 rv = -1;
5383                 goto fail;
5384         }
5385
5386         if (!conn_prepare_command(connection, sock)) {
5387                 rv = 0;
5388                 goto fail;
5389         }
5390         rv = !conn_send_command(connection, sock, P_AUTH_RESPONSE, 0,
5391                                 response, resp_size);
5392         if (!rv)
5393                 goto fail;
5394
5395         err = drbd_recv_header(connection, &pi);
5396         if (err) {
5397                 rv = 0;
5398                 goto fail;
5399         }
5400
5401         if (pi.cmd != P_AUTH_RESPONSE) {
5402                 drbd_err(connection, "expected AuthResponse packet, received: %s (0x%04x)\n",
5403                          cmdname(pi.cmd), pi.cmd);
5404                 rv = 0;
5405                 goto fail;
5406         }
5407
5408         if (pi.size != resp_size) {
5409                 drbd_err(connection, "expected AuthResponse payload of wrong size\n");
5410                 rv = 0;
5411                 goto fail;
5412         }
5413
5414         err = drbd_recv_all_warn(connection, response , resp_size);
5415         if (err) {
5416                 rv = 0;
5417                 goto fail;
5418         }
5419
5420         right_response = kmalloc(resp_size, GFP_NOIO);
5421         if (right_response == NULL) {
5422                 drbd_err(connection, "kmalloc of right_response failed\n");
5423                 rv = -1;
5424                 goto fail;
5425         }
5426
5427         rv = crypto_shash_digest(desc, my_challenge, CHALLENGE_LEN,
5428                                  right_response);
5429         if (rv) {
5430                 drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv);
5431                 rv = -1;
5432                 goto fail;
5433         }
5434
5435         rv = !memcmp(response, right_response, resp_size);
5436
5437         if (rv)
5438                 drbd_info(connection, "Peer authenticated using %d bytes HMAC\n",
5439                      resp_size);
5440         else
5441                 rv = -1;
5442
5443  fail:
5444         kfree(peers_ch);
5445         kfree(response);
5446         kfree(right_response);
5447         shash_desc_zero(desc);
5448
5449         return rv;
5450 }
5451 #endif
5452
5453 int drbd_receiver(struct drbd_thread *thi)
5454 {
5455         struct drbd_connection *connection = thi->connection;
5456         int h;
5457
5458         drbd_info(connection, "receiver (re)started\n");
5459
5460         do {
5461                 h = conn_connect(connection);
5462                 if (h == 0) {
5463                         conn_disconnect(connection);
5464                         schedule_timeout_interruptible(HZ);
5465                 }
5466                 if (h == -1) {
5467                         drbd_warn(connection, "Discarding network configuration.\n");
5468                         conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
5469                 }
5470         } while (h == 0);
5471
5472         if (h > 0)
5473                 drbdd(connection);
5474
5475         conn_disconnect(connection);
5476
5477         drbd_info(connection, "receiver terminated\n");
5478         return 0;
5479 }
5480
5481 /* ********* acknowledge sender ******** */
5482
5483 static int got_conn_RqSReply(struct drbd_connection *connection, struct packet_info *pi)
5484 {
5485         struct p_req_state_reply *p = pi->data;
5486         int retcode = be32_to_cpu(p->retcode);
5487
5488         if (retcode >= SS_SUCCESS) {
5489                 set_bit(CONN_WD_ST_CHG_OKAY, &connection->flags);
5490         } else {
5491                 set_bit(CONN_WD_ST_CHG_FAIL, &connection->flags);
5492                 drbd_err(connection, "Requested state change failed by peer: %s (%d)\n",
5493                          drbd_set_st_err_str(retcode), retcode);
5494         }
5495         wake_up(&connection->ping_wait);
5496
5497         return 0;
5498 }
5499
5500 static int got_RqSReply(struct drbd_connection *connection, struct packet_info *pi)
5501 {
5502         struct drbd_peer_device *peer_device;
5503         struct drbd_device *device;
5504         struct p_req_state_reply *p = pi->data;
5505         int retcode = be32_to_cpu(p->retcode);
5506
5507         peer_device = conn_peer_device(connection, pi->vnr);
5508         if (!peer_device)
5509                 return -EIO;
5510         device = peer_device->device;
5511
5512         if (test_bit(CONN_WD_ST_CHG_REQ, &connection->flags)) {
5513                 D_ASSERT(device, connection->agreed_pro_version < 100);
5514                 return got_conn_RqSReply(connection, pi);
5515         }
5516
5517         if (retcode >= SS_SUCCESS) {
5518                 set_bit(CL_ST_CHG_SUCCESS, &device->flags);
5519         } else {
5520                 set_bit(CL_ST_CHG_FAIL, &device->flags);
5521                 drbd_err(device, "Requested state change failed by peer: %s (%d)\n",
5522                         drbd_set_st_err_str(retcode), retcode);
5523         }
5524         wake_up(&device->state_wait);
5525
5526         return 0;
5527 }
5528
5529 static int got_Ping(struct drbd_connection *connection, struct packet_info *pi)
5530 {
5531         return drbd_send_ping_ack(connection);
5532
5533 }
5534
5535 static int got_PingAck(struct drbd_connection *connection, struct packet_info *pi)
5536 {
5537         /* restore idle timeout */
5538         connection->meta.socket->sk->sk_rcvtimeo = connection->net_conf->ping_int*HZ;
5539         if (!test_and_set_bit(GOT_PING_ACK, &connection->flags))
5540                 wake_up(&connection->ping_wait);
5541
5542         return 0;
5543 }
5544
5545 static int got_IsInSync(struct drbd_connection *connection, struct packet_info *pi)
5546 {
5547         struct drbd_peer_device *peer_device;
5548         struct drbd_device *device;
5549         struct p_block_ack *p = pi->data;
5550         sector_t sector = be64_to_cpu(p->sector);
5551         int blksize = be32_to_cpu(p->blksize);
5552
5553         peer_device = conn_peer_device(connection, pi->vnr);
5554         if (!peer_device)
5555                 return -EIO;
5556         device = peer_device->device;
5557
5558         D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89);
5559
5560         update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5561
5562         if (get_ldev(device)) {
5563                 drbd_rs_complete_io(device, sector);
5564                 drbd_set_in_sync(device, sector, blksize);
5565                 /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
5566                 device->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
5567                 put_ldev(device);
5568         }
5569         dec_rs_pending(device);
5570         atomic_add(blksize >> 9, &device->rs_sect_in);
5571
5572         return 0;
5573 }
5574
5575 static int
5576 validate_req_change_req_state(struct drbd_device *device, u64 id, sector_t sector,
5577                               struct rb_root *root, const char *func,
5578                               enum drbd_req_event what, bool missing_ok)
5579 {
5580         struct drbd_request *req;
5581         struct bio_and_error m;
5582
5583         spin_lock_irq(&device->resource->req_lock);
5584         req = find_request(device, root, id, sector, missing_ok, func);
5585         if (unlikely(!req)) {
5586                 spin_unlock_irq(&device->resource->req_lock);
5587                 return -EIO;
5588         }
5589         __req_mod(req, what, &m);
5590         spin_unlock_irq(&device->resource->req_lock);
5591
5592         if (m.bio)
5593                 complete_master_bio(device, &m);
5594         return 0;
5595 }
5596
5597 static int got_BlockAck(struct drbd_connection *connection, struct packet_info *pi)
5598 {
5599         struct drbd_peer_device *peer_device;
5600         struct drbd_device *device;
5601         struct p_block_ack *p = pi->data;
5602         sector_t sector = be64_to_cpu(p->sector);
5603         int blksize = be32_to_cpu(p->blksize);
5604         enum drbd_req_event what;
5605
5606         peer_device = conn_peer_device(connection, pi->vnr);
5607         if (!peer_device)
5608                 return -EIO;
5609         device = peer_device->device;
5610
5611         update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5612
5613         if (p->block_id == ID_SYNCER) {
5614                 drbd_set_in_sync(device, sector, blksize);
5615                 dec_rs_pending(device);
5616                 return 0;
5617         }
5618         switch (pi->cmd) {
5619         case P_RS_WRITE_ACK:
5620                 what = WRITE_ACKED_BY_PEER_AND_SIS;
5621                 break;
5622         case P_WRITE_ACK:
5623                 what = WRITE_ACKED_BY_PEER;
5624                 break;
5625         case P_RECV_ACK:
5626                 what = RECV_ACKED_BY_PEER;
5627                 break;
5628         case P_SUPERSEDED:
5629                 what = CONFLICT_RESOLVED;
5630                 break;
5631         case P_RETRY_WRITE:
5632                 what = POSTPONE_WRITE;
5633                 break;
5634         default:
5635                 BUG();
5636         }
5637
5638         return validate_req_change_req_state(device, p->block_id, sector,
5639                                              &device->write_requests, __func__,
5640                                              what, false);
5641 }
5642
5643 static int got_NegAck(struct drbd_connection *connection, struct packet_info *pi)
5644 {
5645         struct drbd_peer_device *peer_device;
5646         struct drbd_device *device;
5647         struct p_block_ack *p = pi->data;
5648         sector_t sector = be64_to_cpu(p->sector);
5649         int size = be32_to_cpu(p->blksize);
5650         int err;
5651
5652         peer_device = conn_peer_device(connection, pi->vnr);
5653         if (!peer_device)
5654                 return -EIO;
5655         device = peer_device->device;
5656
5657         update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5658
5659         if (p->block_id == ID_SYNCER) {
5660                 dec_rs_pending(device);
5661                 drbd_rs_failed_io(device, sector, size);
5662                 return 0;
5663         }
5664
5665         err = validate_req_change_req_state(device, p->block_id, sector,
5666                                             &device->write_requests, __func__,
5667                                             NEG_ACKED, true);
5668         if (err) {
5669                 /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
5670                    The master bio might already be completed, therefore the
5671                    request is no longer in the collision hash. */
5672                 /* In Protocol B we might already have got a P_RECV_ACK
5673                    but then get a P_NEG_ACK afterwards. */
5674                 drbd_set_out_of_sync(device, sector, size);
5675         }
5676         return 0;
5677 }
5678
5679 static int got_NegDReply(struct drbd_connection *connection, struct packet_info *pi)
5680 {
5681         struct drbd_peer_device *peer_device;
5682         struct drbd_device *device;
5683         struct p_block_ack *p = pi->data;
5684         sector_t sector = be64_to_cpu(p->sector);
5685
5686         peer_device = conn_peer_device(connection, pi->vnr);
5687         if (!peer_device)
5688                 return -EIO;
5689         device = peer_device->device;
5690
5691         update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5692
5693         drbd_err(device, "Got NegDReply; Sector %llus, len %u.\n",
5694             (unsigned long long)sector, be32_to_cpu(p->blksize));
5695
5696         return validate_req_change_req_state(device, p->block_id, sector,
5697                                              &device->read_requests, __func__,
5698                                              NEG_ACKED, false);
5699 }
5700
5701 static int got_NegRSDReply(struct drbd_connection *connection, struct packet_info *pi)
5702 {
5703         struct drbd_peer_device *peer_device;
5704         struct drbd_device *device;
5705         sector_t sector;
5706         int size;
5707         struct p_block_ack *p = pi->data;
5708
5709         peer_device = conn_peer_device(connection, pi->vnr);
5710         if (!peer_device)
5711                 return -EIO;
5712         device = peer_device->device;
5713
5714         sector = be64_to_cpu(p->sector);
5715         size = be32_to_cpu(p->blksize);
5716
5717         update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5718
5719         dec_rs_pending(device);
5720
5721         if (get_ldev_if_state(device, D_FAILED)) {
5722                 drbd_rs_complete_io(device, sector);
5723                 switch (pi->cmd) {
5724                 case P_NEG_RS_DREPLY:
5725                         drbd_rs_failed_io(device, sector, size);
5726                 case P_RS_CANCEL:
5727                         break;
5728                 default:
5729                         BUG();
5730                 }
5731                 put_ldev(device);
5732         }
5733
5734         return 0;
5735 }
5736
5737 static int got_BarrierAck(struct drbd_connection *connection, struct packet_info *pi)
5738 {
5739         struct p_barrier_ack *p = pi->data;
5740         struct drbd_peer_device *peer_device;
5741         int vnr;
5742
5743         tl_release(connection, p->barrier, be32_to_cpu(p->set_size));
5744
5745         rcu_read_lock();
5746         idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
5747                 struct drbd_device *device = peer_device->device;
5748
5749                 if (device->state.conn == C_AHEAD &&
5750                     atomic_read(&device->ap_in_flight) == 0 &&
5751                     !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &device->flags)) {
5752                         device->start_resync_timer.expires = jiffies + HZ;
5753                         add_timer(&device->start_resync_timer);
5754                 }
5755         }
5756         rcu_read_unlock();
5757
5758         return 0;
5759 }
5760
5761 static int got_OVResult(struct drbd_connection *connection, struct packet_info *pi)
5762 {
5763         struct drbd_peer_device *peer_device;
5764         struct drbd_device *device;
5765         struct p_block_ack *p = pi->data;
5766         struct drbd_device_work *dw;
5767         sector_t sector;
5768         int size;
5769
5770         peer_device = conn_peer_device(connection, pi->vnr);
5771         if (!peer_device)
5772                 return -EIO;
5773         device = peer_device->device;
5774
5775         sector = be64_to_cpu(p->sector);
5776         size = be32_to_cpu(p->blksize);
5777
5778         update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5779
5780         if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
5781                 drbd_ov_out_of_sync_found(device, sector, size);
5782         else
5783                 ov_out_of_sync_print(device);
5784
5785         if (!get_ldev(device))
5786                 return 0;
5787
5788         drbd_rs_complete_io(device, sector);
5789         dec_rs_pending(device);
5790
5791         --device->ov_left;
5792
5793         /* let's advance progress step marks only for every other megabyte */
5794         if ((device->ov_left & 0x200) == 0x200)
5795                 drbd_advance_rs_marks(device, device->ov_left);
5796
5797         if (device->ov_left == 0) {
5798                 dw = kmalloc(sizeof(*dw), GFP_NOIO);
5799                 if (dw) {
5800                         dw->w.cb = w_ov_finished;
5801                         dw->device = device;
5802                         drbd_queue_work(&peer_device->connection->sender_work, &dw->w);
5803                 } else {
5804                         drbd_err(device, "kmalloc(dw) failed.");
5805                         ov_out_of_sync_print(device);
5806                         drbd_resync_finished(device);
5807                 }
5808         }
5809         put_ldev(device);
5810         return 0;
5811 }
5812
5813 static int got_skip(struct drbd_connection *connection, struct packet_info *pi)
5814 {
5815         return 0;
5816 }
5817
5818 struct meta_sock_cmd {
5819         size_t pkt_size;
5820         int (*fn)(struct drbd_connection *connection, struct packet_info *);
5821 };
5822
5823 static void set_rcvtimeo(struct drbd_connection *connection, bool ping_timeout)
5824 {
5825         long t;
5826         struct net_conf *nc;
5827
5828         rcu_read_lock();
5829         nc = rcu_dereference(connection->net_conf);
5830         t = ping_timeout ? nc->ping_timeo : nc->ping_int;
5831         rcu_read_unlock();
5832
5833         t *= HZ;
5834         if (ping_timeout)
5835                 t /= 10;
5836
5837         connection->meta.socket->sk->sk_rcvtimeo = t;
5838 }
5839
5840 static void set_ping_timeout(struct drbd_connection *connection)
5841 {
5842         set_rcvtimeo(connection, 1);
5843 }
5844
5845 static void set_idle_timeout(struct drbd_connection *connection)
5846 {
5847         set_rcvtimeo(connection, 0);
5848 }
5849
5850 static struct meta_sock_cmd ack_receiver_tbl[] = {
5851         [P_PING]            = { 0, got_Ping },
5852         [P_PING_ACK]        = { 0, got_PingAck },
5853         [P_RECV_ACK]        = { sizeof(struct p_block_ack), got_BlockAck },
5854         [P_WRITE_ACK]       = { sizeof(struct p_block_ack), got_BlockAck },
5855         [P_RS_WRITE_ACK]    = { sizeof(struct p_block_ack), got_BlockAck },
5856         [P_SUPERSEDED]   = { sizeof(struct p_block_ack), got_BlockAck },
5857         [P_NEG_ACK]         = { sizeof(struct p_block_ack), got_NegAck },
5858         [P_NEG_DREPLY]      = { sizeof(struct p_block_ack), got_NegDReply },
5859         [P_NEG_RS_DREPLY]   = { sizeof(struct p_block_ack), got_NegRSDReply },
5860         [P_OV_RESULT]       = { sizeof(struct p_block_ack), got_OVResult },
5861         [P_BARRIER_ACK]     = { sizeof(struct p_barrier_ack), got_BarrierAck },
5862         [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
5863         [P_RS_IS_IN_SYNC]   = { sizeof(struct p_block_ack), got_IsInSync },
5864         [P_DELAY_PROBE]     = { sizeof(struct p_delay_probe93), got_skip },
5865         [P_RS_CANCEL]       = { sizeof(struct p_block_ack), got_NegRSDReply },
5866         [P_CONN_ST_CHG_REPLY]={ sizeof(struct p_req_state_reply), got_conn_RqSReply },
5867         [P_RETRY_WRITE]     = { sizeof(struct p_block_ack), got_BlockAck },
5868 };
5869
5870 int drbd_ack_receiver(struct drbd_thread *thi)
5871 {
5872         struct drbd_connection *connection = thi->connection;
5873         struct meta_sock_cmd *cmd = NULL;
5874         struct packet_info pi;
5875         unsigned long pre_recv_jif;
5876         int rv;
5877         void *buf    = connection->meta.rbuf;
5878         int received = 0;
5879         unsigned int header_size = drbd_header_size(connection);
5880         int expect   = header_size;
5881         bool ping_timeout_active = false;
5882         struct sched_param param = { .sched_priority = 2 };
5883
5884         rv = sched_setscheduler(current, SCHED_RR, &param);
5885         if (rv < 0)
5886                 drbd_err(connection, "drbd_ack_receiver: ERROR set priority, ret=%d\n", rv);
5887
5888         while (get_t_state(thi) == RUNNING) {
5889                 drbd_thread_current_set_cpu(thi);
5890
5891                 conn_reclaim_net_peer_reqs(connection);
5892
5893                 if (test_and_clear_bit(SEND_PING, &connection->flags)) {
5894                         if (drbd_send_ping(connection)) {
5895                                 drbd_err(connection, "drbd_send_ping has failed\n");
5896                                 goto reconnect;
5897                         }
5898                         set_ping_timeout(connection);
5899                         ping_timeout_active = true;
5900                 }
5901
5902                 pre_recv_jif = jiffies;
5903                 rv = drbd_recv_short(connection->meta.socket, buf, expect-received, 0);
5904
5905                 /* Note:
5906                  * -EINTR        (on meta) we got a signal
5907                  * -EAGAIN       (on meta) rcvtimeo expired
5908                  * -ECONNRESET   other side closed the connection
5909                  * -ERESTARTSYS  (on data) we got a signal
5910                  * rv <  0       other than above: unexpected error!
5911                  * rv == expected: full header or command
5912                  * rv <  expected: "woken" by signal during receive
5913                  * rv == 0       : "connection shut down by peer"
5914                  */
5915                 if (likely(rv > 0)) {
5916                         received += rv;
5917                         buf      += rv;
5918                 } else if (rv == 0) {
5919                         if (test_bit(DISCONNECT_SENT, &connection->flags)) {
5920                                 long t;
5921                                 rcu_read_lock();
5922                                 t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10;
5923                                 rcu_read_unlock();
5924
5925                                 t = wait_event_timeout(connection->ping_wait,
5926                                                        connection->cstate < C_WF_REPORT_PARAMS,
5927                                                        t);
5928                                 if (t)
5929                                         break;
5930                         }
5931                         drbd_err(connection, "meta connection shut down by peer.\n");
5932                         goto reconnect;
5933                 } else if (rv == -EAGAIN) {
5934                         /* If the data socket received something meanwhile,
5935                          * that is good enough: peer is still alive. */
5936                         if (time_after(connection->last_received, pre_recv_jif))
5937                                 continue;
5938                         if (ping_timeout_active) {
5939                                 drbd_err(connection, "PingAck did not arrive in time.\n");
5940                                 goto reconnect;
5941                         }
5942                         set_bit(SEND_PING, &connection->flags);
5943                         continue;
5944                 } else if (rv == -EINTR) {
5945                         /* maybe drbd_thread_stop(): the while condition will notice.
5946                          * maybe woken for send_ping: we'll send a ping above,
5947                          * and change the rcvtimeo */
5948                         flush_signals(current);
5949                         continue;
5950                 } else {
5951                         drbd_err(connection, "sock_recvmsg returned %d\n", rv);
5952                         goto reconnect;
5953                 }
5954
5955                 if (received == expect && cmd == NULL) {
5956                         if (decode_header(connection, connection->meta.rbuf, &pi))
5957                                 goto reconnect;
5958                         cmd = &ack_receiver_tbl[pi.cmd];
5959                         if (pi.cmd >= ARRAY_SIZE(ack_receiver_tbl) || !cmd->fn) {
5960                                 drbd_err(connection, "Unexpected meta packet %s (0x%04x)\n",
5961                                          cmdname(pi.cmd), pi.cmd);
5962                                 goto disconnect;
5963                         }
5964                         expect = header_size + cmd->pkt_size;
5965                         if (pi.size != expect - header_size) {
5966                                 drbd_err(connection, "Wrong packet size on meta (c: %d, l: %d)\n",
5967                                         pi.cmd, pi.size);
5968                                 goto reconnect;
5969                         }
5970                 }
5971                 if (received == expect) {
5972                         bool err;
5973
5974                         err = cmd->fn(connection, &pi);
5975                         if (err) {
5976                                 drbd_err(connection, "%pf failed\n", cmd->fn);
5977                                 goto reconnect;
5978                         }
5979
5980                         connection->last_received = jiffies;
5981
5982                         if (cmd == &ack_receiver_tbl[P_PING_ACK]) {
5983                                 set_idle_timeout(connection);
5984                                 ping_timeout_active = false;
5985                         }
5986
5987                         buf      = connection->meta.rbuf;
5988                         received = 0;
5989                         expect   = header_size;
5990                         cmd      = NULL;
5991                 }
5992         }
5993
5994         if (0) {
5995 reconnect:
5996                 conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
5997                 conn_md_sync(connection);
5998         }
5999         if (0) {
6000 disconnect:
6001                 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
6002         }
6003
6004         drbd_info(connection, "ack_receiver terminated\n");
6005
6006         return 0;
6007 }
6008
6009 void drbd_send_acks_wf(struct work_struct *ws)
6010 {
6011         struct drbd_peer_device *peer_device =
6012                 container_of(ws, struct drbd_peer_device, send_acks_work);
6013         struct drbd_connection *connection = peer_device->connection;
6014         struct drbd_device *device = peer_device->device;
6015         struct net_conf *nc;
6016         int tcp_cork, err;
6017
6018         rcu_read_lock();
6019         nc = rcu_dereference(connection->net_conf);
6020         tcp_cork = nc->tcp_cork;
6021         rcu_read_unlock();
6022
6023         if (tcp_cork)
6024                 drbd_tcp_cork(connection->meta.socket);
6025
6026         err = drbd_finish_peer_reqs(device);
6027         kref_put(&device->kref, drbd_destroy_device);
6028         /* get is in drbd_endio_write_sec_final(). That is necessary to keep the
6029            struct work_struct send_acks_work alive, which is in the peer_device object */
6030
6031         if (err) {
6032                 conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
6033                 return;
6034         }
6035
6036         if (tcp_cork)
6037                 drbd_tcp_uncork(connection->meta.socket);
6038
6039         return;
6040 }