]>
git.ipfire.org Git - thirdparty/linux.git/blob - drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2012, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
40 ksocknal_lib_get_conn_addrs(ksock_conn_t
*conn
)
42 int rc
= lnet_sock_getaddr(conn
->ksnc_sock
, 1, &conn
->ksnc_ipaddr
,
45 /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
46 LASSERT(!conn
->ksnc_closing
);
49 CERROR("Error %d getting sock peer IP\n", rc
);
53 rc
= lnet_sock_getaddr(conn
->ksnc_sock
, 0, &conn
->ksnc_myipaddr
, NULL
);
55 CERROR("Error %d getting sock local IP\n", rc
);
63 ksocknal_lib_zc_capable(ksock_conn_t
*conn
)
65 int caps
= conn
->ksnc_sock
->sk
->sk_route_caps
;
67 if (conn
->ksnc_proto
== &ksocknal_protocol_v1x
)
71 * ZC if the socket supports scatter/gather and doesn't need software
74 return ((caps
& NETIF_F_SG
) && (caps
& NETIF_F_CSUM_MASK
));
78 ksocknal_lib_send_iov(ksock_conn_t
*conn
, ksock_tx_t
*tx
)
80 struct socket
*sock
= conn
->ksnc_sock
;
84 if (*ksocknal_tunables
.ksnd_enable_csum
&& /* checksum enabled */
85 conn
->ksnc_proto
== &ksocknal_protocol_v2x
&& /* V2.x connection */
86 tx
->tx_nob
== tx
->tx_resid
&& /* frist sending */
87 !tx
->tx_msg
.ksm_csum
) /* not checksummed */
88 ksocknal_lib_csum_tx(tx
);
91 * NB we can't trust socket ops to either consume our iovs
92 * or leave them alone.
95 #if SOCKNAL_SINGLE_FRAG_TX
97 struct kvec
*scratchiov
= &scratch
;
98 unsigned int niov
= 1;
100 struct kvec
*scratchiov
= conn
->ksnc_scheduler
->kss_scratch_iov
;
101 unsigned int niov
= tx
->tx_niov
;
103 struct msghdr msg
= {.msg_flags
= MSG_DONTWAIT
};
106 for (nob
= i
= 0; i
< niov
; i
++) {
107 scratchiov
[i
] = tx
->tx_iov
[i
];
108 nob
+= scratchiov
[i
].iov_len
;
111 if (!list_empty(&conn
->ksnc_tx_queue
) ||
113 msg
.msg_flags
|= MSG_MORE
;
115 rc
= kernel_sendmsg(sock
, &msg
, scratchiov
, niov
, nob
);
121 ksocknal_lib_send_kiov(ksock_conn_t
*conn
, ksock_tx_t
*tx
)
123 struct socket
*sock
= conn
->ksnc_sock
;
124 lnet_kiov_t
*kiov
= tx
->tx_kiov
;
128 /* Not NOOP message */
129 LASSERT(tx
->tx_lnetmsg
);
132 * NB we can't trust socket ops to either consume our iovs
133 * or leave them alone.
135 if (tx
->tx_msg
.ksm_zc_cookies
[0]) {
136 /* Zero copy is enabled */
137 struct sock
*sk
= sock
->sk
;
138 struct page
*page
= kiov
->kiov_page
;
139 int offset
= kiov
->kiov_offset
;
140 int fragsize
= kiov
->kiov_len
;
141 int msgflg
= MSG_DONTWAIT
;
143 CDEBUG(D_NET
, "page %p + offset %x for %d\n",
144 page
, offset
, kiov
->kiov_len
);
146 if (!list_empty(&conn
->ksnc_tx_queue
) ||
147 fragsize
< tx
->tx_resid
)
150 if (sk
->sk_prot
->sendpage
) {
151 rc
= sk
->sk_prot
->sendpage(sk
, page
,
152 offset
, fragsize
, msgflg
);
154 rc
= tcp_sendpage(sk
, page
, offset
, fragsize
, msgflg
);
157 #if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK
159 struct kvec
*scratchiov
= &scratch
;
160 unsigned int niov
= 1;
162 #ifdef CONFIG_HIGHMEM
163 #warning "XXX risk of kmap deadlock on multiple frags..."
165 struct kvec
*scratchiov
= conn
->ksnc_scheduler
->kss_scratch_iov
;
166 unsigned int niov
= tx
->tx_nkiov
;
168 struct msghdr msg
= {.msg_flags
= MSG_DONTWAIT
};
171 for (nob
= i
= 0; i
< niov
; i
++) {
172 scratchiov
[i
].iov_base
= kmap(kiov
[i
].kiov_page
) +
174 nob
+= scratchiov
[i
].iov_len
= kiov
[i
].kiov_len
;
177 if (!list_empty(&conn
->ksnc_tx_queue
) ||
179 msg
.msg_flags
|= MSG_MORE
;
181 rc
= kernel_sendmsg(sock
, &msg
, (struct kvec
*)scratchiov
, niov
, nob
);
183 for (i
= 0; i
< niov
; i
++)
184 kunmap(kiov
[i
].kiov_page
);
190 ksocknal_lib_eager_ack(ksock_conn_t
*conn
)
193 struct socket
*sock
= conn
->ksnc_sock
;
196 * Remind the socket to ACK eagerly. If I don't, the socket might
197 * think I'm about to send something it could piggy-back the ACK
198 * on, introducing delay in completing zero-copy sends in my
201 kernel_setsockopt(sock
, SOL_TCP
, TCP_QUICKACK
, (char *)&opt
,
206 ksocknal_lib_recv_iov(ksock_conn_t
*conn
)
208 #if SOCKNAL_SINGLE_FRAG_RX
210 struct kvec
*scratchiov
= &scratch
;
211 unsigned int niov
= 1;
213 struct kvec
*scratchiov
= conn
->ksnc_scheduler
->kss_scratch_iov
;
214 unsigned int niov
= conn
->ksnc_rx_niov
;
216 struct kvec
*iov
= conn
->ksnc_rx_iov
;
217 struct msghdr msg
= {
228 * NB we can't trust socket ops to either consume our iovs
229 * or leave them alone.
233 for (nob
= i
= 0; i
< niov
; i
++) {
234 scratchiov
[i
] = iov
[i
];
235 nob
+= scratchiov
[i
].iov_len
;
237 LASSERT(nob
<= conn
->ksnc_rx_nob_wanted
);
239 rc
= kernel_recvmsg(conn
->ksnc_sock
, &msg
, scratchiov
, niov
, nob
,
243 if (conn
->ksnc_proto
== &ksocknal_protocol_v2x
) {
244 saved_csum
= conn
->ksnc_msg
.ksm_csum
;
245 conn
->ksnc_msg
.ksm_csum
= 0;
249 /* accumulate checksum */
250 for (i
= 0, sum
= rc
; sum
> 0; i
++, sum
-= fragnob
) {
253 fragnob
= iov
[i
].iov_len
;
257 conn
->ksnc_rx_csum
= ksocknal_csum(conn
->ksnc_rx_csum
,
258 iov
[i
].iov_base
, fragnob
);
260 conn
->ksnc_msg
.ksm_csum
= saved_csum
;
267 ksocknal_lib_kiov_vunmap(void *addr
)
276 ksocknal_lib_kiov_vmap(lnet_kiov_t
*kiov
, int niov
,
277 struct kvec
*iov
, struct page
**pages
)
283 if (!*ksocknal_tunables
.ksnd_zc_recv
|| !pages
)
286 LASSERT(niov
<= LNET_MAX_IOV
);
289 niov
< *ksocknal_tunables
.ksnd_zc_recv_min_nfrags
)
292 for (nob
= i
= 0; i
< niov
; i
++) {
293 if ((kiov
[i
].kiov_offset
&& i
> 0) ||
294 (kiov
[i
].kiov_offset
+ kiov
[i
].kiov_len
!= PAGE_SIZE
&& i
< niov
- 1))
297 pages
[i
] = kiov
[i
].kiov_page
;
298 nob
+= kiov
[i
].kiov_len
;
301 addr
= vmap(pages
, niov
, VM_MAP
, PAGE_KERNEL
);
305 iov
->iov_base
= addr
+ kiov
[0].kiov_offset
;
312 ksocknal_lib_recv_kiov(ksock_conn_t
*conn
)
314 #if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK
316 struct kvec
*scratchiov
= &scratch
;
317 struct page
**pages
= NULL
;
318 unsigned int niov
= 1;
320 #ifdef CONFIG_HIGHMEM
321 #warning "XXX risk of kmap deadlock on multiple frags..."
323 struct kvec
*scratchiov
= conn
->ksnc_scheduler
->kss_scratch_iov
;
324 struct page
**pages
= conn
->ksnc_scheduler
->kss_rx_scratch_pgs
;
325 unsigned int niov
= conn
->ksnc_rx_nkiov
;
327 lnet_kiov_t
*kiov
= conn
->ksnc_rx_kiov
;
328 struct msghdr msg
= {
341 * NB we can't trust socket ops to either consume our iovs
342 * or leave them alone.
344 addr
= ksocknal_lib_kiov_vmap(kiov
, niov
, scratchiov
, pages
);
346 nob
= scratchiov
[0].iov_len
;
350 for (nob
= i
= 0; i
< niov
; i
++) {
351 nob
+= scratchiov
[i
].iov_len
= kiov
[i
].kiov_len
;
352 scratchiov
[i
].iov_base
= kmap(kiov
[i
].kiov_page
) +
358 LASSERT(nob
<= conn
->ksnc_rx_nob_wanted
);
360 rc
= kernel_recvmsg(conn
->ksnc_sock
, &msg
, (struct kvec
*)scratchiov
,
361 n
, nob
, MSG_DONTWAIT
);
363 if (conn
->ksnc_msg
.ksm_csum
) {
364 for (i
= 0, sum
= rc
; sum
> 0; i
++, sum
-= fragnob
) {
368 * Dang! have to kmap again because I have nowhere to
369 * stash the mapped address. But by doing it while the
370 * page is still mapped, the kernel just bumps the map
371 * count and returns me the address it stashed.
373 base
= kmap(kiov
[i
].kiov_page
) + kiov
[i
].kiov_offset
;
374 fragnob
= kiov
[i
].kiov_len
;
378 conn
->ksnc_rx_csum
= ksocknal_csum(conn
->ksnc_rx_csum
,
381 kunmap(kiov
[i
].kiov_page
);
386 ksocknal_lib_kiov_vunmap(addr
);
388 for (i
= 0; i
< niov
; i
++)
389 kunmap(kiov
[i
].kiov_page
);
396 ksocknal_lib_csum_tx(ksock_tx_t
*tx
)
402 LASSERT(tx
->tx_iov
[0].iov_base
== &tx
->tx_msg
);
403 LASSERT(tx
->tx_conn
);
404 LASSERT(tx
->tx_conn
->ksnc_proto
== &ksocknal_protocol_v2x
);
406 tx
->tx_msg
.ksm_csum
= 0;
408 csum
= ksocknal_csum(~0, tx
->tx_iov
[0].iov_base
,
409 tx
->tx_iov
[0].iov_len
);
412 for (i
= 0; i
< tx
->tx_nkiov
; i
++) {
413 base
= kmap(tx
->tx_kiov
[i
].kiov_page
) +
414 tx
->tx_kiov
[i
].kiov_offset
;
416 csum
= ksocknal_csum(csum
, base
, tx
->tx_kiov
[i
].kiov_len
);
418 kunmap(tx
->tx_kiov
[i
].kiov_page
);
421 for (i
= 1; i
< tx
->tx_niov
; i
++)
422 csum
= ksocknal_csum(csum
, tx
->tx_iov
[i
].iov_base
,
423 tx
->tx_iov
[i
].iov_len
);
426 if (*ksocknal_tunables
.ksnd_inject_csum_error
) {
428 *ksocknal_tunables
.ksnd_inject_csum_error
= 0;
431 tx
->tx_msg
.ksm_csum
= csum
;
435 ksocknal_lib_get_conn_tunables(ksock_conn_t
*conn
, int *txmem
, int *rxmem
, int *nagle
)
437 struct socket
*sock
= conn
->ksnc_sock
;
441 rc
= ksocknal_connsock_addref(conn
);
443 LASSERT(conn
->ksnc_closing
);
444 *txmem
= *rxmem
= *nagle
= 0;
448 rc
= lnet_sock_getbuf(sock
, txmem
, rxmem
);
450 len
= sizeof(*nagle
);
451 rc
= kernel_getsockopt(sock
, SOL_TCP
, TCP_NODELAY
,
452 (char *)nagle
, &len
);
455 ksocknal_connsock_decref(conn
);
460 *txmem
= *rxmem
= *nagle
= 0;
466 ksocknal_lib_setup_sock(struct socket
*sock
)
474 struct linger linger
;
476 sock
->sk
->sk_allocation
= GFP_NOFS
;
479 * Ensure this socket aborts active sends immediately when we close
485 rc
= kernel_setsockopt(sock
, SOL_SOCKET
, SO_LINGER
, (char *)&linger
,
488 CERROR("Can't set SO_LINGER: %d\n", rc
);
493 rc
= kernel_setsockopt(sock
, SOL_TCP
, TCP_LINGER2
, (char *)&option
,
496 CERROR("Can't set SO_LINGER2: %d\n", rc
);
500 if (!*ksocknal_tunables
.ksnd_nagle
) {
503 rc
= kernel_setsockopt(sock
, SOL_TCP
, TCP_NODELAY
,
504 (char *)&option
, sizeof(option
));
506 CERROR("Can't disable nagle: %d\n", rc
);
511 rc
= lnet_sock_setbuf(sock
, *ksocknal_tunables
.ksnd_tx_buffer_size
,
512 *ksocknal_tunables
.ksnd_rx_buffer_size
);
514 CERROR("Can't set buffer tx %d, rx %d buffers: %d\n",
515 *ksocknal_tunables
.ksnd_tx_buffer_size
,
516 *ksocknal_tunables
.ksnd_rx_buffer_size
, rc
);
520 /* TCP_BACKOFF_* sockopt tunables unsupported in stock kernels */
522 /* snapshot tunables */
523 keep_idle
= *ksocknal_tunables
.ksnd_keepalive_idle
;
524 keep_count
= *ksocknal_tunables
.ksnd_keepalive_count
;
525 keep_intvl
= *ksocknal_tunables
.ksnd_keepalive_intvl
;
527 do_keepalive
= (keep_idle
> 0 && keep_count
> 0 && keep_intvl
> 0);
529 option
= (do_keepalive
? 1 : 0);
530 rc
= kernel_setsockopt(sock
, SOL_SOCKET
, SO_KEEPALIVE
, (char *)&option
,
533 CERROR("Can't set SO_KEEPALIVE: %d\n", rc
);
540 rc
= kernel_setsockopt(sock
, SOL_TCP
, TCP_KEEPIDLE
, (char *)&keep_idle
,
543 CERROR("Can't set TCP_KEEPIDLE: %d\n", rc
);
547 rc
= kernel_setsockopt(sock
, SOL_TCP
, TCP_KEEPINTVL
,
548 (char *)&keep_intvl
, sizeof(keep_intvl
));
550 CERROR("Can't set TCP_KEEPINTVL: %d\n", rc
);
554 rc
= kernel_setsockopt(sock
, SOL_TCP
, TCP_KEEPCNT
, (char *)&keep_count
,
557 CERROR("Can't set TCP_KEEPCNT: %d\n", rc
);
565 ksocknal_lib_push_conn(ksock_conn_t
*conn
)
573 rc
= ksocknal_connsock_addref(conn
);
574 if (rc
) /* being shut down */
577 sk
= conn
->ksnc_sock
->sk
;
581 nonagle
= tp
->nonagle
;
585 rc
= kernel_setsockopt(conn
->ksnc_sock
, SOL_TCP
, TCP_NODELAY
,
586 (char *)&val
, sizeof(val
));
590 tp
->nonagle
= nonagle
;
593 ksocknal_connsock_decref(conn
);
597 * socket call back in Linux
600 ksocknal_data_ready(struct sock
*sk
)
604 /* interleave correctly with closing sockets... */
606 read_lock(&ksocknal_data
.ksnd_global_lock
);
608 conn
= sk
->sk_user_data
;
609 if (!conn
) { /* raced with ksocknal_terminate_conn */
610 LASSERT(sk
->sk_data_ready
!= &ksocknal_data_ready
);
611 sk
->sk_data_ready(sk
);
613 ksocknal_read_callback(conn
);
616 read_unlock(&ksocknal_data
.ksnd_global_lock
);
620 ksocknal_write_space(struct sock
*sk
)
626 /* interleave correctly with closing sockets... */
628 read_lock(&ksocknal_data
.ksnd_global_lock
);
630 conn
= sk
->sk_user_data
;
631 wspace
= sk_stream_wspace(sk
);
632 min_wpace
= sk_stream_min_wspace(sk
);
634 CDEBUG(D_NET
, "sk %p wspace %d low water %d conn %p%s%s%s\n",
635 sk
, wspace
, min_wpace
, conn
,
636 !conn
? "" : (conn
->ksnc_tx_ready
?
637 " ready" : " blocked"),
638 !conn
? "" : (conn
->ksnc_tx_scheduled
?
639 " scheduled" : " idle"),
640 !conn
? "" : (list_empty(&conn
->ksnc_tx_queue
) ?
641 " empty" : " queued"));
643 if (!conn
) { /* raced with ksocknal_terminate_conn */
644 LASSERT(sk
->sk_write_space
!= &ksocknal_write_space
);
645 sk
->sk_write_space(sk
);
647 read_unlock(&ksocknal_data
.ksnd_global_lock
);
651 if (wspace
>= min_wpace
) { /* got enough space */
652 ksocknal_write_callback(conn
);
655 * Clear SOCK_NOSPACE _after_ ksocknal_write_callback so the
656 * ENOMEM check in ksocknal_transmit is race-free (think about
659 clear_bit(SOCK_NOSPACE
, &sk
->sk_socket
->flags
);
662 read_unlock(&ksocknal_data
.ksnd_global_lock
);
666 ksocknal_lib_save_callback(struct socket
*sock
, ksock_conn_t
*conn
)
668 conn
->ksnc_saved_data_ready
= sock
->sk
->sk_data_ready
;
669 conn
->ksnc_saved_write_space
= sock
->sk
->sk_write_space
;
673 ksocknal_lib_set_callback(struct socket
*sock
, ksock_conn_t
*conn
)
675 sock
->sk
->sk_user_data
= conn
;
676 sock
->sk
->sk_data_ready
= ksocknal_data_ready
;
677 sock
->sk
->sk_write_space
= ksocknal_write_space
;
682 ksocknal_lib_reset_callback(struct socket
*sock
, ksock_conn_t
*conn
)
685 * Remove conn's network callbacks.
686 * NB I _have_ to restore the callback, rather than storing a noop,
687 * since the socket could survive past this module being unloaded!!
689 sock
->sk
->sk_data_ready
= conn
->ksnc_saved_data_ready
;
690 sock
->sk
->sk_write_space
= conn
->ksnc_saved_write_space
;
693 * A callback could be in progress already; they hold a read lock
694 * on ksnd_global_lock (to serialise with me) and NOOP if
695 * sk_user_data is NULL.
697 sock
->sk
->sk_user_data
= NULL
;
703 ksocknal_lib_memory_pressure(ksock_conn_t
*conn
)
706 ksock_sched_t
*sched
;
708 sched
= conn
->ksnc_scheduler
;
709 spin_lock_bh(&sched
->kss_lock
);
711 if (!test_bit(SOCK_NOSPACE
, &conn
->ksnc_sock
->flags
) &&
712 !conn
->ksnc_tx_ready
) {
714 * SOCK_NOSPACE is set when the socket fills
715 * and cleared in the write_space callback
716 * (which also sets ksnc_tx_ready). If
717 * SOCK_NOSPACE and ksnc_tx_ready are BOTH
718 * zero, I didn't fill the socket and
719 * write_space won't reschedule me, so I
720 * return -ENOMEM to get my caller to retry
726 spin_unlock_bh(&sched
->kss_lock
);