OpenCloudOS-Kernel/net/tls/tls_main.c

926 lines
22 KiB
C
Raw Normal View History

/*
* Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved.
* Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include <linux/module.h>
#include <net/tcp.h>
#include <net/inet_common.h>
#include <linux/highmem.h>
#include <linux/netdevice.h>
#include <linux/sched/signal.h>
#include <linux/inetdevice.h>
#include <linux/inet_diag.h>
#include <net/tls.h>
MODULE_AUTHOR("Mellanox Technologies");
MODULE_DESCRIPTION("Transport Layer Security Support");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_ALIAS_TCP_ULP("tls");
enum {
TLSV4,
TLSV6,
TLS_NUM_PROTS,
};
static struct proto *saved_tcpv6_prot;
static DEFINE_MUTEX(tcpv6_prot_mutex);
static struct proto *saved_tcpv4_prot;
static DEFINE_MUTEX(tcpv4_prot_mutex);
static LIST_HEAD(device_list);
static DEFINE_SPINLOCK(device_spinlock);
static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG][TLS_NUM_CONFIG];
static struct proto_ops tls_sw_proto_ops;
static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
struct proto *base);
static void update_sk_prot(struct sock *sk, struct tls_context *ctx)
{
int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4;
sk->sk_prot = &tls_prots[ip_ver][ctx->tx_conf][ctx->rx_conf];
}
int wait_on_pending_writer(struct sock *sk, long *timeo)
{
int rc = 0;
DEFINE_WAIT_FUNC(wait, woken_wake_function);
add_wait_queue(sk_sleep(sk), &wait);
while (1) {
if (!*timeo) {
rc = -EAGAIN;
break;
}
if (signal_pending(current)) {
rc = sock_intr_errno(*timeo);
break;
}
if (sk_wait_event(sk, timeo, !sk->sk_write_pending, &wait))
break;
}
remove_wait_queue(sk_sleep(sk), &wait);
return rc;
}
int tls_push_sg(struct sock *sk,
struct tls_context *ctx,
struct scatterlist *sg,
u16 first_offset,
int flags)
{
int sendpage_flags = flags | MSG_SENDPAGE_NOTLAST;
int ret = 0;
struct page *p;
size_t size;
int offset = first_offset;
size = sg->length - offset;
offset += sg->offset;
ctx->in_tcp_sendpages = true;
while (1) {
if (sg_is_last(sg))
sendpage_flags = flags;
/* is sending application-limited? */
tcp_rate_check_app_limited(sk);
p = sg_page(sg);
retry:
ret = do_tcp_sendpages(sk, p, offset, size, sendpage_flags);
if (ret != size) {
if (ret > 0) {
offset += ret;
size -= ret;
goto retry;
}
offset -= sg->offset;
ctx->partially_sent_offset = offset;
ctx->partially_sent_record = (void *)sg;
ctx->in_tcp_sendpages = false;
return ret;
}
put_page(p);
sk_mem_uncharge(sk, sg->length);
sg = sg_next(sg);
if (!sg)
break;
offset = sg->offset;
size = sg->length;
}
ctx->in_tcp_sendpages = false;
return 0;
}
static int tls_handle_open_record(struct sock *sk, int flags)
{
struct tls_context *ctx = tls_get_ctx(sk);
if (tls_is_pending_open_record(ctx))
return ctx->push_pending_record(sk, flags);
return 0;
}
int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg,
unsigned char *record_type)
{
struct cmsghdr *cmsg;
int rc = -EINVAL;
for_each_cmsghdr(cmsg, msg) {
if (!CMSG_OK(msg, cmsg))
return -EINVAL;
if (cmsg->cmsg_level != SOL_TLS)
continue;
switch (cmsg->cmsg_type) {
case TLS_SET_RECORD_TYPE:
if (cmsg->cmsg_len < CMSG_LEN(sizeof(*record_type)))
return -EINVAL;
if (msg->msg_flags & MSG_MORE)
return -EINVAL;
rc = tls_handle_open_record(sk, msg->msg_flags);
if (rc)
return rc;
*record_type = *(unsigned char *)CMSG_DATA(cmsg);
rc = 0;
break;
default:
return -EINVAL;
}
}
return rc;
}
net/tls: Add support for async encryption of records for performance In current implementation, tls records are encrypted & transmitted serially. Till the time the previously submitted user data is encrypted, the implementation waits and on finish starts transmitting the record. This approach of encrypt-one record at a time is inefficient when asynchronous crypto accelerators are used. For each record, there are overheads of interrupts, driver softIRQ scheduling etc. Also the crypto accelerator sits idle most of time while an encrypted record's pages are handed over to tcp stack for transmission. This patch enables encryption of multiple records in parallel when an async capable crypto accelerator is present in system. This is achieved by allowing the user space application to send more data using sendmsg() even while previously issued data is being processed by crypto accelerator. This requires returning the control back to user space application after submitting encryption request to accelerator. This also means that zero-copy mode of encryption cannot be used with async accelerator as we must be done with user space application buffer before returning from sendmsg(). There can be multiple records in flight to/from the accelerator. Each of the record is represented by 'struct tls_rec'. This is used to store the memory pages for the record. After the records are encrypted, they are added in a linked list called tx_ready_list which contains encrypted tls records sorted as per tls sequence number. The records from tx_ready_list are transmitted using a newly introduced function called tls_tx_records(). The tx_ready_list is polled for any record ready to be transmitted in sendmsg(), sendpage() after initiating encryption of new tls records. This achieves parallel encryption and transmission of records when async accelerator is present. There could be situation when crypto accelerator completes encryption later than polling of tx_ready_list by sendmsg()/sendpage(). Therefore we need a deferred work context to be able to transmit records from tx_ready_list. The deferred work context gets scheduled if applications are not sending much data through the socket. If the applications issue sendmsg()/sendpage() in quick succession, then the scheduling of tx_work_handler gets cancelled as the tx_ready_list would be polled from application's context itself. This saves scheduling overhead of deferred work. The patch also brings some side benefit. We are able to get rid of the concept of CLOSED record. This is because the records once closed are either encrypted and then placed into tx_ready_list or if encryption fails, the socket error is set. This simplifies the kernel tls sendpath. However since tls_device.c is still using macros, accessory functions for CLOSED records have been retained. Signed-off-by: Vakul Garg <vakul.garg@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-09-21 12:16:13 +08:00
int tls_push_partial_record(struct sock *sk, struct tls_context *ctx,
int flags)
{
struct scatterlist *sg;
u16 offset;
sg = ctx->partially_sent_record;
offset = ctx->partially_sent_offset;
ctx->partially_sent_record = NULL;
return tls_push_sg(sk, ctx, sg, offset, flags);
}
bool tls_free_partial_record(struct sock *sk, struct tls_context *ctx)
{
struct scatterlist *sg;
sg = ctx->partially_sent_record;
if (!sg)
return false;
while (1) {
put_page(sg_page(sg));
sk_mem_uncharge(sk, sg->length);
if (sg_is_last(sg))
break;
sg++;
}
ctx->partially_sent_record = NULL;
return true;
}
static void tls_write_space(struct sock *sk)
{
struct tls_context *ctx = tls_get_ctx(sk);
/* If in_tcp_sendpages call lower protocol write space handler
* to ensure we wake up any waiting operations there. For example
* if do_tcp_sendpages where to call sk_wait_event.
*/
if (ctx->in_tcp_sendpages) {
ctx->sk_write_space(sk);
return;
}
#ifdef CONFIG_TLS_DEVICE
if (ctx->tx_conf == TLS_HW)
tls_device_write_space(sk, ctx);
else
#endif
tls_sw_write_space(sk, ctx);
ctx->sk_write_space(sk);
}
/**
* tls_ctx_free() - free TLS ULP context
* @sk: socket to with @ctx is attached
* @ctx: TLS context structure
*
* Free TLS context. If @sk is %NULL caller guarantees that the socket
* to which @ctx was attached has no outstanding references.
*/
void tls_ctx_free(struct sock *sk, struct tls_context *ctx)
{
if (!ctx)
return;
memzero_explicit(&ctx->crypto_send, sizeof(ctx->crypto_send));
memzero_explicit(&ctx->crypto_recv, sizeof(ctx->crypto_recv));
net/tls: add a TX lock TLS TX needs to release and re-acquire the socket lock if send buffer fills up. TLS SW TX path currently depends on only allowing one thread to enter the function by the abuse of sk_write_pending. If another writer is already waiting for memory no new ones are allowed in. This has two problems: - writers don't wake other threads up when they leave the kernel; meaning that this scheme works for single extra thread (second application thread or delayed work) because memory becoming available will send a wake up request, but as Mallesham and Pooja report with larger number of threads it leads to threads being put to sleep indefinitely; - the delayed work does not get _scheduled_ but it may _run_ when other writers are present leading to crashes as writers don't expect state to change under their feet (same records get pushed and freed multiple times); it's hard to reliably bail from the work, however, because the mere presence of a writer does not guarantee that the writer will push pending records before exiting. Ensuring wakeups always happen will make the code basically open code a mutex. Just use a mutex. The TLS HW TX path does not have any locking (not even the sk_write_pending hack), yet it uses a per-socket sg_tx_data array to push records. Fixes: a42055e8d2c3 ("net/tls: Add support for async encryption of records for performance") Reported-by: Mallesham Jatharakonda <mallesh537@gmail.com> Reported-by: Pooja Trivedi <poojatrivedi@gmail.com> Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com> Reviewed-by: Simon Horman <simon.horman@netronome.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-11-06 06:24:35 +08:00
mutex_destroy(&ctx->tx_lock);
if (sk)
kfree_rcu(ctx, rcu);
else
kfree(ctx);
}
static void tls_sk_proto_cleanup(struct sock *sk,
struct tls_context *ctx, long timeo)
{
net/tls: fix page double free on TX cleanup With commit 94850257cf0f ("tls: Fix tls_device handling of partial records") a new path was introduced to cleanup partial records during sk_proto_close. This path does not handle the SW KTLS tx_list cleanup. This is unnecessary though since the free_resources calls for both SW and offload paths will cleanup a partial record. The visible effect is the following warning, but this bug also causes a page double free. WARNING: CPU: 7 PID: 4000 at net/core/stream.c:206 sk_stream_kill_queues+0x103/0x110 RIP: 0010:sk_stream_kill_queues+0x103/0x110 RSP: 0018:ffffb6df87e07bd0 EFLAGS: 00010206 RAX: 0000000000000000 RBX: ffff8c21db4971c0 RCX: 0000000000000007 RDX: ffffffffffffffa0 RSI: 000000000000001d RDI: ffff8c21db497270 RBP: ffff8c21db497270 R08: ffff8c29f4748600 R09: 000000010020001a R10: ffffb6df87e07aa0 R11: ffffffff9a445600 R12: 0000000000000007 R13: 0000000000000000 R14: ffff8c21f03f2900 R15: ffff8c21f03b8df0 Call Trace: inet_csk_destroy_sock+0x55/0x100 tcp_close+0x25d/0x400 ? tcp_check_oom+0x120/0x120 tls_sk_proto_close+0x127/0x1c0 inet_release+0x3c/0x60 __sock_release+0x3d/0xb0 sock_close+0x11/0x20 __fput+0xd8/0x210 task_work_run+0x84/0xa0 do_exit+0x2dc/0xb90 ? release_sock+0x43/0x90 do_group_exit+0x3a/0xa0 get_signal+0x295/0x720 do_signal+0x36/0x610 ? SYSC_recvfrom+0x11d/0x130 exit_to_usermode_loop+0x69/0xb0 do_syscall_64+0x173/0x180 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 RIP: 0033:0x7fe9b9abc10d RSP: 002b:00007fe9b19a1d48 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca RAX: fffffffffffffe00 RBX: 0000000000000006 RCX: 00007fe9b9abc10d RDX: 0000000000000002 RSI: 0000000000000080 RDI: 00007fe948003430 RBP: 00007fe948003410 R08: 00007fe948003430 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00005603739d9080 R13: 00007fe9b9ab9f90 R14: 00007fe948003430 R15: 0000000000000000 Fixes: 94850257cf0f ("tls: Fix tls_device handling of partial records") Signed-off-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com> Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-06-24 12:26:58 +08:00
if (unlikely(sk->sk_write_pending) &&
!wait_on_pending_writer(sk, &timeo))
tls_handle_open_record(sk, 0);
/* We need these for tls_sw_fallback handling of other packets */
if (ctx->tx_conf == TLS_SW) {
kfree(ctx->tx.rec_seq);
kfree(ctx->tx.iv);
tls_sw_release_resources_tx(sk);
} else if (ctx->tx_conf == TLS_HW) {
tls_device_free_resources_tx(sk);
}
if (ctx->rx_conf == TLS_SW)
tls_sw_release_resources_rx(sk);
else if (ctx->rx_conf == TLS_HW)
tls_device_offload_cleanup_rx(sk);
}
static void tls_sk_proto_close(struct sock *sk, long timeout)
{
bpf: sockmap/tls, close can race with map free When a map free is called and in parallel a socket is closed we have two paths that can potentially reset the socket prot ops, the bpf close() path and the map free path. This creates a problem with which prot ops should be used from the socket closed side. If the map_free side completes first then we want to call the original lowest level ops. However, if the tls path runs first we want to call the sockmap ops. Additionally there was no locking around prot updates in TLS code paths so the prot ops could be changed multiple times once from TLS path and again from sockmap side potentially leaving ops pointed at either TLS or sockmap when psock and/or tls context have already been destroyed. To fix this race first only update ops inside callback lock so that TLS, sockmap and lowest level all agree on prot state. Second and a ULP callback update() so that lower layers can inform the upper layer when they are being removed allowing the upper layer to reset prot ops. This gets us close to allowing sockmap and tls to be stacked in arbitrary order but will save that patch for *next trees. v4: - make sure we don't free things for device; - remove the checks which swap the callbacks back only if TLS is at the top. Reported-by: syzbot+06537213db7ba2745c4a@syzkaller.appspotmail.com Fixes: 02c558b2d5d6 ("bpf: sockmap, support for msg_peek in sk_msg with redirect ingress") Signed-off-by: John Fastabend <john.fastabend@gmail.com> Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com> Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-07-20 01:29:22 +08:00
struct inet_connection_sock *icsk = inet_csk(sk);
struct tls_context *ctx = tls_get_ctx(sk);
long timeo = sock_sndtimeo(sk, 0);
bool free_ctx;
if (ctx->tx_conf == TLS_SW)
tls_sw_cancel_work_tx(ctx);
lock_sock(sk);
free_ctx = ctx->tx_conf != TLS_HW && ctx->rx_conf != TLS_HW;
if (ctx->tx_conf != TLS_BASE || ctx->rx_conf != TLS_BASE)
tls_sk_proto_cleanup(sk, ctx, timeo);
net/tls: Add generic NIC offload infrastructure This patch adds a generic infrastructure to offload TLS crypto to a network device. It enables the kernel TLS socket to skip encryption and authentication operations on the transmit side of the data path. Leaving those computationally expensive operations to the NIC. The NIC offload infrastructure builds TLS records and pushes them to the TCP layer just like the SW KTLS implementation and using the same API. TCP segmentation is mostly unaffected. Currently the only exception is that we prevent mixed SKBs where only part of the payload requires offload. In the future we are likely to add a similar restriction following a change cipher spec record. The notable differences between SW KTLS and NIC offloaded TLS implementations are as follows: 1. The offloaded implementation builds "plaintext TLS record", those records contain plaintext instead of ciphertext and place holder bytes instead of authentication tags. 2. The offloaded implementation maintains a mapping from TCP sequence number to TLS records. Thus given a TCP SKB sent from a NIC offloaded TLS socket, we can use the tls NIC offload infrastructure to obtain enough context to encrypt the payload of the SKB. A TLS record is released when the last byte of the record is ack'ed, this is done through the new icsk_clean_acked callback. The infrastructure should be extendable to support various NIC offload implementations. However it is currently written with the implementation below in mind: The NIC assumes that packets from each offloaded stream are sent as plaintext and in-order. It keeps track of the TLS records in the TCP stream. When a packet marked for offload is transmitted, the NIC encrypts the payload in-place and puts authentication tags in the relevant place holders. The responsibility for handling out-of-order packets (i.e. TCP retransmission, qdisc drops) falls on the netdev driver. The netdev driver keeps track of the expected TCP SN from the NIC's perspective. If the next packet to transmit matches the expected TCP SN, the driver advances the expected TCP SN, and transmits the packet with TLS offload indication. If the next packet to transmit does not match the expected TCP SN. The driver calls the TLS layer to obtain the TLS record that includes the TCP of the packet for transmission. Using this TLS record, the driver posts a work entry on the transmit queue to reconstruct the NIC TLS state required for the offload of the out-of-order packet. It updates the expected TCP SN accordingly and transmits the now in-order packet. The same queue is used for packet transmission and TLS context reconstruction to avoid the need for flushing the transmit queue before issuing the context reconstruction request. Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com> Signed-off-by: Boris Pismenny <borisp@mellanox.com> Signed-off-by: Aviad Yehezkel <aviadye@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-30 15:16:16 +08:00
bpf: sockmap/tls, close can race with map free When a map free is called and in parallel a socket is closed we have two paths that can potentially reset the socket prot ops, the bpf close() path and the map free path. This creates a problem with which prot ops should be used from the socket closed side. If the map_free side completes first then we want to call the original lowest level ops. However, if the tls path runs first we want to call the sockmap ops. Additionally there was no locking around prot updates in TLS code paths so the prot ops could be changed multiple times once from TLS path and again from sockmap side potentially leaving ops pointed at either TLS or sockmap when psock and/or tls context have already been destroyed. To fix this race first only update ops inside callback lock so that TLS, sockmap and lowest level all agree on prot state. Second and a ULP callback update() so that lower layers can inform the upper layer when they are being removed allowing the upper layer to reset prot ops. This gets us close to allowing sockmap and tls to be stacked in arbitrary order but will save that patch for *next trees. v4: - make sure we don't free things for device; - remove the checks which swap the callbacks back only if TLS is at the top. Reported-by: syzbot+06537213db7ba2745c4a@syzkaller.appspotmail.com Fixes: 02c558b2d5d6 ("bpf: sockmap, support for msg_peek in sk_msg with redirect ingress") Signed-off-by: John Fastabend <john.fastabend@gmail.com> Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com> Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-07-20 01:29:22 +08:00
write_lock_bh(&sk->sk_callback_lock);
if (free_ctx)
rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
net/tls: fix transition through disconnect with close It is possible (via shutdown()) for TCP socks to go through TCP_CLOSE state via tcp_disconnect() without actually calling tcp_close which would then call the tls close callback. Because of this a user could disconnect a socket then put it in a LISTEN state which would break our assumptions about sockets always being ESTABLISHED state. More directly because close() can call unhash() and unhash is implemented by sockmap if a sockmap socket has TLS enabled we can incorrectly destroy the psock from unhash() and then call its close handler again. But because the psock (sockmap socket representation) is already destroyed we call close handler in sk->prot. However, in some cases (TLS BASE/BASE case) this will still point at the sockmap close handler resulting in a circular call and crash reported by syzbot. To fix both above issues implement the unhash() routine for TLS. v4: - add note about tls offload still needing the fix; - move sk_proto to the cold cache line; - split TX context free into "release" and "free", otherwise the GC work itself is in already freed memory; - more TX before RX for consistency; - reuse tls_ctx_free(); - schedule the GC work after we're done with context to avoid UAF; - don't set the unhash in all modes, all modes "inherit" TLS_BASE's callbacks anyway; - disable the unhash hook for TLS_HW. Fixes: 3c4d7559159bf ("tls: kernel TLS support") Reported-by: Eric Dumazet <edumazet@google.com> Signed-off-by: John Fastabend <john.fastabend@gmail.com> Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-07-20 01:29:18 +08:00
sk->sk_prot = ctx->sk_proto;
if (sk->sk_write_space == tls_write_space)
sk->sk_write_space = ctx->sk_write_space;
bpf: sockmap/tls, close can race with map free When a map free is called and in parallel a socket is closed we have two paths that can potentially reset the socket prot ops, the bpf close() path and the map free path. This creates a problem with which prot ops should be used from the socket closed side. If the map_free side completes first then we want to call the original lowest level ops. However, if the tls path runs first we want to call the sockmap ops. Additionally there was no locking around prot updates in TLS code paths so the prot ops could be changed multiple times once from TLS path and again from sockmap side potentially leaving ops pointed at either TLS or sockmap when psock and/or tls context have already been destroyed. To fix this race first only update ops inside callback lock so that TLS, sockmap and lowest level all agree on prot state. Second and a ULP callback update() so that lower layers can inform the upper layer when they are being removed allowing the upper layer to reset prot ops. This gets us close to allowing sockmap and tls to be stacked in arbitrary order but will save that patch for *next trees. v4: - make sure we don't free things for device; - remove the checks which swap the callbacks back only if TLS is at the top. Reported-by: syzbot+06537213db7ba2745c4a@syzkaller.appspotmail.com Fixes: 02c558b2d5d6 ("bpf: sockmap, support for msg_peek in sk_msg with redirect ingress") Signed-off-by: John Fastabend <john.fastabend@gmail.com> Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com> Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-07-20 01:29:22 +08:00
write_unlock_bh(&sk->sk_callback_lock);
release_sock(sk);
if (ctx->tx_conf == TLS_SW)
tls_sw_free_ctx_tx(ctx);
if (ctx->rx_conf == TLS_SW || ctx->rx_conf == TLS_HW)
tls_sw_strparser_done(ctx);
if (ctx->rx_conf == TLS_SW)
tls_sw_free_ctx_rx(ctx);
ctx->sk_proto->close(sk, timeout);
tls: fix use after free in tls_sk_proto_close syzbot reported a use-after-free in tls_sk_proto_close Add a boolean value to cleanup a bit this function. BUG: KASAN: use-after-free in tls_sk_proto_close+0x8ab/0x9c0 net/tls/tls_main.c:297 Read of size 1 at addr ffff8801ae40a858 by task syz-executor363/4503 CPU: 0 PID: 4503 Comm: syz-executor363 Not tainted 4.17.0-rc3+ #34 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1b9/0x294 lib/dump_stack.c:113 print_address_description+0x6c/0x20b mm/kasan/report.c:256 kasan_report_error mm/kasan/report.c:354 [inline] kasan_report.cold.7+0x242/0x2fe mm/kasan/report.c:412 __asan_report_load1_noabort+0x14/0x20 mm/kasan/report.c:430 tls_sk_proto_close+0x8ab/0x9c0 net/tls/tls_main.c:297 inet_release+0x104/0x1f0 net/ipv4/af_inet.c:427 inet6_release+0x50/0x70 net/ipv6/af_inet6.c:460 sock_release+0x96/0x1b0 net/socket.c:594 sock_close+0x16/0x20 net/socket.c:1149 __fput+0x34d/0x890 fs/file_table.c:209 ____fput+0x15/0x20 fs/file_table.c:243 task_work_run+0x1e4/0x290 kernel/task_work.c:113 exit_task_work include/linux/task_work.h:22 [inline] do_exit+0x1aee/0x2730 kernel/exit.c:865 do_group_exit+0x16f/0x430 kernel/exit.c:968 get_signal+0x886/0x1960 kernel/signal.c:2469 do_signal+0x98/0x2040 arch/x86/kernel/signal.c:810 exit_to_usermode_loop+0x28a/0x310 arch/x86/entry/common.c:162 prepare_exit_to_usermode arch/x86/entry/common.c:196 [inline] syscall_return_slowpath arch/x86/entry/common.c:265 [inline] do_syscall_64+0x6ac/0x800 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x4457b9 RSP: 002b:00007fdf4d766da8 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca RAX: fffffffffffffe00 RBX: 00000000006dac3c RCX: 00000000004457b9 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00000000006dac3c RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000006dac38 R13: 3692738801137283 R14: 6bf92c39443c4c1d R15: 0000000000000006 Allocated by task 4498: save_stack+0x43/0xd0 mm/kasan/kasan.c:448 set_track mm/kasan/kasan.c:460 [inline] kasan_kmalloc+0xc4/0xe0 mm/kasan/kasan.c:553 kmem_cache_alloc_trace+0x152/0x780 mm/slab.c:3620 kmalloc include/linux/slab.h:512 [inline] kzalloc include/linux/slab.h:701 [inline] create_ctx net/tls/tls_main.c:521 [inline] tls_init+0x1f9/0xb00 net/tls/tls_main.c:633 tcp_set_ulp+0x1bc/0x520 net/ipv4/tcp_ulp.c:153 do_tcp_setsockopt.isra.39+0x44a/0x2600 net/ipv4/tcp.c:2588 tcp_setsockopt+0xc1/0xe0 net/ipv4/tcp.c:2893 sock_common_setsockopt+0x9a/0xe0 net/core/sock.c:3039 __sys_setsockopt+0x1bd/0x390 net/socket.c:1903 __do_sys_setsockopt net/socket.c:1914 [inline] __se_sys_setsockopt net/socket.c:1911 [inline] __x64_sys_setsockopt+0xbe/0x150 net/socket.c:1911 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe Freed by task 4503: save_stack+0x43/0xd0 mm/kasan/kasan.c:448 set_track mm/kasan/kasan.c:460 [inline] __kasan_slab_free+0x11a/0x170 mm/kasan/kasan.c:521 kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528 __cache_free mm/slab.c:3498 [inline] kfree+0xd9/0x260 mm/slab.c:3813 tls_sw_free_resources+0x2a3/0x360 net/tls/tls_sw.c:1037 tls_sk_proto_close+0x67c/0x9c0 net/tls/tls_main.c:288 inet_release+0x104/0x1f0 net/ipv4/af_inet.c:427 inet6_release+0x50/0x70 net/ipv6/af_inet6.c:460 sock_release+0x96/0x1b0 net/socket.c:594 sock_close+0x16/0x20 net/socket.c:1149 __fput+0x34d/0x890 fs/file_table.c:209 ____fput+0x15/0x20 fs/file_table.c:243 task_work_run+0x1e4/0x290 kernel/task_work.c:113 exit_task_work include/linux/task_work.h:22 [inline] do_exit+0x1aee/0x2730 kernel/exit.c:865 do_group_exit+0x16f/0x430 kernel/exit.c:968 get_signal+0x886/0x1960 kernel/signal.c:2469 do_signal+0x98/0x2040 arch/x86/kernel/signal.c:810 exit_to_usermode_loop+0x28a/0x310 arch/x86/entry/common.c:162 prepare_exit_to_usermode arch/x86/entry/common.c:196 [inline] syscall_return_slowpath arch/x86/entry/common.c:265 [inline] do_syscall_64+0x6ac/0x800 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe The buggy address belongs to the object at ffff8801ae40a800 which belongs to the cache kmalloc-256 of size 256 The buggy address is located 88 bytes inside of 256-byte region [ffff8801ae40a800, ffff8801ae40a900) The buggy address belongs to the page: page:ffffea0006b90280 count:1 mapcount:0 mapping:ffff8801ae40a080 index:0x0 flags: 0x2fffc0000000100(slab) raw: 02fffc0000000100 ffff8801ae40a080 0000000000000000 000000010000000c raw: ffffea0006bea9e0 ffffea0006bc94a0 ffff8801da8007c0 0000000000000000 page dumped because: kasan: bad access detected Fixes: dd0bed1665d6 ("tls: support for Inline tls record") Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Atul Gupta <atul.gupta@chelsio.com> Cc: Steve Wise <swise@opengridcomputing.com> Cc: Ilya Lesokhin <ilyal@mellanox.com> Cc: Aviad Yehezkel <aviadye@mellanox.com> Cc: Dave Watson <davejwatson@fb.com> Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-05-05 23:35:04 +08:00
if (free_ctx)
tls_ctx_free(sk, ctx);
}
static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval,
int __user *optlen)
{
int rc = 0;
struct tls_context *ctx = tls_get_ctx(sk);
struct tls_crypto_info *crypto_info;
int len;
if (get_user(len, optlen))
return -EFAULT;
if (!optval || (len < sizeof(*crypto_info))) {
rc = -EINVAL;
goto out;
}
if (!ctx) {
rc = -EBUSY;
goto out;
}
/* get user crypto info */
crypto_info = &ctx->crypto_send.info;
if (!TLS_CRYPTO_INFO_READY(crypto_info)) {
rc = -EBUSY;
goto out;
}
if (len == sizeof(*crypto_info)) {
if (copy_to_user(optval, crypto_info, sizeof(*crypto_info)))
rc = -EFAULT;
goto out;
}
switch (crypto_info->cipher_type) {
case TLS_CIPHER_AES_GCM_128: {
struct tls12_crypto_info_aes_gcm_128 *
crypto_info_aes_gcm_128 =
container_of(crypto_info,
struct tls12_crypto_info_aes_gcm_128,
info);
if (len != sizeof(*crypto_info_aes_gcm_128)) {
rc = -EINVAL;
goto out;
}
lock_sock(sk);
memcpy(crypto_info_aes_gcm_128->iv,
ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
TLS_CIPHER_AES_GCM_128_IV_SIZE);
memcpy(crypto_info_aes_gcm_128->rec_seq, ctx->tx.rec_seq,
TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE);
release_sock(sk);
if (copy_to_user(optval,
crypto_info_aes_gcm_128,
sizeof(*crypto_info_aes_gcm_128)))
rc = -EFAULT;
break;
}
case TLS_CIPHER_AES_GCM_256: {
struct tls12_crypto_info_aes_gcm_256 *
crypto_info_aes_gcm_256 =
container_of(crypto_info,
struct tls12_crypto_info_aes_gcm_256,
info);
if (len != sizeof(*crypto_info_aes_gcm_256)) {
rc = -EINVAL;
goto out;
}
lock_sock(sk);
memcpy(crypto_info_aes_gcm_256->iv,
ctx->tx.iv + TLS_CIPHER_AES_GCM_256_SALT_SIZE,
TLS_CIPHER_AES_GCM_256_IV_SIZE);
memcpy(crypto_info_aes_gcm_256->rec_seq, ctx->tx.rec_seq,
TLS_CIPHER_AES_GCM_256_REC_SEQ_SIZE);
release_sock(sk);
if (copy_to_user(optval,
crypto_info_aes_gcm_256,
sizeof(*crypto_info_aes_gcm_256)))
rc = -EFAULT;
break;
}
default:
rc = -EINVAL;
}
out:
return rc;
}
static int do_tls_getsockopt(struct sock *sk, int optname,
char __user *optval, int __user *optlen)
{
int rc = 0;
switch (optname) {
case TLS_TX:
rc = do_tls_getsockopt_tx(sk, optval, optlen);
break;
default:
rc = -ENOPROTOOPT;
break;
}
return rc;
}
static int tls_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen)
{
struct tls_context *ctx = tls_get_ctx(sk);
if (level != SOL_TLS)
return ctx->sk_proto->getsockopt(sk, level,
optname, optval, optlen);
return do_tls_getsockopt(sk, optname, optval, optlen);
}
static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval,
unsigned int optlen, int tx)
{
struct tls_crypto_info *crypto_info;
struct tls_crypto_info *alt_crypto_info;
struct tls_context *ctx = tls_get_ctx(sk);
size_t optsize;
int rc = 0;
int conf;
if (!optval || (optlen < sizeof(*crypto_info))) {
rc = -EINVAL;
goto out;
}
if (tx) {
crypto_info = &ctx->crypto_send.info;
alt_crypto_info = &ctx->crypto_recv.info;
} else {
crypto_info = &ctx->crypto_recv.info;
alt_crypto_info = &ctx->crypto_send.info;
}
/* Currently we don't support set crypto info more than one time */
if (TLS_CRYPTO_INFO_READY(crypto_info)) {
rc = -EBUSY;
goto out;
}
rc = copy_from_user(crypto_info, optval, sizeof(*crypto_info));
if (rc) {
rc = -EFAULT;
goto err_crypto_info;
}
/* check version */
if (crypto_info->version != TLS_1_2_VERSION &&
crypto_info->version != TLS_1_3_VERSION) {
rc = -ENOTSUPP;
goto err_crypto_info;
}
/* Ensure that TLS version and ciphers are same in both directions */
if (TLS_CRYPTO_INFO_READY(alt_crypto_info)) {
if (alt_crypto_info->version != crypto_info->version ||
alt_crypto_info->cipher_type != crypto_info->cipher_type) {
rc = -EINVAL;
goto err_crypto_info;
}
}
switch (crypto_info->cipher_type) {
case TLS_CIPHER_AES_GCM_128:
optsize = sizeof(struct tls12_crypto_info_aes_gcm_128);
break;
case TLS_CIPHER_AES_GCM_256: {
optsize = sizeof(struct tls12_crypto_info_aes_gcm_256);
break;
}
case TLS_CIPHER_AES_CCM_128:
optsize = sizeof(struct tls12_crypto_info_aes_ccm_128);
break;
default:
rc = -EINVAL;
goto err_crypto_info;
}
if (optlen != optsize) {
rc = -EINVAL;
goto err_crypto_info;
}
rc = copy_from_user(crypto_info + 1, optval + sizeof(*crypto_info),
optlen - sizeof(*crypto_info));
if (rc) {
rc = -EFAULT;
goto err_crypto_info;
}
if (tx) {
net/tls: Add generic NIC offload infrastructure This patch adds a generic infrastructure to offload TLS crypto to a network device. It enables the kernel TLS socket to skip encryption and authentication operations on the transmit side of the data path. Leaving those computationally expensive operations to the NIC. The NIC offload infrastructure builds TLS records and pushes them to the TCP layer just like the SW KTLS implementation and using the same API. TCP segmentation is mostly unaffected. Currently the only exception is that we prevent mixed SKBs where only part of the payload requires offload. In the future we are likely to add a similar restriction following a change cipher spec record. The notable differences between SW KTLS and NIC offloaded TLS implementations are as follows: 1. The offloaded implementation builds "plaintext TLS record", those records contain plaintext instead of ciphertext and place holder bytes instead of authentication tags. 2. The offloaded implementation maintains a mapping from TCP sequence number to TLS records. Thus given a TCP SKB sent from a NIC offloaded TLS socket, we can use the tls NIC offload infrastructure to obtain enough context to encrypt the payload of the SKB. A TLS record is released when the last byte of the record is ack'ed, this is done through the new icsk_clean_acked callback. The infrastructure should be extendable to support various NIC offload implementations. However it is currently written with the implementation below in mind: The NIC assumes that packets from each offloaded stream are sent as plaintext and in-order. It keeps track of the TLS records in the TCP stream. When a packet marked for offload is transmitted, the NIC encrypts the payload in-place and puts authentication tags in the relevant place holders. The responsibility for handling out-of-order packets (i.e. TCP retransmission, qdisc drops) falls on the netdev driver. The netdev driver keeps track of the expected TCP SN from the NIC's perspective. If the next packet to transmit matches the expected TCP SN, the driver advances the expected TCP SN, and transmits the packet with TLS offload indication. If the next packet to transmit does not match the expected TCP SN. The driver calls the TLS layer to obtain the TLS record that includes the TCP of the packet for transmission. Using this TLS record, the driver posts a work entry on the transmit queue to reconstruct the NIC TLS state required for the offload of the out-of-order packet. It updates the expected TCP SN accordingly and transmits the now in-order packet. The same queue is used for packet transmission and TLS context reconstruction to avoid the need for flushing the transmit queue before issuing the context reconstruction request. Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com> Signed-off-by: Boris Pismenny <borisp@mellanox.com> Signed-off-by: Aviad Yehezkel <aviadye@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-30 15:16:16 +08:00
rc = tls_set_device_offload(sk, ctx);
conf = TLS_HW;
if (rc) {
rc = tls_set_sw_offload(sk, ctx, 1);
if (rc)
goto err_crypto_info;
net/tls: Add generic NIC offload infrastructure This patch adds a generic infrastructure to offload TLS crypto to a network device. It enables the kernel TLS socket to skip encryption and authentication operations on the transmit side of the data path. Leaving those computationally expensive operations to the NIC. The NIC offload infrastructure builds TLS records and pushes them to the TCP layer just like the SW KTLS implementation and using the same API. TCP segmentation is mostly unaffected. Currently the only exception is that we prevent mixed SKBs where only part of the payload requires offload. In the future we are likely to add a similar restriction following a change cipher spec record. The notable differences between SW KTLS and NIC offloaded TLS implementations are as follows: 1. The offloaded implementation builds "plaintext TLS record", those records contain plaintext instead of ciphertext and place holder bytes instead of authentication tags. 2. The offloaded implementation maintains a mapping from TCP sequence number to TLS records. Thus given a TCP SKB sent from a NIC offloaded TLS socket, we can use the tls NIC offload infrastructure to obtain enough context to encrypt the payload of the SKB. A TLS record is released when the last byte of the record is ack'ed, this is done through the new icsk_clean_acked callback. The infrastructure should be extendable to support various NIC offload implementations. However it is currently written with the implementation below in mind: The NIC assumes that packets from each offloaded stream are sent as plaintext and in-order. It keeps track of the TLS records in the TCP stream. When a packet marked for offload is transmitted, the NIC encrypts the payload in-place and puts authentication tags in the relevant place holders. The responsibility for handling out-of-order packets (i.e. TCP retransmission, qdisc drops) falls on the netdev driver. The netdev driver keeps track of the expected TCP SN from the NIC's perspective. If the next packet to transmit matches the expected TCP SN, the driver advances the expected TCP SN, and transmits the packet with TLS offload indication. If the next packet to transmit does not match the expected TCP SN. The driver calls the TLS layer to obtain the TLS record that includes the TCP of the packet for transmission. Using this TLS record, the driver posts a work entry on the transmit queue to reconstruct the NIC TLS state required for the offload of the out-of-order packet. It updates the expected TCP SN accordingly and transmits the now in-order packet. The same queue is used for packet transmission and TLS context reconstruction to avoid the need for flushing the transmit queue before issuing the context reconstruction request. Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com> Signed-off-by: Boris Pismenny <borisp@mellanox.com> Signed-off-by: Aviad Yehezkel <aviadye@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-30 15:16:16 +08:00
conf = TLS_SW;
}
} else {
rc = tls_set_device_offload_rx(sk, ctx);
conf = TLS_HW;
if (rc) {
rc = tls_set_sw_offload(sk, ctx, 0);
if (rc)
goto err_crypto_info;
conf = TLS_SW;
}
tls_sw_strparser_arm(sk, ctx);
}
if (tx)
ctx->tx_conf = conf;
else
ctx->rx_conf = conf;
update_sk_prot(sk, ctx);
if (tx) {
ctx->sk_write_space = sk->sk_write_space;
sk->sk_write_space = tls_write_space;
} else {
sk->sk_socket->ops = &tls_sw_proto_ops;
}
goto out;
err_crypto_info:
memzero_explicit(crypto_info, sizeof(union tls_crypto_context));
out:
return rc;
}
static int do_tls_setsockopt(struct sock *sk, int optname,
char __user *optval, unsigned int optlen)
{
int rc = 0;
switch (optname) {
case TLS_TX:
case TLS_RX:
lock_sock(sk);
rc = do_tls_setsockopt_conf(sk, optval, optlen,
optname == TLS_TX);
release_sock(sk);
break;
default:
rc = -ENOPROTOOPT;
break;
}
return rc;
}
static int tls_setsockopt(struct sock *sk, int level, int optname,
char __user *optval, unsigned int optlen)
{
struct tls_context *ctx = tls_get_ctx(sk);
if (level != SOL_TLS)
return ctx->sk_proto->setsockopt(sk, level, optname, optval,
optlen);
return do_tls_setsockopt(sk, optname, optval, optlen);
}
static struct tls_context *create_ctx(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tls_context *ctx;
ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC);
if (!ctx)
return NULL;
net/tls: add a TX lock TLS TX needs to release and re-acquire the socket lock if send buffer fills up. TLS SW TX path currently depends on only allowing one thread to enter the function by the abuse of sk_write_pending. If another writer is already waiting for memory no new ones are allowed in. This has two problems: - writers don't wake other threads up when they leave the kernel; meaning that this scheme works for single extra thread (second application thread or delayed work) because memory becoming available will send a wake up request, but as Mallesham and Pooja report with larger number of threads it leads to threads being put to sleep indefinitely; - the delayed work does not get _scheduled_ but it may _run_ when other writers are present leading to crashes as writers don't expect state to change under their feet (same records get pushed and freed multiple times); it's hard to reliably bail from the work, however, because the mere presence of a writer does not guarantee that the writer will push pending records before exiting. Ensuring wakeups always happen will make the code basically open code a mutex. Just use a mutex. The TLS HW TX path does not have any locking (not even the sk_write_pending hack), yet it uses a per-socket sg_tx_data array to push records. Fixes: a42055e8d2c3 ("net/tls: Add support for async encryption of records for performance") Reported-by: Mallesham Jatharakonda <mallesh537@gmail.com> Reported-by: Pooja Trivedi <poojatrivedi@gmail.com> Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com> Reviewed-by: Simon Horman <simon.horman@netronome.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-11-06 06:24:35 +08:00
mutex_init(&ctx->tx_lock);
rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
ctx->sk_proto = sk->sk_prot;
return ctx;
}
static void tls_build_proto(struct sock *sk)
{
int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4;
/* Build IPv6 TLS whenever the address of tcpv6 _prot changes */
if (ip_ver == TLSV6 &&
unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) {
mutex_lock(&tcpv6_prot_mutex);
if (likely(sk->sk_prot != saved_tcpv6_prot)) {
build_protos(tls_prots[TLSV6], sk->sk_prot);
smp_store_release(&saved_tcpv6_prot, sk->sk_prot);
}
mutex_unlock(&tcpv6_prot_mutex);
}
if (ip_ver == TLSV4 &&
unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv4_prot))) {
mutex_lock(&tcpv4_prot_mutex);
if (likely(sk->sk_prot != saved_tcpv4_prot)) {
build_protos(tls_prots[TLSV4], sk->sk_prot);
smp_store_release(&saved_tcpv4_prot, sk->sk_prot);
}
mutex_unlock(&tcpv4_prot_mutex);
}
}
static void tls_hw_sk_destruct(struct sock *sk)
{
struct tls_context *ctx = tls_get_ctx(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
ctx->sk_destruct(sk);
/* Free ctx */
rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
tls_ctx_free(sk, ctx);
}
static int tls_hw_prot(struct sock *sk)
{
struct tls_context *ctx;
struct tls_device *dev;
int rc = 0;
spin_lock_bh(&device_spinlock);
list_for_each_entry(dev, &device_list, dev_list) {
if (dev->feature && dev->feature(dev)) {
ctx = create_ctx(sk);
if (!ctx)
goto out;
spin_unlock_bh(&device_spinlock);
tls_build_proto(sk);
ctx->sk_destruct = sk->sk_destruct;
sk->sk_destruct = tls_hw_sk_destruct;
ctx->rx_conf = TLS_HW_RECORD;
ctx->tx_conf = TLS_HW_RECORD;
update_sk_prot(sk, ctx);
spin_lock_bh(&device_spinlock);
rc = 1;
break;
}
}
out:
spin_unlock_bh(&device_spinlock);
return rc;
}
static void tls_hw_unhash(struct sock *sk)
{
struct tls_context *ctx = tls_get_ctx(sk);
struct tls_device *dev;
spin_lock_bh(&device_spinlock);
list_for_each_entry(dev, &device_list, dev_list) {
if (dev->unhash) {
kref_get(&dev->kref);
spin_unlock_bh(&device_spinlock);
dev->unhash(dev, sk);
kref_put(&dev->kref, dev->release);
spin_lock_bh(&device_spinlock);
}
}
spin_unlock_bh(&device_spinlock);
ctx->sk_proto->unhash(sk);
}
static int tls_hw_hash(struct sock *sk)
{
struct tls_context *ctx = tls_get_ctx(sk);
struct tls_device *dev;
int err;
err = ctx->sk_proto->hash(sk);
spin_lock_bh(&device_spinlock);
list_for_each_entry(dev, &device_list, dev_list) {
if (dev->hash) {
kref_get(&dev->kref);
spin_unlock_bh(&device_spinlock);
err |= dev->hash(dev, sk);
kref_put(&dev->kref, dev->release);
spin_lock_bh(&device_spinlock);
}
}
spin_unlock_bh(&device_spinlock);
if (err)
tls_hw_unhash(sk);
return err;
}
static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
struct proto *base)
{
prot[TLS_BASE][TLS_BASE] = *base;
prot[TLS_BASE][TLS_BASE].setsockopt = tls_setsockopt;
prot[TLS_BASE][TLS_BASE].getsockopt = tls_getsockopt;
prot[TLS_BASE][TLS_BASE].close = tls_sk_proto_close;
prot[TLS_SW][TLS_BASE] = prot[TLS_BASE][TLS_BASE];
prot[TLS_SW][TLS_BASE].sendmsg = tls_sw_sendmsg;
prot[TLS_SW][TLS_BASE].sendpage = tls_sw_sendpage;
prot[TLS_BASE][TLS_SW] = prot[TLS_BASE][TLS_BASE];
prot[TLS_BASE][TLS_SW].recvmsg = tls_sw_recvmsg;
prot[TLS_BASE][TLS_SW].stream_memory_read = tls_sw_stream_read;
prot[TLS_BASE][TLS_SW].close = tls_sk_proto_close;
prot[TLS_SW][TLS_SW] = prot[TLS_SW][TLS_BASE];
prot[TLS_SW][TLS_SW].recvmsg = tls_sw_recvmsg;
prot[TLS_SW][TLS_SW].stream_memory_read = tls_sw_stream_read;
prot[TLS_SW][TLS_SW].close = tls_sk_proto_close;
net/tls: Add generic NIC offload infrastructure This patch adds a generic infrastructure to offload TLS crypto to a network device. It enables the kernel TLS socket to skip encryption and authentication operations on the transmit side of the data path. Leaving those computationally expensive operations to the NIC. The NIC offload infrastructure builds TLS records and pushes them to the TCP layer just like the SW KTLS implementation and using the same API. TCP segmentation is mostly unaffected. Currently the only exception is that we prevent mixed SKBs where only part of the payload requires offload. In the future we are likely to add a similar restriction following a change cipher spec record. The notable differences between SW KTLS and NIC offloaded TLS implementations are as follows: 1. The offloaded implementation builds "plaintext TLS record", those records contain plaintext instead of ciphertext and place holder bytes instead of authentication tags. 2. The offloaded implementation maintains a mapping from TCP sequence number to TLS records. Thus given a TCP SKB sent from a NIC offloaded TLS socket, we can use the tls NIC offload infrastructure to obtain enough context to encrypt the payload of the SKB. A TLS record is released when the last byte of the record is ack'ed, this is done through the new icsk_clean_acked callback. The infrastructure should be extendable to support various NIC offload implementations. However it is currently written with the implementation below in mind: The NIC assumes that packets from each offloaded stream are sent as plaintext and in-order. It keeps track of the TLS records in the TCP stream. When a packet marked for offload is transmitted, the NIC encrypts the payload in-place and puts authentication tags in the relevant place holders. The responsibility for handling out-of-order packets (i.e. TCP retransmission, qdisc drops) falls on the netdev driver. The netdev driver keeps track of the expected TCP SN from the NIC's perspective. If the next packet to transmit matches the expected TCP SN, the driver advances the expected TCP SN, and transmits the packet with TLS offload indication. If the next packet to transmit does not match the expected TCP SN. The driver calls the TLS layer to obtain the TLS record that includes the TCP of the packet for transmission. Using this TLS record, the driver posts a work entry on the transmit queue to reconstruct the NIC TLS state required for the offload of the out-of-order packet. It updates the expected TCP SN accordingly and transmits the now in-order packet. The same queue is used for packet transmission and TLS context reconstruction to avoid the need for flushing the transmit queue before issuing the context reconstruction request. Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com> Signed-off-by: Boris Pismenny <borisp@mellanox.com> Signed-off-by: Aviad Yehezkel <aviadye@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-30 15:16:16 +08:00
#ifdef CONFIG_TLS_DEVICE
prot[TLS_HW][TLS_BASE] = prot[TLS_BASE][TLS_BASE];
prot[TLS_HW][TLS_BASE].sendmsg = tls_device_sendmsg;
prot[TLS_HW][TLS_BASE].sendpage = tls_device_sendpage;
prot[TLS_HW][TLS_SW] = prot[TLS_BASE][TLS_SW];
prot[TLS_HW][TLS_SW].sendmsg = tls_device_sendmsg;
prot[TLS_HW][TLS_SW].sendpage = tls_device_sendpage;
prot[TLS_BASE][TLS_HW] = prot[TLS_BASE][TLS_SW];
prot[TLS_SW][TLS_HW] = prot[TLS_SW][TLS_SW];
prot[TLS_HW][TLS_HW] = prot[TLS_HW][TLS_SW];
net/tls: Add generic NIC offload infrastructure This patch adds a generic infrastructure to offload TLS crypto to a network device. It enables the kernel TLS socket to skip encryption and authentication operations on the transmit side of the data path. Leaving those computationally expensive operations to the NIC. The NIC offload infrastructure builds TLS records and pushes them to the TCP layer just like the SW KTLS implementation and using the same API. TCP segmentation is mostly unaffected. Currently the only exception is that we prevent mixed SKBs where only part of the payload requires offload. In the future we are likely to add a similar restriction following a change cipher spec record. The notable differences between SW KTLS and NIC offloaded TLS implementations are as follows: 1. The offloaded implementation builds "plaintext TLS record", those records contain plaintext instead of ciphertext and place holder bytes instead of authentication tags. 2. The offloaded implementation maintains a mapping from TCP sequence number to TLS records. Thus given a TCP SKB sent from a NIC offloaded TLS socket, we can use the tls NIC offload infrastructure to obtain enough context to encrypt the payload of the SKB. A TLS record is released when the last byte of the record is ack'ed, this is done through the new icsk_clean_acked callback. The infrastructure should be extendable to support various NIC offload implementations. However it is currently written with the implementation below in mind: The NIC assumes that packets from each offloaded stream are sent as plaintext and in-order. It keeps track of the TLS records in the TCP stream. When a packet marked for offload is transmitted, the NIC encrypts the payload in-place and puts authentication tags in the relevant place holders. The responsibility for handling out-of-order packets (i.e. TCP retransmission, qdisc drops) falls on the netdev driver. The netdev driver keeps track of the expected TCP SN from the NIC's perspective. If the next packet to transmit matches the expected TCP SN, the driver advances the expected TCP SN, and transmits the packet with TLS offload indication. If the next packet to transmit does not match the expected TCP SN. The driver calls the TLS layer to obtain the TLS record that includes the TCP of the packet for transmission. Using this TLS record, the driver posts a work entry on the transmit queue to reconstruct the NIC TLS state required for the offload of the out-of-order packet. It updates the expected TCP SN accordingly and transmits the now in-order packet. The same queue is used for packet transmission and TLS context reconstruction to avoid the need for flushing the transmit queue before issuing the context reconstruction request. Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com> Signed-off-by: Boris Pismenny <borisp@mellanox.com> Signed-off-by: Aviad Yehezkel <aviadye@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-30 15:16:16 +08:00
#endif
prot[TLS_HW_RECORD][TLS_HW_RECORD] = *base;
prot[TLS_HW_RECORD][TLS_HW_RECORD].hash = tls_hw_hash;
prot[TLS_HW_RECORD][TLS_HW_RECORD].unhash = tls_hw_unhash;
}
static int tls_init(struct sock *sk)
{
struct tls_context *ctx;
int rc = 0;
if (tls_hw_prot(sk))
bpf: sockmap/tls, close can race with map free When a map free is called and in parallel a socket is closed we have two paths that can potentially reset the socket prot ops, the bpf close() path and the map free path. This creates a problem with which prot ops should be used from the socket closed side. If the map_free side completes first then we want to call the original lowest level ops. However, if the tls path runs first we want to call the sockmap ops. Additionally there was no locking around prot updates in TLS code paths so the prot ops could be changed multiple times once from TLS path and again from sockmap side potentially leaving ops pointed at either TLS or sockmap when psock and/or tls context have already been destroyed. To fix this race first only update ops inside callback lock so that TLS, sockmap and lowest level all agree on prot state. Second and a ULP callback update() so that lower layers can inform the upper layer when they are being removed allowing the upper layer to reset prot ops. This gets us close to allowing sockmap and tls to be stacked in arbitrary order but will save that patch for *next trees. v4: - make sure we don't free things for device; - remove the checks which swap the callbacks back only if TLS is at the top. Reported-by: syzbot+06537213db7ba2745c4a@syzkaller.appspotmail.com Fixes: 02c558b2d5d6 ("bpf: sockmap, support for msg_peek in sk_msg with redirect ingress") Signed-off-by: John Fastabend <john.fastabend@gmail.com> Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com> Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-07-20 01:29:22 +08:00
return 0;
/* The TLS ulp is currently supported only for TCP sockets
* in ESTABLISHED state.
* Supporting sockets in LISTEN state will require us
* to modify the accept implementation to clone rather then
* share the ulp context.
*/
if (sk->sk_state != TCP_ESTABLISHED)
return -ENOTSUPP;
bpf: sockmap/tls, close can race with map free When a map free is called and in parallel a socket is closed we have two paths that can potentially reset the socket prot ops, the bpf close() path and the map free path. This creates a problem with which prot ops should be used from the socket closed side. If the map_free side completes first then we want to call the original lowest level ops. However, if the tls path runs first we want to call the sockmap ops. Additionally there was no locking around prot updates in TLS code paths so the prot ops could be changed multiple times once from TLS path and again from sockmap side potentially leaving ops pointed at either TLS or sockmap when psock and/or tls context have already been destroyed. To fix this race first only update ops inside callback lock so that TLS, sockmap and lowest level all agree on prot state. Second and a ULP callback update() so that lower layers can inform the upper layer when they are being removed allowing the upper layer to reset prot ops. This gets us close to allowing sockmap and tls to be stacked in arbitrary order but will save that patch for *next trees. v4: - make sure we don't free things for device; - remove the checks which swap the callbacks back only if TLS is at the top. Reported-by: syzbot+06537213db7ba2745c4a@syzkaller.appspotmail.com Fixes: 02c558b2d5d6 ("bpf: sockmap, support for msg_peek in sk_msg with redirect ingress") Signed-off-by: John Fastabend <john.fastabend@gmail.com> Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com> Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-07-20 01:29:22 +08:00
tls_build_proto(sk);
/* allocate tls context */
bpf: sockmap/tls, close can race with map free When a map free is called and in parallel a socket is closed we have two paths that can potentially reset the socket prot ops, the bpf close() path and the map free path. This creates a problem with which prot ops should be used from the socket closed side. If the map_free side completes first then we want to call the original lowest level ops. However, if the tls path runs first we want to call the sockmap ops. Additionally there was no locking around prot updates in TLS code paths so the prot ops could be changed multiple times once from TLS path and again from sockmap side potentially leaving ops pointed at either TLS or sockmap when psock and/or tls context have already been destroyed. To fix this race first only update ops inside callback lock so that TLS, sockmap and lowest level all agree on prot state. Second and a ULP callback update() so that lower layers can inform the upper layer when they are being removed allowing the upper layer to reset prot ops. This gets us close to allowing sockmap and tls to be stacked in arbitrary order but will save that patch for *next trees. v4: - make sure we don't free things for device; - remove the checks which swap the callbacks back only if TLS is at the top. Reported-by: syzbot+06537213db7ba2745c4a@syzkaller.appspotmail.com Fixes: 02c558b2d5d6 ("bpf: sockmap, support for msg_peek in sk_msg with redirect ingress") Signed-off-by: John Fastabend <john.fastabend@gmail.com> Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com> Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-07-20 01:29:22 +08:00
write_lock_bh(&sk->sk_callback_lock);
ctx = create_ctx(sk);
if (!ctx) {
rc = -ENOMEM;
goto out;
}
ctx->tx_conf = TLS_BASE;
ctx->rx_conf = TLS_BASE;
update_sk_prot(sk, ctx);
out:
bpf: sockmap/tls, close can race with map free When a map free is called and in parallel a socket is closed we have two paths that can potentially reset the socket prot ops, the bpf close() path and the map free path. This creates a problem with which prot ops should be used from the socket closed side. If the map_free side completes first then we want to call the original lowest level ops. However, if the tls path runs first we want to call the sockmap ops. Additionally there was no locking around prot updates in TLS code paths so the prot ops could be changed multiple times once from TLS path and again from sockmap side potentially leaving ops pointed at either TLS or sockmap when psock and/or tls context have already been destroyed. To fix this race first only update ops inside callback lock so that TLS, sockmap and lowest level all agree on prot state. Second and a ULP callback update() so that lower layers can inform the upper layer when they are being removed allowing the upper layer to reset prot ops. This gets us close to allowing sockmap and tls to be stacked in arbitrary order but will save that patch for *next trees. v4: - make sure we don't free things for device; - remove the checks which swap the callbacks back only if TLS is at the top. Reported-by: syzbot+06537213db7ba2745c4a@syzkaller.appspotmail.com Fixes: 02c558b2d5d6 ("bpf: sockmap, support for msg_peek in sk_msg with redirect ingress") Signed-off-by: John Fastabend <john.fastabend@gmail.com> Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com> Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-07-20 01:29:22 +08:00
write_unlock_bh(&sk->sk_callback_lock);
return rc;
}
bpf: sockmap/tls, close can race with map free When a map free is called and in parallel a socket is closed we have two paths that can potentially reset the socket prot ops, the bpf close() path and the map free path. This creates a problem with which prot ops should be used from the socket closed side. If the map_free side completes first then we want to call the original lowest level ops. However, if the tls path runs first we want to call the sockmap ops. Additionally there was no locking around prot updates in TLS code paths so the prot ops could be changed multiple times once from TLS path and again from sockmap side potentially leaving ops pointed at either TLS or sockmap when psock and/or tls context have already been destroyed. To fix this race first only update ops inside callback lock so that TLS, sockmap and lowest level all agree on prot state. Second and a ULP callback update() so that lower layers can inform the upper layer when they are being removed allowing the upper layer to reset prot ops. This gets us close to allowing sockmap and tls to be stacked in arbitrary order but will save that patch for *next trees. v4: - make sure we don't free things for device; - remove the checks which swap the callbacks back only if TLS is at the top. Reported-by: syzbot+06537213db7ba2745c4a@syzkaller.appspotmail.com Fixes: 02c558b2d5d6 ("bpf: sockmap, support for msg_peek in sk_msg with redirect ingress") Signed-off-by: John Fastabend <john.fastabend@gmail.com> Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com> Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-07-20 01:29:22 +08:00
static void tls_update(struct sock *sk, struct proto *p)
{
struct tls_context *ctx;
ctx = tls_get_ctx(sk);
if (likely(ctx))
bpf: sockmap/tls, close can race with map free When a map free is called and in parallel a socket is closed we have two paths that can potentially reset the socket prot ops, the bpf close() path and the map free path. This creates a problem with which prot ops should be used from the socket closed side. If the map_free side completes first then we want to call the original lowest level ops. However, if the tls path runs first we want to call the sockmap ops. Additionally there was no locking around prot updates in TLS code paths so the prot ops could be changed multiple times once from TLS path and again from sockmap side potentially leaving ops pointed at either TLS or sockmap when psock and/or tls context have already been destroyed. To fix this race first only update ops inside callback lock so that TLS, sockmap and lowest level all agree on prot state. Second and a ULP callback update() so that lower layers can inform the upper layer when they are being removed allowing the upper layer to reset prot ops. This gets us close to allowing sockmap and tls to be stacked in arbitrary order but will save that patch for *next trees. v4: - make sure we don't free things for device; - remove the checks which swap the callbacks back only if TLS is at the top. Reported-by: syzbot+06537213db7ba2745c4a@syzkaller.appspotmail.com Fixes: 02c558b2d5d6 ("bpf: sockmap, support for msg_peek in sk_msg with redirect ingress") Signed-off-by: John Fastabend <john.fastabend@gmail.com> Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com> Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-07-20 01:29:22 +08:00
ctx->sk_proto = p;
else
bpf: sockmap/tls, close can race with map free When a map free is called and in parallel a socket is closed we have two paths that can potentially reset the socket prot ops, the bpf close() path and the map free path. This creates a problem with which prot ops should be used from the socket closed side. If the map_free side completes first then we want to call the original lowest level ops. However, if the tls path runs first we want to call the sockmap ops. Additionally there was no locking around prot updates in TLS code paths so the prot ops could be changed multiple times once from TLS path and again from sockmap side potentially leaving ops pointed at either TLS or sockmap when psock and/or tls context have already been destroyed. To fix this race first only update ops inside callback lock so that TLS, sockmap and lowest level all agree on prot state. Second and a ULP callback update() so that lower layers can inform the upper layer when they are being removed allowing the upper layer to reset prot ops. This gets us close to allowing sockmap and tls to be stacked in arbitrary order but will save that patch for *next trees. v4: - make sure we don't free things for device; - remove the checks which swap the callbacks back only if TLS is at the top. Reported-by: syzbot+06537213db7ba2745c4a@syzkaller.appspotmail.com Fixes: 02c558b2d5d6 ("bpf: sockmap, support for msg_peek in sk_msg with redirect ingress") Signed-off-by: John Fastabend <john.fastabend@gmail.com> Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com> Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-07-20 01:29:22 +08:00
sk->sk_prot = p;
}
static int tls_get_info(const struct sock *sk, struct sk_buff *skb)
{
u16 version, cipher_type;
struct tls_context *ctx;
struct nlattr *start;
int err;
start = nla_nest_start_noflag(skb, INET_ULP_INFO_TLS);
if (!start)
return -EMSGSIZE;
rcu_read_lock();
ctx = rcu_dereference(inet_csk(sk)->icsk_ulp_data);
if (!ctx) {
err = 0;
goto nla_failure;
}
version = ctx->prot_info.version;
if (version) {
err = nla_put_u16(skb, TLS_INFO_VERSION, version);
if (err)
goto nla_failure;
}
cipher_type = ctx->prot_info.cipher_type;
if (cipher_type) {
err = nla_put_u16(skb, TLS_INFO_CIPHER, cipher_type);
if (err)
goto nla_failure;
}
err = nla_put_u16(skb, TLS_INFO_TXCONF, tls_user_config(ctx, true));
if (err)
goto nla_failure;
err = nla_put_u16(skb, TLS_INFO_RXCONF, tls_user_config(ctx, false));
if (err)
goto nla_failure;
rcu_read_unlock();
nla_nest_end(skb, start);
return 0;
nla_failure:
rcu_read_unlock();
nla_nest_cancel(skb, start);
return err;
}
static size_t tls_get_info_size(const struct sock *sk)
{
size_t size = 0;
size += nla_total_size(0) + /* INET_ULP_INFO_TLS */
nla_total_size(sizeof(u16)) + /* TLS_INFO_VERSION */
nla_total_size(sizeof(u16)) + /* TLS_INFO_CIPHER */
nla_total_size(sizeof(u16)) + /* TLS_INFO_RXCONF */
nla_total_size(sizeof(u16)) + /* TLS_INFO_TXCONF */
0;
return size;
}
void tls_register_device(struct tls_device *device)
{
spin_lock_bh(&device_spinlock);
list_add_tail(&device->dev_list, &device_list);
spin_unlock_bh(&device_spinlock);
}
EXPORT_SYMBOL(tls_register_device);
void tls_unregister_device(struct tls_device *device)
{
spin_lock_bh(&device_spinlock);
list_del(&device->dev_list);
spin_unlock_bh(&device_spinlock);
}
EXPORT_SYMBOL(tls_unregister_device);
static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = {
.name = "tls",
.owner = THIS_MODULE,
.init = tls_init,
bpf: sockmap/tls, close can race with map free When a map free is called and in parallel a socket is closed we have two paths that can potentially reset the socket prot ops, the bpf close() path and the map free path. This creates a problem with which prot ops should be used from the socket closed side. If the map_free side completes first then we want to call the original lowest level ops. However, if the tls path runs first we want to call the sockmap ops. Additionally there was no locking around prot updates in TLS code paths so the prot ops could be changed multiple times once from TLS path and again from sockmap side potentially leaving ops pointed at either TLS or sockmap when psock and/or tls context have already been destroyed. To fix this race first only update ops inside callback lock so that TLS, sockmap and lowest level all agree on prot state. Second and a ULP callback update() so that lower layers can inform the upper layer when they are being removed allowing the upper layer to reset prot ops. This gets us close to allowing sockmap and tls to be stacked in arbitrary order but will save that patch for *next trees. v4: - make sure we don't free things for device; - remove the checks which swap the callbacks back only if TLS is at the top. Reported-by: syzbot+06537213db7ba2745c4a@syzkaller.appspotmail.com Fixes: 02c558b2d5d6 ("bpf: sockmap, support for msg_peek in sk_msg with redirect ingress") Signed-off-by: John Fastabend <john.fastabend@gmail.com> Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com> Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-07-20 01:29:22 +08:00
.update = tls_update,
.get_info = tls_get_info,
.get_info_size = tls_get_info_size,
};
static int __init tls_register(void)
{
tls_sw_proto_ops = inet_stream_ops;
tls_sw_proto_ops.splice_read = tls_sw_splice_read;
net/tls: Add generic NIC offload infrastructure This patch adds a generic infrastructure to offload TLS crypto to a network device. It enables the kernel TLS socket to skip encryption and authentication operations on the transmit side of the data path. Leaving those computationally expensive operations to the NIC. The NIC offload infrastructure builds TLS records and pushes them to the TCP layer just like the SW KTLS implementation and using the same API. TCP segmentation is mostly unaffected. Currently the only exception is that we prevent mixed SKBs where only part of the payload requires offload. In the future we are likely to add a similar restriction following a change cipher spec record. The notable differences between SW KTLS and NIC offloaded TLS implementations are as follows: 1. The offloaded implementation builds "plaintext TLS record", those records contain plaintext instead of ciphertext and place holder bytes instead of authentication tags. 2. The offloaded implementation maintains a mapping from TCP sequence number to TLS records. Thus given a TCP SKB sent from a NIC offloaded TLS socket, we can use the tls NIC offload infrastructure to obtain enough context to encrypt the payload of the SKB. A TLS record is released when the last byte of the record is ack'ed, this is done through the new icsk_clean_acked callback. The infrastructure should be extendable to support various NIC offload implementations. However it is currently written with the implementation below in mind: The NIC assumes that packets from each offloaded stream are sent as plaintext and in-order. It keeps track of the TLS records in the TCP stream. When a packet marked for offload is transmitted, the NIC encrypts the payload in-place and puts authentication tags in the relevant place holders. The responsibility for handling out-of-order packets (i.e. TCP retransmission, qdisc drops) falls on the netdev driver. The netdev driver keeps track of the expected TCP SN from the NIC's perspective. If the next packet to transmit matches the expected TCP SN, the driver advances the expected TCP SN, and transmits the packet with TLS offload indication. If the next packet to transmit does not match the expected TCP SN. The driver calls the TLS layer to obtain the TLS record that includes the TCP of the packet for transmission. Using this TLS record, the driver posts a work entry on the transmit queue to reconstruct the NIC TLS state required for the offload of the out-of-order packet. It updates the expected TCP SN accordingly and transmits the now in-order packet. The same queue is used for packet transmission and TLS context reconstruction to avoid the need for flushing the transmit queue before issuing the context reconstruction request. Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com> Signed-off-by: Boris Pismenny <borisp@mellanox.com> Signed-off-by: Aviad Yehezkel <aviadye@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-30 15:16:16 +08:00
tls_device_init();
tcp_register_ulp(&tcp_tls_ulp_ops);
return 0;
}
static void __exit tls_unregister(void)
{
tcp_unregister_ulp(&tcp_tls_ulp_ops);
net/tls: Add generic NIC offload infrastructure This patch adds a generic infrastructure to offload TLS crypto to a network device. It enables the kernel TLS socket to skip encryption and authentication operations on the transmit side of the data path. Leaving those computationally expensive operations to the NIC. The NIC offload infrastructure builds TLS records and pushes them to the TCP layer just like the SW KTLS implementation and using the same API. TCP segmentation is mostly unaffected. Currently the only exception is that we prevent mixed SKBs where only part of the payload requires offload. In the future we are likely to add a similar restriction following a change cipher spec record. The notable differences between SW KTLS and NIC offloaded TLS implementations are as follows: 1. The offloaded implementation builds "plaintext TLS record", those records contain plaintext instead of ciphertext and place holder bytes instead of authentication tags. 2. The offloaded implementation maintains a mapping from TCP sequence number to TLS records. Thus given a TCP SKB sent from a NIC offloaded TLS socket, we can use the tls NIC offload infrastructure to obtain enough context to encrypt the payload of the SKB. A TLS record is released when the last byte of the record is ack'ed, this is done through the new icsk_clean_acked callback. The infrastructure should be extendable to support various NIC offload implementations. However it is currently written with the implementation below in mind: The NIC assumes that packets from each offloaded stream are sent as plaintext and in-order. It keeps track of the TLS records in the TCP stream. When a packet marked for offload is transmitted, the NIC encrypts the payload in-place and puts authentication tags in the relevant place holders. The responsibility for handling out-of-order packets (i.e. TCP retransmission, qdisc drops) falls on the netdev driver. The netdev driver keeps track of the expected TCP SN from the NIC's perspective. If the next packet to transmit matches the expected TCP SN, the driver advances the expected TCP SN, and transmits the packet with TLS offload indication. If the next packet to transmit does not match the expected TCP SN. The driver calls the TLS layer to obtain the TLS record that includes the TCP of the packet for transmission. Using this TLS record, the driver posts a work entry on the transmit queue to reconstruct the NIC TLS state required for the offload of the out-of-order packet. It updates the expected TCP SN accordingly and transmits the now in-order packet. The same queue is used for packet transmission and TLS context reconstruction to avoid the need for flushing the transmit queue before issuing the context reconstruction request. Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com> Signed-off-by: Boris Pismenny <borisp@mellanox.com> Signed-off-by: Aviad Yehezkel <aviadye@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-30 15:16:16 +08:00
tls_device_cleanup();
}
module_init(tls_register);
module_exit(tls_unregister);