From 765afea3ff4b25a4686b04be0e98dc95b8a15a7e Mon Sep 17 00:00:00 2001
From: Lindsay Stewart <slindsay@amazon.com>
Date: Wed, 13 Sep 2023 11:58:05 -0700
Subject: [PATCH] Reduce allocs in ktls app data send (#4181)

---
 tests/unit/s2n_ktls_io_test.c | 218 +++++++++++++++++++++++++++++-----
 tls/s2n_ktls_io.c             |  99 ++++++++++-----
 2 files changed, 258 insertions(+), 59 deletions(-)

diff --git a/tests/unit/s2n_ktls_io_test.c b/tests/unit/s2n_ktls_io_test.c
index 65dc4c8eca4..f58d89c361e 100644
--- a/tests/unit/s2n_ktls_io_test.c
+++ b/tests/unit/s2n_ktls_io_test.c
@@ -15,6 +15,7 @@
 
 #include "s2n_test.h"
 #include "testlib/s2n_ktls_test_utils.h"
+#include "testlib/s2n_mem_testlib.h"
 #include "testlib/s2n_testlib.h"
 #include "tls/s2n_ktls.h"
 #include "utils/s2n_random.h"
@@ -558,6 +559,9 @@ int main(int argc, char **argv)
             EXPECT_FAILURE_WITH_ERRNO(
                     s2n_ktls_sendv_with_offset(&conn, NULL, 1, 0, &blocked),
                     S2N_ERR_NULL);
+            EXPECT_FAILURE_WITH_ERRNO(
+                    s2n_ktls_sendv_with_offset(&conn, NULL, 1, 1, &blocked),
+                    S2N_ERR_NULL);
             EXPECT_FAILURE_WITH_ERRNO(
                     s2n_ktls_sendv_with_offset(&conn, &test_iovec, 1, 0, NULL),
                     S2N_ERR_NULL);
@@ -671,38 +675,20 @@ int main(int argc, char **argv)
             EXPECT_OK(s2n_test_validate_data(&out, test_data, sizeof(test_data)));
         };
 
-        /* Test: Send with very large number of iovecs */
-        {
-            DEFER_CLEANUP(struct s2n_connection *conn = s2n_connection_new(S2N_SERVER),
-                    s2n_connection_ptr_free);
-            EXPECT_NOT_NULL(conn);
-
-            DEFER_CLEANUP(struct s2n_test_ktls_io_stuffer out = { 0 },
-                    s2n_ktls_io_stuffer_free);
-            EXPECT_OK(s2n_test_init_ktls_io_stuffer_send(conn, &out));
-
-            size_t many_iov_lens[1000] = { 0 };
-            DEFER_CLEANUP(struct s2n_test_iovecs test_iovecs = { 0 }, s2n_test_iovecs_free);
-            EXPECT_OK(s2n_test_new_iovecs(&test_iovecs, &test_data_blob,
-                    many_iov_lens, s2n_array_len(many_iov_lens)));
-
-            s2n_blocked_status blocked = S2N_NOT_BLOCKED;
-            ssize_t result = s2n_ktls_sendv_with_offset(conn,
-                    test_iovecs.iovecs, test_iovecs.iovecs_count, 0, &blocked);
-            EXPECT_EQUAL(result, sizeof(test_data));
-
-            EXPECT_EQUAL(out.sendmsg_invoked_count, 1);
-            EXPECT_EQUAL(blocked, S2N_NOT_BLOCKED);
-            EXPECT_OK(s2n_test_validate_ancillary(&out, TLS_APPLICATION_DATA, sizeof(test_data)));
-            EXPECT_OK(s2n_test_validate_data(&out, test_data, sizeof(test_data)));
-        };
-
         /* Test: Send with offset */
         {
             DEFER_CLEANUP(struct s2n_test_iovecs test_iovecs = { 0 }, s2n_test_iovecs_free);
             EXPECT_OK(s2n_test_new_iovecs(&test_iovecs, &test_data_blob,
                     test_iov_lens, s2n_array_len(test_iov_lens)));
 
+            size_t large_test_iov_lens[100] = { 0 };
+            EXPECT_MEMCPY_SUCCESS(large_test_iov_lens, test_iov_lens, sizeof(test_iov_lens));
+
+            DEFER_CLEANUP(struct s2n_test_iovecs large_test_iovecs = { 0 },
+                    s2n_test_iovecs_free);
+            EXPECT_OK(s2n_test_new_iovecs(&large_test_iovecs, &test_data_blob,
+                    large_test_iov_lens, s2n_array_len(large_test_iov_lens)));
+
             /* Test: Send with invalid / too large offset */
             {
                 const size_t bad_offset = sizeof(test_data) + 1;
@@ -716,16 +702,38 @@ int main(int argc, char **argv)
                 EXPECT_OK(s2n_test_init_ktls_io_stuffer_send(conn, &out));
 
                 s2n_blocked_status blocked = S2N_NOT_BLOCKED;
-                ssize_t result = s2n_ktls_sendv_with_offset(conn,
-                        test_iovecs.iovecs, test_iovecs.iovecs_count, bad_offset, &blocked);
+                ssize_t result = s2n_ktls_sendv_with_offset(conn, large_test_iovecs.iovecs,
+                        large_test_iovecs.iovecs_count, bad_offset, &blocked);
                 EXPECT_FAILURE_WITH_ERRNO(result, S2N_ERR_INVALID_ARGUMENT);
 
                 EXPECT_EQUAL(out.sendmsg_invoked_count, 0);
                 EXPECT_EQUAL(blocked, S2N_NOT_BLOCKED);
                 EXPECT_OK(s2n_test_records_in_ancillary(&out, 0));
-            }
+            };
+
+            /* Test: Send with offset equal to total data size */
+            {
+                const size_t offset = sizeof(test_data);
+
+                DEFER_CLEANUP(struct s2n_connection *conn = s2n_connection_new(S2N_SERVER),
+                        s2n_connection_ptr_free);
+                EXPECT_NOT_NULL(conn);
 
-            /* Test: Send with all possible valid offsets */
+                DEFER_CLEANUP(struct s2n_test_ktls_io_stuffer out = { 0 },
+                        s2n_ktls_io_stuffer_free);
+                EXPECT_OK(s2n_test_init_ktls_io_stuffer_send(conn, &out));
+
+                s2n_blocked_status blocked = S2N_NOT_BLOCKED;
+                int written = s2n_ktls_sendv_with_offset(conn, large_test_iovecs.iovecs,
+                        large_test_iovecs.iovecs_count, offset, &blocked);
+                EXPECT_EQUAL(written, 0);
+
+                EXPECT_EQUAL(out.sendmsg_invoked_count, 1);
+                EXPECT_EQUAL(blocked, S2N_NOT_BLOCKED);
+                EXPECT_OK(s2n_test_records_in_ancillary(&out, 0));
+            };
+
+            /* Test: Send with small iovecs array and all possible valid offsets */
             for (size_t offset = 0; offset < sizeof(test_data); offset++) {
                 DEFER_CLEANUP(struct s2n_connection *conn = s2n_connection_new(S2N_SERVER),
                         s2n_connection_ptr_free);
@@ -748,6 +756,30 @@ int main(int argc, char **argv)
                 EXPECT_OK(s2n_test_validate_ancillary(&out, TLS_APPLICATION_DATA, expected_sent));
                 EXPECT_OK(s2n_test_validate_data(&out, test_data + offset, expected_sent));
             }
+
+            /* Test: Send with large iovecs array and all possible valid offsets */
+            for (size_t offset = 0; offset < sizeof(test_data); offset++) {
+                DEFER_CLEANUP(struct s2n_connection *conn = s2n_connection_new(S2N_SERVER),
+                        s2n_connection_ptr_free);
+                EXPECT_NOT_NULL(conn);
+
+                DEFER_CLEANUP(struct s2n_test_ktls_io_stuffer out = { 0 },
+                        s2n_ktls_io_stuffer_free);
+                EXPECT_OK(s2n_test_init_ktls_io_stuffer_send(conn, &out));
+
+                const size_t expected_sent = sizeof(test_data) - offset;
+                EXPECT_TRUE(expected_sent > 0);
+
+                s2n_blocked_status blocked = S2N_NOT_BLOCKED;
+                ssize_t result = s2n_ktls_sendv_with_offset(conn, large_test_iovecs.iovecs,
+                        large_test_iovecs.iovecs_count, offset, &blocked);
+                EXPECT_EQUAL(result, expected_sent);
+
+                EXPECT_EQUAL(out.sendmsg_invoked_count, 1);
+                EXPECT_EQUAL(blocked, S2N_NOT_BLOCKED);
+                EXPECT_OK(s2n_test_validate_ancillary(&out, TLS_APPLICATION_DATA, expected_sent));
+                EXPECT_OK(s2n_test_validate_data(&out, test_data + offset, expected_sent));
+            }
         };
 
         /* Test: Partial write */
@@ -799,6 +831,128 @@ int main(int argc, char **argv)
             EXPECT_EQUAL(io_ctx.invoked_count, 1);
             EXPECT_EQUAL(blocked, S2N_BLOCKED_ON_WRITE);
         };
+
+        /* Test: Memory usage */
+        {
+            const size_t iov_lens[100] = { 10, 5, 0, 1, 100, 100, 10 };
+            const size_t small_iov_lens_count = 10;
+            const size_t large_iov_lens_count = s2n_array_len(iov_lens);
+
+            DEFER_CLEANUP(struct s2n_test_iovecs small_iovecs = { 0 }, s2n_test_iovecs_free);
+            EXPECT_OK(s2n_test_new_iovecs(&small_iovecs, &test_data_blob,
+                    iov_lens, small_iov_lens_count));
+
+            DEFER_CLEANUP(struct s2n_test_iovecs large_iovecs = { 0 }, s2n_test_iovecs_free);
+            EXPECT_OK(s2n_test_new_iovecs(&large_iovecs, &test_data_blob,
+                    iov_lens, large_iov_lens_count));
+
+            const size_t one_iovec_size = sizeof(struct iovec);
+            const size_t large_iovecs_size = large_iovecs.iovecs_count * one_iovec_size;
+
+            struct {
+                struct s2n_test_iovecs *iovecs;
+                size_t offset;
+                uint32_t expected_malloc;
+                uint32_t expected_malloc_count;
+            } test_cases[] = {
+                /* Small iovecs never require an allocation */
+                {
+                        .iovecs = &small_iovecs,
+                        .offset = 1,
+                        .expected_malloc_count = 0,
+                },
+                {
+                        .iovecs = &small_iovecs,
+                        .offset = iov_lens[0],
+                        .expected_malloc_count = 0,
+                },
+                {
+                        .iovecs = &small_iovecs,
+                        .offset = iov_lens[0] + 1,
+                        .expected_malloc_count = 0,
+                },
+                /* Large iovecs with offset evenly divisible by the iov_lens do
+                 * not require an alloc.
+                 * Example: { x, y, z }, offset=x -> { y, z }
+                 */
+                {
+                        .iovecs = &large_iovecs,
+                        .offset = iov_lens[0],
+                        .expected_malloc_count = 0,
+                },
+                {
+                        .iovecs = &large_iovecs,
+                        .offset = iov_lens[0] + iov_lens[1],
+                        .expected_malloc_count = 0,
+                },
+                /* Large iovecs with offset not evenly divisible by the iov_lens
+                 * modify an entry so require an alloc.
+                 * Example: { x, y, z }, offset=1 -> { x-1, y, z }
+                 */
+                {
+                        .iovecs = &large_iovecs,
+                        .offset = 1,
+                        .expected_malloc_count = 1,
+                        .expected_malloc = large_iovecs_size,
+                },
+                {
+                        .iovecs = &large_iovecs,
+                        .offset = iov_lens[0] + 1,
+                        .expected_malloc_count = 1,
+                        .expected_malloc = large_iovecs_size - one_iovec_size,
+                },
+                /* Large iovecs that become small iovecs when the offset
+                 * is applied do not require an alloc.
+                 */
+                {
+                        .iovecs = &large_iovecs,
+                        .offset = sizeof(test_data) - 1,
+                        .expected_malloc_count = 0,
+                },
+                /* No alloc if the entire large iovec is skipped */
+                {
+                        .iovecs = &large_iovecs,
+                        .offset = sizeof(test_data),
+                        .expected_malloc_count = 0,
+                },
+            };
+
+            for (size_t i = 0; i < s2n_array_len(test_cases); i++) {
+                struct iovec *iovecs = test_cases[i].iovecs->iovecs;
+                const size_t iovecs_count = test_cases[i].iovecs->iovecs_count;
+                const size_t offset = test_cases[i].offset;
+
+                const size_t expected_send = sizeof(test_data) - offset;
+
+                DEFER_CLEANUP(struct s2n_connection *conn = s2n_connection_new(S2N_SERVER),
+                        s2n_connection_ptr_free);
+                EXPECT_NOT_NULL(conn);
+
+                DEFER_CLEANUP(struct s2n_test_ktls_io_stuffer out = { 0 },
+                        s2n_ktls_io_stuffer_free);
+                EXPECT_OK(s2n_test_init_ktls_io_stuffer_send(conn, &out));
+
+                /* Preemptively allocate sendmsg memory to avoid false positives */
+                EXPECT_SUCCESS(s2n_stuffer_resize(&out.data_buffer, expected_send));
+                EXPECT_SUCCESS(s2n_stuffer_resize(&out.ancillary_buffer, 100));
+
+                DEFER_CLEANUP(struct s2n_mem_test_cb_scope scope = { 0 },
+                        s2n_mem_test_free_callbacks);
+                EXPECT_OK(s2n_mem_test_init_callbacks(&scope));
+
+                s2n_blocked_status blocked = S2N_NOT_BLOCKED;
+                ssize_t result = s2n_ktls_sendv_with_offset(conn, iovecs, iovecs_count,
+                        offset, &blocked);
+                EXPECT_EQUAL(result, sizeof(test_data) - offset);
+
+                size_t malloc_count = test_cases[i].expected_malloc_count;
+                EXPECT_OK(s2n_mem_test_assert_malloc_count(malloc_count));
+                if (malloc_count) {
+                    EXPECT_OK(s2n_mem_test_assert_malloc(test_cases[i].expected_malloc));
+                }
+                EXPECT_OK(s2n_mem_test_assert_all_freed());
+            }
+        };
     };
 
     /* Test: s2n_ktls_send_cb */
@@ -814,8 +968,8 @@ int main(int argc, char **argv)
         /* Safety */
         {
             struct s2n_test_ktls_io_stuffer ctx = { 0 };
-            EXPECT_FAILURE_WITH_ERRNO(s2n_ktls_send_cb(NULL, test_data, 1), S2N_ERR_IO);
-            EXPECT_FAILURE_WITH_ERRNO(s2n_ktls_send_cb(&ctx, NULL, 1), S2N_ERR_IO);
+            EXPECT_FAILURE_WITH_ERRNO(s2n_ktls_send_cb(NULL, test_data, 1), S2N_ERR_NULL);
+            EXPECT_FAILURE_WITH_ERRNO(s2n_ktls_send_cb(&ctx, NULL, 1), S2N_ERR_NULL);
         };
 
         /* Test: Basic write succeeds */
diff --git a/tls/s2n_ktls_io.c b/tls/s2n_ktls_io.c
index b0448ff6303..04e210932aa 100644
--- a/tls/s2n_ktls_io.c
+++ b/tls/s2n_ktls_io.c
@@ -39,6 +39,9 @@
 #define S2N_KTLS_RECORD_TYPE_SIZE    (sizeof(uint8_t))
 #define S2N_KTLS_CONTROL_BUFFER_SIZE (CMSG_SPACE(S2N_KTLS_RECORD_TYPE_SIZE))
 
+#define S2N_MAX_STACK_IOVECS     16
+#define S2N_MAX_STACK_IOVECS_MEM (S2N_MAX_STACK_IOVECS * sizeof(struct iovec))
+
 /* Used to override sendmsg and recvmsg for testing. */
 static ssize_t s2n_ktls_default_sendmsg(void *io_context, const struct msghdr *msg);
 static ssize_t s2n_ktls_default_recvmsg(void *io_context, struct msghdr *msg);
@@ -274,32 +277,73 @@ S2N_RESULT s2n_ktls_recvmsg(void *io_context, uint8_t *record_type, uint8_t *buf
     return S2N_RESULT_OK;
 }
 
-static S2N_RESULT s2n_ktls_new_iovecs_with_offset(const struct iovec *bufs,
-        size_t count, size_t offs, struct s2n_blob *mem)
+/* The iovec array `bufs` is constant and owned by the application.
+ *
+ * However, we need to apply the given offset to `bufs`. That may involve
+ * updating the iov_base and iov_len of entries in `bufs` to reflect the bytes
+ * already sent. Because `bufs` is constant, we need to instead copy `bufs` and
+ * modify the copy.
+ *
+ * Since one of the primary benefits of kTLS is that we avoid buffering application
+ * data and can pass application data as-is to the kernel, we try to limit the
+ * situations where we need to copy `bufs` and use stack memory where possible.
+ *
+ * Note: We are copying an array of iovecs here, NOT the scattered application
+ * data the iovecs reference. On Linux, the maximum data copied would be
+ * 1024 (IOV_MAX on Linux) * 16 (sizeof(struct iovec)) = ~16KB.
+ *
+ * To avoid any copies when using a large number of iovecs, applications should
+ * call s2n_sendv instead of s2n_sendv_with_offset.
+ */
+static S2N_RESULT s2n_ktls_update_bufs_with_offset(const struct iovec **bufs, size_t *count,
+        size_t offs, struct s2n_blob *mem)
 {
-    RESULT_ENSURE(bufs != NULL || count == 0, S2N_ERR_NULL);
+    RESULT_ENSURE_REF(bufs);
+    RESULT_ENSURE_REF(count);
+    RESULT_ENSURE(*bufs != NULL || *count == 0, S2N_ERR_NULL);
     RESULT_ENSURE_REF(mem);
 
-    RESULT_GUARD_POSIX(s2n_realloc(mem, sizeof(struct iovec) * count));
-    struct iovec *new_bufs = (struct iovec *) (void *) mem->data;
-    RESULT_ENSURE_REF(new_bufs);
-
-    for (size_t i = 0; i < count; i++) {
-        size_t old_len = bufs[i].iov_len;
-        if (offs < old_len) {
-            new_bufs[i].iov_base = (uint8_t *) bufs[i].iov_base + offs;
-            new_bufs[i].iov_len = old_len - offs;
-            offs = 0;
-        } else {
-            /* Zero any iovec skipped by the offset.
-             * We could change the count of the copy instead, but this is simpler. */
-            new_bufs[i].iov_base = NULL;
-            new_bufs[i].iov_len = 0;
-            offs -= old_len;
+    size_t skipped = 0;
+    while (offs > 0) {
+        /* If we need to skip more iovecs than actually exist,
+         * then the offset is too large and therefore invalid.
+         */
+        RESULT_ENSURE(skipped < *count, S2N_ERR_INVALID_ARGUMENT);
+
+        size_t iov_len = (*bufs)[skipped].iov_len;
+
+        /* This is the last iovec affected by the offset. */
+        if (offs < iov_len) {
+            break;
         }
+
+        offs -= iov_len;
+        skipped++;
     }
-    /* The offset cannot be greater than the total size of all iovecs */
-    RESULT_ENSURE(offs == 0, S2N_ERR_INVALID_ARGUMENT);
+
+    *count = (*count) - skipped;
+    if (*count == 0) {
+        return S2N_RESULT_OK;
+    }
+
+    *bufs = &(*bufs)[skipped];
+    if (offs == 0) {
+        return S2N_RESULT_OK;
+    }
+
+    size_t size = (*count) * (sizeof(struct iovec));
+    /* If possible, use the existing stack memory in `mem` for the copy.
+     * Otherwise, we need to allocate sufficient new heap memory. */
+    if (size > mem->size) {
+        RESULT_GUARD_POSIX(s2n_alloc(mem, size));
+    }
+
+    struct iovec *new_bufs = (struct iovec *) (void *) mem->data;
+    RESULT_CHECKED_MEMCPY(new_bufs, *bufs, size);
+    new_bufs[0].iov_base = (uint8_t *) new_bufs[0].iov_base + offs;
+    new_bufs[0].iov_len = new_bufs[0].iov_len - offs;
+    *bufs = new_bufs;
+
     return S2N_RESULT_OK;
 }
 
@@ -312,13 +356,11 @@ ssize_t s2n_ktls_sendv_with_offset(struct s2n_connection *conn, const struct iov
     POSIX_ENSURE(offs_in >= 0, S2N_ERR_INVALID_ARGUMENT);
     size_t offs = offs_in;
 
-    DEFER_CLEANUP(struct s2n_blob new_bufs = { 0 }, s2n_free);
+    DEFER_CLEANUP(struct s2n_blob new_bufs = { 0 }, s2n_free_or_wipe);
+    uint8_t new_bufs_mem[S2N_MAX_STACK_IOVECS_MEM] = { 0 };
+    POSIX_GUARD(s2n_blob_init(&new_bufs, new_bufs_mem, sizeof(new_bufs_mem)));
     if (offs > 0) {
-        /* We can't modify the application-owned iovecs to reflect the offset.
-         * Therefore, we must alloc and modify a copy.
-         */
-        POSIX_GUARD_RESULT(s2n_ktls_new_iovecs_with_offset(bufs, count, offs, &new_bufs));
-        bufs = (const struct iovec *) (void *) new_bufs.data;
+        POSIX_GUARD_RESULT(s2n_ktls_update_bufs_with_offset(&bufs, &count, offs, &new_bufs));
     }
 
     size_t bytes_written = 0;
@@ -329,6 +371,9 @@ ssize_t s2n_ktls_sendv_with_offset(struct s2n_connection *conn, const struct iov
 
 int s2n_ktls_send_cb(void *io_context, const uint8_t *buf, uint32_t len)
 {
+    POSIX_ENSURE_REF(io_context);
+    POSIX_ENSURE_REF(buf);
+
     /* For now, all control records are assumed to be alerts.
      * We can set the record_type on the io_context in the future.
      */