From 229087f6f1dc2d0c38feba805770f28529980ec0 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 4 Apr 2024 15:03:44 -0700 Subject: [PATCH 01/45] bpf, kconfig: Fix DEBUG_INFO_BTF_MODULES Kconfig definition Turns out that due to CONFIG_DEBUG_INFO_BTF_MODULES not having an explicitly specified "menu item name" in Kconfig, it's basically impossible to turn it off (see [0]). This patch fixes the issue by defining menu name for CONFIG_DEBUG_INFO_BTF_MODULES, which makes it actually adjustable and independent of CONFIG_DEBUG_INFO_BTF, in the sense that one can have DEBUG_INFO_BTF=y and DEBUG_INFO_BTF_MODULES=n. We still keep it as defaulting to Y, of course. Fixes: 5f9ae91f7c0d ("kbuild: Build kernel module BTFs if BTF is enabled and pahole supports it") Reported-by: Vincent Li Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/CAK3+h2xiFfzQ9UXf56nrRRP=p1+iUxGoEP5B+aq9MDT5jLXDSg@mail.gmail.com [0] Link: https://lore.kernel.org/bpf/20240404220344.3879270-1-andrii@kernel.org --- lib/Kconfig.debug | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index c63a5fbf1f1c2b..291185f54ee4c2 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -375,7 +375,7 @@ config DEBUG_INFO_SPLIT Incompatible with older versions of ccache. config DEBUG_INFO_BTF - bool "Generate BTF typeinfo" + bool "Generate BTF type information" depends on !DEBUG_INFO_SPLIT && !DEBUG_INFO_REDUCED depends on !GCC_PLUGIN_RANDSTRUCT || COMPILE_TEST depends on BPF_SYSCALL @@ -408,7 +408,8 @@ config PAHOLE_HAS_LANG_EXCLUDE using DEBUG_INFO_BTF_MODULES. config DEBUG_INFO_BTF_MODULES - def_bool y + bool "Generate BTF type information for kernel modules" + default y depends on DEBUG_INFO_BTF && MODULES && PAHOLE_HAS_SPLIT_BTF help Generate compact split BTF type information for kernel modules. From cfddb048040b598fa7df0e51ca361289fc7abf28 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 5 Apr 2024 13:23:37 +0000 Subject: [PATCH 02/45] MAINTAINERS: Update email address for Puranjay Mohan I would like to use the kernel.org address for kernel development from now on. Signed-off-by: Puranjay Mohan Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240405132337.71950-1-puranjay@kernel.org --- .mailmap | 1 + MAINTAINERS | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.mailmap b/.mailmap index 59c9a841bf7149..b6930c2f68f47c 100644 --- a/.mailmap +++ b/.mailmap @@ -496,6 +496,7 @@ Praveen BP Pradeep Kumar Chitrapu Prasad Sodagudi Punit Agrawal +Puranjay Mohan Qais Yousef Qais Yousef Quentin Monnet diff --git a/MAINTAINERS b/MAINTAINERS index 75381386fe4c38..0c08b44bde17b9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -553,7 +553,7 @@ F: Documentation/devicetree/bindings/iio/accel/adi,adxl345.yaml F: drivers/input/misc/adxl34x.c ADXL355 THREE-AXIS DIGITAL ACCELEROMETER DRIVER -M: Puranjay Mohan +M: Puranjay Mohan L: linux-iio@vger.kernel.org S: Supported F: Documentation/devicetree/bindings/iio/accel/adi,adxl355.yaml @@ -3714,7 +3714,7 @@ F: drivers/iio/imu/bmi323/ BPF JIT for ARM M: Russell King -M: Puranjay Mohan +M: Puranjay Mohan L: bpf@vger.kernel.org S: Maintained F: arch/arm/net/ @@ -21934,7 +21934,7 @@ F: include/linux/soc/ti/ti_sci_inta_msi.h F: include/linux/soc/ti/ti_sci_protocol.h TEXAS INSTRUMENTS' TMP117 TEMPERATURE SENSOR DRIVER -M: Puranjay Mohan +M: Puranjay Mohan L: linux-iio@vger.kernel.org S: Supported F: Documentation/devicetree/bindings/iio/temperature/ti,tmp117.yaml From 76cd338994778c552c51086fc056819b5cdda2e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Fri, 5 Apr 2024 14:33:51 +0200 Subject: [PATCH 03/45] MAINTAINERS: bpf: Add Lehui and Puranjay as riscv64 reviewers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lehui and Puranjay have been active RISC-V 64-bit BPF JIT contributors/reviewers for a long time! Let's make it more official by adding them as reviewers in MAINTAINERS. Thank you for your hard work! Signed-off-by: Björn Töpel Link: https://lore.kernel.org/r/20240405123352.2852393-1-bjorn@kernel.org Signed-off-by: Alexei Starovoitov --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 0c08b44bde17b9..65e8351f978e2a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3764,6 +3764,8 @@ X: arch/riscv/net/bpf_jit_comp64.c BPF JIT for RISC-V (64-bit) M: Björn Töpel +R: Pu Lehui +R: Puranjay Mohan L: bpf@vger.kernel.org S: Maintained F: arch/riscv/net/ From 6648e613226e18897231ab5e42ffc29e63fa3365 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Thu, 4 Apr 2024 10:10:01 +0800 Subject: [PATCH 04/45] bpf, skmsg: Fix NULL pointer dereference in sk_psock_skb_ingress_enqueue Fix NULL pointer data-races in sk_psock_skb_ingress_enqueue() which syzbot reported [1]. [1] BUG: KCSAN: data-race in sk_psock_drop / sk_psock_skb_ingress_enqueue write to 0xffff88814b3278b8 of 8 bytes by task 10724 on cpu 1: sk_psock_stop_verdict net/core/skmsg.c:1257 [inline] sk_psock_drop+0x13e/0x1f0 net/core/skmsg.c:843 sk_psock_put include/linux/skmsg.h:459 [inline] sock_map_close+0x1a7/0x260 net/core/sock_map.c:1648 unix_release+0x4b/0x80 net/unix/af_unix.c:1048 __sock_release net/socket.c:659 [inline] sock_close+0x68/0x150 net/socket.c:1421 __fput+0x2c1/0x660 fs/file_table.c:422 __fput_sync+0x44/0x60 fs/file_table.c:507 __do_sys_close fs/open.c:1556 [inline] __se_sys_close+0x101/0x1b0 fs/open.c:1541 __x64_sys_close+0x1f/0x30 fs/open.c:1541 do_syscall_64+0xd3/0x1d0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 read to 0xffff88814b3278b8 of 8 bytes by task 10713 on cpu 0: sk_psock_data_ready include/linux/skmsg.h:464 [inline] sk_psock_skb_ingress_enqueue+0x32d/0x390 net/core/skmsg.c:555 sk_psock_skb_ingress_self+0x185/0x1e0 net/core/skmsg.c:606 sk_psock_verdict_apply net/core/skmsg.c:1008 [inline] sk_psock_verdict_recv+0x3e4/0x4a0 net/core/skmsg.c:1202 unix_read_skb net/unix/af_unix.c:2546 [inline] unix_stream_read_skb+0x9e/0xf0 net/unix/af_unix.c:2682 sk_psock_verdict_data_ready+0x77/0x220 net/core/skmsg.c:1223 unix_stream_sendmsg+0x527/0x860 net/unix/af_unix.c:2339 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x140/0x180 net/socket.c:745 ____sys_sendmsg+0x312/0x410 net/socket.c:2584 ___sys_sendmsg net/socket.c:2638 [inline] __sys_sendmsg+0x1e9/0x280 net/socket.c:2667 __do_sys_sendmsg net/socket.c:2676 [inline] __se_sys_sendmsg net/socket.c:2674 [inline] __x64_sys_sendmsg+0x46/0x50 net/socket.c:2674 do_syscall_64+0xd3/0x1d0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 value changed: 0xffffffff83d7feb0 -> 0x0000000000000000 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 10713 Comm: syz-executor.4 Tainted: G W 6.8.0-syzkaller-08951-gfe46a7dd189e #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 02/29/2024 Prior to this, commit 4cd12c6065df ("bpf, sockmap: Fix NULL pointer dereference in sk_psock_verdict_data_ready()") fixed one NULL pointer similarly due to no protection of saved_data_ready. Here is another different caller causing the same issue because of the same reason. So we should protect it with sk_callback_lock read lock because the writer side in the sk_psock_drop() uses "write_lock_bh(&sk->sk_callback_lock);". To avoid errors that could happen in future, I move those two pairs of lock into the sk_psock_data_ready(), which is suggested by John Fastabend. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Reported-by: syzbot+aa8c8ec2538929f18f2d@syzkaller.appspotmail.com Signed-off-by: Jason Xing Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Closes: https://syzkaller.appspot.com/bug?extid=aa8c8ec2538929f18f2d Link: https://lore.kernel.org/all/20240329134037.92124-1-kerneljasonxing@gmail.com Link: https://lore.kernel.org/bpf/20240404021001.94815-1-kerneljasonxing@gmail.com --- include/linux/skmsg.h | 2 ++ net/core/skmsg.c | 5 +---- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index e65ec3fd27998a..a509caf823d618 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -461,10 +461,12 @@ static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock) static inline void sk_psock_data_ready(struct sock *sk, struct sk_psock *psock) { + read_lock_bh(&sk->sk_callback_lock); if (psock->saved_data_ready) psock->saved_data_ready(sk); else sk->sk_data_ready(sk); + read_unlock_bh(&sk->sk_callback_lock); } static inline void psock_set_prog(struct bpf_prog **pprog, diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 4d75ef9d24bfa7..fd20aae30be23c 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -1226,11 +1226,8 @@ static void sk_psock_verdict_data_ready(struct sock *sk) rcu_read_lock(); psock = sk_psock(sk); - if (psock) { - read_lock_bh(&sk->sk_callback_lock); + if (psock) sk_psock_data_ready(sk, psock); - read_unlock_bh(&sk->sk_callback_lock); - } rcu_read_unlock(); } } From 37eacb9f6e89fb399a79e952bc9c78eb3e16290e Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Fri, 12 Apr 2024 16:11:00 +0200 Subject: [PATCH 05/45] bpf: Fix a verifier verbose message Long ago a map file descriptor in a pseudo ldimm64 instruction could only be present as an immediate value insn[0].imm, and thus this value was used in a verbose verifier message printed when the file descriptor wasn't valid. Since addition of BPF_PSEUDO_MAP_IDX_VALUE/BPF_PSEUDO_MAP_IDX the insn[0].imm field can also contain an index pointing to the file descriptor in the attr.fd_array array. However, if the file descriptor is invalid, the verifier still prints the verbose message containing value of insn[0].imm. Patch the verifier message to always print the actual file descriptor value. Fixes: 387544bfa291 ("bpf: Introduce fd_idx") Signed-off-by: Anton Protopopov Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240412141100.3562942-1-aspsk@isovalent.com --- kernel/bpf/verifier.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 98188379d5c77d..aa24beb65393c8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18289,8 +18289,7 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) f = fdget(fd); map = __bpf_map_get(f); if (IS_ERR(map)) { - verbose(env, "fd %d is not pointing to valid bpf_map\n", - insn[0].imm); + verbose(env, "fd %d is not pointing to valid bpf_map\n", fd); return PTR_ERR(map); } From dc7d7447b56bcc9cf79a9c22e4edad200a298e4c Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Tue, 16 Apr 2024 14:42:07 +0800 Subject: [PATCH 06/45] bpf, arm64: Fix incorrect runtime stats When __bpf_prog_enter() returns zero, the arm64 register x20 that stores prog start time is not assigned to zero, causing incorrect runtime stats. To fix it, assign the return value of bpf_prog_enter() to x20 register immediately upon its return. Fixes: efc9909fdce0 ("bpf, arm64: Add bpf trampoline for arm64") Reported-by: Ivan Babrou Signed-off-by: Xu Kuohai Signed-off-by: Daniel Borkmann Tested-by: Ivan Babrou Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240416064208.2919073-2-xukuohai@huaweicloud.com --- arch/arm64/net/bpf_jit_comp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 122021f9bdfc87..13eac43c632dfe 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1844,15 +1844,15 @@ static void invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_link *l, emit_call(enter_prog, ctx); + /* save return value to callee saved register x20 */ + emit(A64_MOV(1, A64_R(20), A64_R(0)), ctx); + /* if (__bpf_prog_enter(prog) == 0) * goto skip_exec_of_prog; */ branch = ctx->image + ctx->idx; emit(A64_NOP, ctx); - /* save return value to callee saved register x20 */ - emit(A64_MOV(1, A64_R(20), A64_R(0)), ctx); - emit(A64_ADD_I(1, A64_R(0), A64_SP, args_off), ctx); if (!p->jited) emit_addr_mov_i64(A64_R(1), (const u64)p->insnsi, ctx); From 10541b374aa05c8118cc6a529a615882e53f261b Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Tue, 16 Apr 2024 14:42:08 +0800 Subject: [PATCH 07/45] riscv, bpf: Fix incorrect runtime stats MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When __bpf_prog_enter() returns zero, the s1 register is not set to zero, resulting in incorrect runtime stats. Fix it by setting s1 immediately upon the return of __bpf_prog_enter(). Fixes: 49b5e77ae3e2 ("riscv, bpf: Add bpf trampoline support for RV64") Signed-off-by: Xu Kuohai Signed-off-by: Daniel Borkmann Reviewed-by: Pu Lehui Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20240416064208.2919073-3-xukuohai@huaweicloud.com --- arch/riscv/net/bpf_jit_comp64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 1adf2f39ce59cb..ec9d692838fca5 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -722,6 +722,9 @@ static int invoke_bpf_prog(struct bpf_tramp_link *l, int args_off, int retval_of if (ret) return ret; + /* store prog start time */ + emit_mv(RV_REG_S1, RV_REG_A0, ctx); + /* if (__bpf_prog_enter(prog) == 0) * goto skip_exec_of_prog; */ @@ -729,9 +732,6 @@ static int invoke_bpf_prog(struct bpf_tramp_link *l, int args_off, int retval_of /* nop reserved for conditional jump */ emit(rv_nop(), ctx); - /* store prog start time */ - emit_mv(RV_REG_S1, RV_REG_A0, ctx); - /* arg1: &args_off */ emit_addi(RV_REG_A0, RV_REG_FP, -args_off, ctx); if (!p->jited) From c6f48506ba30c722dd9d89aa6a40eb1926277dff Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 19 Apr 2024 18:28:32 +0000 Subject: [PATCH 08/45] arm32, bpf: Reimplement sign-extension mov instruction The current implementation of the mov instruction with sign extension has the following problems: 1. It clobbers the source register if it is not stacked because it sign extends the source and then moves it to the destination. 2. If the dst_reg is stacked, the current code doesn't write the value back in case of 64-bit mov. 3. There is room for improvement by emitting fewer instructions. The steps for fixing this and the instructions emitted by the JIT are explained below with examples in all combinations: Case A: offset == 32: ===================== Case A.1: src and dst are stacked registers: -------------------------------------------- 1. Load src_lo into tmp_lo 2. Store tmp_lo into dst_lo 3. Sign extend tmp_lo into tmp_hi 4. Store tmp_hi to dst_hi Example: r3 = (s32)r3 r3 is a stacked register ldr r6, [r11, #-16] // Load r3_lo into tmp_lo // str to dst_lo is not emitted because src_lo == dst_lo asr r7, r6, #31 // Sign extend tmp_lo into tmp_hi str r7, [r11, #-12] // Store tmp_hi into r3_hi Case A.2: src is stacked but dst is not: ---------------------------------------- 1. Load src_lo into dst_lo 2. Sign extend dst_lo into dst_hi Example: r6 = (s32)r3 r6 maps to {ARM_R5, ARM_R4} and r3 is stacked ldr r4, [r11, #-16] // Load r3_lo into r6_lo asr r5, r4, #31 // Sign extend r6_lo into r6_hi Case A.3: src is not stacked but dst is stacked: ------------------------------------------------ 1. Store src_lo into dst_lo 2. Sign extend src_lo into tmp_hi 3. Store tmp_hi to dst_hi Example: r3 = (s32)r6 r3 is stacked and r6 maps to {ARM_R5, ARM_R4} str r4, [r11, #-16] // Store r6_lo to r3_lo asr r7, r4, #31 // Sign extend r6_lo into tmp_hi str r7, [r11, #-12] // Store tmp_hi to dest_hi Case A.4: Both src and dst are not stacked: ------------------------------------------- 1. Mov src_lo into dst_lo 2. Sign extend src_lo into dst_hi Example: (bf) r6 = (s32)r6 r6 maps to {ARM_R5, ARM_R4} // Mov not emitted because dst == src asr r5, r4, #31 // Sign extend r6_lo into r6_hi Case B: offset != 32: ===================== Case B.1: src and dst are stacked registers: -------------------------------------------- 1. Load src_lo into tmp_lo 2. Sign extend tmp_lo according to offset. 3. Store tmp_lo into dst_lo 4. Sign extend tmp_lo into tmp_hi 5. Store tmp_hi to dst_hi Example: r9 = (s8)r3 r9 and r3 are both stacked registers ldr r6, [r11, #-16] // Load r3_lo into tmp_lo lsl r6, r6, #24 // Sign extend tmp_lo asr r6, r6, #24 // .. str r6, [r11, #-56] // Store tmp_lo to r9_lo asr r7, r6, #31 // Sign extend tmp_lo to tmp_hi str r7, [r11, #-52] // Store tmp_hi to r9_hi Case B.2: src is stacked but dst is not: ---------------------------------------- 1. Load src_lo into dst_lo 2. Sign extend dst_lo according to offset. 3. Sign extend tmp_lo into dst_hi Example: r6 = (s8)r3 r6 maps to {ARM_R5, ARM_R4} and r3 is stacked ldr r4, [r11, #-16] // Load r3_lo to r6_lo lsl r4, r4, #24 // Sign extend r6_lo asr r4, r4, #24 // .. asr r5, r4, #31 // Sign extend r6_lo into r6_hi Case B.3: src is not stacked but dst is stacked: ------------------------------------------------ 1. Sign extend src_lo into tmp_lo according to offset. 2. Store tmp_lo into dst_lo. 3. Sign extend src_lo into tmp_hi. 4. Store tmp_hi to dst_hi. Example: r3 = (s8)r1 r3 is stacked and r1 maps to {ARM_R3, ARM_R2} lsl r6, r2, #24 // Sign extend r1_lo to tmp_lo asr r6, r6, #24 // .. str r6, [r11, #-16] // Store tmp_lo to r3_lo asr r7, r6, #31 // Sign extend tmp_lo to tmp_hi str r7, [r11, #-12] // Store tmp_hi to r3_hi Case B.4: Both src and dst are not stacked: ------------------------------------------- 1. Sign extend src_lo into dst_lo according to offset. 2. Sign extend dst_lo into dst_hi. Example: r6 = (s8)r1 r6 maps to {ARM_R5, ARM_R4} and r1 maps to {ARM_R3, ARM_R2} lsl r4, r2, #24 // Sign extend r1_lo to r6_lo asr r4, r4, #24 // .. asr r5, r4, #31 // Sign extend r6_lo to r6_hi Fixes: fc832653fa0d ("arm32, bpf: add support for sign-extension mov instruction") Reported-by: syzbot+186522670e6722692d86@syzkaller.appspotmail.com Signed-off-by: Puranjay Mohan Signed-off-by: Daniel Borkmann Reviewed-by: Russell King (Oracle) Closes: https://lore.kernel.org/all/000000000000e9a8d80615163f2a@google.com Link: https://lore.kernel.org/bpf/20240419182832.27707-1-puranjay@kernel.org --- arch/arm/net/bpf_jit_32.c | 56 ++++++++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 13 deletions(-) diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 1d672457d02ff3..72b5cd697f5d94 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -871,16 +871,11 @@ static inline void emit_a32_alu_r64(const bool is64, const s8 dst[], } /* dst = src (4 bytes)*/ -static inline void emit_a32_mov_r(const s8 dst, const s8 src, const u8 off, - struct jit_ctx *ctx) { +static inline void emit_a32_mov_r(const s8 dst, const s8 src, struct jit_ctx *ctx) { const s8 *tmp = bpf2a32[TMP_REG_1]; s8 rt; rt = arm_bpf_get_reg32(src, tmp[0], ctx); - if (off && off != 32) { - emit(ARM_LSL_I(rt, rt, 32 - off), ctx); - emit(ARM_ASR_I(rt, rt, 32 - off), ctx); - } arm_bpf_put_reg32(dst, rt, ctx); } @@ -889,15 +884,15 @@ static inline void emit_a32_mov_r64(const bool is64, const s8 dst[], const s8 src[], struct jit_ctx *ctx) { if (!is64) { - emit_a32_mov_r(dst_lo, src_lo, 0, ctx); + emit_a32_mov_r(dst_lo, src_lo, ctx); if (!ctx->prog->aux->verifier_zext) /* Zero out high 4 bytes */ emit_a32_mov_i(dst_hi, 0, ctx); } else if (__LINUX_ARM_ARCH__ < 6 && ctx->cpu_architecture < CPU_ARCH_ARMv5TE) { /* complete 8 byte move */ - emit_a32_mov_r(dst_lo, src_lo, 0, ctx); - emit_a32_mov_r(dst_hi, src_hi, 0, ctx); + emit_a32_mov_r(dst_lo, src_lo, ctx); + emit_a32_mov_r(dst_hi, src_hi, ctx); } else if (is_stacked(src_lo) && is_stacked(dst_lo)) { const u8 *tmp = bpf2a32[TMP_REG_1]; @@ -917,17 +912,52 @@ static inline void emit_a32_mov_r64(const bool is64, const s8 dst[], static inline void emit_a32_movsx_r64(const bool is64, const u8 off, const s8 dst[], const s8 src[], struct jit_ctx *ctx) { const s8 *tmp = bpf2a32[TMP_REG_1]; - const s8 *rt; + s8 rs; + s8 rd; - rt = arm_bpf_get_reg64(dst, tmp, ctx); + if (is_stacked(dst_lo)) + rd = tmp[1]; + else + rd = dst_lo; + rs = arm_bpf_get_reg32(src_lo, rd, ctx); + /* rs may be one of src[1], dst[1], or tmp[1] */ + + /* Sign extend rs if needed. If off == 32, lower 32-bits of src are moved to dst and sign + * extension only happens in the upper 64 bits. + */ + if (off != 32) { + /* Sign extend rs into rd */ + emit(ARM_LSL_I(rd, rs, 32 - off), ctx); + emit(ARM_ASR_I(rd, rd, 32 - off), ctx); + } else { + rd = rs; + } + + /* Write rd to dst_lo + * + * Optimization: + * Assume: + * 1. dst == src and stacked. + * 2. off == 32 + * + * In this case src_lo was loaded into rd(tmp[1]) but rd was not sign extended as off==32. + * So, we don't need to write rd back to dst_lo as they have the same value. + * This saves us one str instruction. + */ + if (dst_lo != src_lo || off != 32) + arm_bpf_put_reg32(dst_lo, rd, ctx); - emit_a32_mov_r(dst_lo, src_lo, off, ctx); if (!is64) { if (!ctx->prog->aux->verifier_zext) /* Zero out high 4 bytes */ emit_a32_mov_i(dst_hi, 0, ctx); } else { - emit(ARM_ASR_I(rt[0], rt[1], 31), ctx); + if (is_stacked(dst_hi)) { + emit(ARM_ASR_I(tmp[0], rd, 31), ctx); + arm_bpf_put_reg32(dst_hi, tmp[0], ctx); + } else { + emit(ARM_ASR_I(dst_hi, rd, 31), ctx); + } } } From 5bcf0dcbf9066348058b88a510c57f70f384c92c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Thu, 18 Apr 2024 09:18:39 +0200 Subject: [PATCH 09/45] xdp: use flags field to disambiguate broadcast redirect MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When redirecting a packet using XDP, the bpf_redirect_map() helper will set up the redirect destination information in struct bpf_redirect_info (using the __bpf_xdp_redirect_map() helper function), and the xdp_do_redirect() function will read this information after the XDP program returns and pass the frame on to the right redirect destination. When using the BPF_F_BROADCAST flag to do multicast redirect to a whole map, __bpf_xdp_redirect_map() sets the 'map' pointer in struct bpf_redirect_info to point to the destination map to be broadcast. And xdp_do_redirect() reacts to the value of this map pointer to decide whether it's dealing with a broadcast or a single-value redirect. However, if the destination map is being destroyed before xdp_do_redirect() is called, the map pointer will be cleared out (by bpf_clear_redirect_map()) without waiting for any XDP programs to stop running. This causes xdp_do_redirect() to think that the redirect was to a single target, but the target pointer is also NULL (since broadcast redirects don't have a single target), so this causes a crash when a NULL pointer is passed to dev_map_enqueue(). To fix this, change xdp_do_redirect() to react directly to the presence of the BPF_F_BROADCAST flag in the 'flags' value in struct bpf_redirect_info to disambiguate between a single-target and a broadcast redirect. And only read the 'map' pointer if the broadcast flag is set, aborting if that has been cleared out in the meantime. This prevents the crash, while keeping the atomic (cmpxchg-based) clearing of the map pointer itself, and without adding any more checks in the non-broadcast fast path. Fixes: e624d4ed4aa8 ("xdp: Extend xdp_redirect_map with broadcast support") Reported-and-tested-by: syzbot+af9492708df9797198d6@syzkaller.appspotmail.com Signed-off-by: Toke Høiland-Jørgensen Acked-by: Stanislav Fomichev Reviewed-by: Hangbin Liu Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/r/20240418071840.156411-1-toke@redhat.com Signed-off-by: Martin KaFai Lau --- net/core/filter.c | 42 ++++++++++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 8adf95765cdd96..ae5254f712c94b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4360,10 +4360,12 @@ static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri, enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; + u32 flags = ri->flags; struct bpf_map *map; int err; ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ + ri->flags = 0; ri->map_type = BPF_MAP_TYPE_UNSPEC; if (unlikely(!xdpf)) { @@ -4375,11 +4377,20 @@ static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri, case BPF_MAP_TYPE_DEVMAP: fallthrough; case BPF_MAP_TYPE_DEVMAP_HASH: - map = READ_ONCE(ri->map); - if (unlikely(map)) { + if (unlikely(flags & BPF_F_BROADCAST)) { + map = READ_ONCE(ri->map); + + /* The map pointer is cleared when the map is being torn + * down by bpf_clear_redirect_map() + */ + if (unlikely(!map)) { + err = -ENOENT; + break; + } + WRITE_ONCE(ri->map, NULL); err = dev_map_enqueue_multi(xdpf, dev, map, - ri->flags & BPF_F_EXCLUDE_INGRESS); + flags & BPF_F_EXCLUDE_INGRESS); } else { err = dev_map_enqueue(fwd, xdpf, dev); } @@ -4442,9 +4453,9 @@ EXPORT_SYMBOL_GPL(xdp_do_redirect_frame); static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog, - void *fwd, - enum bpf_map_type map_type, u32 map_id) + struct bpf_prog *xdp_prog, void *fwd, + enum bpf_map_type map_type, u32 map_id, + u32 flags) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct bpf_map *map; @@ -4454,11 +4465,20 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, case BPF_MAP_TYPE_DEVMAP: fallthrough; case BPF_MAP_TYPE_DEVMAP_HASH: - map = READ_ONCE(ri->map); - if (unlikely(map)) { + if (unlikely(flags & BPF_F_BROADCAST)) { + map = READ_ONCE(ri->map); + + /* The map pointer is cleared when the map is being torn + * down by bpf_clear_redirect_map() + */ + if (unlikely(!map)) { + err = -ENOENT; + break; + } + WRITE_ONCE(ri->map, NULL); err = dev_map_redirect_multi(dev, skb, xdp_prog, map, - ri->flags & BPF_F_EXCLUDE_INGRESS); + flags & BPF_F_EXCLUDE_INGRESS); } else { err = dev_map_generic_redirect(fwd, skb, xdp_prog); } @@ -4495,9 +4515,11 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; + u32 flags = ri->flags; int err; ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ + ri->flags = 0; ri->map_type = BPF_MAP_TYPE_UNSPEC; if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) { @@ -4517,7 +4539,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, return 0; } - return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id); + return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id, flags); err: _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err); return err; From 666854ea9cad844f75a068f32812a2d78004914a Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Wed, 24 Apr 2024 21:44:18 +0700 Subject: [PATCH 10/45] ice: ensure the copied buf is NUL terminated Currently, we allocate a count-sized kernel buffer and copy count bytes from userspace to that buffer. Later, we use sscanf on this buffer but we don't ensure that the string is terminated inside the buffer, this can lead to OOB read when using sscanf. Fix this issue by using memdup_user_nul instead of memdup_user. Fixes: 96a9a9341cda ("ice: configure FW logging") Fixes: 73671c3162c8 ("ice: enable FW logging") Reviewed-by: Przemek Kitszel Signed-off-by: Bui Quang Minh Link: https://lore.kernel.org/r/20240424-fix-oob-read-v2-1-f1f1b53a10f4@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_debugfs.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_debugfs.c b/drivers/net/ethernet/intel/ice/ice_debugfs.c index d252d98218d084..9fc0fd95a13d8f 100644 --- a/drivers/net/ethernet/intel/ice/ice_debugfs.c +++ b/drivers/net/ethernet/intel/ice/ice_debugfs.c @@ -171,7 +171,7 @@ ice_debugfs_module_write(struct file *filp, const char __user *buf, if (*ppos != 0 || count > 8) return -EINVAL; - cmd_buf = memdup_user(buf, count); + cmd_buf = memdup_user_nul(buf, count); if (IS_ERR(cmd_buf)) return PTR_ERR(cmd_buf); @@ -257,7 +257,7 @@ ice_debugfs_nr_messages_write(struct file *filp, const char __user *buf, if (*ppos != 0 || count > 4) return -EINVAL; - cmd_buf = memdup_user(buf, count); + cmd_buf = memdup_user_nul(buf, count); if (IS_ERR(cmd_buf)) return PTR_ERR(cmd_buf); @@ -332,7 +332,7 @@ ice_debugfs_enable_write(struct file *filp, const char __user *buf, if (*ppos != 0 || count > 2) return -EINVAL; - cmd_buf = memdup_user(buf, count); + cmd_buf = memdup_user_nul(buf, count); if (IS_ERR(cmd_buf)) return PTR_ERR(cmd_buf); @@ -428,7 +428,7 @@ ice_debugfs_log_size_write(struct file *filp, const char __user *buf, if (*ppos != 0 || count > 5) return -EINVAL; - cmd_buf = memdup_user(buf, count); + cmd_buf = memdup_user_nul(buf, count); if (IS_ERR(cmd_buf)) return PTR_ERR(cmd_buf); From 8c34096c7fdf272fd4c0c37fe411cd2e3ed0ee9f Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Wed, 24 Apr 2024 21:44:19 +0700 Subject: [PATCH 11/45] bna: ensure the copied buf is NUL terminated Currently, we allocate a nbytes-sized kernel buffer and copy nbytes from userspace to that buffer. Later, we use sscanf on this buffer but we don't ensure that the string is terminated inside the buffer, this can lead to OOB read when using sscanf. Fix this issue by using memdup_user_nul instead of memdup_user. Fixes: 7afc5dbde091 ("bna: Add debugfs interface.") Signed-off-by: Bui Quang Minh Link: https://lore.kernel.org/r/20240424-fix-oob-read-v2-2-f1f1b53a10f4@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/brocade/bna/bnad_debugfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/brocade/bna/bnad_debugfs.c b/drivers/net/ethernet/brocade/bna/bnad_debugfs.c index 7246e13dd559fc..97291bfbeea589 100644 --- a/drivers/net/ethernet/brocade/bna/bnad_debugfs.c +++ b/drivers/net/ethernet/brocade/bna/bnad_debugfs.c @@ -312,7 +312,7 @@ bnad_debugfs_write_regrd(struct file *file, const char __user *buf, void *kern_buf; /* Copy the user space buf */ - kern_buf = memdup_user(buf, nbytes); + kern_buf = memdup_user_nul(buf, nbytes); if (IS_ERR(kern_buf)) return PTR_ERR(kern_buf); @@ -372,7 +372,7 @@ bnad_debugfs_write_regwr(struct file *file, const char __user *buf, void *kern_buf; /* Copy the user space buf */ - kern_buf = memdup_user(buf, nbytes); + kern_buf = memdup_user_nul(buf, nbytes); if (IS_ERR(kern_buf)) return PTR_ERR(kern_buf); From f299ee709fb45036454ca11e90cb2810fe771878 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Wed, 24 Apr 2024 21:44:23 +0700 Subject: [PATCH 12/45] octeontx2-af: avoid off-by-one read from userspace We try to access count + 1 byte from userspace with memdup_user(buffer, count + 1). However, the userspace only provides buffer of count bytes and only these count bytes are verified to be okay to access. To ensure the copied buffer is NUL terminated, we use memdup_user_nul instead. Fixes: 3a2eb515d136 ("octeontx2-af: Fix an off by one in rvu_dbg_qsize_write()") Signed-off-by: Bui Quang Minh Link: https://lore.kernel.org/r/20240424-fix-oob-read-v2-6-f1f1b53a10f4@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c index 2500f5ba4f5a42..881d704644fbee 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c @@ -999,12 +999,10 @@ static ssize_t rvu_dbg_qsize_write(struct file *filp, u16 pcifunc; int ret, lf; - cmd_buf = memdup_user(buffer, count + 1); + cmd_buf = memdup_user_nul(buffer, count); if (IS_ERR(cmd_buf)) return -ENOMEM; - cmd_buf[count] = '\0'; - cmd_buf_tmp = strchr(cmd_buf, '\n'); if (cmd_buf_tmp) { *cmd_buf_tmp = '\0'; From 4b911a9690d72641879ea6d13cce1de31d346d79 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 23 Apr 2024 19:35:49 -0700 Subject: [PATCH 13/45] nsh: Restore skb->{protocol,data,mac_header} for outer header in nsh_gso_segment(). syzbot triggered various splats (see [0] and links) by a crafted GSO packet of VIRTIO_NET_HDR_GSO_UDP layering the following protocols: ETH_P_8021AD + ETH_P_NSH + ETH_P_IPV6 + IPPROTO_UDP NSH can encapsulate IPv4, IPv6, Ethernet, NSH, and MPLS. As the inner protocol can be Ethernet, NSH GSO handler, nsh_gso_segment(), calls skb_mac_gso_segment() to invoke inner protocol GSO handlers. nsh_gso_segment() does the following for the original skb before calling skb_mac_gso_segment() 1. reset skb->network_header 2. save the original skb->{mac_heaeder,mac_len} in a local variable 3. pull the NSH header 4. resets skb->mac_header 5. set up skb->mac_len and skb->protocol for the inner protocol. and does the following for the segmented skb 6. set ntohs(ETH_P_NSH) to skb->protocol 7. push the NSH header 8. restore skb->mac_header 9. set skb->mac_header + mac_len to skb->network_header 10. restore skb->mac_len There are two problems in 6-7 and 8-9. (a) After 6 & 7, skb->data points to the NSH header, so the outer header (ETH_P_8021AD in this case) is stripped when skb is sent out of netdev. Also, if NSH is encapsulated by NSH + Ethernet (so NSH-Ethernet-NSH), skb_pull() in the first nsh_gso_segment() will make skb->data point to the middle of the outer NSH or Ethernet header because the Ethernet header is not pulled by the second nsh_gso_segment(). (b) While restoring skb->{mac_header,network_header} in 8 & 9, nsh_gso_segment() does not assume that the data in the linear buffer is shifted. However, udp6_ufo_fragment() could shift the data and change skb->mac_header accordingly as demonstrated by syzbot. If this happens, even the restored skb->mac_header points to the middle of the outer header. It seems nsh_gso_segment() has never worked with outer headers so far. At the end of nsh_gso_segment(), the outer header must be restored for the segmented skb, instead of the NSH header. To do that, let's calculate the outer header position relatively from the inner header and set skb->{data,mac_header,protocol} properly. [0]: BUG: KMSAN: uninit-value in ipvlan_process_outbound drivers/net/ipvlan/ipvlan_core.c:524 [inline] BUG: KMSAN: uninit-value in ipvlan_xmit_mode_l3 drivers/net/ipvlan/ipvlan_core.c:602 [inline] BUG: KMSAN: uninit-value in ipvlan_queue_xmit+0xf44/0x16b0 drivers/net/ipvlan/ipvlan_core.c:668 ipvlan_process_outbound drivers/net/ipvlan/ipvlan_core.c:524 [inline] ipvlan_xmit_mode_l3 drivers/net/ipvlan/ipvlan_core.c:602 [inline] ipvlan_queue_xmit+0xf44/0x16b0 drivers/net/ipvlan/ipvlan_core.c:668 ipvlan_start_xmit+0x5c/0x1a0 drivers/net/ipvlan/ipvlan_main.c:222 __netdev_start_xmit include/linux/netdevice.h:4989 [inline] netdev_start_xmit include/linux/netdevice.h:5003 [inline] xmit_one net/core/dev.c:3547 [inline] dev_hard_start_xmit+0x244/0xa10 net/core/dev.c:3563 __dev_queue_xmit+0x33ed/0x51c0 net/core/dev.c:4351 dev_queue_xmit include/linux/netdevice.h:3171 [inline] packet_xmit+0x9c/0x6b0 net/packet/af_packet.c:276 packet_snd net/packet/af_packet.c:3081 [inline] packet_sendmsg+0x8aef/0x9f10 net/packet/af_packet.c:3113 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg net/socket.c:745 [inline] __sys_sendto+0x735/0xa10 net/socket.c:2191 __do_sys_sendto net/socket.c:2203 [inline] __se_sys_sendto net/socket.c:2199 [inline] __x64_sys_sendto+0x125/0x1c0 net/socket.c:2199 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcf/0x1e0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b Uninit was created at: slab_post_alloc_hook mm/slub.c:3819 [inline] slab_alloc_node mm/slub.c:3860 [inline] __do_kmalloc_node mm/slub.c:3980 [inline] __kmalloc_node_track_caller+0x705/0x1000 mm/slub.c:4001 kmalloc_reserve+0x249/0x4a0 net/core/skbuff.c:582 __alloc_skb+0x352/0x790 net/core/skbuff.c:651 skb_segment+0x20aa/0x7080 net/core/skbuff.c:4647 udp6_ufo_fragment+0xcab/0x1150 net/ipv6/udp_offload.c:109 ipv6_gso_segment+0x14be/0x2ca0 net/ipv6/ip6_offload.c:152 skb_mac_gso_segment+0x3e8/0x760 net/core/gso.c:53 nsh_gso_segment+0x6f4/0xf70 net/nsh/nsh.c:108 skb_mac_gso_segment+0x3e8/0x760 net/core/gso.c:53 __skb_gso_segment+0x4b0/0x730 net/core/gso.c:124 skb_gso_segment include/net/gso.h:83 [inline] validate_xmit_skb+0x107f/0x1930 net/core/dev.c:3628 __dev_queue_xmit+0x1f28/0x51c0 net/core/dev.c:4343 dev_queue_xmit include/linux/netdevice.h:3171 [inline] packet_xmit+0x9c/0x6b0 net/packet/af_packet.c:276 packet_snd net/packet/af_packet.c:3081 [inline] packet_sendmsg+0x8aef/0x9f10 net/packet/af_packet.c:3113 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg net/socket.c:745 [inline] __sys_sendto+0x735/0xa10 net/socket.c:2191 __do_sys_sendto net/socket.c:2203 [inline] __se_sys_sendto net/socket.c:2199 [inline] __x64_sys_sendto+0x125/0x1c0 net/socket.c:2199 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcf/0x1e0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b CPU: 1 PID: 5101 Comm: syz-executor421 Not tainted 6.8.0-rc5-syzkaller-00297-gf2e367d6ad3b #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/25/2024 Fixes: c411ed854584 ("nsh: add GSO support") Reported-and-tested-by: syzbot+42a0dc856239de4de60e@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=42a0dc856239de4de60e Reported-and-tested-by: syzbot+c298c9f0e46a3c86332b@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=c298c9f0e46a3c86332b Link: https://lore.kernel.org/netdev/20240415222041.18537-1-kuniyu@amazon.com/ Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240424023549.21862-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/nsh/nsh.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/net/nsh/nsh.c b/net/nsh/nsh.c index f4a38bd6a7e04f..bfb7758063f315 100644 --- a/net/nsh/nsh.c +++ b/net/nsh/nsh.c @@ -77,13 +77,15 @@ EXPORT_SYMBOL_GPL(nsh_pop); static struct sk_buff *nsh_gso_segment(struct sk_buff *skb, netdev_features_t features) { + unsigned int outer_hlen, mac_len, nsh_len; struct sk_buff *segs = ERR_PTR(-EINVAL); u16 mac_offset = skb->mac_header; - unsigned int nsh_len, mac_len; - __be16 proto; + __be16 outer_proto, proto; skb_reset_network_header(skb); + outer_proto = skb->protocol; + outer_hlen = skb_mac_header_len(skb); mac_len = skb->mac_len; if (unlikely(!pskb_may_pull(skb, NSH_BASE_HDR_LEN))) @@ -113,10 +115,10 @@ static struct sk_buff *nsh_gso_segment(struct sk_buff *skb, } for (skb = segs; skb; skb = skb->next) { - skb->protocol = htons(ETH_P_NSH); - __skb_push(skb, nsh_len); - skb->mac_header = mac_offset; - skb->network_header = skb->mac_header + mac_len; + skb->protocol = outer_proto; + __skb_push(skb, nsh_len + outer_hlen); + skb_reset_mac_header(skb); + skb_set_network_header(skb, outer_hlen); skb->mac_len = mac_len; } From 42f853b42899d9b445763b55c3c8adc72be0f0e1 Mon Sep 17 00:00:00 2001 From: David Bauer Date: Wed, 24 Apr 2024 19:11:10 +0200 Subject: [PATCH 14/45] net l2tp: drop flow hash on forward Drop the flow-hash of the skb when forwarding to the L2TP netdev. This avoids the L2TP qdisc from using the flow-hash from the outer packet, which is identical for every flow within the tunnel. This does not affect every platform but is specific for the ethernet driver. It depends on the platform including L4 information in the flow-hash. One such example is the Mediatek Filogic MT798x family of networking processors. Fixes: d9e31d17ceba ("l2tp: Add L2TP ethernet pseudowire support") Acked-by: James Chapman Signed-off-by: David Bauer Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240424171110.13701-1-mail@david-bauer.net Signed-off-by: Paolo Abeni --- net/l2tp/l2tp_eth.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index 39e487ccc46881..8ba00ad433c21b 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -127,6 +127,9 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, /* checksums verified by L2TP */ skb->ip_summed = CHECKSUM_NONE; + /* drop outer flow-hash */ + skb_clear_hash(skb); + skb_dst_drop(skb); nf_reset_ct(skb); From 66e13b615a0ce76b785d780ecc9776ba71983629 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Wed, 24 Apr 2024 10:02:08 +0000 Subject: [PATCH 15/45] bpf: verifier: prevent userspace memory access With BPF_PROBE_MEM, BPF allows de-referencing an untrusted pointer. To thwart invalid memory accesses, the JITs add an exception table entry for all such accesses. But in case the src_reg + offset is a userspace address, the BPF program might read that memory if the user has mapped it. Make the verifier add guard instructions around such memory accesses and skip the load if the address falls into the userspace region. The JITs need to implement bpf_arch_uaddress_limit() to define where the userspace addresses end for that architecture or TASK_SIZE is taken as default. The implementation is as follows: REG_AX = SRC_REG if(offset) REG_AX += offset; REG_AX >>= 32; if (REG_AX <= (uaddress_limit >> 32)) DST_REG = 0; else DST_REG = *(size *)(SRC_REG + offset); Comparing just the upper 32 bits of the load address with the upper 32 bits of uaddress_limit implies that the values are being aligned down to a 4GB boundary before comparison. The above means that all loads with address <= uaddress_limit + 4GB are skipped. This is acceptable because there is a large hole (much larger than 4GB) between userspace and kernel space memory, therefore a correctly functioning BPF program should not access this 4GB memory above the userspace. Let's analyze what this patch does to the following fentry program dereferencing an untrusted pointer: SEC("fentry/tcp_v4_connect") int BPF_PROG(fentry_tcp_v4_connect, struct sock *sk) { *(volatile long *)sk; return 0; } BPF Program before | BPF Program after ------------------ | ----------------- 0: (79) r1 = *(u64 *)(r1 +0) 0: (79) r1 = *(u64 *)(r1 +0) ----------------------------------------------------------------------- 1: (79) r1 = *(u64 *)(r1 +0) --\ 1: (bf) r11 = r1 ----------------------------\ \ 2: (77) r11 >>= 32 2: (b7) r0 = 0 \ \ 3: (b5) if r11 <= 0x8000 goto pc+2 3: (95) exit \ \-> 4: (79) r1 = *(u64 *)(r1 +0) \ 5: (05) goto pc+1 \ 6: (b7) r1 = 0 \-------------------------------------- 7: (b7) r0 = 0 8: (95) exit As you can see from above, in the best case (off=0), 5 extra instructions are emitted. Now, we analyze the same program after it has gone through the JITs of ARM64 and RISC-V architectures. We follow the single load instruction that has the untrusted pointer and see what instrumentation has been added around it. x86-64 JIT ========== JIT's Instrumentation (upstream) --------------------- 0: nopl 0x0(%rax,%rax,1) 5: xchg %ax,%ax 7: push %rbp 8: mov %rsp,%rbp b: mov 0x0(%rdi),%rdi --------------------------------- f: movabs $0x800000000000,%r11 19: cmp %r11,%rdi 1c: jb 0x000000000000002a 1e: mov %rdi,%r11 21: add $0x0,%r11 28: jae 0x000000000000002e 2a: xor %edi,%edi 2c: jmp 0x0000000000000032 2e: mov 0x0(%rdi),%rdi --------------------------------- 32: xor %eax,%eax 34: leave 35: ret The x86-64 JIT already emits some instructions to protect against user memory access. This patch doesn't make any changes for the x86-64 JIT. ARM64 JIT ========= No Intrumentation Verifier's Instrumentation (upstream) (This patch) ----------------- -------------------------- 0: add x9, x30, #0x0 0: add x9, x30, #0x0 4: nop 4: nop 8: paciasp 8: paciasp c: stp x29, x30, [sp, #-16]! c: stp x29, x30, [sp, #-16]! 10: mov x29, sp 10: mov x29, sp 14: stp x19, x20, [sp, #-16]! 14: stp x19, x20, [sp, #-16]! 18: stp x21, x22, [sp, #-16]! 18: stp x21, x22, [sp, #-16]! 1c: stp x25, x26, [sp, #-16]! 1c: stp x25, x26, [sp, #-16]! 20: stp x27, x28, [sp, #-16]! 20: stp x27, x28, [sp, #-16]! 24: mov x25, sp 24: mov x25, sp 28: mov x26, #0x0 28: mov x26, #0x0 2c: sub x27, x25, #0x0 2c: sub x27, x25, #0x0 30: sub sp, sp, #0x0 30: sub sp, sp, #0x0 34: ldr x0, [x0] 34: ldr x0, [x0] -------------------------------------------------------------------------------- 38: ldr x0, [x0] ----------\ 38: add x9, x0, #0x0 -----------------------------------\\ 3c: lsr x9, x9, #32 3c: mov x7, #0x0 \\ 40: cmp x9, #0x10, lsl #12 40: mov sp, sp \\ 44: b.ls 0x0000000000000050 44: ldp x27, x28, [sp], #16 \\--> 48: ldr x0, [x0] 48: ldp x25, x26, [sp], #16 \ 4c: b 0x0000000000000054 4c: ldp x21, x22, [sp], #16 \ 50: mov x0, #0x0 50: ldp x19, x20, [sp], #16 \--------------------------------------- 54: ldp x29, x30, [sp], #16 54: mov x7, #0x0 58: add x0, x7, #0x0 58: mov sp, sp 5c: autiasp 5c: ldp x27, x28, [sp], #16 60: ret 60: ldp x25, x26, [sp], #16 64: nop 64: ldp x21, x22, [sp], #16 68: ldr x10, 0x0000000000000070 68: ldp x19, x20, [sp], #16 6c: br x10 6c: ldp x29, x30, [sp], #16 70: add x0, x7, #0x0 74: autiasp 78: ret 7c: nop 80: ldr x10, 0x0000000000000088 84: br x10 There are 6 extra instructions added in ARM64 in the best case. This will become 7 in the worst case (off != 0). RISC-V JIT (RISCV_ISA_C Disabled) ========== No Intrumentation Verifier's Instrumentation (upstream) (This patch) ----------------- -------------------------- 0: nop 0: nop 4: nop 4: nop 8: li a6, 33 8: li a6, 33 c: addi sp, sp, -16 c: addi sp, sp, -16 10: sd s0, 8(sp) 10: sd s0, 8(sp) 14: addi s0, sp, 16 14: addi s0, sp, 16 18: ld a0, 0(a0) 18: ld a0, 0(a0) --------------------------------------------------------------- 1c: ld a0, 0(a0) --\ 1c: mv t0, a0 --------------------------\ \ 20: srli t0, t0, 32 20: li a5, 0 \ \ 24: lui t1, 4096 24: ld s0, 8(sp) \ \ 28: sext.w t1, t1 28: addi sp, sp, 16 \ \ 2c: bgeu t1, t0, 12 2c: sext.w a0, a5 \ \--> 30: ld a0, 0(a0) 30: ret \ 34: j 8 \ 38: li a0, 0 \------------------------------ 3c: li a5, 0 40: ld s0, 8(sp) 44: addi sp, sp, 16 48: sext.w a0, a5 4c: ret There are 7 extra instructions added in RISC-V. Fixes: 800834285361 ("bpf, arm64: Add BPF exception tables") Reported-by: Breno Leitao Suggested-by: Alexei Starovoitov Acked-by: Ilya Leoshkevich Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20240424100210.11982-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 6 ++++++ include/linux/filter.h | 1 + kernel/bpf/core.c | 9 +++++++++ kernel/bpf/verifier.c | 30 ++++++++++++++++++++++++++++++ 4 files changed, 46 insertions(+) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index df5fac428408fe..b520b66ad14b1a 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -3473,3 +3473,9 @@ bool bpf_jit_supports_ptr_xchg(void) { return true; } + +/* x86-64 JIT emits its own code to filter user addresses so return 0 here */ +u64 bpf_arch_uaddress_limit(void) +{ + return 0; +} diff --git a/include/linux/filter.h b/include/linux/filter.h index c99bc3df2d28e3..219ee7a7687443 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -963,6 +963,7 @@ bool bpf_jit_supports_far_kfunc_call(void); bool bpf_jit_supports_exceptions(void); bool bpf_jit_supports_ptr_xchg(void); bool bpf_jit_supports_arena(void); +u64 bpf_arch_uaddress_limit(void); void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); bool bpf_helper_changes_pkt_data(void *func); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 696bc55de8e82e..1ea5ce5bb59933 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2942,6 +2942,15 @@ bool __weak bpf_jit_supports_arena(void) return false; } +u64 __weak bpf_arch_uaddress_limit(void) +{ +#if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE) + return TASK_SIZE; +#else + return 0; +#endif +} + /* Return TRUE if the JIT backend satisfies the following two conditions: * 1) JIT backend supports atomic_xchg() on pointer-sized words. * 2) Under the specific arch, the implementation of xchg() is the same diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index aa24beb65393c8..cb7ad1f795e18b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19675,6 +19675,36 @@ static int do_misc_fixups(struct bpf_verifier_env *env) goto next_insn; } + /* Make it impossible to de-reference a userspace address */ + if (BPF_CLASS(insn->code) == BPF_LDX && + (BPF_MODE(insn->code) == BPF_PROBE_MEM || + BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) { + struct bpf_insn *patch = &insn_buf[0]; + u64 uaddress_limit = bpf_arch_uaddress_limit(); + + if (!uaddress_limit) + goto next_insn; + + *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg); + if (insn->off) + *patch++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_AX, insn->off); + *patch++ = BPF_ALU64_IMM(BPF_RSH, BPF_REG_AX, 32); + *patch++ = BPF_JMP_IMM(BPF_JLE, BPF_REG_AX, uaddress_limit >> 32, 2); + *patch++ = *insn; + *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *patch++ = BPF_MOV64_IMM(insn->dst_reg, 0); + + cnt = patch - insn_buf; + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } + /* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */ if (BPF_CLASS(insn->code) == BPF_LD && (BPF_MODE(insn->code) == BPF_ABS || From b599d7d26d6ad1fc9975218574bc2ca6d0293cfd Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Wed, 24 Apr 2024 10:02:09 +0000 Subject: [PATCH 16/45] bpf, x86: Fix PROBE_MEM runtime load check When a load is marked PROBE_MEM - e.g. due to PTR_UNTRUSTED access - the address being loaded from is not necessarily valid. The BPF jit sets up exception handlers for each such load which catch page faults and 0 out the destination register. If the address for the load is outside kernel address space, the load will escape the exception handling and crash the kernel. To prevent this from happening, the emits some instruction to verify that addr is > end of userspace addresses. x86 has a legacy vsyscall ABI where a page at address 0xffffffffff600000 is mapped with user accessible permissions. The addresses in this page are considered userspace addresses by the fault handler. Therefore, a BPF program accessing this page will crash the kernel. This patch fixes the runtime checks to also check that the PROBE_MEM address is below VSYSCALL_ADDR. Example BPF program: SEC("fentry/tcp_v4_connect") int BPF_PROG(fentry_tcp_v4_connect, struct sock *sk) { *(volatile unsigned long *)&sk->sk_tsq_flags; return 0; } BPF Assembly: 0: (79) r1 = *(u64 *)(r1 +0) 1: (79) r1 = *(u64 *)(r1 +344) 2: (b7) r0 = 0 3: (95) exit x86-64 JIT ========== BEFORE AFTER ------ ----- 0: nopl 0x0(%rax,%rax,1) 0: nopl 0x0(%rax,%rax,1) 5: xchg %ax,%ax 5: xchg %ax,%ax 7: push %rbp 7: push %rbp 8: mov %rsp,%rbp 8: mov %rsp,%rbp b: mov 0x0(%rdi),%rdi b: mov 0x0(%rdi),%rdi ------------------------------------------------------------------------------- f: movabs $0x100000000000000,%r11 f: movabs $0xffffffffff600000,%r10 19: add $0x2a0,%rdi 19: mov %rdi,%r11 20: cmp %r11,%rdi 1c: add $0x2a0,%r11 23: jae 0x0000000000000029 23: sub %r10,%r11 25: xor %edi,%edi 26: movabs $0x100000000a00000,%r10 27: jmp 0x000000000000002d 30: cmp %r10,%r11 29: mov 0x0(%rdi),%rdi 33: ja 0x0000000000000039 --------------------------------\ 35: xor %edi,%edi 2d: xor %eax,%eax \ 37: jmp 0x0000000000000040 2f: leave \ 39: mov 0x2a0(%rdi),%rdi 30: ret \-------------------------------------------- 40: xor %eax,%eax 42: leave 43: ret Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20240424100210.11982-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 57 ++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 32 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index b520b66ad14b1a..59cbc94b6e6903 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1807,36 +1807,41 @@ st: if (is_imm8(insn->off)) if (BPF_MODE(insn->code) == BPF_PROBE_MEM || BPF_MODE(insn->code) == BPF_PROBE_MEMSX) { /* Conservatively check that src_reg + insn->off is a kernel address: - * src_reg + insn->off >= TASK_SIZE_MAX + PAGE_SIZE - * src_reg is used as scratch for src_reg += insn->off and restored - * after emit_ldx if necessary + * src_reg + insn->off > TASK_SIZE_MAX + PAGE_SIZE + * and + * src_reg + insn->off < VSYSCALL_ADDR */ - u64 limit = TASK_SIZE_MAX + PAGE_SIZE; + u64 limit = TASK_SIZE_MAX + PAGE_SIZE - VSYSCALL_ADDR; u8 *end_of_jmp; - /* At end of these emitted checks, insn->off will have been added - * to src_reg, so no need to do relative load with insn->off offset - */ - insn_off = 0; + /* movabsq r10, VSYSCALL_ADDR */ + emit_mov_imm64(&prog, BPF_REG_AX, (long)VSYSCALL_ADDR >> 32, + (u32)(long)VSYSCALL_ADDR); - /* movabsq r11, limit */ - EMIT2(add_1mod(0x48, AUX_REG), add_1reg(0xB8, AUX_REG)); - EMIT((u32)limit, 4); - EMIT(limit >> 32, 4); + /* mov src_reg, r11 */ + EMIT_mov(AUX_REG, src_reg); if (insn->off) { - /* add src_reg, insn->off */ - maybe_emit_1mod(&prog, src_reg, true); - EMIT2_off32(0x81, add_1reg(0xC0, src_reg), insn->off); + /* add r11, insn->off */ + maybe_emit_1mod(&prog, AUX_REG, true); + EMIT2_off32(0x81, add_1reg(0xC0, AUX_REG), insn->off); } - /* cmp src_reg, r11 */ - maybe_emit_mod(&prog, src_reg, AUX_REG, true); - EMIT2(0x39, add_2reg(0xC0, src_reg, AUX_REG)); + /* sub r11, r10 */ + maybe_emit_mod(&prog, AUX_REG, BPF_REG_AX, true); + EMIT2(0x29, add_2reg(0xC0, AUX_REG, BPF_REG_AX)); + + /* movabsq r10, limit */ + emit_mov_imm64(&prog, BPF_REG_AX, (long)limit >> 32, + (u32)(long)limit); + + /* cmp r10, r11 */ + maybe_emit_mod(&prog, AUX_REG, BPF_REG_AX, true); + EMIT2(0x39, add_2reg(0xC0, AUX_REG, BPF_REG_AX)); - /* if unsigned '>=', goto load */ - EMIT2(X86_JAE, 0); + /* if unsigned '>', goto load */ + EMIT2(X86_JA, 0); end_of_jmp = prog; /* xor dst_reg, dst_reg */ @@ -1862,18 +1867,6 @@ st: if (is_imm8(insn->off)) /* populate jmp_offset for JMP above */ start_of_ldx[-1] = prog - start_of_ldx; - if (insn->off && src_reg != dst_reg) { - /* sub src_reg, insn->off - * Restore src_reg after "add src_reg, insn->off" in prev - * if statement. But if src_reg == dst_reg, emit_ldx - * above already clobbered src_reg, so no need to restore. - * If add src_reg, insn->off was unnecessary, no need to - * restore either. - */ - maybe_emit_1mod(&prog, src_reg, true); - EMIT2_off32(0x81, add_1reg(0xE8, src_reg), insn->off); - } - if (!bpf_prog->aux->extable) break; From 7cd6750d9a560fa69bb640a7280479d6a67999ad Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Wed, 24 Apr 2024 10:02:10 +0000 Subject: [PATCH 17/45] selftests/bpf: Test PROBE_MEM of VSYSCALL_ADDR on x86-64 The vsyscall is a legacy API for fast execution of system calls. It maps a page at address VSYSCALL_ADDR into the userspace program. This address is in the top 10MB of the address space: ffffffffff600000 - ffffffffff600fff | 4 kB | legacy vsyscall ABI The last commit fixes the x86-64 BPF JIT to skip accessing addresses in this memory region. Add this address to bpf_testmod_return_ptr() so we can make sure that it is fixed. After this change and without the previous commit, subprogs_extable selftest will crash the kernel. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20240424100210.11982-4-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index 39ad96a18123f6..edcd26106557b4 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -205,6 +205,9 @@ __weak noinline struct file *bpf_testmod_return_ptr(int arg) case 5: return (void *)~(1ull << 30); /* trigger extable */ case 6: return &f; /* valid addr */ case 7: return (void *)((long)&f | 1); /* kernel tricks */ +#ifdef CONFIG_X86_64 + case 8: return (void *)VSYSCALL_ADDR; /* vsyscall page address */ +#endif default: return NULL; } } From 6a30653b604aaad1bf0f2e74b068ceb8b6fc7aea Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 25 Apr 2024 09:39:32 +0100 Subject: [PATCH 18/45] Fix a potential infinite loop in extract_user_to_sg() Fix extract_user_to_sg() so that it will break out of the loop if iov_iter_extract_pages() returns 0 rather than looping around forever. [Note that I've included two fixes lines as the function got moved to a different file and renamed] Fixes: 85dd2c8ff368 ("netfs: Add a function to extract a UBUF or IOVEC into a BVEC iterator") Fixes: f5f82cd18732 ("Move netfs_extract_iter_to_sg() to lib/scatterlist.c") Signed-off-by: David Howells cc: Jeff Layton cc: Steve French cc: Herbert Xu cc: netfs@lists.linux.dev Link: https://lore.kernel.org/r/1967121.1714034372@warthog.procyon.org.uk Signed-off-by: Jakub Kicinski --- lib/scatterlist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 68b45c82c37a69..7bc2220fea8058 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -1124,7 +1124,7 @@ static ssize_t extract_user_to_sg(struct iov_iter *iter, do { res = iov_iter_extract_pages(iter, &pages, maxsize, sg_max, extraction_flags, &off); - if (res < 0) + if (res <= 0) goto failed; len = res; From d85cf67a339685beae1d0aee27b7f61da95455be Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Thu, 25 Apr 2024 15:27:19 -0700 Subject: [PATCH 19/45] net: bcmgenet: synchronize EXT_RGMII_OOB_CTRL access The EXT_RGMII_OOB_CTRL register can be written from different contexts. It is predominantly written from the adjust_link handler which is synchronized by the phydev->lock, but can also be written from a different context when configuring the mii in bcmgenet_mii_config(). The chances of contention are quite low, but it is conceivable that adjust_link could occur during resume when WoL is enabled so use the phydev->lock synchronizer in bcmgenet_mii_config() to be sure. Fixes: afe3f907d20f ("net: bcmgenet: power on MII block for all MII modes") Cc: stable@vger.kernel.org Signed-off-by: Doug Berger Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmmii.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c b/drivers/net/ethernet/broadcom/genet/bcmmii.c index 9ada8935574755..86a4aa72b3d4b3 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmmii.c +++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c @@ -2,7 +2,7 @@ /* * Broadcom GENET MDIO routines * - * Copyright (c) 2014-2017 Broadcom + * Copyright (c) 2014-2024 Broadcom */ #include @@ -275,6 +275,7 @@ int bcmgenet_mii_config(struct net_device *dev, bool init) * block for the interface to work, unconditionally clear the * Out-of-band disable since we do not need it. */ + mutex_lock(&phydev->lock); reg = bcmgenet_ext_readl(priv, EXT_RGMII_OOB_CTRL); reg &= ~OOB_DISABLE; if (priv->ext_phy) { @@ -286,6 +287,7 @@ int bcmgenet_mii_config(struct net_device *dev, bool init) reg |= RGMII_MODE_EN; } bcmgenet_ext_writel(priv, reg, EXT_RGMII_OOB_CTRL); + mutex_unlock(&phydev->lock); if (init) dev_info(kdev, "configuring instance for %s\n", phy_name); From 2dbe5f19368caae63b1f59f5bc2af78c7d522b3a Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Thu, 25 Apr 2024 15:27:20 -0700 Subject: [PATCH 20/45] net: bcmgenet: synchronize use of bcmgenet_set_rx_mode() The ndo_set_rx_mode function is synchronized with the netif_addr_lock spinlock and BHs disabled. Since this function is also invoked directly from the driver the same synchronization should be applied. Fixes: 72f96347628e ("net: bcmgenet: set Rx mode before starting netif") Cc: stable@vger.kernel.org Signed-off-by: Doug Berger Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index b1f84b37032a78..5452b7dc6e6ae2 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -2,7 +2,7 @@ /* * Broadcom GENET (Gigabit Ethernet) controller driver * - * Copyright (c) 2014-2020 Broadcom + * Copyright (c) 2014-2024 Broadcom */ #define pr_fmt(fmt) "bcmgenet: " fmt @@ -3334,7 +3334,9 @@ static void bcmgenet_netif_start(struct net_device *dev) struct bcmgenet_priv *priv = netdev_priv(dev); /* Start the network engine */ + netif_addr_lock_bh(dev); bcmgenet_set_rx_mode(dev); + netif_addr_unlock_bh(dev); bcmgenet_enable_rx_napi(priv); umac_enable_set(priv, CMD_TX_EN | CMD_RX_EN, true); From 0d5e2a82232605b337972fb2c7d0cbc46898aca1 Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Thu, 25 Apr 2024 15:27:21 -0700 Subject: [PATCH 21/45] net: bcmgenet: synchronize UMAC_CMD access The UMAC_CMD register is written from different execution contexts and has insufficient synchronization protections to prevent possible corruption. Of particular concern are the acceses from the phy_device delayed work context used by the adjust_link call and the BH context that may be used by the ndo_set_rx_mode call. A spinlock is added to the driver to protect contended register accesses (i.e. reg_lock) and it is used to synchronize accesses to UMAC_CMD. Fixes: 1c1008c793fa ("net: bcmgenet: add main driver file") Cc: stable@vger.kernel.org Signed-off-by: Doug Berger Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 12 +++++++++++- drivers/net/ethernet/broadcom/genet/bcmgenet.h | 4 +++- drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c | 8 +++++++- drivers/net/ethernet/broadcom/genet/bcmmii.c | 2 ++ 4 files changed, 23 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index 5452b7dc6e6ae2..c7e7dac057a336 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -2467,14 +2467,18 @@ static void umac_enable_set(struct bcmgenet_priv *priv, u32 mask, bool enable) { u32 reg; + spin_lock_bh(&priv->reg_lock); reg = bcmgenet_umac_readl(priv, UMAC_CMD); - if (reg & CMD_SW_RESET) + if (reg & CMD_SW_RESET) { + spin_unlock_bh(&priv->reg_lock); return; + } if (enable) reg |= mask; else reg &= ~mask; bcmgenet_umac_writel(priv, reg, UMAC_CMD); + spin_unlock_bh(&priv->reg_lock); /* UniMAC stops on a packet boundary, wait for a full-size packet * to be processed @@ -2490,8 +2494,10 @@ static void reset_umac(struct bcmgenet_priv *priv) udelay(10); /* issue soft reset and disable MAC while updating its registers */ + spin_lock_bh(&priv->reg_lock); bcmgenet_umac_writel(priv, CMD_SW_RESET, UMAC_CMD); udelay(2); + spin_unlock_bh(&priv->reg_lock); } static void bcmgenet_intr_disable(struct bcmgenet_priv *priv) @@ -3597,16 +3603,19 @@ static void bcmgenet_set_rx_mode(struct net_device *dev) * 3. The number of filters needed exceeds the number filters * supported by the hardware. */ + spin_lock(&priv->reg_lock); reg = bcmgenet_umac_readl(priv, UMAC_CMD); if ((dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) || (nfilter > MAX_MDF_FILTER)) { reg |= CMD_PROMISC; bcmgenet_umac_writel(priv, reg, UMAC_CMD); + spin_unlock(&priv->reg_lock); bcmgenet_umac_writel(priv, 0, UMAC_MDF_CTRL); return; } else { reg &= ~CMD_PROMISC; bcmgenet_umac_writel(priv, reg, UMAC_CMD); + spin_unlock(&priv->reg_lock); } /* update MDF filter */ @@ -4005,6 +4014,7 @@ static int bcmgenet_probe(struct platform_device *pdev) goto err; } + spin_lock_init(&priv->reg_lock); spin_lock_init(&priv->lock); /* Set default pause parameters */ diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.h b/drivers/net/ethernet/broadcom/genet/bcmgenet.h index 7523b60b3c1c01..43b923c48b14f4 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.h +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright (c) 2014-2020 Broadcom + * Copyright (c) 2014-2024 Broadcom */ #ifndef __BCMGENET_H__ @@ -573,6 +573,8 @@ struct bcmgenet_rxnfc_rule { /* device context */ struct bcmgenet_priv { void __iomem *base; + /* reg_lock: lock to serialize access to shared registers */ + spinlock_t reg_lock; enum bcmgenet_version version; struct net_device *dev; diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c b/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c index 7a41cad5788f4e..1248792d7fd4d2 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c @@ -2,7 +2,7 @@ /* * Broadcom GENET (Gigabit Ethernet) Wake-on-LAN support * - * Copyright (c) 2014-2020 Broadcom + * Copyright (c) 2014-2024 Broadcom */ #define pr_fmt(fmt) "bcmgenet_wol: " fmt @@ -151,6 +151,7 @@ int bcmgenet_wol_power_down_cfg(struct bcmgenet_priv *priv, } /* Can't suspend with WoL if MAC is still in reset */ + spin_lock_bh(&priv->reg_lock); reg = bcmgenet_umac_readl(priv, UMAC_CMD); if (reg & CMD_SW_RESET) reg &= ~CMD_SW_RESET; @@ -158,6 +159,7 @@ int bcmgenet_wol_power_down_cfg(struct bcmgenet_priv *priv, /* disable RX */ reg &= ~CMD_RX_EN; bcmgenet_umac_writel(priv, reg, UMAC_CMD); + spin_unlock_bh(&priv->reg_lock); mdelay(10); if (priv->wolopts & (WAKE_MAGIC | WAKE_MAGICSECURE)) { @@ -203,6 +205,7 @@ int bcmgenet_wol_power_down_cfg(struct bcmgenet_priv *priv, } /* Enable CRC forward */ + spin_lock_bh(&priv->reg_lock); reg = bcmgenet_umac_readl(priv, UMAC_CMD); priv->crc_fwd_en = 1; reg |= CMD_CRC_FWD; @@ -210,6 +213,7 @@ int bcmgenet_wol_power_down_cfg(struct bcmgenet_priv *priv, /* Receiver must be enabled for WOL MP detection */ reg |= CMD_RX_EN; bcmgenet_umac_writel(priv, reg, UMAC_CMD); + spin_unlock_bh(&priv->reg_lock); reg = UMAC_IRQ_MPD_R; if (hfb_enable) @@ -256,7 +260,9 @@ void bcmgenet_wol_power_up_cfg(struct bcmgenet_priv *priv, } /* Disable CRC Forward */ + spin_lock_bh(&priv->reg_lock); reg = bcmgenet_umac_readl(priv, UMAC_CMD); reg &= ~CMD_CRC_FWD; bcmgenet_umac_writel(priv, reg, UMAC_CMD); + spin_unlock_bh(&priv->reg_lock); } diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c b/drivers/net/ethernet/broadcom/genet/bcmmii.c index 86a4aa72b3d4b3..c4a3698cef66f6 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmmii.c +++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c @@ -76,6 +76,7 @@ static void bcmgenet_mac_config(struct net_device *dev) reg |= RGMII_LINK; bcmgenet_ext_writel(priv, reg, EXT_RGMII_OOB_CTRL); + spin_lock_bh(&priv->reg_lock); reg = bcmgenet_umac_readl(priv, UMAC_CMD); reg &= ~((CMD_SPEED_MASK << CMD_SPEED_SHIFT) | CMD_HD_EN | @@ -88,6 +89,7 @@ static void bcmgenet_mac_config(struct net_device *dev) reg |= CMD_TX_EN | CMD_RX_EN; } bcmgenet_umac_writel(priv, reg, UMAC_CMD); + spin_unlock_bh(&priv->reg_lock); active = phy_init_eee(phydev, 0) >= 0; bcmgenet_eee_enable_set(dev, From 16f50301a804a4b86c75fdd0dfb50ec263ed41b1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 24 Apr 2024 11:37:59 -0700 Subject: [PATCH 22/45] MAINTAINERS: add an explicit entry for YNL Donald has been contributing to YNL a lot. Let's create a dedicated MAINTAINERS entry and add make his involvement official :) Signed-off-by: Jakub Kicinski Acked-by: Donald Hunter Signed-off-by: David S. Miller --- MAINTAINERS | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index aff81a252865d9..4a2e864fce5168 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -24468,6 +24468,14 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/har F: Documentation/admin-guide/LSM/Yama.rst F: security/yama/ +YAML NETLINK (YNL) +M: Donald Hunter +M: Jakub Kicinski +F: Documentation/netlink/ +F: Documentation/userspace-api/netlink/intro-specs.rst +F: Documentation/userspace-api/netlink/specs.rst +F: tools/net/ynl/ + YEALINK PHONE DRIVER M: Henk Vergonet L: usbb2k-api-dev@nongnu.org From e25714466abd9d96901b15efddf82c60a38abd86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Asbj=C3=B8rn=20Sloth=20T=C3=B8nnesen?= Date: Fri, 26 Apr 2024 09:12:23 +0000 Subject: [PATCH 23/45] net: qede: sanitize 'rc' in qede_add_tc_flower_fltr() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Explicitly set 'rc' (return code), before jumping to the unlock and return path. By not having any code depend on that 'rc' remains at it's initial value of -EINVAL, then we can re-use 'rc' for the return code of function calls in subsequent patches. Only compile tested. Signed-off-by: Asbjørn Sloth Tønnesen Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qede/qede_filter.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c index a5ac21a0ee33ff..8ecdfa36a68544 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_filter.c +++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c @@ -1868,8 +1868,8 @@ int qede_add_tc_flower_fltr(struct qede_dev *edev, __be16 proto, struct flow_cls_offload *f) { struct qede_arfs_fltr_node *n; - int min_hlen, rc = -EINVAL; struct qede_arfs_tuple t; + int min_hlen, rc; __qede_lock(edev); @@ -1879,8 +1879,10 @@ int qede_add_tc_flower_fltr(struct qede_dev *edev, __be16 proto, } /* parse flower attribute and prepare filter */ - if (qede_parse_flow_attr(edev, proto, f->rule, &t)) + if (qede_parse_flow_attr(edev, proto, f->rule, &t)) { + rc = -EINVAL; goto unlock; + } /* Validate profile mode and number of filters */ if ((edev->arfs->filter_count && edev->arfs->mode != t.mode) || @@ -1888,12 +1890,15 @@ int qede_add_tc_flower_fltr(struct qede_dev *edev, __be16 proto, DP_NOTICE(edev, "Filter configuration invalidated, filter mode=0x%x, configured mode=0x%x, filter count=0x%x\n", t.mode, edev->arfs->mode, edev->arfs->filter_count); + rc = -EINVAL; goto unlock; } /* parse tc actions and get the vf_id */ - if (qede_parse_actions(edev, &f->rule->action, f->common.extack)) + if (qede_parse_actions(edev, &f->rule->action, f->common.extack)) { + rc = -EINVAL; goto unlock; + } if (qede_flow_find_fltr(edev, &t)) { rc = -EEXIST; From fcee2065a178f78be6fd516302830378b17dba3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Asbj=C3=B8rn=20Sloth=20T=C3=B8nnesen?= Date: Fri, 26 Apr 2024 09:12:24 +0000 Subject: [PATCH 24/45] net: qede: use return from qede_parse_flow_attr() for flower MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In qede_add_tc_flower_fltr(), when calling qede_parse_flow_attr() then the return code was only used for a non-zero check, and then -EINVAL was returned. qede_parse_flow_attr() can currently fail with: * -EINVAL * -EOPNOTSUPP * -EPROTONOSUPPORT This patch changes the code to use the actual return code, not just return -EINVAL. The blaimed commit introduced these functions. Only compile tested. Fixes: 2ce9c93eaca6 ("qede: Ingress tc flower offload (drop action) support.") Signed-off-by: Asbjørn Sloth Tønnesen Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qede/qede_filter.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c index 8ecdfa36a68544..25ef0f4258cb11 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_filter.c +++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c @@ -1879,10 +1879,9 @@ int qede_add_tc_flower_fltr(struct qede_dev *edev, __be16 proto, } /* parse flower attribute and prepare filter */ - if (qede_parse_flow_attr(edev, proto, f->rule, &t)) { - rc = -EINVAL; + rc = qede_parse_flow_attr(edev, proto, f->rule, &t); + if (rc) goto unlock; - } /* Validate profile mode and number of filters */ if ((edev->arfs->filter_count && edev->arfs->mode != t.mode) || From 27b44414a34b108c5a37cd5b4894f606061d86e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Asbj=C3=B8rn=20Sloth=20T=C3=B8nnesen?= Date: Fri, 26 Apr 2024 09:12:25 +0000 Subject: [PATCH 25/45] net: qede: use return from qede_parse_flow_attr() for flow_spec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In qede_flow_spec_to_rule(), when calling qede_parse_flow_attr() then the return code was only used for a non-zero check, and then -EINVAL was returned. qede_parse_flow_attr() can currently fail with: * -EINVAL * -EOPNOTSUPP * -EPROTONOSUPPORT This patch changes the code to use the actual return code, not just return -EINVAL. The blaimed commit introduced qede_flow_spec_to_rule(), and this call to qede_parse_flow_attr(), it looks like it just duplicated how it was already used. Only compile tested. Fixes: 37c5d3efd7f8 ("qede: use ethtool_rx_flow_rule() to remove duplicated parser code") Signed-off-by: Asbjørn Sloth Tønnesen Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qede/qede_filter.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c index 25ef0f4258cb11..377d661f70f783 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_filter.c +++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c @@ -2002,10 +2002,9 @@ static int qede_flow_spec_to_rule(struct qede_dev *edev, if (IS_ERR(flow)) return PTR_ERR(flow); - if (qede_parse_flow_attr(edev, proto, flow->rule, t)) { - err = -EINVAL; + err = qede_parse_flow_attr(edev, proto, flow->rule, t); + if (err) goto err_out; - } /* Make sure location is valid and filter isn't already set */ err = qede_flow_spec_validate(edev, &flow->rule->action, t, From f26f719a36e56381a1f4230e5364e7ad4d485888 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Asbj=C3=B8rn=20Sloth=20T=C3=B8nnesen?= Date: Fri, 26 Apr 2024 09:12:26 +0000 Subject: [PATCH 26/45] net: qede: use return from qede_parse_actions() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When calling qede_parse_actions() then the return code was only used for a non-zero check, and then -EINVAL was returned. qede_parse_actions() can currently fail with: * -EINVAL * -EOPNOTSUPP This patch changes the code to use the actual return code, not just return -EINVAL. The blaimed commit broke the implicit assumption that only -EINVAL would ever be returned. Only compile tested. Fixes: 319a1d19471e ("flow_offload: check for basic action hw stats type") Signed-off-by: Asbjørn Sloth Tønnesen Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qede/qede_filter.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c index 377d661f70f783..cb6b33a228ea20 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_filter.c +++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c @@ -1894,10 +1894,9 @@ int qede_add_tc_flower_fltr(struct qede_dev *edev, __be16 proto, } /* parse tc actions and get the vf_id */ - if (qede_parse_actions(edev, &f->rule->action, f->common.extack)) { - rc = -EINVAL; + rc = qede_parse_actions(edev, &f->rule->action, f->common.extack); + if (rc) goto unlock; - } if (qede_flow_find_fltr(edev, &t)) { rc = -EEXIST; From 6dee402daba4eb8677a9438ebdcd8fe90ddd4326 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 26 Apr 2024 17:27:17 +0200 Subject: [PATCH 27/45] vxlan: Fix racy device stats updates. VXLAN devices update their stats locklessly. Therefore these counters should either be stored in per-cpu data structures or the updates should be done using atomic increments. Since the net_device_core_stats infrastructure is already used in vxlan_rcv(), use it for the other rx_dropped and tx_dropped counter updates. Update the other counters atomically using DEV_STATS_INC(). Fixes: d342894c5d2f ("vxlan: virtual extensible lan") Signed-off-by: Guillaume Nault Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/vxlan/vxlan_core.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index ba319fc2195719..0cd9e44c7be85f 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1766,8 +1766,8 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) skb_reset_network_header(skb); if (!vxlan_ecn_decapsulate(vs, oiph, skb)) { - ++vxlan->dev->stats.rx_frame_errors; - ++vxlan->dev->stats.rx_errors; + DEV_STATS_INC(vxlan->dev, rx_frame_errors); + DEV_STATS_INC(vxlan->dev, rx_errors); vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX_ERRORS, 0); goto drop; @@ -1837,7 +1837,7 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) goto out; if (!pskb_may_pull(skb, arp_hdr_len(dev))) { - dev->stats.tx_dropped++; + dev_core_stats_tx_dropped_inc(dev); goto out; } parp = arp_hdr(skb); @@ -1893,7 +1893,7 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) reply->pkt_type = PACKET_HOST; if (netif_rx(reply) == NET_RX_DROP) { - dev->stats.rx_dropped++; + dev_core_stats_rx_dropped_inc(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_RX_DROPS, 0); } @@ -2052,7 +2052,7 @@ static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) goto out; if (netif_rx(reply) == NET_RX_DROP) { - dev->stats.rx_dropped++; + dev_core_stats_rx_dropped_inc(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_RX_DROPS, 0); } @@ -2263,7 +2263,7 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, len); } else { drop: - dev->stats.rx_dropped++; + dev_core_stats_rx_dropped_inc(dev); vxlan_vnifilter_count(dst_vxlan, vni, NULL, VXLAN_VNI_STATS_RX_DROPS, 0); } @@ -2295,7 +2295,7 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev, addr_family, dst_port, vxlan->cfg.flags); if (!dst_vxlan) { - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_ERRORS, 0); kfree_skb(skb); @@ -2559,7 +2559,7 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, return; drop: - dev->stats.tx_dropped++; + dev_core_stats_tx_dropped_inc(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); dev_kfree_skb(skb); return; @@ -2567,11 +2567,11 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, tx_error: rcu_read_unlock(); if (err == -ELOOP) - dev->stats.collisions++; + DEV_STATS_INC(dev, collisions); else if (err == -ENETUNREACH) - dev->stats.tx_carrier_errors++; + DEV_STATS_INC(dev, tx_carrier_errors); dst_release(ndst); - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_ERRORS, 0); kfree_skb(skb); } @@ -2604,7 +2604,7 @@ static void vxlan_xmit_nh(struct sk_buff *skb, struct net_device *dev, return; drop: - dev->stats.tx_dropped++; + dev_core_stats_tx_dropped_inc(dev); vxlan_vnifilter_count(netdev_priv(dev), vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); dev_kfree_skb(skb); @@ -2642,7 +2642,7 @@ static netdev_tx_t vxlan_xmit_nhid(struct sk_buff *skb, struct net_device *dev, return NETDEV_TX_OK; drop: - dev->stats.tx_dropped++; + dev_core_stats_tx_dropped_inc(dev); vxlan_vnifilter_count(netdev_priv(dev), vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); dev_kfree_skb(skb); @@ -2739,7 +2739,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) !is_multicast_ether_addr(eth->h_dest)) vxlan_fdb_miss(vxlan, eth->h_dest); - dev->stats.tx_dropped++; + dev_core_stats_tx_dropped_inc(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); kfree_skb(skb); From b22ea4ef4c3438817fcb604255b55b0058ed8c64 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 26 Apr 2024 17:27:19 +0200 Subject: [PATCH 28/45] vxlan: Add missing VNI filter counter update in arp_reduce(). VXLAN stores per-VNI statistics using vxlan_vnifilter_count(). These statistics were not updated when arp_reduce() failed its pskb_may_pull() call. Use vxlan_vnifilter_count() to update the VNI counter when that happens. Fixes: 4095e0e1328a ("drivers: vxlan: vnifilter: per vni stats") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- drivers/net/vxlan/vxlan_core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 0cd9e44c7be85f..c9e4e03ad214f3 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1838,6 +1838,8 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) if (!pskb_may_pull(skb, arp_hdr_len(dev))) { dev_core_stats_tx_dropped_inc(dev); + vxlan_vnifilter_count(vxlan, vni, NULL, + VXLAN_VNI_STATS_TX_DROPS, 0); goto out; } parp = arp_hdr(skb); From 9f8eeea1643c213c0e1ad2e546a15536200d216b Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Sun, 28 Apr 2024 19:16:38 +0800 Subject: [PATCH 29/45] rxrpc: Fix using alignmask being zero for __page_frag_alloc_align() rxrpc_alloc_data_txbuf() may be called with data_align being zero in none_alloc_txbuf() and rxkad_alloc_txbuf(), data_align is supposed to be an order-based alignment value, but zero is not a valid order-based alignment value, and '~(data_align - 1)' doesn't result in a valid mask-based alignment value for __page_frag_alloc_align(). Fix it by passing a valid order-based alignment value in none_alloc_txbuf() and rxkad_alloc_txbuf(). Also use page_frag_alloc_align() expecting an order-based alignment value in rxrpc_alloc_data_txbuf() to avoid doing the alignment converting operation and to catch possible invalid alignment value in the future. Remove the 'if (data_align)' checking too, as it is always true for a valid order-based alignment value. Fixes: 6b2536462fd4 ("rxrpc: Fix use of changed alignment param to page_frag_alloc_align()") Fixes: 49489bb03a50 ("rxrpc: Do zerocopy using MSG_SPLICE_PAGES and page frags") CC: Alexander Duyck Signed-off-by: Yunsheng Lin Acked-by: David Howells Link: https://lore.kernel.org/r/20240428111640.27306-1-linyunsheng@huawei.com Signed-off-by: Jakub Kicinski --- net/rxrpc/insecure.c | 2 +- net/rxrpc/rxkad.c | 2 +- net/rxrpc/txbuf.c | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c index f2701068ed9e4c..6716c021a532e1 100644 --- a/net/rxrpc/insecure.c +++ b/net/rxrpc/insecure.c @@ -19,7 +19,7 @@ static int none_init_connection_security(struct rxrpc_connection *conn, */ static struct rxrpc_txbuf *none_alloc_txbuf(struct rxrpc_call *call, size_t remain, gfp_t gfp) { - return rxrpc_alloc_data_txbuf(call, min_t(size_t, remain, RXRPC_JUMBO_DATALEN), 0, gfp); + return rxrpc_alloc_data_txbuf(call, min_t(size_t, remain, RXRPC_JUMBO_DATALEN), 1, gfp); } static int none_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index f1a68270862db9..48a1475e6b0634 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -155,7 +155,7 @@ static struct rxrpc_txbuf *rxkad_alloc_txbuf(struct rxrpc_call *call, size_t rem switch (call->conn->security_level) { default: space = min_t(size_t, remain, RXRPC_JUMBO_DATALEN); - return rxrpc_alloc_data_txbuf(call, space, 0, gfp); + return rxrpc_alloc_data_txbuf(call, space, 1, gfp); case RXRPC_SECURITY_AUTH: shdr = sizeof(struct rxkad_level1_hdr); break; diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c index e0679658d9de0e..c3913d8a50d340 100644 --- a/net/rxrpc/txbuf.c +++ b/net/rxrpc/txbuf.c @@ -21,20 +21,20 @@ struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_ { struct rxrpc_wire_header *whdr; struct rxrpc_txbuf *txb; - size_t total, hoff = 0; + size_t total, hoff; void *buf; txb = kmalloc(sizeof(*txb), gfp); if (!txb) return NULL; - if (data_align) - hoff = round_up(sizeof(*whdr), data_align) - sizeof(*whdr); + hoff = round_up(sizeof(*whdr), data_align) - sizeof(*whdr); total = hoff + sizeof(*whdr) + data_size; + data_align = umax(data_align, L1_CACHE_BYTES); mutex_lock(&call->conn->tx_data_alloc_lock); - buf = __page_frag_alloc_align(&call->conn->tx_data_alloc, total, gfp, - ~(data_align - 1) & ~(L1_CACHE_BYTES - 1)); + buf = page_frag_alloc_align(&call->conn->tx_data_alloc, total, gfp, + data_align); mutex_unlock(&call->conn->tx_data_alloc_lock); if (!buf) { kfree(txb); From 9067eccdd7849dd120d5495dbd5a686fa6ed2c1a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 29 Apr 2024 11:11:47 +0200 Subject: [PATCH 30/45] cxgb4: Properly lock TX queue for the selftest. The selftest for the driver sends a dummy packet and checks if the packet will be received properly as it should be. The regular TX path and the selftest can use the same network queue so locking is required and was missing in the selftest path. This was addressed in the commit cited below. Unfortunately locking the TX queue requires BH to be disabled which is not the case in selftest path which is invoked in process context. Lockdep should be complaining about this. Use __netif_tx_lock_bh() for TX queue locking. Fixes: c650e04898072 ("cxgb4: Fix race between loopback and normal Tx path") Reported-by: "John B. Wyatt IV" Closes: https://lore.kernel.org/all/Zic0ot5aGgR-V4Ks@thinkpad2021/ Signed-off-by: Sebastian Andrzej Siewior Link: https://lore.kernel.org/r/20240429091147.YWAaal4v@linutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/cxgb4/sge.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c index 49d5808b7d11d2..de52bcb884c417 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c @@ -2670,12 +2670,12 @@ int cxgb4_selftest_lb_pkt(struct net_device *netdev) lb->loopback = 1; q = &adap->sge.ethtxq[pi->first_qset]; - __netif_tx_lock(q->txq, smp_processor_id()); + __netif_tx_lock_bh(q->txq); reclaim_completed_tx(adap, &q->q, -1, true); credits = txq_avail(&q->q) - ndesc; if (unlikely(credits < 0)) { - __netif_tx_unlock(q->txq); + __netif_tx_unlock_bh(q->txq); return -ENOMEM; } @@ -2710,7 +2710,7 @@ int cxgb4_selftest_lb_pkt(struct net_device *netdev) init_completion(&lb->completion); txq_advance(&q->q, ndesc); cxgb4_ring_tx_db(adap, &q->q, ndesc); - __netif_tx_unlock(q->txq); + __netif_tx_unlock_bh(q->txq); /* wait for the pkt to return */ ret = wait_for_completion_timeout(&lb->completion, 10 * HZ); From b9a61c20179fda7bdfe2c1210aa72451991ab81a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Mon, 29 Apr 2024 15:38:32 +0200 Subject: [PATCH 31/45] net: dsa: mv88e6xxx: Fix number of databases for 88E6141 / 88E6341 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Topaz family (88E6141 and 88E6341) only support 256 Forwarding Information Tables. Fixes: a75961d0ebfd ("net: dsa: mv88e6xxx: Add support for ethernet switch 88E6341") Fixes: 1558727a1c1b ("net: dsa: mv88e6xxx: Add support for ethernet switch 88E6141") Signed-off-by: Marek Behún Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/20240429133832.9547-1-kabel@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mv88e6xxx/chip.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 59b5dd0e2f41d2..14daf432f30b5c 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -5705,7 +5705,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .prod_num = MV88E6XXX_PORT_SWITCH_ID_PROD_6141, .family = MV88E6XXX_FAMILY_6341, .name = "Marvell 88E6141", - .num_databases = 4096, + .num_databases = 256, .num_macs = 2048, .num_ports = 6, .num_internal_phys = 5, @@ -6164,7 +6164,7 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .prod_num = MV88E6XXX_PORT_SWITCH_ID_PROD_6341, .family = MV88E6XXX_FAMILY_6341, .name = "Marvell 88E6341", - .num_databases = 4096, + .num_databases = 256, .num_macs = 2048, .num_internal_phys = 5, .num_ports = 6, From 387f295cb2150ed164905b648d76dfcbd3621778 Mon Sep 17 00:00:00 2001 From: Vitaly Lifshits Date: Mon, 29 Apr 2024 10:10:40 -0700 Subject: [PATCH 32/45] e1000e: change usleep_range to udelay in PHY mdic access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a partial revert of commit 6dbdd4de0362 ("e1000e: Workaround for sporadic MDI error on Meteor Lake systems"). The referenced commit used usleep_range inside the PHY access routines, which are sometimes called from an atomic context. This can lead to a kernel panic in some scenarios, such as cable disconnection and reconnection on vPro systems. Solve this by changing the usleep_range calls back to udelay. Fixes: 6dbdd4de0362 ("e1000e: Workaround for sporadic MDI error on Meteor Lake systems") Cc: stable@vger.kernel.org Reported-by: Jérôme Carretero Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218740 Closes: https://lore.kernel.org/lkml/a7eb665c74b5efb5140e6979759ed243072cb24a.camel@zougloub.eu/ Co-developed-by: Sasha Neftin Signed-off-by: Sasha Neftin Signed-off-by: Vitaly Lifshits Tested-by: Dima Ruinskiy Signed-off-by: Tony Nguyen Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240429171040.1152516-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/e1000e/phy.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/phy.c b/drivers/net/ethernet/intel/e1000e/phy.c index 93544f1cc2a51b..f7ae0e0aa4a4f5 100644 --- a/drivers/net/ethernet/intel/e1000e/phy.c +++ b/drivers/net/ethernet/intel/e1000e/phy.c @@ -157,7 +157,7 @@ s32 e1000e_read_phy_reg_mdic(struct e1000_hw *hw, u32 offset, u16 *data) * the lower time out */ for (i = 0; i < (E1000_GEN_POLL_TIMEOUT * 3); i++) { - usleep_range(50, 60); + udelay(50); mdic = er32(MDIC); if (mdic & E1000_MDIC_READY) break; @@ -181,7 +181,7 @@ s32 e1000e_read_phy_reg_mdic(struct e1000_hw *hw, u32 offset, u16 *data) * reading duplicate data in the next MDIC transaction. */ if (hw->mac.type == e1000_pch2lan) - usleep_range(100, 150); + udelay(100); if (success) { *data = (u16)mdic; @@ -237,7 +237,7 @@ s32 e1000e_write_phy_reg_mdic(struct e1000_hw *hw, u32 offset, u16 data) * the lower time out */ for (i = 0; i < (E1000_GEN_POLL_TIMEOUT * 3); i++) { - usleep_range(50, 60); + udelay(50); mdic = er32(MDIC); if (mdic & E1000_MDIC_READY) break; @@ -261,7 +261,7 @@ s32 e1000e_write_phy_reg_mdic(struct e1000_hw *hw, u32 offset, u16 data) * reading duplicate data in the next MDIC transaction. */ if (hw->mac.type == e1000_pch2lan) - usleep_range(100, 150); + udelay(100); if (success) return 0; From fb7a0d334894206ae35f023a82cad5a290fd7386 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 29 Apr 2024 20:00:31 +0200 Subject: [PATCH 33/45] mptcp: ensure snd_nxt is properly initialized on connect Christoph reported a splat hinting at a corrupted snd_una: WARNING: CPU: 1 PID: 38 at net/mptcp/protocol.c:1005 __mptcp_clean_una+0x4b3/0x620 net/mptcp/protocol.c:1005 Modules linked in: CPU: 1 PID: 38 Comm: kworker/1:1 Not tainted 6.9.0-rc1-gbbeac67456c9 #59 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014 Workqueue: events mptcp_worker RIP: 0010:__mptcp_clean_una+0x4b3/0x620 net/mptcp/protocol.c:1005 Code: be 06 01 00 00 bf 06 01 00 00 e8 a8 12 e7 fe e9 00 fe ff ff e8 8e 1a e7 fe 0f b7 ab 3e 02 00 00 e9 d3 fd ff ff e8 7d 1a e7 fe <0f> 0b 4c 8b bb e0 05 00 00 e9 74 fc ff ff e8 6a 1a e7 fe 0f 0b e9 RSP: 0018:ffffc9000013fd48 EFLAGS: 00010293 RAX: 0000000000000000 RBX: ffff8881029bd280 RCX: ffffffff82382fe4 RDX: ffff8881003cbd00 RSI: ffffffff823833c3 RDI: 0000000000000001 RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: fefefefefefefeff R12: ffff888138ba8000 R13: 0000000000000106 R14: ffff8881029bd908 R15: ffff888126560000 FS: 0000000000000000(0000) GS:ffff88813bd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f604a5dae38 CR3: 0000000101dac002 CR4: 0000000000170ef0 Call Trace: __mptcp_clean_una_wakeup net/mptcp/protocol.c:1055 [inline] mptcp_clean_una_wakeup net/mptcp/protocol.c:1062 [inline] __mptcp_retrans+0x7f/0x7e0 net/mptcp/protocol.c:2615 mptcp_worker+0x434/0x740 net/mptcp/protocol.c:2767 process_one_work+0x1e0/0x560 kernel/workqueue.c:3254 process_scheduled_works kernel/workqueue.c:3335 [inline] worker_thread+0x3c7/0x640 kernel/workqueue.c:3416 kthread+0x121/0x170 kernel/kthread.c:388 ret_from_fork+0x44/0x50 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:243 When fallback to TCP happens early on a client socket, snd_nxt is not yet initialized and any incoming ack will copy such value into snd_una. If the mptcp worker (dumbly) tries mptcp-level re-injection after such ack, that would unconditionally trigger a send buffer cleanup using 'bad' snd_una values. We could easily disable re-injection for fallback sockets, but such dumb behavior already helped catching a few subtle issues and a very low to zero impact in practice. Instead address the issue always initializing snd_nxt (and write_seq, for consistency) at connect time. Fixes: 8fd738049ac3 ("mptcp: fallback in case of simultaneous connect") Cc: stable@vger.kernel.org Reported-by: Christoph Paasch Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/485 Tested-by: Christoph Paasch Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240429-upstream-net-20240429-mptcp-snd_nxt-init-connect-v1-1-59ceac0a7dcb@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 7e74b812e366ae..965eb69dc5de32 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3723,6 +3723,9 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_TOKENFALLBACKINIT); mptcp_subflow_early_fallback(msk, subflow); } + + WRITE_ONCE(msk->write_seq, subflow->idsn); + WRITE_ONCE(msk->snd_nxt, subflow->idsn); if (likely(!__mptcp_check_fallback(msk))) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVE); From 59c878cbcdd80ed39315573b3511d0acfd3501b5 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sat, 27 Apr 2024 20:24:18 +0200 Subject: [PATCH 34/45] net: bridge: fix multicast-to-unicast with fraglist GSO Calling skb_copy on a SKB_GSO_FRAGLIST skb is not valid, since it returns an invalid linearized skb. This code only needs to change the ethernet header, so pskb_copy is the right function to call here. Fixes: 6db6f0eae605 ("bridge: multicast to unicast") Signed-off-by: Felix Fietkau Acked-by: Paolo Abeni Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_forward.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 7431f89e897b95..d7c35f55bd69fb 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -266,7 +266,7 @@ static void maybe_deliver_addr(struct net_bridge_port *p, struct sk_buff *skb, if (skb->dev == p->dev && ether_addr_equal(src, addr)) return; - skb = skb_copy(skb, GFP_ATOMIC); + skb = pskb_copy(skb, GFP_ATOMIC); if (!skb) { DEV_STATS_INC(dev, tx_dropped); return; From d091e579b864fa790dd6a0cd537a22c383126681 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sat, 27 Apr 2024 20:24:19 +0200 Subject: [PATCH 35/45] net: core: reject skb_copy(_expand) for fraglist GSO skbs SKB_GSO_FRAGLIST skbs must not be linearized, otherwise they become invalid. Return NULL if such an skb is passed to skb_copy or skb_copy_expand, in order to prevent a crash on a potential later call to skb_gso_segment. Fixes: 3a1296a38d0c ("net: Support GRO/GSO fraglist chaining.") Signed-off-by: Felix Fietkau Signed-off-by: David S. Miller --- net/core/skbuff.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b99127712e6704..4096e679f61c76 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2123,11 +2123,17 @@ static inline int skb_alloc_rx_flag(const struct sk_buff *skb) struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) { - int headerlen = skb_headroom(skb); - unsigned int size = skb_end_offset(skb) + skb->data_len; - struct sk_buff *n = __alloc_skb(size, gfp_mask, - skb_alloc_rx_flag(skb), NUMA_NO_NODE); + struct sk_buff *n; + unsigned int size; + int headerlen; + + if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)) + return NULL; + headerlen = skb_headroom(skb); + size = skb_end_offset(skb) + skb->data_len; + n = __alloc_skb(size, gfp_mask, + skb_alloc_rx_flag(skb), NUMA_NO_NODE); if (!n) return NULL; @@ -2455,12 +2461,17 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, /* * Allocate the copy buffer */ - struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom, - gfp_mask, skb_alloc_rx_flag(skb), - NUMA_NO_NODE); - int oldheadroom = skb_headroom(skb); int head_copy_len, head_copy_off; + struct sk_buff *n; + int oldheadroom; + + if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)) + return NULL; + oldheadroom = skb_headroom(skb); + n = __alloc_skb(newheadroom + skb->len + newtailroom, + gfp_mask, skb_alloc_rx_flag(skb), + NUMA_NO_NODE); if (!n) return NULL; From 8953285d7bd63c12b007432a9b4587fa2fad49fb Mon Sep 17 00:00:00 2001 From: Jeffrey Altman Date: Fri, 19 Apr 2024 13:30:57 -0300 Subject: [PATCH 36/45] rxrpc: Clients must accept conn from any address The find connection logic of Transarc's Rx was modified in the mid-1990s to support multi-homed servers which might send a response packet from an address other than the destination address in the received packet. The rules for accepting a packet by an Rx initiator (RX_CLIENT_CONNECTION) were altered to permit acceptance of a packet from any address provided that the port number was unchanged and all of the connection identifiers matched (Epoch, CID, SecurityClass, ...). This change applies the same rules to the Linux implementation which makes it consistent with IBM AFS 3.6, Arla, OpenAFS and AuriStorFS. Fixes: 17926a79320a ("[AF_RXRPC]: Provide secure RxRPC sockets for use by userspace and kernel both") Signed-off-by: Jeffrey Altman Acked-by: David Howells Signed-off-by: Marc Dionne Link: https://lore.kernel.org/r/20240419163057.4141728-1-marc.dionne@auristor.com Signed-off-by: Jakub Kicinski --- net/rxrpc/conn_object.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 0af4642aeec4bc..1539d315afe74a 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -119,18 +119,13 @@ struct rxrpc_connection *rxrpc_find_client_connection_rcu(struct rxrpc_local *lo switch (srx->transport.family) { case AF_INET: if (peer->srx.transport.sin.sin_port != - srx->transport.sin.sin_port || - peer->srx.transport.sin.sin_addr.s_addr != - srx->transport.sin.sin_addr.s_addr) + srx->transport.sin.sin_port) goto not_found; break; #ifdef CONFIG_AF_RXRPC_IPV6 case AF_INET6: if (peer->srx.transport.sin6.sin6_port != - srx->transport.sin6.sin6_port || - memcmp(&peer->srx.transport.sin6.sin6_addr, - &srx->transport.sin6.sin6_addr, - sizeof(struct in6_addr)) != 0) + srx->transport.sin6.sin6_port) goto not_found; break; #endif From 080cbb890286cd794f1ee788bbc5463e2deb7c2b Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 30 Apr 2024 15:53:37 +0200 Subject: [PATCH 37/45] tipc: fix UAF in error path Sam Page (sam4k) working with Trend Micro Zero Day Initiative reported a UAF in the tipc_buf_append() error path: BUG: KASAN: slab-use-after-free in kfree_skb_list_reason+0x47e/0x4c0 linux/net/core/skbuff.c:1183 Read of size 8 at addr ffff88804d2a7c80 by task poc/8034 CPU: 1 PID: 8034 Comm: poc Not tainted 6.8.2 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-debian-1.16.0-5 04/01/2014 Call Trace: __dump_stack linux/lib/dump_stack.c:88 dump_stack_lvl+0xd9/0x1b0 linux/lib/dump_stack.c:106 print_address_description linux/mm/kasan/report.c:377 print_report+0xc4/0x620 linux/mm/kasan/report.c:488 kasan_report+0xda/0x110 linux/mm/kasan/report.c:601 kfree_skb_list_reason+0x47e/0x4c0 linux/net/core/skbuff.c:1183 skb_release_data+0x5af/0x880 linux/net/core/skbuff.c:1026 skb_release_all linux/net/core/skbuff.c:1094 __kfree_skb linux/net/core/skbuff.c:1108 kfree_skb_reason+0x12d/0x210 linux/net/core/skbuff.c:1144 kfree_skb linux/./include/linux/skbuff.h:1244 tipc_buf_append+0x425/0xb50 linux/net/tipc/msg.c:186 tipc_link_input+0x224/0x7c0 linux/net/tipc/link.c:1324 tipc_link_rcv+0x76e/0x2d70 linux/net/tipc/link.c:1824 tipc_rcv+0x45f/0x10f0 linux/net/tipc/node.c:2159 tipc_udp_recv+0x73b/0x8f0 linux/net/tipc/udp_media.c:390 udp_queue_rcv_one_skb+0xad2/0x1850 linux/net/ipv4/udp.c:2108 udp_queue_rcv_skb+0x131/0xb00 linux/net/ipv4/udp.c:2186 udp_unicast_rcv_skb+0x165/0x3b0 linux/net/ipv4/udp.c:2346 __udp4_lib_rcv+0x2594/0x3400 linux/net/ipv4/udp.c:2422 ip_protocol_deliver_rcu+0x30c/0x4e0 linux/net/ipv4/ip_input.c:205 ip_local_deliver_finish+0x2e4/0x520 linux/net/ipv4/ip_input.c:233 NF_HOOK linux/./include/linux/netfilter.h:314 NF_HOOK linux/./include/linux/netfilter.h:308 ip_local_deliver+0x18e/0x1f0 linux/net/ipv4/ip_input.c:254 dst_input linux/./include/net/dst.h:461 ip_rcv_finish linux/net/ipv4/ip_input.c:449 NF_HOOK linux/./include/linux/netfilter.h:314 NF_HOOK linux/./include/linux/netfilter.h:308 ip_rcv+0x2c5/0x5d0 linux/net/ipv4/ip_input.c:569 __netif_receive_skb_one_core+0x199/0x1e0 linux/net/core/dev.c:5534 __netif_receive_skb+0x1f/0x1c0 linux/net/core/dev.c:5648 process_backlog+0x101/0x6b0 linux/net/core/dev.c:5976 __napi_poll.constprop.0+0xba/0x550 linux/net/core/dev.c:6576 napi_poll linux/net/core/dev.c:6645 net_rx_action+0x95a/0xe90 linux/net/core/dev.c:6781 __do_softirq+0x21f/0x8e7 linux/kernel/softirq.c:553 do_softirq linux/kernel/softirq.c:454 do_softirq+0xb2/0xf0 linux/kernel/softirq.c:441 __local_bh_enable_ip+0x100/0x120 linux/kernel/softirq.c:381 local_bh_enable linux/./include/linux/bottom_half.h:33 rcu_read_unlock_bh linux/./include/linux/rcupdate.h:851 __dev_queue_xmit+0x871/0x3ee0 linux/net/core/dev.c:4378 dev_queue_xmit linux/./include/linux/netdevice.h:3169 neigh_hh_output linux/./include/net/neighbour.h:526 neigh_output linux/./include/net/neighbour.h:540 ip_finish_output2+0x169f/0x2550 linux/net/ipv4/ip_output.c:235 __ip_finish_output linux/net/ipv4/ip_output.c:313 __ip_finish_output+0x49e/0x950 linux/net/ipv4/ip_output.c:295 ip_finish_output+0x31/0x310 linux/net/ipv4/ip_output.c:323 NF_HOOK_COND linux/./include/linux/netfilter.h:303 ip_output+0x13b/0x2a0 linux/net/ipv4/ip_output.c:433 dst_output linux/./include/net/dst.h:451 ip_local_out linux/net/ipv4/ip_output.c:129 ip_send_skb+0x3e5/0x560 linux/net/ipv4/ip_output.c:1492 udp_send_skb+0x73f/0x1530 linux/net/ipv4/udp.c:963 udp_sendmsg+0x1a36/0x2b40 linux/net/ipv4/udp.c:1250 inet_sendmsg+0x105/0x140 linux/net/ipv4/af_inet.c:850 sock_sendmsg_nosec linux/net/socket.c:730 __sock_sendmsg linux/net/socket.c:745 __sys_sendto+0x42c/0x4e0 linux/net/socket.c:2191 __do_sys_sendto linux/net/socket.c:2203 __se_sys_sendto linux/net/socket.c:2199 __x64_sys_sendto+0xe0/0x1c0 linux/net/socket.c:2199 do_syscall_x64 linux/arch/x86/entry/common.c:52 do_syscall_64+0xd8/0x270 linux/arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x6f/0x77 linux/arch/x86/entry/entry_64.S:120 RIP: 0033:0x7f3434974f29 Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 37 8f 0d 00 f7 d8 64 89 01 48 RSP: 002b:00007fff9154f2b8 EFLAGS: 00000212 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f3434974f29 RDX: 00000000000032c8 RSI: 00007fff9154f300 RDI: 0000000000000003 RBP: 00007fff915532e0 R08: 00007fff91553360 R09: 0000000000000010 R10: 0000000000000000 R11: 0000000000000212 R12: 000055ed86d261d0 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 In the critical scenario, either the relevant skb is freed or its ownership is transferred into a frag_lists. In both cases, the cleanup code must not free it again: we need to clear the skb reference earlier. Fixes: 1149557d64c9 ("tipc: eliminate unnecessary linearization of incoming buffers") Cc: stable@vger.kernel.org Reported-by: zdi-disclosures@trendmicro.com # ZDI-CAN-23852 Acked-by: Xin Long Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/752f1ccf762223d109845365d07f55414058e5a3.1714484273.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- net/tipc/msg.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 5c9fd4791c4ba1..9a6e9bcbf69402 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -156,6 +156,11 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) if (!head) goto err; + /* Either the input skb ownership is transferred to headskb + * or the input skb is freed, clear the reference to avoid + * bad access on error path. + */ + *buf = NULL; if (skb_try_coalesce(head, frag, &headstolen, &delta)) { kfree_skb_partial(frag, headstolen); } else { @@ -179,7 +184,6 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) *headbuf = NULL; return 1; } - *buf = NULL; return 0; err: kfree_skb(*buf); From 97bf6f81b29a8efaf5d0983251a7450e5794370d Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 30 Apr 2024 10:03:38 -0400 Subject: [PATCH 38/45] tipc: fix a possible memleak in tipc_buf_append __skb_linearize() doesn't free the skb when it fails, so move '*buf = NULL' after __skb_linearize(), so that the skb can be freed on the err path. Fixes: b7df21cf1b79 ("tipc: skb_linearize the head skb when reassembling msgs") Reported-by: Paolo Abeni Signed-off-by: Xin Long Reviewed-by: Simon Horman Reviewed-by: Tung Nguyen Link: https://lore.kernel.org/r/90710748c29a1521efac4f75ea01b3b7e61414cf.1714485818.git.lucien.xin@gmail.com Signed-off-by: Jakub Kicinski --- net/tipc/msg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 9a6e9bcbf69402..76284fc538ebdd 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -142,9 +142,9 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) if (fragid == FIRST_FRAGMENT) { if (unlikely(head)) goto err; - *buf = NULL; if (skb_has_frag_list(frag) && __skb_linearize(frag)) goto err; + *buf = NULL; frag = skb_unshare(frag, GFP_ATOMIC); if (unlikely(!frag)) goto err; From f7789419137b18e3847d0cc41afd788c3c00663d Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 30 Apr 2024 18:50:13 +0200 Subject: [PATCH 39/45] vxlan: Pull inner IP header in vxlan_rcv(). Ensure the inner IP header is part of skb's linear data before reading its ECN bits. Otherwise we might read garbage. One symptom is the system erroneously logging errors like "vxlan: non-ECT from xxx.xxx.xxx.xxx with TOS=xxxx". Similar bugs have been fixed in geneve, ip_tunnel and ip6_tunnel (see commit 1ca1ba465e55 ("geneve: make sure to pull inner header in geneve_rx()") for example). So let's reuse the same code structure for consistency. Maybe we'll can add a common helper in the future. Fixes: d342894c5d2f ("vxlan: virtual extensible lan") Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Reviewed-by: Eric Dumazet Reviewed-by: Nikolay Aleksandrov Reviewed-by: Sabrina Dubroca Link: https://lore.kernel.org/r/1239c8db54efec341dd6455c77e0380f58923a3c.1714495737.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index c9e4e03ad214f3..3a9148fb1422ba 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1674,6 +1674,7 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) bool raw_proto = false; void *oiph; __be32 vni = 0; + int nh; /* Need UDP and VXLAN header to be present */ if (!pskb_may_pull(skb, VXLAN_HLEN)) @@ -1762,9 +1763,25 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) skb->pkt_type = PACKET_HOST; } - oiph = skb_network_header(skb); + /* Save offset of outer header relative to skb->head, + * because we are going to reset the network header to the inner header + * and might change skb->head. + */ + nh = skb_network_header(skb) - skb->head; + skb_reset_network_header(skb); + if (!pskb_inet_may_pull(skb)) { + DEV_STATS_INC(vxlan->dev, rx_length_errors); + DEV_STATS_INC(vxlan->dev, rx_errors); + vxlan_vnifilter_count(vxlan, vni, vninode, + VXLAN_VNI_STATS_RX_ERRORS, 0); + goto drop; + } + + /* Get the outer header. */ + oiph = skb->head + nh; + if (!vxlan_ecn_decapsulate(vs, oiph, skb)) { DEV_STATS_INC(vxlan->dev, rx_frame_errors); DEV_STATS_INC(vxlan->dev, rx_errors); From 8a2e4d37afb8500b276e5ee903dee06f50ab0494 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Tue, 30 Apr 2024 11:10:04 +0200 Subject: [PATCH 40/45] s390/qeth: Fix kernel panic after setting hsuid Symptom: When the hsuid attribute is set for the first time on an IQD Layer3 device while the corresponding network interface is already UP, the kernel will try to execute a napi function pointer that is NULL. Example: --------------------------------------------------------------------------- [ 2057.572696] illegal operation: 0001 ilc:1 [#1] SMP [ 2057.572702] Modules linked in: af_iucv qeth_l3 zfcp scsi_transport_fc sunrpc nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nf_tables_set nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables libcrc32c nfnetlink ghash_s390 prng xts aes_s390 des_s390 de s_generic sha3_512_s390 sha3_256_s390 sha512_s390 vfio_ccw vfio_mdev mdev vfio_iommu_type1 eadm_sch vfio ext4 mbcache jbd2 qeth_l2 bridge stp llc dasd_eckd_mod qeth dasd_mod qdio ccwgroup pkey zcrypt [ 2057.572739] CPU: 6 PID: 60182 Comm: stress_client Kdump: loaded Not tainted 4.18.0-541.el8.s390x #1 [ 2057.572742] Hardware name: IBM 3931 A01 704 (LPAR) [ 2057.572744] Krnl PSW : 0704f00180000000 0000000000000002 (0x2) [ 2057.572748] R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:3 PM:0 RI:0 EA:3 [ 2057.572751] Krnl GPRS: 0000000000000004 0000000000000000 00000000a3b008d8 0000000000000000 [ 2057.572754] 00000000a3b008d8 cb923a29c779abc5 0000000000000000 00000000814cfd80 [ 2057.572756] 000000000000012c 0000000000000000 00000000a3b008d8 00000000a3b008d8 [ 2057.572758] 00000000bab6d500 00000000814cfd80 0000000091317e46 00000000814cfc68 [ 2057.572762] Krnl Code:#0000000000000000: 0000 illegal >0000000000000002: 0000 illegal 0000000000000004: 0000 illegal 0000000000000006: 0000 illegal 0000000000000008: 0000 illegal 000000000000000a: 0000 illegal 000000000000000c: 0000 illegal 000000000000000e: 0000 illegal [ 2057.572800] Call Trace: [ 2057.572801] ([<00000000ec639700>] 0xec639700) [ 2057.572803] [<00000000913183e2>] net_rx_action+0x2ba/0x398 [ 2057.572809] [<0000000091515f76>] __do_softirq+0x11e/0x3a0 [ 2057.572813] [<0000000090ce160c>] do_softirq_own_stack+0x3c/0x58 [ 2057.572817] ([<0000000090d2cbd6>] do_softirq.part.1+0x56/0x60) [ 2057.572822] [<0000000090d2cc60>] __local_bh_enable_ip+0x80/0x98 [ 2057.572825] [<0000000091314706>] __dev_queue_xmit+0x2be/0xd70 [ 2057.572827] [<000003ff803dd6d6>] afiucv_hs_send+0x24e/0x300 [af_iucv] [ 2057.572830] [<000003ff803dd88a>] iucv_send_ctrl+0x102/0x138 [af_iucv] [ 2057.572833] [<000003ff803de72a>] iucv_sock_connect+0x37a/0x468 [af_iucv] [ 2057.572835] [<00000000912e7e90>] __sys_connect+0xa0/0xd8 [ 2057.572839] [<00000000912e9580>] sys_socketcall+0x228/0x348 [ 2057.572841] [<0000000091514e1a>] system_call+0x2a6/0x2c8 [ 2057.572843] Last Breaking-Event-Address: [ 2057.572844] [<0000000091317e44>] __napi_poll+0x4c/0x1d8 [ 2057.572846] [ 2057.572847] Kernel panic - not syncing: Fatal exception in interrupt ------------------------------------------------------------------------------------------- Analysis: There is one napi structure per out_q: card->qdio.out_qs[i].napi The napi.poll functions are set during qeth_open(). Since commit 1cfef80d4c2b ("s390/qeth: Don't call dev_close/dev_open (DOWN/UP)") qeth_set_offline()/qeth_set_online() no longer call dev_close()/ dev_open(). So if qeth_free_qdio_queues() cleared card->qdio.out_qs[i].napi.poll while the network interface was UP and the card was offline, they are not set again. Reproduction: chzdev -e $devno layer2=0 ip link set dev $network_interface up echo 0 > /sys/bus/ccwgroup/devices/0.0.$devno/online echo foo > /sys/bus/ccwgroup/devices/0.0.$devno/hsuid echo 1 > /sys/bus/ccwgroup/devices/0.0.$devno/online -> Crash (can be enforced e.g. by af_iucv connect(), ip link down/up, ...) Note that a Completion Queue (CQ) is only enabled or disabled, when hsuid is set for the first time or when it is removed. Workarounds: - Set hsuid before setting the device online for the first time or - Use chzdev -d $devno; chzdev $devno hsuid=xxx; chzdev -e $devno; to set hsuid on an existing device. (this will remove and recreate the network interface) Fix: There is no need to free the output queues when a completion queue is added or removed. card->qdio.state now indicates whether the inbound buffer pool and the outbound queues are allocated. card->qdio.c_q indicates whether a CQ is allocated. Fixes: 1cfef80d4c2b ("s390/qeth: Don't call dev_close/dev_open (DOWN/UP)") Signed-off-by: Alexandra Winter Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240430091004.2265683-1-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- drivers/s390/net/qeth_core_main.c | 61 ++++++++++++++----------------- 1 file changed, 27 insertions(+), 34 deletions(-) diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index f0b8b709649f29..a3adaec5504e45 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -364,30 +364,33 @@ static int qeth_cq_init(struct qeth_card *card) return rc; } +static void qeth_free_cq(struct qeth_card *card) +{ + if (card->qdio.c_q) { + qeth_free_qdio_queue(card->qdio.c_q); + card->qdio.c_q = NULL; + } +} + static int qeth_alloc_cq(struct qeth_card *card) { if (card->options.cq == QETH_CQ_ENABLED) { QETH_CARD_TEXT(card, 2, "cqon"); - card->qdio.c_q = qeth_alloc_qdio_queue(); if (!card->qdio.c_q) { - dev_err(&card->gdev->dev, "Failed to create completion queue\n"); - return -ENOMEM; + card->qdio.c_q = qeth_alloc_qdio_queue(); + if (!card->qdio.c_q) { + dev_err(&card->gdev->dev, + "Failed to create completion queue\n"); + return -ENOMEM; + } } } else { QETH_CARD_TEXT(card, 2, "nocq"); - card->qdio.c_q = NULL; + qeth_free_cq(card); } return 0; } -static void qeth_free_cq(struct qeth_card *card) -{ - if (card->qdio.c_q) { - qeth_free_qdio_queue(card->qdio.c_q); - card->qdio.c_q = NULL; - } -} - static enum iucv_tx_notify qeth_compute_cq_notification(int sbalf15, int delayed) { @@ -2628,6 +2631,10 @@ static int qeth_alloc_qdio_queues(struct qeth_card *card) QETH_CARD_TEXT(card, 2, "allcqdbf"); + /* completion */ + if (qeth_alloc_cq(card)) + goto out_err; + if (atomic_cmpxchg(&card->qdio.state, QETH_QDIO_UNINITIALIZED, QETH_QDIO_ALLOCATED) != QETH_QDIO_UNINITIALIZED) return 0; @@ -2663,10 +2670,6 @@ static int qeth_alloc_qdio_queues(struct qeth_card *card) queue->priority = QETH_QIB_PQUE_PRIO_DEFAULT; } - /* completion */ - if (qeth_alloc_cq(card)) - goto out_freeoutq; - return 0; out_freeoutq: @@ -2677,6 +2680,8 @@ static int qeth_alloc_qdio_queues(struct qeth_card *card) qeth_free_buffer_pool(card); out_buffer_pool: atomic_set(&card->qdio.state, QETH_QDIO_UNINITIALIZED); + qeth_free_cq(card); +out_err: return -ENOMEM; } @@ -2684,11 +2689,12 @@ static void qeth_free_qdio_queues(struct qeth_card *card) { int i, j; + qeth_free_cq(card); + if (atomic_xchg(&card->qdio.state, QETH_QDIO_UNINITIALIZED) == QETH_QDIO_UNINITIALIZED) return; - qeth_free_cq(card); for (j = 0; j < QDIO_MAX_BUFFERS_PER_Q; ++j) { if (card->qdio.in_q->bufs[j].rx_skb) { consume_skb(card->qdio.in_q->bufs[j].rx_skb); @@ -3742,24 +3748,11 @@ static void qeth_qdio_poll(struct ccw_device *cdev, unsigned long card_ptr) int qeth_configure_cq(struct qeth_card *card, enum qeth_cq cq) { - int rc; - - if (card->options.cq == QETH_CQ_NOTAVAILABLE) { - rc = -1; - goto out; - } else { - if (card->options.cq == cq) { - rc = 0; - goto out; - } - - qeth_free_qdio_queues(card); - card->options.cq = cq; - rc = 0; - } -out: - return rc; + if (card->options.cq == QETH_CQ_NOTAVAILABLE) + return -1; + card->options.cq = cq; + return 0; } EXPORT_SYMBOL_GPL(qeth_configure_cq); From fc1092f51567277509563800a3c56732070b6aa4 Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Tue, 30 Apr 2024 21:39:45 +0900 Subject: [PATCH 41/45] ipv4: Fix uninit-value access in __ip_make_skb() KMSAN reported uninit-value access in __ip_make_skb() [1]. __ip_make_skb() tests HDRINCL to know if the skb has icmphdr. However, HDRINCL can cause a race condition. If calling setsockopt(2) with IP_HDRINCL changes HDRINCL while __ip_make_skb() is running, the function will access icmphdr in the skb even if it is not included. This causes the issue reported by KMSAN. Check FLOWI_FLAG_KNOWN_NH on fl4->flowi4_flags instead of testing HDRINCL on the socket. Also, fl4->fl4_icmp_type and fl4->fl4_icmp_code are not initialized. These are union in struct flowi4 and are implicitly initialized by flowi4_init_output(), but we should not rely on specific union layout. Initialize these explicitly in raw_sendmsg(). [1] BUG: KMSAN: uninit-value in __ip_make_skb+0x2b74/0x2d20 net/ipv4/ip_output.c:1481 __ip_make_skb+0x2b74/0x2d20 net/ipv4/ip_output.c:1481 ip_finish_skb include/net/ip.h:243 [inline] ip_push_pending_frames+0x4c/0x5c0 net/ipv4/ip_output.c:1508 raw_sendmsg+0x2381/0x2690 net/ipv4/raw.c:654 inet_sendmsg+0x27b/0x2a0 net/ipv4/af_inet.c:851 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x274/0x3c0 net/socket.c:745 __sys_sendto+0x62c/0x7b0 net/socket.c:2191 __do_sys_sendto net/socket.c:2203 [inline] __se_sys_sendto net/socket.c:2199 [inline] __x64_sys_sendto+0x130/0x200 net/socket.c:2199 do_syscall_64+0xd8/0x1f0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x6d/0x75 Uninit was created at: slab_post_alloc_hook mm/slub.c:3804 [inline] slab_alloc_node mm/slub.c:3845 [inline] kmem_cache_alloc_node+0x5f6/0xc50 mm/slub.c:3888 kmalloc_reserve+0x13c/0x4a0 net/core/skbuff.c:577 __alloc_skb+0x35a/0x7c0 net/core/skbuff.c:668 alloc_skb include/linux/skbuff.h:1318 [inline] __ip_append_data+0x49ab/0x68c0 net/ipv4/ip_output.c:1128 ip_append_data+0x1e7/0x260 net/ipv4/ip_output.c:1365 raw_sendmsg+0x22b1/0x2690 net/ipv4/raw.c:648 inet_sendmsg+0x27b/0x2a0 net/ipv4/af_inet.c:851 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x274/0x3c0 net/socket.c:745 __sys_sendto+0x62c/0x7b0 net/socket.c:2191 __do_sys_sendto net/socket.c:2203 [inline] __se_sys_sendto net/socket.c:2199 [inline] __x64_sys_sendto+0x130/0x200 net/socket.c:2199 do_syscall_64+0xd8/0x1f0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x6d/0x75 CPU: 1 PID: 15709 Comm: syz-executor.7 Not tainted 6.8.0-11567-gb3603fcb79b1 #25 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-1.fc39 04/01/2014 Fixes: 99e5acae193e ("ipv4: Fix potential uninit variable access bug in __ip_make_skb()") Reported-by: syzkaller Signed-off-by: Shigeru Yoshida Link: https://lore.kernel.org/r/20240430123945.2057348-1-syoshida@redhat.com Signed-off-by: Paolo Abeni --- net/ipv4/ip_output.c | 2 +- net/ipv4/raw.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 1fe794967211e2..39229fd0601a11 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1473,7 +1473,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, * by icmp_hdr(skb)->type. */ if (sk->sk_type == SOCK_RAW && - !inet_test_bit(HDRINCL, sk)) + !(fl4->flowi4_flags & FLOWI_FLAG_KNOWN_NH)) icmp_type = fl4->fl4_icmp_type; else icmp_type = icmp_hdr(skb)->type; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index dcb11f22cbf2b4..4cb43401e0e06c 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -612,6 +612,9 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), daddr, saddr, 0, 0, sk->sk_uid); + fl4.fl4_icmp_type = 0; + fl4.fl4_icmp_code = 0; + if (!hdrincl) { rfv.msg = msg; rfv.hlen = 0; From 5ef31ea5d053a8f493a772ebad3f3ce82c35d845 Mon Sep 17 00:00:00 2001 From: Richard Gobert Date: Tue, 30 Apr 2024 16:35:54 +0200 Subject: [PATCH 42/45] net: gro: fix udp bad offset in socket lookup by adding {inner_}network_offset to napi_gro_cb Commits a602456 ("udp: Add GRO functions to UDP socket") and 57c67ff ("udp: additional GRO support") introduce incorrect usage of {ip,ipv6}_hdr in the complete phase of gro. The functions always return skb->network_header, which in the case of encapsulated packets at the gro complete phase, is always set to the innermost L3 of the packet. That means that calling {ip,ipv6}_hdr for skbs which completed the GRO receive phase (both in gro_list and *_gro_complete) when parsing an encapsulated packet's _outer_ L3/L4 may return an unexpected value. This incorrect usage leads to a bug in GRO's UDP socket lookup. udp{4,6}_lib_lookup_skb functions use ip_hdr/ipv6_hdr respectively. These *_hdr functions return network_header which will point to the innermost L3, resulting in the wrong offset being used in __udp{4,6}_lib_lookup with encapsulated packets. This patch adds network_offset and inner_network_offset to napi_gro_cb, and makes sure both are set correctly. To fix the issue, network_offsets union is used inside napi_gro_cb, in which both the outer and the inner network offsets are saved. Reproduction example: Endpoint configuration example (fou + local address bind) # ip fou add port 6666 ipproto 4 # ip link add name tun1 type ipip remote 2.2.2.1 local 2.2.2.2 encap fou encap-dport 5555 encap-sport 6666 mode ipip # ip link set tun1 up # ip a add 1.1.1.2/24 dev tun1 Netperf TCP_STREAM result on net-next before patch is applied: net-next main, GRO enabled: $ netperf -H 1.1.1.2 -t TCP_STREAM -l 5 Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 131072 16384 16384 5.28 2.37 net-next main, GRO disabled: $ netperf -H 1.1.1.2 -t TCP_STREAM -l 5 Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 131072 16384 16384 5.01 2745.06 patch applied, GRO enabled: $ netperf -H 1.1.1.2 -t TCP_STREAM -l 5 Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 131072 16384 16384 5.01 2877.38 Fixes: a6024562ffd7 ("udp: Add GRO functions to UDP socket") Signed-off-by: Richard Gobert Reviewed-by: Eric Dumazet Reviewed-by: Willem de Bruijn Signed-off-by: Paolo Abeni --- include/net/gro.h | 9 +++++++++ net/8021q/vlan_core.c | 2 ++ net/core/gro.c | 1 + net/ipv4/af_inet.c | 1 + net/ipv4/udp.c | 3 ++- net/ipv4/udp_offload.c | 3 ++- net/ipv6/ip6_offload.c | 1 + net/ipv6/udp.c | 3 ++- net/ipv6/udp_offload.c | 3 ++- 9 files changed, 22 insertions(+), 4 deletions(-) diff --git a/include/net/gro.h b/include/net/gro.h index 50f1e403dbbb38..c1d4ca0463a178 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -87,6 +87,15 @@ struct napi_gro_cb { /* used to support CHECKSUM_COMPLETE for tunneling protocols */ __wsum csum; + + /* L3 offsets */ + union { + struct { + u16 network_offset; + u16 inner_network_offset; + }; + u16 network_offsets[2]; + }; }; #define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb) diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index f001582345052f..9404dd551dfd28 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -478,6 +478,8 @@ static struct sk_buff *vlan_gro_receive(struct list_head *head, if (unlikely(!vhdr)) goto out; + NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark] = hlen; + type = vhdr->h_vlan_encapsulated_proto; ptype = gro_find_receive_by_type(type); diff --git a/net/core/gro.c b/net/core/gro.c index 83f35d99a682c2..c7901253a1a8fc 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -371,6 +371,7 @@ static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff) const skb_frag_t *frag0; unsigned int headlen; + NAPI_GRO_CB(skb)->network_offset = 0; NAPI_GRO_CB(skb)->data_offset = 0; headlen = skb_headlen(skb); NAPI_GRO_CB(skb)->frag0 = skb->data; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 55bd72997b3106..fafb123f798be3 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1572,6 +1572,7 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) /* The above will be needed by the transport layer if there is one * immediately following this IP hdr. */ + NAPI_GRO_CB(skb)->inner_network_offset = off; /* Note : No need to call skb_gro_postpull_rcsum() here, * as we already checked checksum over ipv4 header was 0 diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 420905be5f30c9..b32cf2eeeb41d1 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -532,7 +532,8 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport) { - const struct iphdr *iph = ip_hdr(skb); + const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; + const struct iphdr *iph = (struct iphdr *)(skb->data + offset); struct net *net = dev_net(skb->dev); int iif, sdif; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 3498dd1d0694dc..fd29d21d579c18 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -718,7 +718,8 @@ EXPORT_SYMBOL(udp_gro_complete); INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff) { - const struct iphdr *iph = ip_hdr(skb); + const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; + const struct iphdr *iph = (struct iphdr *)(skb->data + offset); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); /* do fraglist only if there is no outer UDP encap (or we already processed it) */ diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index b41e35af69ea28..c8b909a9904f32 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -237,6 +237,7 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head, goto out; skb_set_network_header(skb, off); + NAPI_GRO_CB(skb)->inner_network_offset = off; flush += ntohs(iph->payload_len) != skb->len - hlen; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 1a4cccdd40c9ca..8f7aa8bac1e7b1 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -272,7 +272,8 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb, struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport) { - const struct ipv6hdr *iph = ipv6_hdr(skb); + const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; + const struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + offset); struct net *net = dev_net(skb->dev); int iif, sdif; diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index bbd347de00b450..b41152dd424697 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -164,7 +164,8 @@ struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb) INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff) { - const struct ipv6hdr *ipv6h = ipv6_hdr(skb); + const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; + const struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + offset); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); /* do fraglist only if there is no outer UDP encap (or we already processed it) */ From 5babae777c61aa8a8679d59d3cdc54165ad96d42 Mon Sep 17 00:00:00 2001 From: Richard Gobert Date: Tue, 30 Apr 2024 16:35:55 +0200 Subject: [PATCH 43/45] net: gro: add flush check in udp_gro_receive_segment GRO-GSO path is supposed to be transparent and as such L3 flush checks are relevant to all UDP flows merging in GRO. This patch uses the same logic and code from tcp_gro_receive, terminating merge if flush is non zero. Fixes: e20cf8d3f1f7 ("udp: implement GRO for plain UDP sockets.") Signed-off-by: Richard Gobert Reviewed-by: Willem de Bruijn Signed-off-by: Paolo Abeni --- net/ipv4/udp_offload.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index fd29d21d579c18..8721fe5beca2be 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -471,6 +471,7 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, struct sk_buff *p; unsigned int ulen; int ret = 0; + int flush; /* requires non zero csum, for symmetry with GSO */ if (!uh->check) { @@ -504,13 +505,22 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, return p; } + flush = NAPI_GRO_CB(p)->flush; + + if (NAPI_GRO_CB(p)->flush_id != 1 || + NAPI_GRO_CB(p)->count != 1 || + !NAPI_GRO_CB(p)->is_atomic) + flush |= NAPI_GRO_CB(p)->flush_id; + else + NAPI_GRO_CB(p)->is_atomic = false; + /* Terminate the flow on len mismatch or if it grow "too much". * Under small packet flood GRO count could elsewhere grow a lot * leading to excessive truesize values. * On len mismatch merge the first packet shorter than gso_size, * otherwise complete the GRO packet. */ - if (ulen > ntohs(uh2->len)) { + if (ulen > ntohs(uh2->len) || flush) { pp = p; } else { if (NAPI_GRO_CB(skb)->is_flist) { From c9ccbcd9f1995e6aa1578220f86c96f57be529d7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 30 Apr 2024 16:33:05 -0700 Subject: [PATCH 44/45] MAINTAINERS: remove Ariel Elior aelior@marvell.com bounces, we haven't seen Ariel on lore since March 2022. Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/r/20240430233305.1356105-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- MAINTAINERS | 3 --- 1 file changed, 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 4a2e864fce5168..0261a807727423 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4193,7 +4193,6 @@ S: Supported F: drivers/scsi/bnx2i/ BROADCOM BNX2X 10 GIGABIT ETHERNET DRIVER -M: Ariel Elior M: Sudarsana Kalluru M: Manish Chopra L: netdev@vger.kernel.org @@ -17998,7 +17997,6 @@ S: Supported F: drivers/scsi/qedi/ QLOGIC QL4xxx ETHERNET DRIVER -M: Ariel Elior M: Manish Chopra L: netdev@vger.kernel.org S: Supported @@ -18008,7 +18006,6 @@ F: include/linux/qed/ QLOGIC QL4xxx RDMA DRIVER M: Michal Kalderon -M: Ariel Elior L: linux-rdma@vger.kernel.org S: Supported F: drivers/infiniband/hw/qedr/ From 78cfe547607a83de60cd25304fa2422777634712 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 30 Apr 2024 16:35:32 -0700 Subject: [PATCH 45/45] MAINTAINERS: mark MYRICOM MYRI-10G as Orphan Chris's email address bounces and lore hasn't seen an email from anyone with his name for almost a decade. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240430233532.1356982-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- MAINTAINERS | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 0261a807727423..157135b470fa45 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15161,9 +15161,8 @@ F: drivers/scsi/myrb.* F: drivers/scsi/myrs.* MYRICOM MYRI-10G 10GbE DRIVER (MYRI10GE) -M: Chris Lee L: netdev@vger.kernel.org -S: Supported +S: Orphan W: https://www.cspi.com/ethernet-products/support/downloads/ F: drivers/net/ethernet/myricom/myri10ge/