From e11ce03b58743bf1e096c48fcaa7e6f08eb75dfa Mon Sep 17 00:00:00 2001 From: Roi Martin Date: Wed, 9 Oct 2024 10:08:33 +0200 Subject: [PATCH 001/125] btrfs: fix uninitialized pointer free in add_inode_ref() commit 66691c6e2f18d2aa4b22ffb624b9bdc97e9979e4 upstream. The add_inode_ref() function does not initialize the "name" struct when it is declared. If any of the following calls to "read_one_inode() returns NULL, dir = read_one_inode(root, parent_objectid); if (!dir) { ret = -ENOENT; goto out; } inode = read_one_inode(root, inode_objectid); if (!inode) { ret = -EIO; goto out; } then "name.name" would be freed on "out" before being initialized. out: ... kfree(name.name); This issue was reported by Coverity with CID 1526744. Fixes: e43eec81c516 ("btrfs: use struct qstr instead of name and namelen pairs") CC: stable@vger.kernel.org # 6.6+ Reviewed-by: Filipe Manana Signed-off-by: Roi Martin Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e0037665aa92..9ca55ad589e5 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1374,7 +1374,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, struct inode *inode = NULL; unsigned long ref_ptr; unsigned long ref_end; - struct fscrypt_str name; + struct fscrypt_str name = { 0 }; int ret; int log_ref_ver = 0; u64 parent_objectid; From 7fc7c47b9ba0cf2d192f2117a64b24881b0b577f Mon Sep 17 00:00:00 2001 From: Roi Martin Date: Thu, 10 Oct 2024 21:47:17 +0200 Subject: [PATCH 002/125] btrfs: fix uninitialized pointer free on read_alloc_one_name() error commit 2ab5e243c2266c841e0f6904fad1514b18eaf510 upstream. The function read_alloc_one_name() does not initialize the name field of the passed fscrypt_str struct if kmalloc fails to allocate the corresponding buffer. Thus, it is not guaranteed that fscrypt_str.name is initialized when freeing it. This is a follow-up to the linked patch that fixes the remaining instances of the bug introduced by commit e43eec81c516 ("btrfs: use struct qstr instead of name and namelen pairs"). Link: https://lore.kernel.org/linux-btrfs/20241009080833.1355894-1-jroi.martin@gmail.com/ Fixes: e43eec81c516 ("btrfs: use struct qstr instead of name and namelen pairs") CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Anand Jain Signed-off-by: Roi Martin Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/tree-log.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9ca55ad589e5..cc9a2f8a4ae3 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1846,7 +1846,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, struct btrfs_dir_item *di, struct btrfs_key *key) { - struct fscrypt_str name; + struct fscrypt_str name = { 0 }; struct btrfs_dir_item *dir_dst_di; struct btrfs_dir_item *index_dst_di; bool dir_dst_matches = false; @@ -2126,7 +2126,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, struct extent_buffer *eb; int slot; struct btrfs_dir_item *di; - struct fscrypt_str name; + struct fscrypt_str name = { 0 }; struct inode *inode = NULL; struct btrfs_key location; From 5511999e9615e4318e9142d23b29bd1597befc08 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Tue, 8 Oct 2024 22:42:57 +0900 Subject: [PATCH 003/125] ksmbd: fix user-after-free from session log off commit 7aa8804c0b67b3cb263a472d17f2cb50d7f1a930 upstream. There is racy issue between smb2 session log off and smb2 session setup. It will cause user-after-free from session log off. This add session_lock when setting SMB2_SESSION_EXPIRED and referece count to session struct not to free session while it is being used. Cc: stable@vger.kernel.org # v5.15+ Reported-by: zdi-disclosures@trendmicro.com # ZDI-CAN-25282 Signed-off-by: Namjae Jeon Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/smb/server/mgmt/user_session.c | 26 +++++++++++++++++++++----- fs/smb/server/mgmt/user_session.h | 4 ++++ fs/smb/server/server.c | 2 ++ fs/smb/server/smb2pdu.c | 8 +++++++- 4 files changed, 34 insertions(+), 6 deletions(-) diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c index dac5f984f175..9f40b9c473ba 100644 --- a/fs/smb/server/mgmt/user_session.c +++ b/fs/smb/server/mgmt/user_session.c @@ -176,9 +176,10 @@ static void ksmbd_expire_session(struct ksmbd_conn *conn) down_write(&conn->session_lock); xa_for_each(&conn->sessions, id, sess) { - if (sess->state != SMB2_SESSION_VALID || - time_after(jiffies, - sess->last_active + SMB2_SESSION_TIMEOUT)) { + if (atomic_read(&sess->refcnt) == 0 && + (sess->state != SMB2_SESSION_VALID || + time_after(jiffies, + sess->last_active + SMB2_SESSION_TIMEOUT))) { xa_erase(&conn->sessions, sess->id); hash_del(&sess->hlist); ksmbd_session_destroy(sess); @@ -268,8 +269,6 @@ struct ksmbd_session *ksmbd_session_lookup_slowpath(unsigned long long id) down_read(&sessions_table_lock); sess = __session_lookup(id); - if (sess) - sess->last_active = jiffies; up_read(&sessions_table_lock); return sess; @@ -288,6 +287,22 @@ struct ksmbd_session *ksmbd_session_lookup_all(struct ksmbd_conn *conn, return sess; } +void ksmbd_user_session_get(struct ksmbd_session *sess) +{ + atomic_inc(&sess->refcnt); +} + +void ksmbd_user_session_put(struct ksmbd_session *sess) +{ + if (!sess) + return; + + if (atomic_read(&sess->refcnt) <= 0) + WARN_ON(1); + else + atomic_dec(&sess->refcnt); +} + struct preauth_session *ksmbd_preauth_session_alloc(struct ksmbd_conn *conn, u64 sess_id) { @@ -390,6 +405,7 @@ static struct ksmbd_session *__session_create(int protocol) xa_init(&sess->rpc_handle_list); sess->sequence_number = 1; rwlock_init(&sess->tree_conns_lock); + atomic_set(&sess->refcnt, 1); ret = __init_smb2_session(sess); if (ret) diff --git a/fs/smb/server/mgmt/user_session.h b/fs/smb/server/mgmt/user_session.h index dc9fded2cd43..c1c4b20bd5c6 100644 --- a/fs/smb/server/mgmt/user_session.h +++ b/fs/smb/server/mgmt/user_session.h @@ -61,6 +61,8 @@ struct ksmbd_session { struct ksmbd_file_table file_table; unsigned long last_active; rwlock_t tree_conns_lock; + + atomic_t refcnt; }; static inline int test_session_flag(struct ksmbd_session *sess, int bit) @@ -104,4 +106,6 @@ void ksmbd_release_tree_conn_id(struct ksmbd_session *sess, int id); int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name); void ksmbd_session_rpc_close(struct ksmbd_session *sess, int id); int ksmbd_session_rpc_method(struct ksmbd_session *sess, int id); +void ksmbd_user_session_get(struct ksmbd_session *sess); +void ksmbd_user_session_put(struct ksmbd_session *sess); #endif /* __USER_SESSION_MANAGEMENT_H__ */ diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c index 2bbc3c3316f0..d5d85300560d 100644 --- a/fs/smb/server/server.c +++ b/fs/smb/server/server.c @@ -238,6 +238,8 @@ static void __handle_ksmbd_work(struct ksmbd_work *work, } while (is_chained == true); send: + if (work->sess) + ksmbd_user_session_put(work->sess); if (work->tcon) ksmbd_tree_connect_put(work->tcon); smb3_preauth_hash_rsp(work); diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index c6473b08b1f3..cac9eb4663aa 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -605,8 +605,10 @@ int smb2_check_user_session(struct ksmbd_work *work) /* Check for validity of user session */ work->sess = ksmbd_session_lookup_all(conn, sess_id); - if (work->sess) + if (work->sess) { + ksmbd_user_session_get(work->sess); return 1; + } ksmbd_debug(SMB, "Invalid user session, Uid %llu\n", sess_id); return -ENOENT; } @@ -1743,6 +1745,7 @@ int smb2_sess_setup(struct ksmbd_work *work) } conn->binding = true; + ksmbd_user_session_get(sess); } else if ((conn->dialect < SMB30_PROT_ID || server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL) && (req->Flags & SMB2_SESSION_REQ_FLAG_BINDING)) { @@ -1769,6 +1772,7 @@ int smb2_sess_setup(struct ksmbd_work *work) } conn->binding = false; + ksmbd_user_session_get(sess); } work->sess = sess; @@ -2229,7 +2233,9 @@ int smb2_session_logoff(struct ksmbd_work *work) } ksmbd_destroy_file_table(&sess->file_table); + down_write(&conn->session_lock); sess->state = SMB2_SESSION_EXPIRED; + up_write(&conn->session_lock); ksmbd_free_user(sess->user); sess->user = NULL; From 3c088dba8a4ed4e631c7e941fb31ef2b247d906a Mon Sep 17 00:00:00 2001 From: Vasiliy Kovalev Date: Wed, 9 Oct 2024 16:42:48 +0300 Subject: [PATCH 004/125] ALSA: hda/conexant - Fix audio routing for HP EliteOne 1000 G2 commit 9988844c457f6f17fb2e75aa000b6c3b1b673bb9 upstream. There is a problem with simultaneous audio output to headphones and speakers, and when headphones are turned off, the speakers also turn off and do not turn them on. However, it was found that if you boot linux immediately after windows, there are no such problems. When comparing alsa-info, the only difference is the different configuration of Node 0x1d: working conf. (windows): Pin-ctls: 0x80: HP not working (linux): Pin-ctls: 0xc0: OUT HP This patch disable the AC_PINCTL_OUT_EN bit of Node 0x1d and fixes the described problem. Signed-off-by: Vasiliy Kovalev Cc: Link: https://patch.msgid.link/20241009134248.662175-1-kovalev@altlinux.org Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_conexant.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c index 8a3abd4babba..01a2c22fef94 100644 --- a/sound/pci/hda/patch_conexant.c +++ b/sound/pci/hda/patch_conexant.c @@ -307,6 +307,7 @@ enum { CXT_FIXUP_HP_SPECTRE, CXT_FIXUP_HP_GATE_MIC, CXT_FIXUP_MUTE_LED_GPIO, + CXT_FIXUP_HP_ELITEONE_OUT_DIS, CXT_FIXUP_HP_ZBOOK_MUTE_LED, CXT_FIXUP_HEADSET_MIC, CXT_FIXUP_HP_MIC_NO_PRESENCE, @@ -324,6 +325,19 @@ static void cxt_fixup_stereo_dmic(struct hda_codec *codec, spec->gen.inv_dmic_split = 1; } +/* fix widget control pin settings */ +static void cxt_fixup_update_pinctl(struct hda_codec *codec, + const struct hda_fixup *fix, int action) +{ + if (action == HDA_FIXUP_ACT_PROBE) { + /* Unset OUT_EN for this Node pin, leaving only HP_EN. + * This is the value stored in the codec register after + * the correct initialization of the previous windows boot. + */ + snd_hda_set_pin_ctl(codec, 0x1d, AC_PINCTL_HP_EN); + } +} + static void cxt5066_increase_mic_boost(struct hda_codec *codec, const struct hda_fixup *fix, int action) { @@ -975,6 +989,10 @@ static const struct hda_fixup cxt_fixups[] = { .type = HDA_FIXUP_FUNC, .v.func = cxt_fixup_mute_led_gpio, }, + [CXT_FIXUP_HP_ELITEONE_OUT_DIS] = { + .type = HDA_FIXUP_FUNC, + .v.func = cxt_fixup_update_pinctl, + }, [CXT_FIXUP_HP_ZBOOK_MUTE_LED] = { .type = HDA_FIXUP_FUNC, .v.func = cxt_fixup_hp_zbook_mute_led, @@ -1065,6 +1083,7 @@ static const struct snd_pci_quirk cxt5066_fixups[] = { SND_PCI_QUIRK(0x103c, 0x83b2, "HP EliteBook 840 G5", CXT_FIXUP_HP_DOCK), SND_PCI_QUIRK(0x103c, 0x83b3, "HP EliteBook 830 G5", CXT_FIXUP_HP_DOCK), SND_PCI_QUIRK(0x103c, 0x83d3, "HP ProBook 640 G4", CXT_FIXUP_HP_DOCK), + SND_PCI_QUIRK(0x103c, 0x83e5, "HP EliteOne 1000 G2", CXT_FIXUP_HP_ELITEONE_OUT_DIS), SND_PCI_QUIRK(0x103c, 0x8402, "HP ProBook 645 G4", CXT_FIXUP_MUTE_LED_GPIO), SND_PCI_QUIRK(0x103c, 0x8427, "HP ZBook Studio G5", CXT_FIXUP_HP_ZBOOK_MUTE_LED), SND_PCI_QUIRK(0x103c, 0x844f, "HP ZBook Studio G5", CXT_FIXUP_HP_ZBOOK_MUTE_LED), From 7b2e478abab0b3a33515433a6af563aebba773c1 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 15 Oct 2024 10:38:47 +0200 Subject: [PATCH 005/125] mptcp: pm: fix UaF read in mptcp_pm_nl_rm_addr_or_subflow commit 7decd1f5904a489d3ccdcf131972f94645681689 upstream. Syzkaller reported this splat: ================================================================== BUG: KASAN: slab-use-after-free in mptcp_pm_nl_rm_addr_or_subflow+0xb44/0xcc0 net/mptcp/pm_netlink.c:881 Read of size 4 at addr ffff8880569ac858 by task syz.1.2799/14662 CPU: 0 UID: 0 PID: 14662 Comm: syz.1.2799 Not tainted 6.12.0-rc2-syzkaller-00307-g36c254515dc6 #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x116/0x1f0 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:377 [inline] print_report+0xc3/0x620 mm/kasan/report.c:488 kasan_report+0xd9/0x110 mm/kasan/report.c:601 mptcp_pm_nl_rm_addr_or_subflow+0xb44/0xcc0 net/mptcp/pm_netlink.c:881 mptcp_pm_nl_rm_subflow_received net/mptcp/pm_netlink.c:914 [inline] mptcp_nl_remove_id_zero_address+0x305/0x4a0 net/mptcp/pm_netlink.c:1572 mptcp_pm_nl_del_addr_doit+0x5c9/0x770 net/mptcp/pm_netlink.c:1603 genl_family_rcv_msg_doit+0x202/0x2f0 net/netlink/genetlink.c:1115 genl_family_rcv_msg net/netlink/genetlink.c:1195 [inline] genl_rcv_msg+0x565/0x800 net/netlink/genetlink.c:1210 netlink_rcv_skb+0x165/0x410 net/netlink/af_netlink.c:2551 genl_rcv+0x28/0x40 net/netlink/genetlink.c:1219 netlink_unicast_kernel net/netlink/af_netlink.c:1331 [inline] netlink_unicast+0x53c/0x7f0 net/netlink/af_netlink.c:1357 netlink_sendmsg+0x8b8/0xd70 net/netlink/af_netlink.c:1901 sock_sendmsg_nosec net/socket.c:729 [inline] __sock_sendmsg net/socket.c:744 [inline] ____sys_sendmsg+0x9ae/0xb40 net/socket.c:2607 ___sys_sendmsg+0x135/0x1e0 net/socket.c:2661 __sys_sendmsg+0x117/0x1f0 net/socket.c:2690 do_syscall_32_irqs_on arch/x86/entry/common.c:165 [inline] __do_fast_syscall_32+0x73/0x120 arch/x86/entry/common.c:386 do_fast_syscall_32+0x32/0x80 arch/x86/entry/common.c:411 entry_SYSENTER_compat_after_hwframe+0x84/0x8e RIP: 0023:0xf7fe4579 Code: b8 01 10 06 03 74 b4 01 10 07 03 74 b0 01 10 08 03 74 d8 01 00 00 00 00 00 00 00 00 00 00 00 00 00 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90 90 90 90 8d b4 26 00 00 00 00 8d b4 26 00 00 00 00 RSP: 002b:00000000f574556c EFLAGS: 00000296 ORIG_RAX: 0000000000000172 RAX: ffffffffffffffda RBX: 000000000000000b RCX: 0000000020000140 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000296 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 Allocated by task 5387: kasan_save_stack+0x33/0x60 mm/kasan/common.c:47 kasan_save_track+0x14/0x30 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:377 [inline] __kasan_kmalloc+0xaa/0xb0 mm/kasan/common.c:394 kmalloc_noprof include/linux/slab.h:878 [inline] kzalloc_noprof include/linux/slab.h:1014 [inline] subflow_create_ctx+0x87/0x2a0 net/mptcp/subflow.c:1803 subflow_ulp_init+0xc3/0x4d0 net/mptcp/subflow.c:1956 __tcp_set_ulp net/ipv4/tcp_ulp.c:146 [inline] tcp_set_ulp+0x326/0x7f0 net/ipv4/tcp_ulp.c:167 mptcp_subflow_create_socket+0x4ae/0x10a0 net/mptcp/subflow.c:1764 __mptcp_subflow_connect+0x3cc/0x1490 net/mptcp/subflow.c:1592 mptcp_pm_create_subflow_or_signal_addr+0xbda/0x23a0 net/mptcp/pm_netlink.c:642 mptcp_pm_nl_fully_established net/mptcp/pm_netlink.c:650 [inline] mptcp_pm_nl_work+0x3a1/0x4f0 net/mptcp/pm_netlink.c:943 mptcp_worker+0x15a/0x1240 net/mptcp/protocol.c:2777 process_one_work+0x958/0x1b30 kernel/workqueue.c:3229 process_scheduled_works kernel/workqueue.c:3310 [inline] worker_thread+0x6c8/0xf00 kernel/workqueue.c:3391 kthread+0x2c1/0x3a0 kernel/kthread.c:389 ret_from_fork+0x45/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Freed by task 113: kasan_save_stack+0x33/0x60 mm/kasan/common.c:47 kasan_save_track+0x14/0x30 mm/kasan/common.c:68 kasan_save_free_info+0x3b/0x60 mm/kasan/generic.c:579 poison_slab_object mm/kasan/common.c:247 [inline] __kasan_slab_free+0x51/0x70 mm/kasan/common.c:264 kasan_slab_free include/linux/kasan.h:230 [inline] slab_free_hook mm/slub.c:2342 [inline] slab_free mm/slub.c:4579 [inline] kfree+0x14f/0x4b0 mm/slub.c:4727 kvfree+0x47/0x50 mm/util.c:701 kvfree_rcu_list+0xf5/0x2c0 kernel/rcu/tree.c:3423 kvfree_rcu_drain_ready kernel/rcu/tree.c:3563 [inline] kfree_rcu_monitor+0x503/0x8b0 kernel/rcu/tree.c:3632 kfree_rcu_shrink_scan+0x245/0x3a0 kernel/rcu/tree.c:3966 do_shrink_slab+0x44f/0x11c0 mm/shrinker.c:435 shrink_slab+0x32b/0x12a0 mm/shrinker.c:662 shrink_one+0x47e/0x7b0 mm/vmscan.c:4818 shrink_many mm/vmscan.c:4879 [inline] lru_gen_shrink_node mm/vmscan.c:4957 [inline] shrink_node+0x2452/0x39d0 mm/vmscan.c:5937 kswapd_shrink_node mm/vmscan.c:6765 [inline] balance_pgdat+0xc19/0x18f0 mm/vmscan.c:6957 kswapd+0x5ea/0xbf0 mm/vmscan.c:7226 kthread+0x2c1/0x3a0 kernel/kthread.c:389 ret_from_fork+0x45/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Last potentially related work creation: kasan_save_stack+0x33/0x60 mm/kasan/common.c:47 __kasan_record_aux_stack+0xba/0xd0 mm/kasan/generic.c:541 kvfree_call_rcu+0x74/0xbe0 kernel/rcu/tree.c:3810 subflow_ulp_release+0x2ae/0x350 net/mptcp/subflow.c:2009 tcp_cleanup_ulp+0x7c/0x130 net/ipv4/tcp_ulp.c:124 tcp_v4_destroy_sock+0x1c5/0x6a0 net/ipv4/tcp_ipv4.c:2541 inet_csk_destroy_sock+0x1a3/0x440 net/ipv4/inet_connection_sock.c:1293 tcp_done+0x252/0x350 net/ipv4/tcp.c:4870 tcp_rcv_state_process+0x379b/0x4f30 net/ipv4/tcp_input.c:6933 tcp_v4_do_rcv+0x1ad/0xa90 net/ipv4/tcp_ipv4.c:1938 sk_backlog_rcv include/net/sock.h:1115 [inline] __release_sock+0x31b/0x400 net/core/sock.c:3072 __tcp_close+0x4f3/0xff0 net/ipv4/tcp.c:3142 __mptcp_close_ssk+0x331/0x14d0 net/mptcp/protocol.c:2489 mptcp_close_ssk net/mptcp/protocol.c:2543 [inline] mptcp_close_ssk+0x150/0x220 net/mptcp/protocol.c:2526 mptcp_pm_nl_rm_addr_or_subflow+0x2be/0xcc0 net/mptcp/pm_netlink.c:878 mptcp_pm_nl_rm_subflow_received net/mptcp/pm_netlink.c:914 [inline] mptcp_nl_remove_id_zero_address+0x305/0x4a0 net/mptcp/pm_netlink.c:1572 mptcp_pm_nl_del_addr_doit+0x5c9/0x770 net/mptcp/pm_netlink.c:1603 genl_family_rcv_msg_doit+0x202/0x2f0 net/netlink/genetlink.c:1115 genl_family_rcv_msg net/netlink/genetlink.c:1195 [inline] genl_rcv_msg+0x565/0x800 net/netlink/genetlink.c:1210 netlink_rcv_skb+0x165/0x410 net/netlink/af_netlink.c:2551 genl_rcv+0x28/0x40 net/netlink/genetlink.c:1219 netlink_unicast_kernel net/netlink/af_netlink.c:1331 [inline] netlink_unicast+0x53c/0x7f0 net/netlink/af_netlink.c:1357 netlink_sendmsg+0x8b8/0xd70 net/netlink/af_netlink.c:1901 sock_sendmsg_nosec net/socket.c:729 [inline] __sock_sendmsg net/socket.c:744 [inline] ____sys_sendmsg+0x9ae/0xb40 net/socket.c:2607 ___sys_sendmsg+0x135/0x1e0 net/socket.c:2661 __sys_sendmsg+0x117/0x1f0 net/socket.c:2690 do_syscall_32_irqs_on arch/x86/entry/common.c:165 [inline] __do_fast_syscall_32+0x73/0x120 arch/x86/entry/common.c:386 do_fast_syscall_32+0x32/0x80 arch/x86/entry/common.c:411 entry_SYSENTER_compat_after_hwframe+0x84/0x8e The buggy address belongs to the object at ffff8880569ac800 which belongs to the cache kmalloc-512 of size 512 The buggy address is located 88 bytes inside of freed 512-byte region [ffff8880569ac800, ffff8880569aca00) The buggy address belongs to the physical page: page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x569ac head: order:2 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0 flags: 0x4fff00000000040(head|node=1|zone=1|lastcpupid=0x7ff) page_type: f5(slab) raw: 04fff00000000040 ffff88801ac42c80 dead000000000100 dead000000000122 raw: 0000000000000000 0000000080100010 00000001f5000000 0000000000000000 head: 04fff00000000040 ffff88801ac42c80 dead000000000100 dead000000000122 head: 0000000000000000 0000000080100010 00000001f5000000 0000000000000000 head: 04fff00000000002 ffffea00015a6b01 ffffffffffffffff 0000000000000000 head: 0000000000000004 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 2, migratetype Unmovable, gfp_mask 0xd20c0(__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC), pid 10238, tgid 10238 (kworker/u32:6), ts 597403252405, free_ts 597177952947 set_page_owner include/linux/page_owner.h:32 [inline] post_alloc_hook+0x2d1/0x350 mm/page_alloc.c:1537 prep_new_page mm/page_alloc.c:1545 [inline] get_page_from_freelist+0x101e/0x3070 mm/page_alloc.c:3457 __alloc_pages_noprof+0x223/0x25a0 mm/page_alloc.c:4733 alloc_pages_mpol_noprof+0x2c9/0x610 mm/mempolicy.c:2265 alloc_slab_page mm/slub.c:2412 [inline] allocate_slab mm/slub.c:2578 [inline] new_slab+0x2ba/0x3f0 mm/slub.c:2631 ___slab_alloc+0xd1d/0x16f0 mm/slub.c:3818 __slab_alloc.constprop.0+0x56/0xb0 mm/slub.c:3908 __slab_alloc_node mm/slub.c:3961 [inline] slab_alloc_node mm/slub.c:4122 [inline] __kmalloc_cache_noprof+0x2c5/0x310 mm/slub.c:4290 kmalloc_noprof include/linux/slab.h:878 [inline] kzalloc_noprof include/linux/slab.h:1014 [inline] mld_add_delrec net/ipv6/mcast.c:743 [inline] igmp6_leave_group net/ipv6/mcast.c:2625 [inline] igmp6_group_dropped+0x4ab/0xe40 net/ipv6/mcast.c:723 __ipv6_dev_mc_dec+0x281/0x360 net/ipv6/mcast.c:979 addrconf_leave_solict net/ipv6/addrconf.c:2253 [inline] __ipv6_ifa_notify+0x3f6/0xc30 net/ipv6/addrconf.c:6283 addrconf_ifdown.isra.0+0xef9/0x1a20 net/ipv6/addrconf.c:3982 addrconf_notify+0x220/0x19c0 net/ipv6/addrconf.c:3781 notifier_call_chain+0xb9/0x410 kernel/notifier.c:93 call_netdevice_notifiers_info+0xbe/0x140 net/core/dev.c:1996 call_netdevice_notifiers_extack net/core/dev.c:2034 [inline] call_netdevice_notifiers net/core/dev.c:2048 [inline] dev_close_many+0x333/0x6a0 net/core/dev.c:1589 page last free pid 13136 tgid 13136 stack trace: reset_page_owner include/linux/page_owner.h:25 [inline] free_pages_prepare mm/page_alloc.c:1108 [inline] free_unref_page+0x5f4/0xdc0 mm/page_alloc.c:2638 stack_depot_save_flags+0x2da/0x900 lib/stackdepot.c:666 kasan_save_stack+0x42/0x60 mm/kasan/common.c:48 kasan_save_track+0x14/0x30 mm/kasan/common.c:68 unpoison_slab_object mm/kasan/common.c:319 [inline] __kasan_slab_alloc+0x89/0x90 mm/kasan/common.c:345 kasan_slab_alloc include/linux/kasan.h:247 [inline] slab_post_alloc_hook mm/slub.c:4085 [inline] slab_alloc_node mm/slub.c:4134 [inline] kmem_cache_alloc_noprof+0x121/0x2f0 mm/slub.c:4141 skb_clone+0x190/0x3f0 net/core/skbuff.c:2084 do_one_broadcast net/netlink/af_netlink.c:1462 [inline] netlink_broadcast_filtered+0xb11/0xef0 net/netlink/af_netlink.c:1540 netlink_broadcast+0x39/0x50 net/netlink/af_netlink.c:1564 uevent_net_broadcast_untagged lib/kobject_uevent.c:331 [inline] kobject_uevent_net_broadcast lib/kobject_uevent.c:410 [inline] kobject_uevent_env+0xacd/0x1670 lib/kobject_uevent.c:608 device_del+0x623/0x9f0 drivers/base/core.c:3882 snd_card_disconnect.part.0+0x58a/0x7c0 sound/core/init.c:546 snd_card_disconnect+0x1f/0x30 sound/core/init.c:495 snd_usx2y_disconnect+0xe9/0x1f0 sound/usb/usx2y/usbusx2y.c:417 usb_unbind_interface+0x1e8/0x970 drivers/usb/core/driver.c:461 device_remove drivers/base/dd.c:569 [inline] device_remove+0x122/0x170 drivers/base/dd.c:561 That's because 'subflow' is used just after 'mptcp_close_ssk(subflow)', which will initiate the release of its memory. Even if it is very likely the release and the re-utilisation will be done later on, it is of course better to avoid any issues and read the content of 'subflow' before closing it. Fixes: 1c1f72137598 ("mptcp: pm: only decrement add_addr_accepted for MPJ req") Cc: stable@vger.kernel.org Reported-by: syzbot+3c8b7a8e7df6a2a226ca@syzkaller.appspotmail.com Closes: https://lore.kernel.org/670d7337.050a0220.4cbc0.004f.GAE@google.com Signed-off-by: Matthieu Baerts (NGI0) Acked-by: Paolo Abeni Link: https://patch.msgid.link/20241015-net-mptcp-uaf-pm-rm-v1-1-c4ee5d987a64@kernel.org Signed-off-by: Paolo Abeni Signed-off-by: Greg Kroah-Hartman --- net/mptcp/pm_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 131af6be4b67..54b49fc6efd9 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -877,12 +877,12 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, i, rm_id, id, remote_id, msk->mpc_endpoint_id); spin_unlock_bh(&msk->pm.lock); mptcp_subflow_shutdown(sk, ssk, how); + removed |= subflow->request_join; /* the following takes care of updating the subflows counter */ mptcp_close_ssk(sk, ssk, subflow); spin_lock_bh(&msk->pm.lock); - removed |= subflow->request_join; if (rm_type == MPTCP_MIB_RMSUBFLOW) __MPTCP_INC_STATS(sock_net(sk), rm_type); } From 1c95443e44e1f468b6635fb9ce9809f210a2d66e Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Thu, 10 Oct 2024 17:20:53 +0800 Subject: [PATCH 006/125] net: enetc: remove xdp_drops statistic from enetc_xdp_drop() commit 412950d5746f7aa139e14fe95338694c1f09b595 upstream. The xdp_drops statistic indicates the number of XDP frames dropped in the Rx direction. However, enetc_xdp_drop() is also used in XDP_TX and XDP_REDIRECT actions. If frame loss occurs in these two actions, the frames loss count should not be included in xdp_drops, because there are already xdp_tx_drops and xdp_redirect_failures to count the frame loss of these two actions, so it's better to remove xdp_drops statistic from enetc_xdp_drop() and increase xdp_drops in XDP_DROP action. Fixes: 7ed2bc80074e ("net: enetc: add support for XDP_TX") Cc: stable@vger.kernel.org Signed-off-by: Wei Fang Reviewed-by: Maciej Fijalkowski Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20241010092056.298128-2-wei.fang@nxp.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/freescale/enetc/enetc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 18e3a9cd4fc0..2ec324d715d9 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -1524,7 +1524,6 @@ static void enetc_xdp_drop(struct enetc_bdr *rx_ring, int rx_ring_first, &rx_ring->rx_swbd[rx_ring_first]); enetc_bdr_idx_inc(rx_ring, &rx_ring_first); } - rx_ring->stats.xdp_drops++; } static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, @@ -1589,6 +1588,7 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, fallthrough; case XDP_DROP: enetc_xdp_drop(rx_ring, orig_i, i); + rx_ring->stats.xdp_drops++; break; case XDP_PASS: rxbd = orig_rxbd; From 37184349468a0919267179acbf7324a2de767f6b Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Thu, 10 Oct 2024 17:20:54 +0800 Subject: [PATCH 007/125] net: enetc: block concurrent XDP transmissions during ring reconfiguration commit c728a95ccf2a8ba544facfc30a4418d4c68c39f0 upstream. When testing the XDP_REDIRECT function on the LS1028A platform, we found a very reproducible issue that the Tx frames can no longer be sent out even if XDP_REDIRECT is turned off. Specifically, if there is a lot of traffic on Rx direction, when XDP_REDIRECT is turned on, the console may display some warnings like "timeout for tx ring #6 clear", and all redirected frames will be dropped, the detailed log is as follows. root@ls1028ardb:~# ./xdp-bench redirect eno0 eno2 Redirecting from eno0 (ifindex 3; driver fsl_enetc) to eno2 (ifindex 4; driver fsl_enetc) [203.849809] fsl_enetc 0000:00:00.2 eno2: timeout for tx ring #5 clear [204.006051] fsl_enetc 0000:00:00.2 eno2: timeout for tx ring #6 clear [204.161944] fsl_enetc 0000:00:00.2 eno2: timeout for tx ring #7 clear eno0->eno2 1420505 rx/s 1420590 err,drop/s 0 xmit/s xmit eno0->eno2 0 xmit/s 1420590 drop/s 0 drv_err/s 15.71 bulk-avg eno0->eno2 1420484 rx/s 1420485 err,drop/s 0 xmit/s xmit eno0->eno2 0 xmit/s 1420485 drop/s 0 drv_err/s 15.71 bulk-avg By analyzing the XDP_REDIRECT implementation of enetc driver, the driver will reconfigure Tx and Rx BD rings when a bpf program is installed or uninstalled, but there is no mechanisms to block the redirected frames when enetc driver reconfigures rings. Similarly, XDP_TX verdicts on received frames can also lead to frames being enqueued in the Tx rings. Because XDP ignores the state set by the netif_tx_wake_queue() API, so introduce the ENETC_TX_DOWN flag to suppress transmission of XDP frames. Fixes: c33bfaf91c4c ("net: enetc: set up XDP program under enetc_reconfigure()") Cc: stable@vger.kernel.org Signed-off-by: Wei Fang Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20241010092056.298128-3-wei.fang@nxp.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/freescale/enetc/enetc.c | 14 ++++++++++++++ drivers/net/ethernet/freescale/enetc/enetc.h | 1 + 2 files changed, 15 insertions(+) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 2ec324d715d9..4ea2c9e89ad8 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -902,6 +902,7 @@ static bool enetc_clean_tx_ring(struct enetc_bdr *tx_ring, int napi_budget) if (unlikely(tx_frm_cnt && netif_carrier_ok(ndev) && __netif_subqueue_stopped(ndev, tx_ring->index) && + !test_bit(ENETC_TX_DOWN, &priv->flags) && (enetc_bd_unused(tx_ring) >= ENETC_TXBDS_MAX_NEEDED))) { netif_wake_subqueue(ndev, tx_ring->index); } @@ -1380,6 +1381,9 @@ int enetc_xdp_xmit(struct net_device *ndev, int num_frames, int xdp_tx_bd_cnt, i, k; int xdp_tx_frm_cnt = 0; + if (unlikely(test_bit(ENETC_TX_DOWN, &priv->flags))) + return -ENETDOWN; + enetc_lock_mdio(); tx_ring = priv->xdp_tx_ring[smp_processor_id()]; @@ -1605,6 +1609,12 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, break; case XDP_TX: tx_ring = priv->xdp_tx_ring[rx_ring->index]; + if (unlikely(test_bit(ENETC_TX_DOWN, &priv->flags))) { + enetc_xdp_drop(rx_ring, orig_i, i); + tx_ring->stats.xdp_tx_drops++; + break; + } + xdp_tx_bd_cnt = enetc_rx_swbd_to_xdp_tx_swbd(xdp_tx_arr, rx_ring, orig_i, i); @@ -2466,6 +2476,8 @@ void enetc_start(struct net_device *ndev) enetc_enable_bdrs(priv); netif_tx_start_all_queues(ndev); + + clear_bit(ENETC_TX_DOWN, &priv->flags); } EXPORT_SYMBOL_GPL(enetc_start); @@ -2523,6 +2535,8 @@ void enetc_stop(struct net_device *ndev) struct enetc_ndev_priv *priv = netdev_priv(ndev); int i; + set_bit(ENETC_TX_DOWN, &priv->flags); + netif_tx_stop_all_queues(ndev); enetc_disable_bdrs(priv); diff --git a/drivers/net/ethernet/freescale/enetc/enetc.h b/drivers/net/ethernet/freescale/enetc/enetc.h index 7439739cd81a..fcadb0848d25 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.h +++ b/drivers/net/ethernet/freescale/enetc/enetc.h @@ -328,6 +328,7 @@ enum enetc_active_offloads { enum enetc_flags_bit { ENETC_TX_ONESTEP_TSTAMP_IN_PROGRESS = 0, + ENETC_TX_DOWN, }; /* interrupt coalescing modes */ From a419f478b92733253e27ac2ea7b15780ed1ecfa7 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Thu, 10 Oct 2024 17:20:55 +0800 Subject: [PATCH 008/125] net: enetc: disable Tx BD rings after they are empty commit 0a93f2ca4be6c4616d371f18a3fabad2df7f8d55 upstream. The Tx BD rings are disabled first in enetc_stop() and the driver waits for them to become empty. This operation is not safe while the ring is actively transmitting frames, and will cause the ring to not be empty and hardware exception. As described in the NETC block guide, software should only disable an active Tx ring after all pending ring entries have been consumed (i.e. when PI = CI). Disabling a transmit ring that is actively processing BDs risks a HW-SW race hazard whereby a hardware resource becomes assigned to work on one or more ring entries only to have those entries be removed due to the ring becoming disabled. When testing XDP_REDIRECT feautre, although all frames were blocked from being put into Tx rings during ring reconfiguration, the similar warning log was still encountered: fsl_enetc 0000:00:00.2 eno2: timeout for tx ring #6 clear fsl_enetc 0000:00:00.2 eno2: timeout for tx ring #7 clear The reason is that when there are still unsent frames in the Tx ring, disabling the Tx ring causes the remaining frames to be unable to be sent out. And the Tx ring cannot be restored, which means that even if the xdp program is uninstalled, the Tx frames cannot be sent out anymore. Therefore, correct the operation order in enect_start() and enect_stop(). Fixes: ff58fda09096 ("net: enetc: prioritize ability to go down over packet processing") Cc: stable@vger.kernel.org Signed-off-by: Wei Fang Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20241010092056.298128-4-wei.fang@nxp.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/freescale/enetc/enetc.c | 36 ++++++++++++++------ 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 4ea2c9e89ad8..6f20e17af6cd 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -2236,16 +2236,22 @@ static void enetc_enable_rxbdr(struct enetc_hw *hw, struct enetc_bdr *rx_ring) enetc_rxbdr_wr(hw, idx, ENETC_RBMR, rbmr); } -static void enetc_enable_bdrs(struct enetc_ndev_priv *priv) +static void enetc_enable_rx_bdrs(struct enetc_ndev_priv *priv) +{ + struct enetc_hw *hw = &priv->si->hw; + int i; + + for (i = 0; i < priv->num_rx_rings; i++) + enetc_enable_rxbdr(hw, priv->rx_ring[i]); +} + +static void enetc_enable_tx_bdrs(struct enetc_ndev_priv *priv) { struct enetc_hw *hw = &priv->si->hw; int i; for (i = 0; i < priv->num_tx_rings; i++) enetc_enable_txbdr(hw, priv->tx_ring[i]); - - for (i = 0; i < priv->num_rx_rings; i++) - enetc_enable_rxbdr(hw, priv->rx_ring[i]); } static void enetc_disable_rxbdr(struct enetc_hw *hw, struct enetc_bdr *rx_ring) @@ -2264,16 +2270,22 @@ static void enetc_disable_txbdr(struct enetc_hw *hw, struct enetc_bdr *rx_ring) enetc_txbdr_wr(hw, idx, ENETC_TBMR, 0); } -static void enetc_disable_bdrs(struct enetc_ndev_priv *priv) +static void enetc_disable_rx_bdrs(struct enetc_ndev_priv *priv) +{ + struct enetc_hw *hw = &priv->si->hw; + int i; + + for (i = 0; i < priv->num_rx_rings; i++) + enetc_disable_rxbdr(hw, priv->rx_ring[i]); +} + +static void enetc_disable_tx_bdrs(struct enetc_ndev_priv *priv) { struct enetc_hw *hw = &priv->si->hw; int i; for (i = 0; i < priv->num_tx_rings; i++) enetc_disable_txbdr(hw, priv->tx_ring[i]); - - for (i = 0; i < priv->num_rx_rings; i++) - enetc_disable_rxbdr(hw, priv->rx_ring[i]); } static void enetc_wait_txbdr(struct enetc_hw *hw, struct enetc_bdr *tx_ring) @@ -2465,6 +2477,8 @@ void enetc_start(struct net_device *ndev) enetc_setup_interrupts(priv); + enetc_enable_tx_bdrs(priv); + for (i = 0; i < priv->bdr_int_num; i++) { int irq = pci_irq_vector(priv->si->pdev, ENETC_BDR_INT_BASE_IDX + i); @@ -2473,7 +2487,7 @@ void enetc_start(struct net_device *ndev) enable_irq(irq); } - enetc_enable_bdrs(priv); + enetc_enable_rx_bdrs(priv); netif_tx_start_all_queues(ndev); @@ -2539,7 +2553,7 @@ void enetc_stop(struct net_device *ndev) netif_tx_stop_all_queues(ndev); - enetc_disable_bdrs(priv); + enetc_disable_rx_bdrs(priv); for (i = 0; i < priv->bdr_int_num; i++) { int irq = pci_irq_vector(priv->si->pdev, @@ -2552,6 +2566,8 @@ void enetc_stop(struct net_device *ndev) enetc_wait_bdrs(priv); + enetc_disable_tx_bdrs(priv); + enetc_clear_interrupts(priv); } EXPORT_SYMBOL_GPL(enetc_stop); From 760a7c9695c08661c176b7b729882375a23f9e3d Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Thu, 10 Oct 2024 17:20:56 +0800 Subject: [PATCH 009/125] net: enetc: disable NAPI after all rings are disabled commit 6b58fadd44aafbbd6af5f0b965063e1fd2063992 upstream. When running "xdp-bench tx eno0" to test the XDP_TX feature of ENETC on LS1028A, it was found that if the command was re-run multiple times, Rx could not receive the frames, and the result of xdp-bench showed that the rx rate was 0. root@ls1028ardb:~# ./xdp-bench tx eno0 Hairpinning (XDP_TX) packets on eno0 (ifindex 3; driver fsl_enetc) Summary 2046 rx/s 0 err,drop/s Summary 0 rx/s 0 err,drop/s Summary 0 rx/s 0 err,drop/s Summary 0 rx/s 0 err,drop/s By observing the Rx PIR and CIR registers, CIR is always 0x7FF and PIR is always 0x7FE, which means that the Rx ring is full and can no longer accommodate other Rx frames. Therefore, the problem is caused by the Rx BD ring not being cleaned up. Further analysis of the code revealed that the Rx BD ring will only be cleaned if the "cleaned_cnt > xdp_tx_in_flight" condition is met. Therefore, some debug logs were added to the driver and the current values of cleaned_cnt and xdp_tx_in_flight were printed when the Rx BD ring was full. The logs are as follows. [ 178.762419] [XDP TX] >> cleaned_cnt:1728, xdp_tx_in_flight:2140 [ 178.771387] [XDP TX] >> cleaned_cnt:1941, xdp_tx_in_flight:2110 [ 178.776058] [XDP TX] >> cleaned_cnt:1792, xdp_tx_in_flight:2110 From the results, the max value of xdp_tx_in_flight has reached 2140. However, the size of the Rx BD ring is only 2048. So xdp_tx_in_flight did not drop to 0 after enetc_stop() is called and the driver does not clear it. The root cause is that NAPI is disabled too aggressively, without having waited for the pending XDP_TX frames to be transmitted, and their buffers recycled, so that xdp_tx_in_flight cannot naturally drop to 0. Later, enetc_free_tx_ring() does free those stale, unsent XDP_TX packets, but it is not coded up to also reset xdp_tx_in_flight, hence the manifestation of the bug. One option would be to cover this extra condition in enetc_free_tx_ring(), but now that the ENETC_TX_DOWN exists, we have created a window at the beginning of enetc_stop() where NAPI can still be scheduled, but any concurrent enqueue will be blocked. Therefore, enetc_wait_bdrs() and enetc_disable_tx_bdrs() can be called with NAPI still scheduled, and it is guaranteed that this will not wait indefinitely, but instead give us an indication that the pending TX frames have orderly dropped to zero. Only then should we call napi_disable(). This way, enetc_free_tx_ring() becomes entirely redundant and can be dropped as part of subsequent cleanup. The change also refactors enetc_start() so that it looks like the mirror opposite procedure of enetc_stop(). Fixes: ff58fda09096 ("net: enetc: prioritize ability to go down over packet processing") Cc: stable@vger.kernel.org Signed-off-by: Wei Fang Reviewed-by: Vladimir Oltean Tested-by: Vladimir Oltean Link: https://patch.msgid.link/20241010092056.298128-5-wei.fang@nxp.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/freescale/enetc/enetc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 6f20e17af6cd..c17b9e338516 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -2477,8 +2477,6 @@ void enetc_start(struct net_device *ndev) enetc_setup_interrupts(priv); - enetc_enable_tx_bdrs(priv); - for (i = 0; i < priv->bdr_int_num; i++) { int irq = pci_irq_vector(priv->si->pdev, ENETC_BDR_INT_BASE_IDX + i); @@ -2487,6 +2485,8 @@ void enetc_start(struct net_device *ndev) enable_irq(irq); } + enetc_enable_tx_bdrs(priv); + enetc_enable_rx_bdrs(priv); netif_tx_start_all_queues(ndev); @@ -2555,6 +2555,10 @@ void enetc_stop(struct net_device *ndev) enetc_disable_rx_bdrs(priv); + enetc_wait_bdrs(priv); + + enetc_disable_tx_bdrs(priv); + for (i = 0; i < priv->bdr_int_num; i++) { int irq = pci_irq_vector(priv->si->pdev, ENETC_BDR_INT_BASE_IDX + i); @@ -2564,10 +2568,6 @@ void enetc_stop(struct net_device *ndev) napi_disable(&priv->int_vector[i]->napi); } - enetc_wait_bdrs(priv); - - enetc_disable_tx_bdrs(priv); - enetc_clear_interrupts(priv); } EXPORT_SYMBOL_GPL(enetc_stop); From 75150ba93ddaccceff80ea3fd9efa79e07aa1fb1 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Fri, 11 Oct 2024 11:01:03 +0800 Subject: [PATCH 010/125] net: enetc: add missing static descriptor and inline keyword commit 1d7b2ce43d2c22a21dadaf689cb36a69570346a6 upstream. Fix the build warnings when CONFIG_FSL_ENETC_MDIO is not enabled. The detailed warnings are shown as follows. include/linux/fsl/enetc_mdio.h:62:18: warning: no previous prototype for function 'enetc_hw_alloc' [-Wmissing-prototypes] 62 | struct enetc_hw *enetc_hw_alloc(struct device *dev, void __iomem *port_regs) | ^ include/linux/fsl/enetc_mdio.h:62:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 62 | struct enetc_hw *enetc_hw_alloc(struct device *dev, void __iomem *port_regs) | ^ | static 8 warnings generated. Fixes: 6517798dd343 ("enetc: Make MDIO accessors more generic and export to include/linux/fsl") Cc: stable@vger.kernel.org Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202410102136.jQHZOcS4-lkp@intel.com/ Signed-off-by: Wei Fang Reviewed-by: Claudiu Manoil Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20241011030103.392362-1-wei.fang@nxp.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- include/linux/fsl/enetc_mdio.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/fsl/enetc_mdio.h b/include/linux/fsl/enetc_mdio.h index df25fffdc0ae..623ccfcbf39c 100644 --- a/include/linux/fsl/enetc_mdio.h +++ b/include/linux/fsl/enetc_mdio.h @@ -59,7 +59,8 @@ static inline int enetc_mdio_read_c45(struct mii_bus *bus, int phy_id, static inline int enetc_mdio_write_c45(struct mii_bus *bus, int phy_id, int devad, int regnum, u16 value) { return -EINVAL; } -struct enetc_hw *enetc_hw_alloc(struct device *dev, void __iomem *port_regs) +static inline struct enetc_hw *enetc_hw_alloc(struct device *dev, + void __iomem *port_regs) { return ERR_PTR(-EINVAL); } #endif From a3f169e398215e71361774d13bf91a0101283ac2 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Wed, 9 Oct 2024 15:23:01 +0800 Subject: [PATCH 011/125] posix-clock: Fix missing timespec64 check in pc_clock_settime() commit d8794ac20a299b647ba9958f6d657051fc51a540 upstream. As Andrew pointed out, it will make sense that the PTP core checked timespec64 struct's tv_sec and tv_nsec range before calling ptp->info->settime64(). As the man manual of clock_settime() said, if tp.tv_sec is negative or tp.tv_nsec is outside the range [0..999,999,999], it should return EINVAL, which include dynamic clocks which handles PTP clock, and the condition is consistent with timespec64_valid(). As Thomas suggested, timespec64_valid() only check the timespec is valid, but not ensure that the time is in a valid range, so check it ahead using timespec64_valid_strict() in pc_clock_settime() and return -EINVAL if not valid. There are some drivers that use tp->tv_sec and tp->tv_nsec directly to write registers without validity checks and assume that the higher layer has checked it, which is dangerous and will benefit from this, such as hclge_ptp_settime(), igb_ptp_settime_i210(), _rcar_gen4_ptp_settime(), and some drivers can remove the checks of itself. Cc: stable@vger.kernel.org Fixes: 0606f422b453 ("posix clocks: Introduce dynamic clocks") Acked-by: Richard Cochran Suggested-by: Andrew Lunn Suggested-by: Thomas Gleixner Signed-off-by: Jinjie Ruan Link: https://patch.msgid.link/20241009072302.1754567-2-ruanjinjie@huawei.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- kernel/time/posix-clock.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 77c0c2370b6d..8127673bfc45 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -299,6 +299,9 @@ static int pc_clock_settime(clockid_t id, const struct timespec64 *ts) goto out; } + if (!timespec64_valid_strict(ts)) + return -EINVAL; + if (cd.clk->ops.clock_settime) err = cd.clk->ops.clock_settime(cd.clk, ts); else From 9f1e7735474e7457a4d919a517900e46868ae5f6 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 8 Oct 2024 16:58:46 +0100 Subject: [PATCH 012/125] arm64: probes: Remove broken LDR (literal) uprobe support commit acc450aa07099d071b18174c22a1119c57da8227 upstream. The simulate_ldr_literal() and simulate_ldrsw_literal() functions are unsafe to use for uprobes. Both functions were originally written for use with kprobes, and access memory with plain C accesses. When uprobes was added, these were reused unmodified even though they cannot safely access user memory. There are three key problems: 1) The plain C accesses do not have corresponding extable entries, and thus if they encounter a fault the kernel will treat these as unintentional accesses to user memory, resulting in a BUG() which will kill the kernel thread, and likely lead to further issues (e.g. lockup or panic()). 2) The plain C accesses are subject to HW PAN and SW PAN, and so when either is in use, any attempt to simulate an access to user memory will fault. Thus neither simulate_ldr_literal() nor simulate_ldrsw_literal() can do anything useful when simulating a user instruction on any system with HW PAN or SW PAN. 3) The plain C accesses are privileged, as they run in kernel context, and in practice can access a small range of kernel virtual addresses. The instructions they simulate have a range of +/-1MiB, and since the simulated instructions must itself be a user instructions in the TTBR0 address range, these can address the final 1MiB of the TTBR1 acddress range by wrapping downwards from an address in the first 1MiB of the TTBR0 address range. In contemporary kernels the last 8MiB of TTBR1 address range is reserved, and accesses to this will always fault, meaning this is no worse than (1). Historically, it was theoretically possible for the linear map or vmemmap to spill into the final 8MiB of the TTBR1 address range, but in practice this is extremely unlikely to occur as this would require either: * Having enough physical memory to fill the entire linear map all the way to the final 1MiB of the TTBR1 address range. * Getting unlucky with KASLR randomization of the linear map such that the populated region happens to overlap with the last 1MiB of the TTBR address range. ... and in either case if we were to spill into the final page there would be larger problems as the final page would alias with error pointers. Practically speaking, (1) and (2) are the big issues. Given there have been no reports of problems since the broken code was introduced, it appears that no-one is relying on probing these instructions with uprobes. Avoid these issues by not allowing uprobes on LDR (literal) and LDRSW (literal), limiting the use of simulate_ldr_literal() and simulate_ldrsw_literal() to kprobes. Attempts to place uprobes on LDR (literal) and LDRSW (literal) will be rejected as arm_probe_decode_insn() will return INSN_REJECTED. In future we can consider introducing working uprobes support for these instructions, but this will require more significant work. Fixes: 9842ceae9fa8 ("arm64: Add uprobe support") Cc: stable@vger.kernel.org Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Will Deacon Link: https://lore.kernel.org/r/20241008155851.801546-2-mark.rutland@arm.com Signed-off-by: Will Deacon Signed-off-by: Greg Kroah-Hartman --- arch/arm64/kernel/probes/decode-insn.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c index 968d5fffe233..3496d6169e59 100644 --- a/arch/arm64/kernel/probes/decode-insn.c +++ b/arch/arm64/kernel/probes/decode-insn.c @@ -99,10 +99,6 @@ arm_probe_decode_insn(probe_opcode_t insn, struct arch_probe_insn *api) aarch64_insn_is_blr(insn) || aarch64_insn_is_ret(insn)) { api->handler = simulate_br_blr_ret; - } else if (aarch64_insn_is_ldr_lit(insn)) { - api->handler = simulate_ldr_literal; - } else if (aarch64_insn_is_ldrsw_lit(insn)) { - api->handler = simulate_ldrsw_literal; } else { /* * Instruction cannot be stepped out-of-line and we don't @@ -140,6 +136,17 @@ arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi) probe_opcode_t insn = le32_to_cpu(*addr); probe_opcode_t *scan_end = NULL; unsigned long size = 0, offset = 0; + struct arch_probe_insn *api = &asi->api; + + if (aarch64_insn_is_ldr_lit(insn)) { + api->handler = simulate_ldr_literal; + decoded = INSN_GOOD_NO_SLOT; + } else if (aarch64_insn_is_ldrsw_lit(insn)) { + api->handler = simulate_ldrsw_literal; + decoded = INSN_GOOD_NO_SLOT; + } else { + decoded = arm_probe_decode_insn(insn, &asi->api); + } /* * If there's a symbol defined in front of and near enough to @@ -157,7 +164,6 @@ arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi) else scan_end = addr - MAX_ATOMIC_CONTEXT_SIZE; } - decoded = arm_probe_decode_insn(insn, &asi->api); if (decoded != INSN_REJECTED && scan_end) if (is_probed_address_atomic(addr - 1, scan_end)) From 173c13e38799c362c5acb6731a695abac11daeab Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 8 Oct 2024 16:58:47 +0100 Subject: [PATCH 013/125] arm64: probes: Fix simulate_ldr*_literal() commit 50f813e57601c22b6f26ced3193b9b94d70a2640 upstream. The simulate_ldr_literal() code always loads a 64-bit quantity, and when simulating a 32-bit load into a 'W' register, it discards the most significant 32 bits. For big-endian kernels this means that the relevant bits are discarded, and the value returned is the the subsequent 32 bits in memory (i.e. the value at addr + 4). Additionally, simulate_ldr_literal() and simulate_ldrsw_literal() use a plain C load, which the compiler may tear or elide (e.g. if the target is the zero register). Today this doesn't happen to matter, but it may matter in future if trampoline code uses a LDR (literal) or LDRSW (literal). Update simulate_ldr_literal() and simulate_ldrsw_literal() to use an appropriately-sized READ_ONCE() to perform the access, which avoids these problems. Fixes: 39a67d49ba35 ("arm64: kprobes instruction simulation support") Cc: stable@vger.kernel.org Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Will Deacon Link: https://lore.kernel.org/r/20241008155851.801546-3-mark.rutland@arm.com Signed-off-by: Will Deacon Signed-off-by: Greg Kroah-Hartman --- arch/arm64/kernel/probes/simulate-insn.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/arch/arm64/kernel/probes/simulate-insn.c b/arch/arm64/kernel/probes/simulate-insn.c index 22d0b3252476..b65334ab79d2 100644 --- a/arch/arm64/kernel/probes/simulate-insn.c +++ b/arch/arm64/kernel/probes/simulate-insn.c @@ -171,17 +171,15 @@ simulate_tbz_tbnz(u32 opcode, long addr, struct pt_regs *regs) void __kprobes simulate_ldr_literal(u32 opcode, long addr, struct pt_regs *regs) { - u64 *load_addr; + unsigned long load_addr; int xn = opcode & 0x1f; - int disp; - disp = ldr_displacement(opcode); - load_addr = (u64 *) (addr + disp); + load_addr = addr + ldr_displacement(opcode); if (opcode & (1 << 30)) /* x0-x30 */ - set_x_reg(regs, xn, *load_addr); + set_x_reg(regs, xn, READ_ONCE(*(u64 *)load_addr)); else /* w0-w30 */ - set_w_reg(regs, xn, *load_addr); + set_w_reg(regs, xn, READ_ONCE(*(u32 *)load_addr)); instruction_pointer_set(regs, instruction_pointer(regs) + 4); } @@ -189,14 +187,12 @@ simulate_ldr_literal(u32 opcode, long addr, struct pt_regs *regs) void __kprobes simulate_ldrsw_literal(u32 opcode, long addr, struct pt_regs *regs) { - s32 *load_addr; + unsigned long load_addr; int xn = opcode & 0x1f; - int disp; - disp = ldr_displacement(opcode); - load_addr = (s32 *) (addr + disp); + load_addr = addr + ldr_displacement(opcode); - set_x_reg(regs, xn, *load_addr); + set_x_reg(regs, xn, READ_ONCE(*(s32 *)load_addr)); instruction_pointer_set(regs, instruction_pointer(regs) + 4); } From 8165bf83b8a64be801d59cd2532b0d1ffed74d00 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 8 Oct 2024 16:58:48 +0100 Subject: [PATCH 014/125] arm64: probes: Fix uprobes for big-endian kernels commit 13f8f1e05f1dc36dbba6cba0ae03354c0dafcde7 upstream. The arm64 uprobes code is broken for big-endian kernels as it doesn't convert the in-memory instruction encoding (which is always little-endian) into the kernel's native endianness before analyzing and simulating instructions. This may result in a few distinct problems: * The kernel may may erroneously reject probing an instruction which can safely be probed. * The kernel may erroneously erroneously permit stepping an instruction out-of-line when that instruction cannot be stepped out-of-line safely. * The kernel may erroneously simulate instruction incorrectly dur to interpretting the byte-swapped encoding. The endianness mismatch isn't caught by the compiler or sparse because: * The arch_uprobe::{insn,ixol} fields are encoded as arrays of u8, so the compiler and sparse have no idea these contain a little-endian 32-bit value. The core uprobes code populates these with a memcpy() which similarly does not handle endianness. * While the uprobe_opcode_t type is an alias for __le32, both arch_uprobe_analyze_insn() and arch_uprobe_skip_sstep() cast from u8[] to the similarly-named probe_opcode_t, which is an alias for u32. Hence there is no endianness conversion warning. Fix this by changing the arch_uprobe::{insn,ixol} fields to __le32 and adding the appropriate __le32_to_cpu() conversions prior to consuming the instruction encoding. The core uprobes copies these fields as opaque ranges of bytes, and so is unaffected by this change. At the same time, remove MAX_UINSN_BYTES and consistently use AARCH64_INSN_SIZE for clarity. Tested with the following: | #include | #include | | #define noinline __attribute__((noinline)) | | static noinline void *adrp_self(void) | { | void *addr; | | asm volatile( | " adrp %x0, adrp_self\n" | " add %x0, %x0, :lo12:adrp_self\n" | : "=r" (addr)); | } | | | int main(int argc, char *argv) | { | void *ptr = adrp_self(); | bool equal = (ptr == adrp_self); | | printf("adrp_self => %p\n" | "adrp_self() => %p\n" | "%s\n", | adrp_self, ptr, equal ? "EQUAL" : "NOT EQUAL"); | | return 0; | } .... where the adrp_self() function was compiled to: | 00000000004007e0 : | 4007e0: 90000000 adrp x0, 400000 <__ehdr_start> | 4007e4: 911f8000 add x0, x0, #0x7e0 | 4007e8: d65f03c0 ret Before this patch, the ADRP is not recognized, and is assumed to be steppable, resulting in corruption of the result: | # ./adrp-self | adrp_self => 0x4007e0 | adrp_self() => 0x4007e0 | EQUAL | # echo 'p /root/adrp-self:0x007e0' > /sys/kernel/tracing/uprobe_events | # echo 1 > /sys/kernel/tracing/events/uprobes/enable | # ./adrp-self | adrp_self => 0x4007e0 | adrp_self() => 0xffffffffff7e0 | NOT EQUAL After this patch, the ADRP is correctly recognized and simulated: | # ./adrp-self | adrp_self => 0x4007e0 | adrp_self() => 0x4007e0 | EQUAL | # | # echo 'p /root/adrp-self:0x007e0' > /sys/kernel/tracing/uprobe_events | # echo 1 > /sys/kernel/tracing/events/uprobes/enable | # ./adrp-self | adrp_self => 0x4007e0 | adrp_self() => 0x4007e0 | EQUAL Fixes: 9842ceae9fa8 ("arm64: Add uprobe support") Cc: stable@vger.kernel.org Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Will Deacon Link: https://lore.kernel.org/r/20241008155851.801546-4-mark.rutland@arm.com Signed-off-by: Will Deacon Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/uprobes.h | 8 +++----- arch/arm64/kernel/probes/uprobes.c | 4 ++-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/arm64/include/asm/uprobes.h b/arch/arm64/include/asm/uprobes.h index 2b09495499c6..014b02897f8e 100644 --- a/arch/arm64/include/asm/uprobes.h +++ b/arch/arm64/include/asm/uprobes.h @@ -10,11 +10,9 @@ #include #include -#define MAX_UINSN_BYTES AARCH64_INSN_SIZE - #define UPROBE_SWBP_INSN cpu_to_le32(BRK64_OPCODE_UPROBES) #define UPROBE_SWBP_INSN_SIZE AARCH64_INSN_SIZE -#define UPROBE_XOL_SLOT_BYTES MAX_UINSN_BYTES +#define UPROBE_XOL_SLOT_BYTES AARCH64_INSN_SIZE typedef __le32 uprobe_opcode_t; @@ -23,8 +21,8 @@ struct arch_uprobe_task { struct arch_uprobe { union { - u8 insn[MAX_UINSN_BYTES]; - u8 ixol[MAX_UINSN_BYTES]; + __le32 insn; + __le32 ixol; }; struct arch_probe_insn api; bool simulate; diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c index d49aef2657cd..a2f137a595fc 100644 --- a/arch/arm64/kernel/probes/uprobes.c +++ b/arch/arm64/kernel/probes/uprobes.c @@ -42,7 +42,7 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, else if (!IS_ALIGNED(addr, AARCH64_INSN_SIZE)) return -EINVAL; - insn = *(probe_opcode_t *)(&auprobe->insn[0]); + insn = le32_to_cpu(auprobe->insn); switch (arm_probe_decode_insn(insn, &auprobe->api)) { case INSN_REJECTED: @@ -108,7 +108,7 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) if (!auprobe->simulate) return false; - insn = *(probe_opcode_t *)(&auprobe->insn[0]); + insn = le32_to_cpu(auprobe->insn); addr = instruction_pointer(regs); if (auprobe->api.handler) From 81db1e52848694761a1aa162ce76198af9964ed8 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Sun, 13 Oct 2024 07:29:16 +0200 Subject: [PATCH 015/125] net: macb: Avoid 20s boot delay by skipping MDIO bus registration for fixed-link PHY commit d0c3601f2c4e12e7689b0f46ebc17525250ea8c3 upstream. A boot delay was introduced by commit 79540d133ed6 ("net: macb: Fix handling of fixed-link node"). This delay was caused by the call to `mdiobus_register()` in cases where a fixed-link PHY was present. The MDIO bus registration triggered unnecessary PHY address scans, leading to a 20-second delay due to attempts to detect Clause 45 (C45) compatible PHYs, despite no MDIO bus being attached. The commit 79540d133ed6 ("net: macb: Fix handling of fixed-link node") was originally introduced to fix a regression caused by commit 7897b071ac3b4 ("net: macb: convert to phylink"), which caused the driver to misinterpret fixed-link nodes as PHY nodes. This resulted in warnings like: mdio_bus f0028000.ethernet-ffffffff: fixed-link has invalid PHY address mdio_bus f0028000.ethernet-ffffffff: scan phy fixed-link at address 0 ... mdio_bus f0028000.ethernet-ffffffff: scan phy fixed-link at address 31 This patch reworks the logic to avoid registering and allocation of the MDIO bus when: - The device tree contains a fixed-link node. - There is no "mdio" child node in the device tree. If a child node named "mdio" exists, the MDIO bus will be registered to support PHYs attached to the MACB's MDIO bus. Otherwise, with only a fixed-link, the MDIO bus is skipped. Tested on a sama5d35 based system with a ksz8863 switch attached to macb0. Fixes: 79540d133ed6 ("net: macb: Fix handling of fixed-link node") Signed-off-by: Oleksij Rempel Cc: stable@vger.kernel.org Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241013052916.3115142-1-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/cadence/macb_main.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index b940dcd3ace6..8f61731e4554 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -930,9 +930,6 @@ static int macb_mdiobus_register(struct macb *bp) return ret; } - if (of_phy_is_fixed_link(np)) - return mdiobus_register(bp->mii_bus); - /* Only create the PHY from the device tree if at least one PHY is * described. Otherwise scan the entire MDIO bus. We do this to support * old device tree that did not follow the best practices and did not @@ -953,8 +950,19 @@ static int macb_mdiobus_register(struct macb *bp) static int macb_mii_init(struct macb *bp) { + struct device_node *child, *np = bp->pdev->dev.of_node; int err = -ENXIO; + /* With fixed-link, we don't need to register the MDIO bus, + * except if we have a child named "mdio" in the device tree. + * In that case, some devices may be attached to the MACB's MDIO bus. + */ + child = of_get_child_by_name(np, "mdio"); + if (child) + of_node_put(child); + else if (of_phy_is_fixed_link(np)) + return macb_mii_probe(bp->dev); + /* Enable management port */ macb_writel(bp, NCR, MACB_BIT(MPE)); From 20b5342de51bda794791e013b90754774003a515 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Mon, 14 Oct 2024 20:19:22 +0800 Subject: [PATCH 016/125] net: microchip: vcap api: Fix memory leaks in vcap_api_encode_rule_test() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 217a3d98d1e9891a8b1438a27dfbc64ddf01f691 upstream. Commit a3c1e45156ad ("net: microchip: vcap: Fix use-after-free error in kunit test") fixed the use-after-free error, but introduced below memory leaks by removing necessary vcap_free_rule(), add it to fix it. unreferenced object 0xffffff80ca58b700 (size 192): comm "kunit_try_catch", pid 1215, jiffies 4294898264 hex dump (first 32 bytes): 00 12 7a 00 05 00 00 00 0a 00 00 00 64 00 00 00 ..z.........d... 00 00 00 00 00 00 00 00 00 04 0b cc 80 ff ff ff ................ backtrace (crc 9c09c3fe): [<0000000052a0be73>] kmemleak_alloc+0x34/0x40 [<0000000043605459>] __kmalloc_cache_noprof+0x26c/0x2f4 [<0000000040a01b8d>] vcap_alloc_rule+0x3cc/0x9c4 [<000000003fe86110>] vcap_api_encode_rule_test+0x1ac/0x16b0 [<00000000b3595fc4>] kunit_try_run_case+0x13c/0x3ac [<0000000010f5d2bf>] kunit_generic_run_threadfn_adapter+0x80/0xec [<00000000c5d82c9a>] kthread+0x2e8/0x374 [<00000000f4287308>] ret_from_fork+0x10/0x20 unreferenced object 0xffffff80cc0b0400 (size 64): comm "kunit_try_catch", pid 1215, jiffies 4294898265 hex dump (first 32 bytes): 80 04 0b cc 80 ff ff ff 18 b7 58 ca 80 ff ff ff ..........X..... 39 00 00 00 02 00 00 00 06 05 04 03 02 01 ff ff 9............... backtrace (crc daf014e9): [<0000000052a0be73>] kmemleak_alloc+0x34/0x40 [<0000000043605459>] __kmalloc_cache_noprof+0x26c/0x2f4 [<000000000ff63fd4>] vcap_rule_add_key+0x2cc/0x528 [<00000000dfdb1e81>] vcap_api_encode_rule_test+0x224/0x16b0 [<00000000b3595fc4>] kunit_try_run_case+0x13c/0x3ac [<0000000010f5d2bf>] kunit_generic_run_threadfn_adapter+0x80/0xec [<00000000c5d82c9a>] kthread+0x2e8/0x374 [<00000000f4287308>] ret_from_fork+0x10/0x20 unreferenced object 0xffffff80cc0b0700 (size 64): comm "kunit_try_catch", pid 1215, jiffies 4294898265 hex dump (first 32 bytes): 80 07 0b cc 80 ff ff ff 28 b7 58 ca 80 ff ff ff ........(.X..... 3c 00 00 00 00 00 00 00 01 2f 03 b3 ec ff ff ff <......../...... backtrace (crc 8d877792): [<0000000052a0be73>] kmemleak_alloc+0x34/0x40 [<0000000043605459>] __kmalloc_cache_noprof+0x26c/0x2f4 [<000000006eadfab7>] vcap_rule_add_action+0x2d0/0x52c [<00000000323475d1>] vcap_api_encode_rule_test+0x4d4/0x16b0 [<00000000b3595fc4>] kunit_try_run_case+0x13c/0x3ac [<0000000010f5d2bf>] kunit_generic_run_threadfn_adapter+0x80/0xec [<00000000c5d82c9a>] kthread+0x2e8/0x374 [<00000000f4287308>] ret_from_fork+0x10/0x20 unreferenced object 0xffffff80cc0b0900 (size 64): comm "kunit_try_catch", pid 1215, jiffies 4294898266 hex dump (first 32 bytes): 80 09 0b cc 80 ff ff ff 80 06 0b cc 80 ff ff ff ................ 7d 00 00 00 01 00 00 00 00 00 00 00 ff 00 00 00 }............... backtrace (crc 34181e56): [<0000000052a0be73>] kmemleak_alloc+0x34/0x40 [<0000000043605459>] __kmalloc_cache_noprof+0x26c/0x2f4 [<000000000ff63fd4>] vcap_rule_add_key+0x2cc/0x528 [<00000000991e3564>] vcap_val_rule+0xcf0/0x13e8 [<00000000fc9868e5>] vcap_api_encode_rule_test+0x678/0x16b0 [<00000000b3595fc4>] kunit_try_run_case+0x13c/0x3ac [<0000000010f5d2bf>] kunit_generic_run_threadfn_adapter+0x80/0xec [<00000000c5d82c9a>] kthread+0x2e8/0x374 [<00000000f4287308>] ret_from_fork+0x10/0x20 unreferenced object 0xffffff80cc0b0980 (size 64): comm "kunit_try_catch", pid 1215, jiffies 4294898266 hex dump (first 32 bytes): 18 b7 58 ca 80 ff ff ff 00 09 0b cc 80 ff ff ff ..X............. 67 00 00 00 00 00 00 00 01 01 74 88 c0 ff ff ff g.........t..... backtrace (crc 275fd9be): [<0000000052a0be73>] kmemleak_alloc+0x34/0x40 [<0000000043605459>] __kmalloc_cache_noprof+0x26c/0x2f4 [<000000000ff63fd4>] vcap_rule_add_key+0x2cc/0x528 [<000000001396a1a2>] test_add_def_fields+0xb0/0x100 [<000000006e7621f0>] vcap_val_rule+0xa98/0x13e8 [<00000000fc9868e5>] vcap_api_encode_rule_test+0x678/0x16b0 [<00000000b3595fc4>] kunit_try_run_case+0x13c/0x3ac [<0000000010f5d2bf>] kunit_generic_run_threadfn_adapter+0x80/0xec [<00000000c5d82c9a>] kthread+0x2e8/0x374 [<00000000f4287308>] ret_from_fork+0x10/0x20 ...... Cc: stable@vger.kernel.org Fixes: a3c1e45156ad ("net: microchip: vcap: Fix use-after-free error in kunit test") Reviewed-by: Simon Horman Reviewed-by: Jens Emil Schulz Østergaard Signed-off-by: Jinjie Ruan Link: https://patch.msgid.link/20241014121922.1280583-1-ruanjinjie@huawei.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/microchip/vcap/vcap_api_kunit.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/microchip/vcap/vcap_api_kunit.c b/drivers/net/ethernet/microchip/vcap/vcap_api_kunit.c index 79276bc3d495..66ef14d95bf6 100644 --- a/drivers/net/ethernet/microchip/vcap/vcap_api_kunit.c +++ b/drivers/net/ethernet/microchip/vcap/vcap_api_kunit.c @@ -1444,6 +1444,8 @@ static void vcap_api_encode_rule_test(struct kunit *test) ret = vcap_del_rule(&test_vctrl, &test_netdev, id); KUNIT_EXPECT_EQ(test, 0, ret); + + vcap_free_rule(rule); } static void vcap_api_set_rule_counter_test(struct kunit *test) From 8e29f3235181e89a6dd9f77b10e73dc876b69033 Mon Sep 17 00:00:00 2001 From: Nianyao Tang Date: Sat, 6 Apr 2024 02:27:37 +0000 Subject: [PATCH 017/125] irqchip/gic-v3-its: Fix VSYNC referencing an unmapped VPE on GIC v4.1 commit 80e9963fb3b5509dfcabe9652d56bf4b35542055 upstream. As per the GICv4.1 spec (Arm IHI 0069H, 5.3.19): "A VMAPP with {V, Alloc}=={0, x} is self-synchronizing, This means the ITS command queue does not show the command as consumed until all of its effects are completed." Furthermore, VSYNC is allowed to deliver an SError when referencing a non existent VPE. By these definitions, a VMAPP followed by a VSYNC is a bug, as the later references a VPE that has been unmapped by the former. Fix it by eliding the VSYNC in this scenario. Fixes: 64edfaa9a234 ("irqchip/gic-v4.1: Implement the v4.1 flavour of VMAPP") Signed-off-by: Nianyao Tang Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Reviewed-by: Zenghui Yu Link: https://lore.kernel.org/r/20240406022737.3898763-1-tangnianyao@huawei.com Signed-off-by: Greg Kroah-Hartman --- drivers/irqchip/irq-gic-v3-its.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 350abbb36e04..f04a2c3a45e1 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -786,6 +786,7 @@ static struct its_vpe *its_build_vmapp_cmd(struct its_node *its, struct its_cmd_block *cmd, struct its_cmd_desc *desc) { + struct its_vpe *vpe = valid_vpe(its, desc->its_vmapp_cmd.vpe); unsigned long vpt_addr, vconf_addr; u64 target; bool alloc; @@ -798,6 +799,11 @@ static struct its_vpe *its_build_vmapp_cmd(struct its_node *its, if (is_v4_1(its)) { alloc = !atomic_dec_return(&desc->its_vmapp_cmd.vpe->vmapp_count); its_encode_alloc(cmd, alloc); + /* + * Unmapping a VPE is self-synchronizing on GICv4.1, + * no need to issue a VSYNC. + */ + vpe = NULL; } goto out; @@ -832,7 +838,7 @@ static struct its_vpe *its_build_vmapp_cmd(struct its_node *its, out: its_fixup_cmd(cmd); - return valid_vpe(its, desc->its_vmapp_cmd.vpe); + return vpe; } static struct its_vpe *its_build_vmapti_cmd(struct its_node *its, From a9af9d5fb01bb180a386d18ff6381f54a735eac3 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Fri, 4 Oct 2024 15:03:49 +0900 Subject: [PATCH 018/125] fat: fix uninitialized variable commit 963a7f4d3b90ee195b895ca06b95757fcba02d1a upstream. syszbot produced this with a corrupted fs image. In theory, however an IO error would trigger this also. This affects just an error report, so should not be a serious error. Link: https://lkml.kernel.org/r/87r08wjsnh.fsf@mail.parknet.co.jp Link: https://lkml.kernel.org/r/66ff2c95.050a0220.49194.03e9.GAE@google.com Signed-off-by: OGAWA Hirofumi Reported-by: syzbot+ef0d7bc412553291aa86@syzkaller.appspotmail.com Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/fat/namei_vfat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index c4d00999a433..3cf22a6727f1 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -1037,7 +1037,7 @@ error_inode: if (corrupt < 0) { fat_fs_error(new_dir->i_sb, "%s: Filesystem corrupted (i_pos %lld)", - __func__, sinfo.i_pos); + __func__, new_i_pos); } goto out; } From 8f5fa1c677df3c6ef05673eaadd51650b644be7a Mon Sep 17 00:00:00 2001 From: Edward Liaw Date: Thu, 3 Oct 2024 21:17:10 +0000 Subject: [PATCH 019/125] selftests/mm: replace atomic_bool with pthread_barrier_t commit e61ef21e27e8deed8c474e9f47f4aa7bc37e138c upstream. Patch series "selftests/mm: fix deadlock after pthread_create". On Android arm, pthread_create followed by a fork caused a deadlock in the case where the fork required work to be completed by the created thread. Update the synchronization primitive to use pthread_barrier instead of atomic_bool. Apply the same fix to the wp-fork-with-event test. This patch (of 2): Swap synchronization primitive with pthread_barrier, so that stdatomic.h does not need to be included. The synchronization is needed on Android ARM64; we see a deadlock with pthread_create when the parent thread races forward before the child has a chance to start doing work. Link: https://lkml.kernel.org/r/20241003211716.371786-1-edliaw@google.com Link: https://lkml.kernel.org/r/20241003211716.371786-2-edliaw@google.com Fixes: cff294582798 ("selftests/mm: extend and rename uffd pagemap test") Signed-off-by: Edward Liaw Cc: Lokesh Gidra Cc: Peter Xu Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- tools/testing/selftests/mm/uffd-common.c | 5 +++-- tools/testing/selftests/mm/uffd-common.h | 3 +-- tools/testing/selftests/mm/uffd-unit-tests.c | 14 ++++++++------ 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c index ba6777cdf423..3bdae35e0add 100644 --- a/tools/testing/selftests/mm/uffd-common.c +++ b/tools/testing/selftests/mm/uffd-common.c @@ -17,7 +17,7 @@ bool map_shared; bool test_uffdio_wp = true; unsigned long long *count_verify; uffd_test_ops_t *uffd_test_ops; -atomic_bool ready_for_fork; +pthread_barrier_t ready_for_fork; static int uffd_mem_fd_create(off_t mem_size, bool hugetlb) { @@ -508,7 +508,8 @@ void *uffd_poll_thread(void *arg) pollfd[1].fd = pipefd[cpu*2]; pollfd[1].events = POLLIN; - ready_for_fork = true; + /* Ready for parent thread to fork */ + pthread_barrier_wait(&ready_for_fork); for (;;) { ret = poll(pollfd, 2, -1); diff --git a/tools/testing/selftests/mm/uffd-common.h b/tools/testing/selftests/mm/uffd-common.h index 2832669bd9fd..2d78ae0daf06 100644 --- a/tools/testing/selftests/mm/uffd-common.h +++ b/tools/testing/selftests/mm/uffd-common.h @@ -33,7 +33,6 @@ #include #include #include -#include #include "../kselftest.h" #include "vm_util.h" @@ -99,7 +98,7 @@ extern bool map_shared; extern bool test_uffdio_wp; extern unsigned long long *count_verify; extern volatile bool test_uffdio_copy_eexist; -extern atomic_bool ready_for_fork; +extern pthread_barrier_t ready_for_fork; extern uffd_test_ops_t anon_uffd_test_ops; extern uffd_test_ops_t shmem_uffd_test_ops; diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c index 92d51768b7be..d7bc740245ac 100644 --- a/tools/testing/selftests/mm/uffd-unit-tests.c +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -770,7 +770,7 @@ static void uffd_sigbus_test_common(bool wp) char c; struct uffd_args args = { 0 }; - ready_for_fork = false; + pthread_barrier_init(&ready_for_fork, NULL, 2); fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); @@ -787,8 +787,9 @@ static void uffd_sigbus_test_common(bool wp) if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args)) err("uffd_poll_thread create"); - while (!ready_for_fork) - ; /* Wait for the poll_thread to start executing before forking */ + /* Wait for child thread to start before forking */ + pthread_barrier_wait(&ready_for_fork); + pthread_barrier_destroy(&ready_for_fork); pid = fork(); if (pid < 0) @@ -829,7 +830,7 @@ static void uffd_events_test_common(bool wp) char c; struct uffd_args args = { 0 }; - ready_for_fork = false; + pthread_barrier_init(&ready_for_fork, NULL, 2); fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); if (uffd_register(uffd, area_dst, nr_pages * page_size, @@ -840,8 +841,9 @@ static void uffd_events_test_common(bool wp) if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args)) err("uffd_poll_thread create"); - while (!ready_for_fork) - ; /* Wait for the poll_thread to start executing before forking */ + /* Wait for child thread to start before forking */ + pthread_barrier_wait(&ready_for_fork); + pthread_barrier_destroy(&ready_for_fork); pid = fork(); if (pid < 0) From 6b91fd65a117b6e9f2384e139a9bab894f37ae06 Mon Sep 17 00:00:00 2001 From: Edward Liaw Date: Thu, 3 Oct 2024 21:17:11 +0000 Subject: [PATCH 020/125] selftests/mm: fix deadlock for fork after pthread_create on ARM commit e142cc87ac4ec618f2ccf5f68aedcd6e28a59d9d upstream. On Android with arm, there is some synchronization needed to avoid a deadlock when forking after pthread_create. Link: https://lkml.kernel.org/r/20241003211716.371786-3-edliaw@google.com Fixes: cff294582798 ("selftests/mm: extend and rename uffd pagemap test") Signed-off-by: Edward Liaw Cc: Lokesh Gidra Cc: Peter Xu Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- tools/testing/selftests/mm/uffd-unit-tests.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c index d7bc740245ac..42cdba544f81 100644 --- a/tools/testing/selftests/mm/uffd-unit-tests.c +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -237,6 +237,9 @@ static void *fork_event_consumer(void *data) fork_event_args *args = data; struct uffd_msg msg = { 0 }; + /* Ready for parent thread to fork */ + pthread_barrier_wait(&ready_for_fork); + /* Read until a full msg received */ while (uffd_read_msg(args->parent_uffd, &msg)); @@ -304,8 +307,12 @@ static int pagemap_test_fork(int uffd, bool with_event, bool test_pin) /* Prepare a thread to resolve EVENT_FORK */ if (with_event) { + pthread_barrier_init(&ready_for_fork, NULL, 2); if (pthread_create(&thread, NULL, fork_event_consumer, &args)) err("pthread_create()"); + /* Wait for child thread to start before forking */ + pthread_barrier_wait(&ready_for_fork); + pthread_barrier_destroy(&ready_for_fork); } child = fork(); From 17396e32f975130b3e6251f024c8807d192e4c3e Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 7 Oct 2024 23:42:04 +0200 Subject: [PATCH 021/125] mm/mremap: fix move_normal_pmd/retract_page_tables race commit 6fa1066fc5d00cb9f1b0e83b7ff6ef98d26ba2aa upstream. In mremap(), move_page_tables() looks at the type of the PMD entry and the specified address range to figure out by which method the next chunk of page table entries should be moved. At that point, the mmap_lock is held in write mode, but no rmap locks are held yet. For PMD entries that point to page tables and are fully covered by the source address range, move_pgt_entry(NORMAL_PMD, ...) is called, which first takes rmap locks, then does move_normal_pmd(). move_normal_pmd() takes the necessary page table locks at source and destination, then moves an entire page table from the source to the destination. The problem is: The rmap locks, which protect against concurrent page table removal by retract_page_tables() in the THP code, are only taken after the PMD entry has been read and it has been decided how to move it. So we can race as follows (with two processes that have mappings of the same tmpfs file that is stored on a tmpfs mount with huge=advise); note that process A accesses page tables through the MM while process B does it through the file rmap: process A process B ========= ========= mremap mremap_to move_vma move_page_tables get_old_pmd alloc_new_pmd *** PREEMPT *** madvise(MADV_COLLAPSE) do_madvise madvise_walk_vmas madvise_vma_behavior madvise_collapse hpage_collapse_scan_file collapse_file retract_page_tables i_mmap_lock_read(mapping) pmdp_collapse_flush i_mmap_unlock_read(mapping) move_pgt_entry(NORMAL_PMD, ...) take_rmap_locks move_normal_pmd drop_rmap_locks When this happens, move_normal_pmd() can end up creating bogus PMD entries in the line `pmd_populate(mm, new_pmd, pmd_pgtable(pmd))`. The effect depends on arch-specific and machine-specific details; on x86, you can end up with physical page 0 mapped as a page table, which is likely exploitable for user->kernel privilege escalation. Fix the race by letting process B recheck that the PMD still points to a page table after the rmap locks have been taken. Otherwise, we bail and let the caller fall back to the PTE-level copying path, which will then bail immediately at the pmd_none() check. Bug reachability: Reaching this bug requires that you can create shmem/file THP mappings - anonymous THP uses different code that doesn't zap stuff under rmap locks. File THP is gated on an experimental config flag (CONFIG_READ_ONLY_THP_FOR_FS), so on normal distro kernels you need shmem THP to hit this bug. As far as I know, getting shmem THP normally requires that you can mount your own tmpfs with the right mount flags, which would require creating your own user+mount namespace; though I don't know if some distros maybe enable shmem THP by default or something like that. Bug impact: This issue can likely be used for user->kernel privilege escalation when it is reachable. Link: https://lkml.kernel.org/r/20241007-move_normal_pmd-vs-collapse-fix-2-v1-1-5ead9631f2ea@google.com Fixes: 1d65b771bc08 ("mm/khugepaged: retract_page_tables() without mmap or vma lock") Signed-off-by: Jann Horn Signed-off-by: David Hildenbrand Co-developed-by: David Hildenbrand Closes: https://project-zero.issues.chromium.org/371047675 Acked-by: Qi Zheng Reviewed-by: Lorenzo Stoakes Cc: Hugh Dickins Cc: Joel Fernandes Cc: Matthew Wilcox Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/mremap.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index 382e81c33fc4..df71010baabe 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -238,6 +238,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, { spinlock_t *old_ptl, *new_ptl; struct mm_struct *mm = vma->vm_mm; + bool res = false; pmd_t pmd; if (!arch_supports_page_table_move()) @@ -277,19 +278,25 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); - /* Clear the pmd */ pmd = *old_pmd; + + /* Racing with collapse? */ + if (unlikely(!pmd_present(pmd) || pmd_leaf(pmd))) + goto out_unlock; + /* Clear the pmd */ pmd_clear(old_pmd); + res = true; VM_BUG_ON(!pmd_none(*new_pmd)); pmd_populate(mm, new_pmd, pmd_pgtable(pmd)); flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE); +out_unlock: if (new_ptl != old_ptl) spin_unlock(new_ptl); spin_unlock(old_ptl); - return true; + return res; } #else static inline bool move_normal_pmd(struct vm_area_struct *vma, From a0035fc55554773b31dfa59b44b96c6588c4b41b Mon Sep 17 00:00:00 2001 From: Wei Xu Date: Mon, 14 Oct 2024 22:12:11 +0000 Subject: [PATCH 022/125] mm/mglru: only clear kswapd_failures if reclaimable commit b130ba4a6259f6b64d8af15e9e7ab1e912bcb7ad upstream. lru_gen_shrink_node() unconditionally clears kswapd_failures, which can prevent kswapd from sleeping and cause 100% kswapd cpu usage even when kswapd repeatedly fails to make progress in reclaim. Only clear kswap_failures in lru_gen_shrink_node() if reclaim makes some progress, similar to shrink_node(). I happened to run into this problem in one of my tests recently. It requires a combination of several conditions: The allocator needs to allocate a right amount of pages such that it can wake up kswapd without itself being OOM killed; there is no memory for kswapd to reclaim (My test disables swap and cleans page cache first); no other process frees enough memory at the same time. Link: https://lkml.kernel.org/r/20241014221211.832591-1-weixugc@google.com Fixes: e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists") Signed-off-by: Wei Xu Cc: Axel Rasmussen Cc: Brian Geffon Cc: Jan Alexander Steffens Cc: Suleiman Souhlal Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/vmscan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 81533bed0b46..3c91b86d59e9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -5603,8 +5603,8 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control * blk_finish_plug(&plug); done: - /* kswapd should never fail */ - pgdat->kswapd_failures = 0; + if (sc->nr_reclaimed > reclaimed) + pgdat->kswapd_failures = 0; } /****************************************************************************** From bed2b9037806c62166a0ef9a559a1e7e3e1275b8 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Tue, 15 Oct 2024 09:45:21 +0800 Subject: [PATCH 023/125] mm/swapfile: skip HugeTLB pages for unuse_vma commit 7528c4fb1237512ee18049f852f014eba80bbe8d upstream. I got a bad pud error and lost a 1GB HugeTLB when calling swapoff. The problem can be reproduced by the following steps: 1. Allocate an anonymous 1GB HugeTLB and some other anonymous memory. 2. Swapout the above anonymous memory. 3. run swapoff and we will get a bad pud error in kernel message: mm/pgtable-generic.c:42: bad pud 00000000743d215d(84000001400000e7) We can tell that pud_clear_bad is called by pud_none_or_clear_bad in unuse_pud_range() by ftrace. And therefore the HugeTLB pages will never be freed because we lost it from page table. We can skip HugeTLB pages for unuse_vma to fix it. Link: https://lkml.kernel.org/r/20241015014521.570237-1-liushixin2@huawei.com Fixes: 0fe6e20b9c4c ("hugetlb, rmap: add reverse mapping for hugepage") Signed-off-by: Liu Shixin Acked-by: Muchun Song Cc: Naoya Horiguchi Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/swapfile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index eada1351753e..c856d6bb2daf 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2003,7 +2003,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type) mmap_read_lock(mm); for_each_vma(vmi, vma) { - if (vma->anon_vma) { + if (vma->anon_vma && !is_vm_hugetlb_page(vma)) { ret = unuse_vma(vma, type); if (ret) break; From f43bd357fde02ef5ed2b06d387a66d0b1fa9df8d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Oct 2024 17:11:06 -0700 Subject: [PATCH 024/125] xfs: fix error returns from xfs_bmapi_write MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 6773da870ab89123d1b513da63ed59e32a29cb77 upstream. [backport: resolve conflicts due to missing quota_repair.c, rtbitmap_repair.c, xfs_bmap_mark_sick()] xfs_bmapi_write can return 0 without actually returning a mapping in mval in two different cases: 1) when there is absolutely no space available to do an allocation 2) when converting delalloc space, and the allocation is so small that it only covers parts of the delalloc extent before the range requested by the caller Callers at best can handle one of these cases, but in many cases can't cope with either one. Switch xfs_bmapi_write to always return a mapping or return an error code instead. For case 1) above ENOSPC is the obvious choice which is very much what the callers expect anyway. For case 2) there is no really good error code, so pick a funky one from the SysV streams portfolio. This fixes the reproducer here: https://lore.kernel.org/linux-xfs/CAEJPjCvT3Uag-pMTYuigEjWZHn1sGMZ0GCjVVCv29tNHK76Cgg@mail.gmail.com0/ which uses reserved blocks to create file systems that are gravely out of space and thus cause at least xfs_file_alloc_space to hang and trigger the lack of ENOSPC handling in xfs_dquot_disk_alloc. Note that this patch does not actually make any caller but xfs_alloc_file_space deal intelligently with case 2) above. Signed-off-by: Christoph Hellwig Reported-by: 刘通 Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R Signed-off-by: Catherine Hoang Acked-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/libxfs/xfs_attr_remote.c | 1 - fs/xfs/libxfs/xfs_bmap.c | 46 ++++++++++++++++++++++++++------- fs/xfs/libxfs/xfs_da_btree.c | 20 ++++---------- fs/xfs/xfs_bmap_util.c | 35 ++++++++++++------------- fs/xfs/xfs_dquot.c | 1 - fs/xfs/xfs_iomap.c | 8 ------ fs/xfs/xfs_reflink.c | 14 ---------- fs/xfs/xfs_rtalloc.c | 2 -- 8 files changed, 59 insertions(+), 68 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index d440393b40eb..54de405cbab5 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -619,7 +619,6 @@ xfs_attr_rmtval_set_blk( if (error) return error; - ASSERT(nmap == 1); ASSERT((map->br_startblock != DELAYSTARTBLOCK) && (map->br_startblock != HOLESTARTBLOCK)); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 48f0d0698ec4..97f575e21f86 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4128,8 +4128,10 @@ xfs_bmapi_allocate( } else { error = xfs_bmap_alloc_userdata(bma); } - if (error || bma->blkno == NULLFSBLOCK) + if (error) return error; + if (bma->blkno == NULLFSBLOCK) + return -ENOSPC; if (bma->flags & XFS_BMAPI_ZERO) { error = xfs_zero_extent(bma->ip, bma->blkno, bma->length); @@ -4309,6 +4311,15 @@ xfs_bmapi_finish( * extent state if necessary. Details behaviour is controlled by the flags * parameter. Only allocates blocks from a single allocation group, to avoid * locking problems. + * + * Returns 0 on success and places the extent mappings in mval. nmaps is used + * as an input/output parameter where the caller specifies the maximum number + * of mappings that may be returned and xfs_bmapi_write passes back the number + * of mappings (including existing mappings) it found. + * + * Returns a negative error code on failure, including -ENOSPC when it could not + * allocate any blocks and -ENOSR when it did allocate blocks to convert a + * delalloc range, but those blocks were before the passed in range. */ int xfs_bmapi_write( @@ -4436,10 +4447,16 @@ xfs_bmapi_write( ASSERT(len > 0); ASSERT(bma.length > 0); error = xfs_bmapi_allocate(&bma); - if (error) + if (error) { + /* + * If we already allocated space in a previous + * iteration return what we go so far when + * running out of space. + */ + if (error == -ENOSPC && bma.nallocs) + break; goto error0; - if (bma.blkno == NULLFSBLOCK) - break; + } /* * If this is a CoW allocation, record the data in @@ -4477,7 +4494,6 @@ xfs_bmapi_write( if (!xfs_iext_next_extent(ifp, &bma.icur, &bma.got)) eof = true; } - *nmap = n; error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, whichfork); @@ -4488,7 +4504,22 @@ xfs_bmapi_write( ifp->if_nextents > XFS_IFORK_MAXEXT(ip, whichfork)); xfs_bmapi_finish(&bma, whichfork, 0); xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval, - orig_nmap, *nmap); + orig_nmap, n); + + /* + * When converting delayed allocations, xfs_bmapi_allocate ignores + * the passed in bno and always converts from the start of the found + * delalloc extent. + * + * To avoid a successful return with *nmap set to 0, return the magic + * -ENOSR error code for this particular case so that the caller can + * handle it. + */ + if (!n) { + ASSERT(bma.nallocs >= *nmap); + return -ENOSR; + } + *nmap = n; return 0; error0: xfs_bmapi_finish(&bma, whichfork, error); @@ -4595,9 +4626,6 @@ xfs_bmapi_convert_delalloc( if (error) goto out_finish; - error = -ENOSPC; - if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK)) - goto out_finish; error = -EFSCORRUPTED; if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock))) goto out_finish; diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 282c7cf032f4..12e3cca804b7 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -2158,8 +2158,8 @@ xfs_da_grow_inode_int( struct xfs_inode *dp = args->dp; int w = args->whichfork; xfs_rfsblock_t nblks = dp->i_nblocks; - struct xfs_bmbt_irec map, *mapp; - int nmap, error, got, i, mapi; + struct xfs_bmbt_irec map, *mapp = ↦ + int nmap, error, got, i, mapi = 1; /* * Find a spot in the file space to put the new block. @@ -2175,14 +2175,7 @@ xfs_da_grow_inode_int( error = xfs_bmapi_write(tp, dp, *bno, count, xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG, args->total, &map, &nmap); - if (error) - return error; - - ASSERT(nmap <= 1); - if (nmap == 1) { - mapp = ↦ - mapi = 1; - } else if (nmap == 0 && count > 1) { + if (error == -ENOSPC && count > 1) { xfs_fileoff_t b; int c; @@ -2199,16 +2192,13 @@ xfs_da_grow_inode_int( args->total, &mapp[mapi], &nmap); if (error) goto out_free_map; - if (nmap < 1) - break; mapi += nmap; b = mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount; } - } else { - mapi = 0; - mapp = NULL; } + if (error) + goto out_free_map; /* * Count the blocks we got, make sure it matches the total. diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index ad4aba5002c1..4a7d1a1b67a3 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -868,33 +868,32 @@ xfs_alloc_file_space( if (error) goto error; + /* + * If the allocator cannot find a single free extent large + * enough to cover the start block of the requested range, + * xfs_bmapi_write will return -ENOSR. + * + * In that case we simply need to keep looping with the same + * startoffset_fsb so that one of the following allocations + * will eventually reach the requested range. + */ error = xfs_bmapi_write(tp, ip, startoffset_fsb, allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp, &nimaps); - if (error) - goto error; + if (error) { + if (error != -ENOSR) + goto error; + error = 0; + } else { + startoffset_fsb += imapp->br_blockcount; + allocatesize_fsb -= imapp->br_blockcount; + } ip->i_diflags |= XFS_DIFLAG_PREALLOC; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - break; - - /* - * If the allocator cannot find a single free extent large - * enough to cover the start block of the requested range, - * xfs_bmapi_write will return 0 but leave *nimaps set to 0. - * - * In that case we simply need to keep looping with the same - * startoffset_fsb so that one of the following allocations - * will eventually reach the requested range. - */ - if (nimaps) { - startoffset_fsb += imapp->br_blockcount; - allocatesize_fsb -= imapp->br_blockcount; - } } return error; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index a013b87ab8d5..9b67f05d92a1 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -333,7 +333,6 @@ xfs_dquot_disk_alloc( goto err_cancel; ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); - ASSERT(nmaps == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && (map.br_startblock != HOLESTARTBLOCK)); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 055cdec2e9ad..6e5ace7c9bc9 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -317,14 +317,6 @@ xfs_iomap_write_direct( if (error) goto out_unlock; - /* - * Copy any maps to caller's array and return any error. - */ - if (nimaps == 0) { - error = -ENOSPC; - goto out_unlock; - } - if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) error = xfs_alert_fsblock_zero(ip, imap); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index e5b62dc28466..b8416762bb60 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -429,13 +429,6 @@ xfs_reflink_fill_cow_hole( if (error) return error; - /* - * Allocation succeeded but the requested range was not even partially - * satisfied? Bail out! - */ - if (nimaps == 0) - return -ENOSPC; - convert: return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now); @@ -498,13 +491,6 @@ xfs_reflink_fill_delalloc( error = xfs_trans_commit(tp); if (error) return error; - - /* - * Allocation succeeded but the requested range was not even - * partially satisfied? Bail out! - */ - if (nimaps == 0) - return -ENOSPC; } while (cmap->br_startoff + cmap->br_blockcount <= imap->br_startoff); return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now); diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 4bec890d93d2..608db1ab88a4 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -840,8 +840,6 @@ xfs_growfs_rt_alloc( nmap = 1; error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks, XFS_BMAPI_METADATA, 0, &map, &nmap); - if (!error && nmap < 1) - error = -ENOSPC; if (error) goto out_trans_cancel; /* From 4bcef72d96b50757df22b05b89bb4a7535919d88 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Oct 2024 17:11:07 -0700 Subject: [PATCH 025/125] xfs: fix xfs_bmap_add_extent_delay_real for partial conversions commit d69bee6a35d3c5e4873b9e164dd1a9711351a97c upstream. [backport: resolve conflict due to xfs_mod_freecounter refactor] xfs_bmap_add_extent_delay_real takes parts or all of a delalloc extent and converts them to a real extent. It is written to deal with any potential overlap of the to be converted range with the delalloc extent, but it turns out that currently only converting the entire extents, or a part starting at the beginning is actually exercised, as the only caller always tries to convert the entire delalloc extent, and either succeeds or at least progresses partially from the start. If it only converts a tiny part of a delalloc extent, the indirect block calculation for the new delalloc extent (da_new) might be equivalent to that of the existing delalloc extent (da_old). If this extent conversion now requires allocating an indirect block that gets accounted into da_new, leading to the assert that da_new must be smaller or equal to da_new unless we split the extent to trigger. Except for the assert that case is actually handled by just trying to allocate more space, as that already handled for the split case (which currently can't be reached at all), so just reusing it should be fine. Except that without dipping into the reserved block pool that would make it a bit too easy to trigger a fs shutdown due to ENOSPC. So in addition to adjusting the assert, also dip into the reserved block pool. Note that I could only reproduce the assert with a change to only convert the actually asked range instead of the full delalloc extent from xfs_bmapi_write. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R Signed-off-by: Catherine Hoang Acked-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/libxfs/xfs_bmap.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 97f575e21f86..18429b7f7811 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1549,6 +1549,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; } + ASSERT(da_new <= da_old); break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: @@ -1578,6 +1579,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; } + ASSERT(da_new <= da_old); break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: @@ -1611,6 +1613,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; } + ASSERT(da_new <= da_old); break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: @@ -1643,6 +1646,7 @@ xfs_bmap_add_extent_delay_real( goto done; } } + ASSERT(da_new <= da_old); break; case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG: @@ -1680,6 +1684,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; } + ASSERT(da_new <= da_old); break; case BMAP_LEFT_FILLING: @@ -1767,6 +1772,7 @@ xfs_bmap_add_extent_delay_real( xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV); xfs_iext_next(ifp, &bma->icur); xfs_iext_update_extent(bma->ip, state, &bma->icur, &RIGHT); + ASSERT(da_new <= da_old); break; case BMAP_RIGHT_FILLING: @@ -1814,6 +1820,7 @@ xfs_bmap_add_extent_delay_real( PREV.br_blockcount = temp; xfs_iext_insert(bma->ip, &bma->icur, &PREV, state); xfs_iext_next(ifp, &bma->icur); + ASSERT(da_new <= da_old); break; case 0: @@ -1934,11 +1941,9 @@ xfs_bmap_add_extent_delay_real( } /* adjust for changes in reserved delayed indirect blocks */ - if (da_new != da_old) { - ASSERT(state == 0 || da_new < da_old); + if (da_new != da_old) error = xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), - false); - } + true); xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork); done: From c299188b443ac5b4dc5a8da0574881eb14bb63cf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Oct 2024 17:11:08 -0700 Subject: [PATCH 026/125] xfs: remove a racy if_bytes check in xfs_reflink_end_cow_extent commit 86de848403abda05bf9c16dcdb6bef65a8d88c41 upstream. Accessing if_bytes without the ilock is racy. Remove the initial if_bytes == 0 check in xfs_reflink_end_cow_extent and let ext_iext_lookup_extent fail for this case after we've taken the ilock. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R Signed-off-by: Catherine Hoang Acked-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_reflink.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index b8416762bb60..3431d0d8b6f3 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -716,12 +716,6 @@ xfs_reflink_end_cow_extent( int nmaps; int error; - /* No COW extents? That's easy! */ - if (ifp->if_bytes == 0) { - *offset_fsb = end_fsb; - return 0; - } - resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, XFS_TRANS_RESERVE, &tp); From c13c21f77824026f8bb5ceb14c5dea27c2678ed6 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 15 Oct 2024 17:11:09 -0700 Subject: [PATCH 027/125] xfs: require XFS_SB_FEAT_INCOMPAT_LOG_XATTRS for attr log intent item recovery commit 8ef1d96a985e4dc07ffbd71bd7fc5604a80cc644 upstream. The XFS_SB_FEAT_INCOMPAT_LOG_XATTRS feature bit protects a filesystem from old kernels that do not know how to recover extended attribute log intent items. Make this check mandatory instead of a debugging assert. Fixes: fd920008784ea ("xfs: Set up infrastructure for log attribute replay") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Catherine Hoang Acked-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_attr_item.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 82775e9537df..ebf656aaf301 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -510,6 +510,9 @@ xfs_attri_validate( unsigned int op = attrp->alfi_op_flags & XFS_ATTRI_OP_FLAGS_TYPE_MASK; + if (!xfs_sb_version_haslogxattrs(&mp->m_sb)) + return false; + if (attrp->__pad != 0) return false; @@ -602,8 +605,6 @@ xfs_attri_item_recover( args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT | XFS_DA_OP_LOGGED; - ASSERT(xfs_sb_version_haslogxattrs(&mp->m_sb)); - switch (attr->xattri_op_flags) { case XFS_ATTRI_OP_FLAGS_SET: case XFS_ATTRI_OP_FLAGS_REPLACE: From 0934046e33929df014caa494e422b3c33c517acd Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 15 Oct 2024 17:11:10 -0700 Subject: [PATCH 028/125] xfs: check opcode and iovec count match in xlog_recover_attri_commit_pass2 commit ad206ae50eca62836c5460ab5bbf2a6c59a268e7 upstream. Check that the number of recovered log iovecs is what is expected for the xattri opcode is expecting. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Catherine Hoang Acked-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_attr_item.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index ebf656aaf301..064cb4fe5df4 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -719,6 +719,7 @@ xlog_recover_attri_commit_pass2( const void *attr_value = NULL; const void *attr_name; size_t len; + unsigned int op; attri_formatp = item->ri_buf[0].i_addr; attr_name = item->ri_buf[1].i_addr; @@ -737,6 +738,32 @@ xlog_recover_attri_commit_pass2( return -EFSCORRUPTED; } + /* Check the number of log iovecs makes sense for the op code. */ + op = attri_formatp->alfi_op_flags & XFS_ATTRI_OP_FLAGS_TYPE_MASK; + switch (op) { + case XFS_ATTRI_OP_FLAGS_SET: + case XFS_ATTRI_OP_FLAGS_REPLACE: + /* Log item, attr name, attr value */ + if (item->ri_total != 3) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + attri_formatp, len); + return -EFSCORRUPTED; + } + break; + case XFS_ATTRI_OP_FLAGS_REMOVE: + /* Log item, attr name */ + if (item->ri_total != 2) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + attri_formatp, len); + return -EFSCORRUPTED; + } + break; + default: + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + attri_formatp, len); + return -EFSCORRUPTED; + } + /* Validate the attr name */ if (item->ri_buf[1].i_len != xlog_calc_iovec_len(attri_formatp->alfi_name_len)) { From cad051826d8308a89e3468cef3f2d75f2150959c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 15 Oct 2024 17:11:11 -0700 Subject: [PATCH 029/125] xfs: fix missing check for invalid attr flags commit f660ec8eaeb50d0317c29601aacabdb15e5f2203 upstream. [backport: fix build errors in xchk_xattr_listent] The xattr scrubber doesn't check for undefined flags in shortform attr entries. Therefore, define a mask XFS_ATTR_ONDISK_MASK that has all possible XFS_ATTR_* flags in it, and use that to check for unknown bits in xchk_xattr_actor. Refactor the check in the dabtree scanner function to use the new mask as well. The redundant checks need to be in place because the dabtree check examines the hash mappings and therefore needs to decode the attr leaf entries to compute the namehash. This happens before the walk of the xattr entries themselves. Fixes: ae0506eba78fd ("xfs: check used space of shortform xattr structures") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Catherine Hoang Acked-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/libxfs/xfs_da_format.h | 5 +++++ fs/xfs/scrub/attr.c | 13 +++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index f9015f88eca7..ebcb9066398f 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -703,8 +703,13 @@ struct xfs_attr3_leafblock { #define XFS_ATTR_ROOT (1u << XFS_ATTR_ROOT_BIT) #define XFS_ATTR_SECURE (1u << XFS_ATTR_SECURE_BIT) #define XFS_ATTR_INCOMPLETE (1u << XFS_ATTR_INCOMPLETE_BIT) + #define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE) +#define XFS_ATTR_ONDISK_MASK (XFS_ATTR_NSP_ONDISK_MASK | \ + XFS_ATTR_LOCAL | \ + XFS_ATTR_INCOMPLETE) + /* * Alignment for namelist and valuelist entries (since they are mixed * there can be only one alignment value) diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 6c16d9530cca..990f4bf1c197 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -182,6 +182,11 @@ xchk_xattr_listent( return; } + if (flags & ~XFS_ATTR_ONDISK_MASK) { + xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno); + goto fail_xref; + } + if (flags & XFS_ATTR_INCOMPLETE) { /* Incomplete attr key, just mark the inode for preening. */ xchk_ino_set_preen(sx->sc, context->dp->i_ino); @@ -463,7 +468,6 @@ xchk_xattr_rec( xfs_dahash_t hash; int nameidx; int hdrsize; - unsigned int badflags; int error; ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); @@ -493,10 +497,11 @@ xchk_xattr_rec( /* Retrieve the entry and check it. */ hash = be32_to_cpu(ent->hashval); - badflags = ~(XFS_ATTR_LOCAL | XFS_ATTR_ROOT | XFS_ATTR_SECURE | - XFS_ATTR_INCOMPLETE); - if ((ent->flags & badflags) != 0) + if (ent->flags & ~XFS_ATTR_ONDISK_MASK) { xchk_da_set_corrupt(ds, level); + return 0; + } + if (ent->flags & XFS_ATTR_LOCAL) { lentry = (struct xfs_attr_leaf_name_local *) (((char *)bp->b_addr) + nameidx); From db460c26f0b032274d9d7b19b9f6b50109cf237c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 15 Oct 2024 17:11:12 -0700 Subject: [PATCH 030/125] xfs: check shortform attr entry flags specifically commit 309dc9cbbb4379241bcc9b5a6a42c04279a0e5a7 upstream. While reviewing flag checking in the attr scrub functions, we noticed that the shortform attr scanner didn't catch entries that have the LOCAL or INCOMPLETE bits set. Neither of these flags can ever be set on a shortform attr, so we need to check this narrower set of valid flags. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Catherine Hoang Acked-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/scrub/attr.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 990f4bf1c197..419968d5f5cb 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -566,6 +566,15 @@ xchk_xattr_check_sf( break; } + /* + * Shortform entries do not set LOCAL or INCOMPLETE, so the + * only valid flag bits here are for namespaces. + */ + if (sfe->flags & ~XFS_ATTR_NSP_ONDISK_MASK) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + break; + } + if (!xchk_xattr_set_map(sc, ab->usedmap, (char *)sfe - (char *)sf, sizeof(struct xfs_attr_sf_entry))) { From 9716cdcc2f9eefdd39991c2cee6cafad06d1ce7e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 15 Oct 2024 17:11:13 -0700 Subject: [PATCH 031/125] xfs: validate recovered name buffers when recovering xattr items commit 1c7f09d210aba2f2bb206e2e8c97c9f11a3fd880 upstream. Strengthen the xattri log item recovery code by checking that we actually have the required name and newname buffers for whatever operation we're replaying. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Catherine Hoang Acked-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_attr_item.c | 58 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 47 insertions(+), 11 deletions(-) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 064cb4fe5df4..141631b0d64c 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -719,22 +719,20 @@ xlog_recover_attri_commit_pass2( const void *attr_value = NULL; const void *attr_name; size_t len; - unsigned int op; - - attri_formatp = item->ri_buf[0].i_addr; - attr_name = item->ri_buf[1].i_addr; + unsigned int op, i = 0; /* Validate xfs_attri_log_format before the large memory allocation */ len = sizeof(struct xfs_attri_log_format); - if (item->ri_buf[0].i_len != len) { + if (item->ri_buf[i].i_len != len) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, item->ri_buf[0].i_addr, item->ri_buf[0].i_len); return -EFSCORRUPTED; } + attri_formatp = item->ri_buf[i].i_addr; if (!xfs_attri_validate(mp, attri_formatp)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + attri_formatp, len); return -EFSCORRUPTED; } @@ -763,31 +761,69 @@ xlog_recover_attri_commit_pass2( attri_formatp, len); return -EFSCORRUPTED; } + i++; /* Validate the attr name */ - if (item->ri_buf[1].i_len != + if (item->ri_buf[i].i_len != xlog_calc_iovec_len(attri_formatp->alfi_name_len)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + attri_formatp, len); return -EFSCORRUPTED; } + attr_name = item->ri_buf[i].i_addr; if (!xfs_attr_namecheck(attr_name, attri_formatp->alfi_name_len)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[1].i_addr, item->ri_buf[1].i_len); + attri_formatp, len); return -EFSCORRUPTED; } + i++; /* Validate the attr value, if present */ if (attri_formatp->alfi_value_len != 0) { - if (item->ri_buf[2].i_len != xlog_calc_iovec_len(attri_formatp->alfi_value_len)) { + if (item->ri_buf[i].i_len != xlog_calc_iovec_len(attri_formatp->alfi_value_len)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, item->ri_buf[0].i_addr, item->ri_buf[0].i_len); return -EFSCORRUPTED; } - attr_value = item->ri_buf[2].i_addr; + attr_value = item->ri_buf[i].i_addr; + i++; + } + + /* + * Make sure we got the correct number of buffers for the operation + * that we just loaded. + */ + if (i != item->ri_total) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + attri_formatp, len); + return -EFSCORRUPTED; + } + + switch (op) { + case XFS_ATTRI_OP_FLAGS_REMOVE: + /* Regular remove operations operate only on names. */ + if (attr_value != NULL || attri_formatp->alfi_value_len != 0) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + attri_formatp, len); + return -EFSCORRUPTED; + } + fallthrough; + case XFS_ATTRI_OP_FLAGS_SET: + case XFS_ATTRI_OP_FLAGS_REPLACE: + /* + * Regular xattr set/remove/replace operations require a name + * and do not take a newname. Values are optional for set and + * replace. + */ + if (attr_name == NULL || attri_formatp->alfi_name_len == 0) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + attri_formatp, len); + return -EFSCORRUPTED; + } + break; } /* From 5689d2345a01d7a03bfaa385b7b3d79ef08d0a76 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 15 Oct 2024 17:11:14 -0700 Subject: [PATCH 032/125] xfs: enforce one namespace per attribute commit ea0b3e814741fb64e7785b564ea619578058e0b0 upstream. [backport: fix conflicts due to various xattr refactoring] Create a standardized helper function to enforce one namespace bit per extended attribute, and refactor all the open-coded hweight logic. This function is not a static inline to avoid porting hassles in userspace. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Catherine Hoang Acked-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/libxfs/xfs_attr.c | 11 +++++++++++ fs/xfs/libxfs/xfs_attr.h | 4 +++- fs/xfs/libxfs/xfs_attr_leaf.c | 6 +++++- fs/xfs/scrub/attr.c | 12 +++++------- fs/xfs/xfs_attr_item.c | 10 ++++++++-- fs/xfs/xfs_attr_list.c | 11 +++++++---- 6 files changed, 39 insertions(+), 15 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 32d350e97e0f..33edf047e0ad 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -1565,12 +1565,23 @@ out_release: return error; } +/* Enforce that there is at most one namespace bit per attr. */ +inline bool xfs_attr_check_namespace(unsigned int attr_flags) +{ + return hweight32(attr_flags & XFS_ATTR_NSP_ONDISK_MASK) < 2; +} + /* Returns true if the attribute entry name is valid. */ bool xfs_attr_namecheck( + unsigned int attr_flags, const void *name, size_t length) { + /* Only one namespace bit allowed. */ + if (!xfs_attr_check_namespace(attr_flags)) + return false; + /* * MAXNAMELEN includes the trailing null, but (name/length) leave it * out, so use >= for the length check. diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 81be9b3e4004..c877f05e3cd1 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -547,7 +547,9 @@ int xfs_attr_get(struct xfs_da_args *args); int xfs_attr_set(struct xfs_da_args *args); int xfs_attr_set_iter(struct xfs_attr_intent *attr); int xfs_attr_remove_iter(struct xfs_attr_intent *attr); -bool xfs_attr_namecheck(const void *name, size_t length); +bool xfs_attr_check_namespace(unsigned int attr_flags); +bool xfs_attr_namecheck(unsigned int attr_flags, const void *name, + size_t length); int xfs_attr_calc_size(struct xfs_da_args *args, int *local); void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres, unsigned int *total); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 2580ae47209a..51ff44068675 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -984,6 +984,10 @@ xfs_attr_shortform_to_leaf( nargs.hashval = xfs_da_hashname(sfe->nameval, sfe->namelen); nargs.attr_filter = sfe->flags & XFS_ATTR_NSP_ONDISK_MASK; + if (!xfs_attr_check_namespace(sfe->flags)) { + error = -EFSCORRUPTED; + goto out; + } error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */ ASSERT(error == -ENOATTR); error = xfs_attr3_leaf_add(bp, &nargs); @@ -1105,7 +1109,7 @@ xfs_attr_shortform_verify( * one namespace flag per xattr, so we can just count the * bits (i.e. hweight) here. */ - if (hweight8(sfep->flags & XFS_ATTR_NSP_ONDISK_MASK) > 1) + if (!xfs_attr_check_namespace(sfep->flags)) return __this_address; sfep = next_sfep; diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 419968d5f5cb..7cb0af5e34b1 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -193,14 +193,8 @@ xchk_xattr_listent( return; } - /* Only one namespace bit allowed. */ - if (hweight32(flags & XFS_ATTR_NSP_ONDISK_MASK) > 1) { - xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno); - goto fail_xref; - } - /* Does this name make sense? */ - if (!xfs_attr_namecheck(name, namelen)) { + if (!xfs_attr_namecheck(flags, name, namelen)) { xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno); goto fail_xref; } @@ -501,6 +495,10 @@ xchk_xattr_rec( xchk_da_set_corrupt(ds, level); return 0; } + if (!xfs_attr_check_namespace(ent->flags)) { + xchk_da_set_corrupt(ds, level); + return 0; + } if (ent->flags & XFS_ATTR_LOCAL) { lentry = (struct xfs_attr_leaf_name_local *) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 141631b0d64c..df86c9c09720 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -522,6 +522,10 @@ xfs_attri_validate( if (attrp->alfi_attr_filter & ~XFS_ATTRI_FILTER_MASK) return false; + if (!xfs_attr_check_namespace(attrp->alfi_attr_filter & + XFS_ATTR_NSP_ONDISK_MASK)) + return false; + /* alfi_op_flags should be either a set or remove */ switch (op) { case XFS_ATTRI_OP_FLAGS_SET: @@ -572,7 +576,8 @@ xfs_attri_item_recover( */ attrp = &attrip->attri_format; if (!xfs_attri_validate(mp, attrp) || - !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len)) + !xfs_attr_namecheck(attrp->alfi_attr_filter, nv->name.i_addr, + nv->name.i_len)) return -EFSCORRUPTED; error = xlog_recover_iget(mp, attrp->alfi_ino, &ip); @@ -772,7 +777,8 @@ xlog_recover_attri_commit_pass2( } attr_name = item->ri_buf[i].i_addr; - if (!xfs_attr_namecheck(attr_name, attri_formatp->alfi_name_len)) { + if (!xfs_attr_namecheck(attri_formatp->alfi_attr_filter, attr_name, + attri_formatp->alfi_name_len)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, attri_formatp, len); return -EFSCORRUPTED; diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 99bbbe1a0e44..9ee1d7d2ba76 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -82,7 +82,8 @@ xfs_attr_shortform_list( (dp->i_af.if_bytes + sf->hdr.count * 16) < context->bufsize)) { for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { if (XFS_IS_CORRUPT(context->dp->i_mount, - !xfs_attr_namecheck(sfe->nameval, + !xfs_attr_namecheck(sfe->flags, + sfe->nameval, sfe->namelen))) return -EFSCORRUPTED; context->put_listent(context, @@ -120,7 +121,8 @@ xfs_attr_shortform_list( for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { if (unlikely( ((char *)sfe < (char *)sf) || - ((char *)sfe >= ((char *)sf + dp->i_af.if_bytes)))) { + ((char *)sfe >= ((char *)sf + dp->i_af.if_bytes)) || + !xfs_attr_check_namespace(sfe->flags))) { XFS_CORRUPTION_ERROR("xfs_attr_shortform_list", XFS_ERRLEVEL_LOW, context->dp->i_mount, sfe, @@ -174,7 +176,7 @@ xfs_attr_shortform_list( cursor->offset = 0; } if (XFS_IS_CORRUPT(context->dp->i_mount, - !xfs_attr_namecheck(sbp->name, + !xfs_attr_namecheck(sbp->flags, sbp->name, sbp->namelen))) { error = -EFSCORRUPTED; goto out; @@ -465,7 +467,8 @@ xfs_attr3_leaf_list_int( } if (XFS_IS_CORRUPT(context->dp->i_mount, - !xfs_attr_namecheck(name, namelen))) + !xfs_attr_namecheck(entry->flags, name, + namelen))) return -EFSCORRUPTED; context->put_listent(context, entry->flags, name, namelen, valuelen); From 20adb1e2f069280507967b18a17a5f04c5766b82 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 15 Oct 2024 17:11:15 -0700 Subject: [PATCH 033/125] xfs: revert commit 44af6c7e59b12 commit 2a009397eb5ae178670cbd7101e9635cf6412b35 upstream. [backport: resolve conflicts due to new xattr walk helper] In my haste to fix what I thought was a performance problem in the attr scrub code, I neglected to notice that the xfs_attr_get_ilocked also had the effect of checking that attributes can actually be looked up through the attr dabtree. Fix this. Fixes: 44af6c7e59b12 ("xfs: don't load local xattr values during scrub") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Catherine Hoang Acked-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/scrub/attr.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 7cb0af5e34b1..147babe738d2 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -199,14 +199,6 @@ xchk_xattr_listent( goto fail_xref; } - /* - * Local xattr values are stored in the attr leaf block, so we don't - * need to retrieve the value from a remote block to detect corruption - * problems. - */ - if (flags & XFS_ATTR_LOCAL) - goto fail_xref; - /* * Try to allocate enough memory to extrat the attr value. If that * doesn't work, we overload the seen_enough variable to convey @@ -222,6 +214,11 @@ xchk_xattr_listent( args.value = ab->value; + /* + * Get the attr value to ensure that lookup can find this attribute + * through the dabtree indexing and that remote value retrieval also + * works correctly. + */ error = xfs_attr_get_ilocked(&args); /* ENODATA means the hash lookup failed and the attr is bad */ if (error == -ENODATA) From 7c03b124353a73451a7bfbdf347db1fcb4117d6a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 15 Oct 2024 17:11:16 -0700 Subject: [PATCH 034/125] xfs: use dontcache for grabbing inodes during scrub commit b27ce0da60a523fc32e3795f96b2de5490642235 upstream. [backport: resolve conflict due to missing iscan.c] Back when I wrote commit a03297a0ca9f2, I had thought that we'd be doing users a favor by only marking inodes dontcache at the end of a scrub operation, and only if there's only one reference to that inode. This was more or less true back when I_DONTCACHE was an XFS iflag and the only thing it did was change the outcome of xfs_fs_drop_inode to 1. Note: If there are dentries pointing to the inode when scrub finishes, the inode will have positive i_count and stay around in cache until dentry reclaim. But now we have d_mark_dontcache, which cause the inode *and* the dentries attached to it all to be marked I_DONTCACHE, which means that we drop the dentries ASAP, which drops the inode ASAP. This is bad if scrub found problems with the inode, because now they can be scheduled for inactivation, which can cause inodegc to trip on it and shut down the filesystem. Even if the inode isn't bad, this is still suboptimal because phases 3-7 each initiate inode scans. Dropping the inode immediately during phase 3 is silly because phase 5 will reload it and drop it immediately, etc. It's fine to mark the inodes dontcache, but if there have been accesses to the file that set up dentries, we should keep them. I validated this by setting up ftrace to capture xfs_iget_recycle* tracepoints and ran xfs/285 for 30 seconds. With current djwong-wtf I saw ~30,000 recycle events. I then dropped the d_mark_dontcache calls and set XFS_IGET_DONTCACHE, and the recycle events dropped to ~5,000 per 30 seconds. Therefore, grab the inode with XFS_IGET_DONTCACHE, which only has the effect of setting I_DONTCACHE for cache misses. Remove the d_mark_dontcache call that can happen in xchk_irele. Fixes: a03297a0ca9f2 ("xfs: manage inode DONTCACHE status at irele time") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Catherine Hoang Acked-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/scrub/common.c | 12 +++--------- fs/xfs/scrub/scrub.h | 7 +++++++ 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 08e292485268..f10cd4fb0abd 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -735,7 +735,7 @@ xchk_iget( { ASSERT(sc->tp != NULL); - return xfs_iget(sc->mp, sc->tp, inum, XFS_IGET_UNTRUSTED, 0, ipp); + return xfs_iget(sc->mp, sc->tp, inum, XCHK_IGET_FLAGS, 0, ipp); } /* @@ -786,8 +786,8 @@ again: if (error) return error; - error = xfs_iget(mp, tp, inum, - XFS_IGET_NORETRY | XFS_IGET_UNTRUSTED, 0, ipp); + error = xfs_iget(mp, tp, inum, XFS_IGET_NORETRY | XCHK_IGET_FLAGS, 0, + ipp); if (error == -EAGAIN) { /* * The inode may be in core but temporarily unavailable and may @@ -994,12 +994,6 @@ xchk_irele( spin_lock(&VFS_I(ip)->i_lock); VFS_I(ip)->i_state &= ~I_DONTCACHE; spin_unlock(&VFS_I(ip)->i_lock); - } else if (atomic_read(&VFS_I(ip)->i_count) == 1) { - /* - * If this is the last reference to the inode and the caller - * permits it, set DONTCACHE to avoid thrashing. - */ - d_mark_dontcache(VFS_I(ip)); } xfs_irele(ip); diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 1ef9c6b4842a..869a10fe9d7d 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -17,6 +17,13 @@ struct xfs_scrub; #define XCHK_GFP_FLAGS ((__force gfp_t)(GFP_KERNEL | __GFP_NOWARN | \ __GFP_RETRY_MAYFAIL)) +/* + * For opening files by handle for fsck operations, we don't trust the inumber + * or the allocation state; therefore, perform an untrusted lookup. We don't + * want these inodes to pollute the cache, so mark them for immediate removal. + */ +#define XCHK_IGET_FLAGS (XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE) + /* Type info and names for the scrub types. */ enum xchk_type { ST_NONE = 1, /* disabled */ From f24ba218314874000a48182c3de70d6988ece1e6 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 15 Oct 2024 17:11:17 -0700 Subject: [PATCH 035/125] xfs: match lock mode in xfs_buffered_write_iomap_begin() commit bb712842a85d595525e72f0e378c143e620b3ea2 upstream. Commit 1aa91d9c9933 ("xfs: Add async buffered write support") replace xfs_ilock(XFS_ILOCK_EXCL) with xfs_ilock_for_iomap() when locking the writing inode, and a new variable lockmode is used to indicate the lock mode. Although the lockmode should always be XFS_ILOCK_EXCL, it's still better to use this variable instead of useing XFS_ILOCK_EXCL directly when unlocking the inode. Fixes: 1aa91d9c9933 ("xfs: Add async buffered write support") Signed-off-by: Zhang Yi Reviewed-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R Signed-off-by: Catherine Hoang Acked-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_iomap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 6e5ace7c9bc9..359aa4fc09b6 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1141,13 +1141,13 @@ retry: * them out if the write happens to fail. */ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap); return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq); found_imap: seq = xfs_iomap_inode_sequence(ip, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); found_cow: @@ -1157,17 +1157,17 @@ found_cow: if (error) goto out_unlock; seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq); } xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq); out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); return error; } From 0f726c17dfd89f8e292dbd62b678971e7009ef29 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 15 Oct 2024 17:11:18 -0700 Subject: [PATCH 036/125] xfs: make the seq argument to xfs_bmapi_convert_delalloc() optional commit fc8d0ba0ff5fe4700fa02008b7751ec6b84b7677 upstream. Allow callers to pass a NULLL seq argument if they don't care about the fork sequence number. Signed-off-by: Zhang Yi Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R Signed-off-by: Catherine Hoang Acked-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/libxfs/xfs_bmap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 18429b7f7811..6ef2c2681248 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4595,7 +4595,8 @@ xfs_bmapi_convert_delalloc( if (!isnullstartblock(bma.got.br_startblock)) { xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, xfs_iomap_inode_sequence(ip, flags)); - *seq = READ_ONCE(ifp->if_seq); + if (seq) + *seq = READ_ONCE(ifp->if_seq); goto out_trans_cancel; } @@ -4641,7 +4642,8 @@ xfs_bmapi_convert_delalloc( ASSERT(!isnullstartblock(bma.got.br_startblock)); xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, xfs_iomap_inode_sequence(ip, flags)); - *seq = READ_ONCE(ifp->if_seq); + if (seq) + *seq = READ_ONCE(ifp->if_seq); if (whichfork == XFS_COW_FORK) xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length); From 36081fd0ee37bd066603e85245bd9b5ef2e90a79 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 15 Oct 2024 17:11:19 -0700 Subject: [PATCH 037/125] xfs: make xfs_bmapi_convert_delalloc() to allocate the target offset commit 2e08371a83f1c06fd85eea8cd37c87a224cc4cc4 upstream. Since xfs_bmapi_convert_delalloc() only attempts to allocate the entire delalloc extent and require multiple invocations to allocate the target offset. So xfs_convert_blocks() add a loop to do this job and we call it in the write back path, but xfs_convert_blocks() isn't a common helper. Let's do it in xfs_bmapi_convert_delalloc() and drop xfs_convert_blocks(), preparing for the post EOF delalloc blocks converting in the buffered write begin path. Signed-off-by: Zhang Yi Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R Signed-off-by: Catherine Hoang Acked-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/libxfs/xfs_bmap.c | 34 +++++++++++++++++++++++-- fs/xfs/xfs_aops.c | 54 +++++++++++----------------------------- 2 files changed, 46 insertions(+), 42 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 6ef2c2681248..05e36a745920 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4537,8 +4537,8 @@ error0: * invocations to allocate the target offset if a large enough physical extent * is not available. */ -int -xfs_bmapi_convert_delalloc( +static int +xfs_bmapi_convert_one_delalloc( struct xfs_inode *ip, int whichfork, xfs_off_t offset, @@ -4666,6 +4666,36 @@ out_trans_cancel: return error; } +/* + * Pass in a dellalloc extent and convert it to real extents, return the real + * extent that maps offset_fsb in iomap. + */ +int +xfs_bmapi_convert_delalloc( + struct xfs_inode *ip, + int whichfork, + loff_t offset, + struct iomap *iomap, + unsigned int *seq) +{ + int error; + + /* + * Attempt to allocate whatever delalloc extent currently backs offset + * and put the result into iomap. Allocate in a loop because it may + * take several attempts to allocate real blocks for a contiguous + * delalloc extent if free space is sufficiently fragmented. + */ + do { + error = xfs_bmapi_convert_one_delalloc(ip, whichfork, offset, + iomap, seq); + if (error) + return error; + } while (iomap->offset + iomap->length <= offset); + + return 0; +} + int xfs_bmapi_remap( struct xfs_trans *tp, diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index e74097e58097..688ac031d3a1 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -233,45 +233,6 @@ xfs_imap_valid( return true; } -/* - * Pass in a dellalloc extent and convert it to real extents, return the real - * extent that maps offset_fsb in wpc->iomap. - * - * The current page is held locked so nothing could have removed the block - * backing offset_fsb, although it could have moved from the COW to the data - * fork by another thread. - */ -static int -xfs_convert_blocks( - struct iomap_writepage_ctx *wpc, - struct xfs_inode *ip, - int whichfork, - loff_t offset) -{ - int error; - unsigned *seq; - - if (whichfork == XFS_COW_FORK) - seq = &XFS_WPC(wpc)->cow_seq; - else - seq = &XFS_WPC(wpc)->data_seq; - - /* - * Attempt to allocate whatever delalloc extent currently backs offset - * and put the result into wpc->iomap. Allocate in a loop because it - * may take several attempts to allocate real blocks for a contiguous - * delalloc extent if free space is sufficiently fragmented. - */ - do { - error = xfs_bmapi_convert_delalloc(ip, whichfork, offset, - &wpc->iomap, seq); - if (error) - return error; - } while (wpc->iomap.offset + wpc->iomap.length <= offset); - - return 0; -} - static int xfs_map_blocks( struct iomap_writepage_ctx *wpc, @@ -289,6 +250,7 @@ xfs_map_blocks( struct xfs_iext_cursor icur; int retries = 0; int error = 0; + unsigned int *seq; if (xfs_is_shutdown(mp)) return -EIO; @@ -386,7 +348,19 @@ retry: trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap); return 0; allocate_blocks: - error = xfs_convert_blocks(wpc, ip, whichfork, offset); + /* + * Convert a dellalloc extent to a real one. The current page is held + * locked so nothing could have removed the block backing offset_fsb, + * although it could have moved from the COW to the data fork by another + * thread. + */ + if (whichfork == XFS_COW_FORK) + seq = &XFS_WPC(wpc)->cow_seq; + else + seq = &XFS_WPC(wpc)->data_seq; + + error = xfs_bmapi_convert_delalloc(ip, whichfork, offset, + &wpc->iomap, seq); if (error) { /* * If we failed to find the extent in the COW fork we might have From 4c99f3026cf2065c224207a5c9a878959cfd5365 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 15 Oct 2024 17:11:20 -0700 Subject: [PATCH 038/125] xfs: convert delayed extents to unwritten when zeroing post eof blocks commit 5ce5674187c345dc31534d2024c09ad8ef29b7ba upstream. Current clone operation could be non-atomic if the destination of a file is beyond EOF, user could get a file with corrupted (zeroed) data on crash. The problem is about preallocations. If you write some data into a file: [A...B) and XFS decides to preallocate some post-eof blocks, then it can create a delayed allocation reservation: [A.........D) The writeback path tries to convert delayed extents to real ones by allocating blocks. If there aren't enough contiguous free space, we can end up with two extents, the first real and the second still delalloc: [A....C)[C.D) After that, both the in-memory and the on-disk file sizes are still B. If we clone into the range [E...F) from another file: [A....C)[C.D) [E...F) then xfs_reflink_zero_posteof() calls iomap_zero_range() to zero out the range [B, E) beyond EOF and flush it. Since [C, D) is still a delalloc extent, its pagecache will be zeroed and both the in-memory and on-disk size will be updated to D after flushing but before cloning. This is wrong, because the user can see the size change and read the zeroes while the clone operation is ongoing. We need to keep the in-memory and on-disk size before the clone operation starts, so instead of writing zeroes through the page cache for delayed ranges beyond EOF, we convert these ranges to unwritten and invalidate any cached data over that range beyond EOF. Suggested-by: Dave Chinner Signed-off-by: Zhang Yi Reviewed-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R Signed-off-by: Catherine Hoang Acked-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_iomap.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 359aa4fc09b6..1a150ecbd2b7 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1005,6 +1005,24 @@ xfs_buffered_write_iomap_begin( goto out_unlock; } + /* + * For zeroing, trim a delalloc extent that extends beyond the EOF + * block. If it starts beyond the EOF block, convert it to an + * unwritten extent. + */ + if ((flags & IOMAP_ZERO) && imap.br_startoff <= offset_fsb && + isnullstartblock(imap.br_startblock)) { + xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); + + if (offset_fsb >= eof_fsb) + goto convert_delay; + if (end_fsb > eof_fsb) { + end_fsb = eof_fsb; + xfs_trim_extent(&imap, offset_fsb, + end_fsb - offset_fsb); + } + } + /* * Search the COW fork extent list even if we did not find a data fork * extent. This serves two purposes: first this implements the @@ -1150,6 +1168,17 @@ found_imap: xfs_iunlock(ip, lockmode); return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); +convert_delay: + xfs_iunlock(ip, lockmode); + truncate_pagecache(inode, offset); + error = xfs_bmapi_convert_delalloc(ip, XFS_DATA_FORK, offset, + iomap, NULL); + if (error) + return error; + + trace_xfs_iomap_alloc(ip, offset, count, XFS_DATA_FORK, &imap); + return 0; + found_cow: seq = xfs_iomap_inode_sequence(ip, 0); if (imap.br_startoff <= offset_fsb) { From 0aca73915dc1369e96b0261847f5d9516011f16e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 15 Oct 2024 17:11:21 -0700 Subject: [PATCH 039/125] xfs: allow symlinks with short remote targets commit 38de567906d95c397d87f292b892686b7ec6fbc3 upstream. An internal user complained about log recovery failing on a symlink ("Bad dinode after recovery") with the following (excerpted) format: core.magic = 0x494e core.mode = 0120777 core.version = 3 core.format = 2 (extents) core.nlinkv2 = 1 core.nextents = 1 core.size = 297 core.nblocks = 1 core.naextents = 0 core.forkoff = 0 core.aformat = 2 (extents) u3.bmx[0] = [startoff,startblock,blockcount,extentflag] 0:[0,12,1,0] This is a symbolic link with a 297-byte target stored in a disk block, which is to say this is a symlink with a remote target. The forkoff is 0, which is to say that there's 512 - 176 == 336 bytes in the inode core to store the data fork. Eventually, testing of generic/388 failed with the same inode corruption message during inode recovery. In writing a debugging patch to call xfs_dinode_verify on dirty inode log items when we're committing transactions, I observed that xfs/298 can reproduce the problem quite quickly. xfs/298 creates a symbolic link, adds some extended attributes, then deletes them all. The test failure occurs when the final removexattr also deletes the attr fork because that does not convert the remote symlink back into a shortform symlink. That is how we trip this test. The only reason why xfs/298 only triggers with the debug patch added is that it deletes the symlink, so the final iflush shows the inode as free. I wrote a quick fstest to emulate the behavior of xfs/298, except that it leaves the symlinks on the filesystem after inducing the "corrupt" state. Kernels going back at least as far as 4.18 have written out symlink inodes in this manner and prior to 1eb70f54c445f they did not object to reading them back in. Because we've been writing out inodes this way for quite some time, the only way to fix this is to relax the check for symbolic links. Directories don't have this problem because di_size is bumped to blocksize during the sf->data conversion. Fixes: 1eb70f54c445f ("xfs: validate inode fork size against fork format") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R Signed-off-by: Catherine Hoang Acked-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/libxfs/xfs_inode_buf.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 0f970a0b3382..51fdd29c4ddc 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -366,17 +366,37 @@ xfs_dinode_verify_fork( /* * For fork types that can contain local data, check that the fork * format matches the size of local data contained within the fork. - * - * For all types, check that when the size says the should be in extent - * or btree format, the inode isn't claiming it is in local format. */ if (whichfork == XFS_DATA_FORK) { - if (S_ISDIR(mode) || S_ISLNK(mode)) { + /* + * A directory small enough to fit in the inode must be stored + * in local format. The directory sf <-> extents conversion + * code updates the directory size accordingly. + */ + if (S_ISDIR(mode)) { if (be64_to_cpu(dip->di_size) <= fork_size && fork_format != XFS_DINODE_FMT_LOCAL) return __this_address; } + /* + * A symlink with a target small enough to fit in the inode can + * be stored in extents format if xattrs were added (thus + * converting the data fork from shortform to remote format) + * and then removed. + */ + if (S_ISLNK(mode)) { + if (be64_to_cpu(dip->di_size) <= fork_size && + fork_format != XFS_DINODE_FMT_EXTENTS && + fork_format != XFS_DINODE_FMT_LOCAL) + return __this_address; + } + + /* + * For all types, check that when the size says the fork should + * be in extent or btree format, the inode isn't claiming to be + * in local format. + */ if (be64_to_cpu(dip->di_size) > fork_size && fork_format == XFS_DINODE_FMT_LOCAL) return __this_address; From 0e52b98bf041d5359b12c4fc8248e627556b659c Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Tue, 15 Oct 2024 17:11:22 -0700 Subject: [PATCH 040/125] xfs: make sure sb_fdblocks is non-negative commit 58f880711f2ba53fd5e959875aff5b3bf6d5c32e upstream. A user with a completely full filesystem experienced an unexpected shutdown when the filesystem tried to write the superblock during runtime. kernel shows the following dmesg: [ 8.176281] XFS (dm-4): Metadata corruption detected at xfs_sb_write_verify+0x60/0x120 [xfs], xfs_sb block 0x0 [ 8.177417] XFS (dm-4): Unmount and run xfs_repair [ 8.178016] XFS (dm-4): First 128 bytes of corrupted metadata buffer: [ 8.178703] 00000000: 58 46 53 42 00 00 10 00 00 00 00 00 01 90 00 00 XFSB............ [ 8.179487] 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ [ 8.180312] 00000020: cf 12 dc 89 ca 26 45 29 92 e6 e3 8d 3b b8 a2 c3 .....&E)....;... [ 8.181150] 00000030: 00 00 00 00 01 00 00 06 00 00 00 00 00 00 00 80 ................ [ 8.182003] 00000040: 00 00 00 00 00 00 00 81 00 00 00 00 00 00 00 82 ................ [ 8.182004] 00000050: 00 00 00 01 00 64 00 00 00 00 00 04 00 00 00 00 .....d.......... [ 8.182004] 00000060: 00 00 64 00 b4 a5 02 00 02 00 00 08 00 00 00 00 ..d............. [ 8.182005] 00000070: 00 00 00 00 00 00 00 00 0c 09 09 03 17 00 00 19 ................ [ 8.182008] XFS (dm-4): Corruption of in-memory data detected. Shutting down filesystem [ 8.182010] XFS (dm-4): Please unmount the filesystem and rectify the problem(s) When xfs_log_sb writes super block to disk, b_fdblocks is fetched from m_fdblocks without any lock. As m_fdblocks can experience a positive -> negative -> positive changing when the FS reaches fullness (see xfs_mod_fdblocks). So there is a chance that sb_fdblocks is negative, and because sb_fdblocks is type of unsigned long long, it reads super big. And sb_fdblocks being bigger than sb_dblocks is a problem during log recovery, xfs_validate_sb_write() complains. Fix: As sb_fdblocks will be re-calculated during mount when lazysbcount is enabled, We just need to make xfs_validate_sb_write() happy -- make sure sb_fdblocks is not nenative. This patch also takes care of other percpu counters in xfs_log_sb. Signed-off-by: Wengang Wang Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R Signed-off-by: Catherine Hoang Acked-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/libxfs/xfs_sb.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 59c4804e4d79..424acdd4b0fc 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -1031,11 +1031,12 @@ xfs_log_sb( * and hence we don't need have to update it here. */ if (xfs_has_lazysbcount(mp)) { - mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); + mp->m_sb.sb_icount = percpu_counter_sum_positive(&mp->m_icount); mp->m_sb.sb_ifree = min_t(uint64_t, - percpu_counter_sum(&mp->m_ifree), + percpu_counter_sum_positive(&mp->m_ifree), mp->m_sb.sb_icount); - mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); + mp->m_sb.sb_fdblocks = + percpu_counter_sum_positive(&mp->m_fdblocks); } xfs_sb_to_disk(bp->b_addr, &mp->m_sb); From 740a427e8f45720bc43e2c380a19a5259d7abc06 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 15 Oct 2024 17:11:23 -0700 Subject: [PATCH 041/125] xfs: fix unlink vs cluster buffer instantiation race commit 348a1983cf4cf5099fc398438a968443af4c9f65 upstream. Luis has been reporting an assert failure when freeing an inode cluster during inode inactivation for a while. The assert looks like: XFS: Assertion failed: bp->b_flags & XBF_DONE, file: fs/xfs/xfs_trans_buf.c, line: 241 ------------[ cut here ]------------ kernel BUG at fs/xfs/xfs_message.c:102! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN NOPTI CPU: 4 PID: 73 Comm: kworker/4:1 Not tainted 6.10.0-rc1 #4 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 Workqueue: xfs-inodegc/loop5 xfs_inodegc_worker [xfs] RIP: 0010:assfail (fs/xfs/xfs_message.c:102) xfs RSP: 0018:ffff88810188f7f0 EFLAGS: 00010202 RAX: 0000000000000000 RBX: ffff88816e748250 RCX: 1ffffffff844b0e7 RDX: 0000000000000004 RSI: ffff88810188f558 RDI: ffffffffc2431fa0 RBP: 1ffff11020311f01 R08: 0000000042431f9f R09: ffffed1020311e9b R10: ffff88810188f4df R11: ffffffffac725d70 R12: ffff88817a3f4000 R13: ffff88812182f000 R14: ffff88810188f998 R15: ffffffffc2423f80 FS: 0000000000000000(0000) GS:ffff8881c8400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055fe9d0f109c CR3: 000000014426c002 CR4: 0000000000770ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: xfs_trans_read_buf_map (fs/xfs/xfs_trans_buf.c:241 (discriminator 1)) xfs xfs_imap_to_bp (fs/xfs/xfs_trans.h:210 fs/xfs/libxfs/xfs_inode_buf.c:138) xfs xfs_inode_item_precommit (fs/xfs/xfs_inode_item.c:145) xfs xfs_trans_run_precommits (fs/xfs/xfs_trans.c:931) xfs __xfs_trans_commit (fs/xfs/xfs_trans.c:966) xfs xfs_inactive_ifree (fs/xfs/xfs_inode.c:1811) xfs xfs_inactive (fs/xfs/xfs_inode.c:2013) xfs xfs_inodegc_worker (fs/xfs/xfs_icache.c:1841 fs/xfs/xfs_icache.c:1886) xfs process_one_work (kernel/workqueue.c:3231) worker_thread (kernel/workqueue.c:3306 (discriminator 2) kernel/workqueue.c:3393 (discriminator 2)) kthread (kernel/kthread.c:389) ret_from_fork (arch/x86/kernel/process.c:147) ret_from_fork_asm (arch/x86/entry/entry_64.S:257) And occurs when the the inode precommit handlers is attempt to look up the inode cluster buffer to attach the inode for writeback. The trail of logic that I can reconstruct is as follows. 1. the inode is clean when inodegc runs, so it is not attached to a cluster buffer when precommit runs. 2. #1 implies the inode cluster buffer may be clean and not pinned by dirty inodes when inodegc runs. 3. #2 implies that the inode cluster buffer can be reclaimed by memory pressure at any time. 4. The assert failure implies that the cluster buffer was attached to the transaction, but not marked done. It had been accessed earlier in the transaction, but not marked done. 5. #4 implies the cluster buffer has been invalidated (i.e. marked stale). 6. #5 implies that the inode cluster buffer was instantiated uninitialised in the transaction in xfs_ifree_cluster(), which only instantiates the buffers to invalidate them and never marks them as done. Given factors 1-3, this issue is highly dependent on timing and environmental factors. Hence the issue can be very difficult to reproduce in some situations, but highly reliable in others. Luis has an environment where it can be reproduced easily by g/531 but, OTOH, I've reproduced it only once in ~2000 cycles of g/531. I think the fix is to have xfs_ifree_cluster() set the XBF_DONE flag on the cluster buffers, even though they may not be initialised. The reasons why I think this is safe are: 1. A buffer cache lookup hit on a XBF_STALE buffer will clear the XBF_DONE flag. Hence all future users of the buffer know they have to re-initialise the contents before use and mark it done themselves. 2. xfs_trans_binval() sets the XFS_BLI_STALE flag, which means the buffer remains locked until the journal commit completes and the buffer is unpinned. Hence once marked XBF_STALE/XFS_BLI_STALE by xfs_ifree_cluster(), the only context that can access the freed buffer is the currently running transaction. 3. #2 implies that future buffer lookups in the currently running transaction will hit the transaction match code and not the buffer cache. Hence XBF_STALE and XFS_BLI_STALE will not be cleared unless the transaction initialises and logs the buffer with valid contents again. At which point, the buffer will be marked marked XBF_DONE again, so having XBF_DONE already set on the stale buffer is a moot point. 4. #2 also implies that any concurrent access to that cluster buffer will block waiting on the buffer lock until the inode cluster has been fully freed and is no longer an active inode cluster buffer. 5. #4 + #1 means that any future user of the disk range of that buffer will always see the range of disk blocks covered by the cluster buffer as not done, and hence must initialise the contents themselves. 6. Setting XBF_DONE in xfs_ifree_cluster() then means the unlinked inode precommit code will see a XBF_DONE buffer from the transaction match as it expects. It can then attach the stale but newly dirtied inode to the stale but newly dirtied cluster buffer without unexpected failures. The stale buffer will then sail through the journal and do the right thing with the attached stale inode during unpin. Hence the fix is just one line of extra code. The explanation of why we have to set XBF_DONE in xfs_ifree_cluster, OTOH, is long and complex.... Fixes: 82842fee6e59 ("xfs: fix AGF vs inode cluster buffer deadlock") Signed-off-by: Dave Chinner Tested-by: Luis Chamberlain Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R Signed-off-by: Catherine Hoang Acked-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_inode.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index efb6b8f35617..8bfde8fce6e2 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2329,11 +2329,26 @@ xfs_ifree_cluster( * This buffer may not have been correctly initialised as we * didn't read it from disk. That's not important because we are * only using to mark the buffer as stale in the log, and to - * attach stale cached inodes on it. That means it will never be - * dispatched for IO. If it is, we want to know about it, and we - * want it to fail. We can acheive this by adding a write - * verifier to the buffer. + * attach stale cached inodes on it. + * + * For the inode that triggered the cluster freeing, this + * attachment may occur in xfs_inode_item_precommit() after we + * have marked this buffer stale. If this buffer was not in + * memory before xfs_ifree_cluster() started, it will not be + * marked XBF_DONE and this will cause problems later in + * xfs_inode_item_precommit() when we trip over a (stale, !done) + * buffer to attached to the transaction. + * + * Hence we have to mark the buffer as XFS_DONE here. This is + * safe because we are also marking the buffer as XBF_STALE and + * XFS_BLI_STALE. That means it will never be dispatched for + * IO and it won't be unlocked until the cluster freeing has + * been committed to the journal and the buffer unpinned. If it + * is written, we want to know about it, and we want it to + * fail. We can acheive this by adding a write verifier to the + * buffer. */ + bp->b_flags |= XBF_DONE; bp->b_ops = &xfs_inode_buf_ops; /* From 2bc2d49c36c25d7fcf202f4cb465c69f8e694bf7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Oct 2024 17:11:24 -0700 Subject: [PATCH 042/125] xfs: fix freeing speculative preallocations for preallocated files commit 610b29161b0aa9feb59b78dc867553274f17fb01 upstream. xfs_can_free_eofblocks returns false for files that have persistent preallocations unless the force flag is passed and there are delayed blocks. This means it won't free delalloc reservations for files with persistent preallocations unless the force flag is set, and it will also free the persistent preallocations if the force flag is set and the file happens to have delayed allocations. Both of these are bad, so do away with the force flag and always free only post-EOF delayed allocations for files with the XFS_DIFLAG_PREALLOC or APPEND flags set. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R Signed-off-by: Catherine Hoang Acked-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_bmap_util.c | 30 ++++++++++++++++++++++-------- fs/xfs/xfs_bmap_util.h | 2 +- fs/xfs/xfs_icache.c | 2 +- fs/xfs/xfs_inode.c | 14 ++++---------- 4 files changed, 28 insertions(+), 20 deletions(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 4a7d1a1b67a3..f9d72d8e3c35 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -636,13 +636,11 @@ out_unlock: /* * Test whether it is appropriate to check an inode for and free post EOF - * blocks. The 'force' parameter determines whether we should also consider - * regular files that are marked preallocated or append-only. + * blocks. */ bool xfs_can_free_eofblocks( - struct xfs_inode *ip, - bool force) + struct xfs_inode *ip) { struct xfs_bmbt_irec imap; struct xfs_mount *mp = ip->i_mount; @@ -676,11 +674,11 @@ xfs_can_free_eofblocks( return false; /* - * Do not free real preallocated or append-only files unless the file - * has delalloc blocks and we are forced to remove them. + * Only free real extents for inodes with persistent preallocations or + * the append-only flag. */ if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) - if (!force || ip->i_delayed_blks == 0) + if (ip->i_delayed_blks == 0) return false; /* @@ -734,6 +732,22 @@ xfs_free_eofblocks( /* Wait on dio to ensure i_size has settled. */ inode_dio_wait(VFS_I(ip)); + /* + * For preallocated files only free delayed allocations. + * + * Note that this means we also leave speculative preallocations in + * place for preallocated files. + */ + if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) { + if (ip->i_delayed_blks) { + xfs_bmap_punch_delalloc_range(ip, + round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize), + LLONG_MAX); + } + xfs_inode_clear_eofblocks_tag(ip); + return 0; + } + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) { ASSERT(xfs_is_shutdown(mp)); @@ -1048,7 +1062,7 @@ xfs_prepare_shift( * Trim eofblocks to avoid shifting uninitialized post-eof preallocation * into the accessible region of the file. */ - if (xfs_can_free_eofblocks(ip, true)) { + if (xfs_can_free_eofblocks(ip)) { error = xfs_free_eofblocks(ip); if (error) return error; diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 6888078f5c31..1383019ccdb7 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -63,7 +63,7 @@ int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset, xfs_off_t len); /* EOF block manipulation functions */ -bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force); +bool xfs_can_free_eofblocks(struct xfs_inode *ip); int xfs_free_eofblocks(struct xfs_inode *ip); int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip, diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index db88f41c94c6..57a9f2317525 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1149,7 +1149,7 @@ xfs_inode_free_eofblocks( } *lockflags |= XFS_IOLOCK_EXCL; - if (xfs_can_free_eofblocks(ip, false)) + if (xfs_can_free_eofblocks(ip)) return xfs_free_eofblocks(ip); /* inode could be preallocated or append-only */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 8bfde8fce6e2..7aa73855fab6 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1469,7 +1469,7 @@ xfs_release( if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) return 0; - if (xfs_can_free_eofblocks(ip, false)) { + if (xfs_can_free_eofblocks(ip)) { /* * Check if the inode is being opened, written and closed * frequently and we have delayed allocation blocks outstanding @@ -1685,15 +1685,13 @@ xfs_inode_needs_inactive( /* * This file isn't being freed, so check if there are post-eof blocks - * to free. @force is true because we are evicting an inode from the - * cache. Post-eof blocks must be freed, lest we end up with broken - * free space accounting. + * to free. * * Note: don't bother with iolock here since lockdep complains about * acquiring it in reclaim context. We have the only reference to the * inode at this point anyways. */ - return xfs_can_free_eofblocks(ip, true); + return xfs_can_free_eofblocks(ip); } /* @@ -1741,15 +1739,11 @@ xfs_inactive( if (VFS_I(ip)->i_nlink != 0) { /* - * force is true because we are evicting an inode from the - * cache. Post-eof blocks must be freed, lest we end up with - * broken free space accounting. - * * Note: don't bother with iolock here since lockdep complains * about acquiring it in reclaim context. We have the only * reference to the inode at this point anyways. */ - if (xfs_can_free_eofblocks(ip, true)) + if (xfs_can_free_eofblocks(ip)) error = xfs_free_eofblocks(ip); goto out; From 3eeac331168300f4f1c4af6b9ca79168c2870ac2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 15 Oct 2024 17:11:26 -0700 Subject: [PATCH 043/125] xfs: allow unlinked symlinks and dirs with zero size commit 1ec9307fc066dd8a140d5430f8a7576aa9d78cd3 upstream. For a very very long time, inode inactivation has set the inode size to zero before unmapping the extents associated with the data fork. Unfortunately, commit 3c6f46eacd876 changed the inode verifier to prohibit zero-length symlinks and directories. If an inode happens to get logged in this state and the system crashes before freeing the inode, log recovery will also fail on the broken inode. Therefore, allow zero-size symlinks and directories as long as the link count is zero; nobody will be able to open these files by handle so there isn't any risk of data exposure. Fixes: 3c6f46eacd876 ("xfs: sanity check directory inode di_size") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R Signed-off-by: Catherine Hoang Acked-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/libxfs/xfs_inode_buf.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 51fdd29c4ddc..423d39b6b917 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -371,10 +371,13 @@ xfs_dinode_verify_fork( /* * A directory small enough to fit in the inode must be stored * in local format. The directory sf <-> extents conversion - * code updates the directory size accordingly. + * code updates the directory size accordingly. Directories + * being truncated have zero size and are not subject to this + * check. */ if (S_ISDIR(mode)) { - if (be64_to_cpu(dip->di_size) <= fork_size && + if (dip->di_size && + be64_to_cpu(dip->di_size) <= fork_size && fork_format != XFS_DINODE_FMT_LOCAL) return __this_address; } @@ -512,9 +515,19 @@ xfs_dinode_verify( if (mode && xfs_mode_to_ftype(mode) == XFS_DIR3_FT_UNKNOWN) return __this_address; - /* No zero-length symlinks/dirs. */ - if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0) - return __this_address; + /* + * No zero-length symlinks/dirs unless they're unlinked and hence being + * inactivated. + */ + if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0) { + if (dip->di_version > 1) { + if (dip->di_nlink) + return __this_address; + } else { + if (dip->di_onlink) + return __this_address; + } + } fa = xfs_dinode_verify_nrext64(mp, dip); if (fa) From 9a0ab4fc28ed95d417c3b1bc287b3df8eca60332 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 15 Oct 2024 17:11:25 -0700 Subject: [PATCH 044/125] xfs: restrict when we try to align cow fork delalloc to cowextsz hints commit 288e1f693f04e66be99f27e7cbe4a45936a66745 upstream. xfs/205 produces the following failure when always_cow is enabled: # --- a/tests/xfs/205.out 2024-02-28 16:20:24.437887970 -0800 # +++ b/tests/xfs/205.out.bad 2024-06-03 21:13:40.584000000 -0700 # @@ -1,4 +1,5 @@ # QA output created by 205 # *** one file # + !!! disk full (expected) # *** one file, a few bytes at a time # *** done This is the result of overly aggressive attempts to align cow fork delalloc reservations to the CoW extent size hint. Looking at the trace data, we're trying to append a single fsblock to the "fred" file. Trying to create a speculative post-eof reservation fails because there's not enough space. We then set @prealloc_blocks to zero and try again, but the cowextsz alignment code triggers, which expands our request for a 1-fsblock reservation into a 39-block reservation. There's not enough space for that, so the whole write fails with ENOSPC even though there's sufficient space in the filesystem to allocate the single block that we need to land the write. There are two things wrong here -- first, we shouldn't be attempting speculative preallocations beyond what was requested when we're low on space. Second, if we've already computed a posteof preallocation, we shouldn't bother trying to align that to the cowextsize hint. Fix both of these problems by adding a flag that only enables the expansion of the delalloc reservation to the cowextsize if we're doing a non-extending write, and only if we're not doing an ENOSPC retry. This requires us to move the ENOSPC retry logic to xfs_bmapi_reserve_delalloc. I probably should have caught this six years ago when 6ca30729c206d was being reviewed, but oh well. Update the comments to reflect what the code does now. Fixes: 6ca30729c206d ("xfs: bmap code cleanup") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R Signed-off-by: Catherine Hoang Acked-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/libxfs/xfs_bmap.c | 31 +++++++++++++++++++++++++++---- fs/xfs/xfs_iomap.c | 34 ++++++++++++---------------------- 2 files changed, 39 insertions(+), 26 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 05e36a745920..e6ea35098e07 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3974,20 +3974,32 @@ xfs_bmapi_reserve_delalloc( xfs_extlen_t alen; xfs_extlen_t indlen; int error; - xfs_fileoff_t aoff = off; + xfs_fileoff_t aoff; + bool use_cowextszhint = + whichfork == XFS_COW_FORK && !prealloc; +retry: /* * Cap the alloc length. Keep track of prealloc so we know whether to * tag the inode before we return. */ + aoff = off; alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN); if (!eof) alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); if (prealloc && alen >= len) prealloc = alen - len; - /* Figure out the extent size, adjust alen */ - if (whichfork == XFS_COW_FORK) { + /* + * If we're targetting the COW fork but aren't creating a speculative + * posteof preallocation, try to expand the reservation to align with + * the COW extent size hint if there's sufficient free space. + * + * Unlike the data fork, the CoW cancellation functions will free all + * the reservations at inactivation, so we don't require that every + * delalloc reservation have a dirty pagecache. + */ + if (use_cowextszhint) { struct xfs_bmbt_irec prev; xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip); @@ -4006,7 +4018,7 @@ xfs_bmapi_reserve_delalloc( */ error = xfs_quota_reserve_blkres(ip, alen); if (error) - return error; + goto out; /* * Split changing sb for alen and indlen since they could be coming @@ -4051,6 +4063,17 @@ out_unreserve_blocks: out_unreserve_quota: if (XFS_IS_QUOTA_ON(mp)) xfs_quota_unreserve_blkres(ip, alen); +out: + if (error == -ENOSPC || error == -EDQUOT) { + trace_xfs_delalloc_enospc(ip, off, len); + + if (prealloc || use_cowextszhint) { + /* retry without any preallocation */ + use_cowextszhint = false; + prealloc = 0; + goto retry; + } + } return error; } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 1a150ecbd2b7..9ce2f48b4ebc 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1127,33 +1127,23 @@ xfs_buffered_write_iomap_begin( } } -retry: - error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb, - end_fsb - offset_fsb, prealloc_blocks, - allocfork == XFS_DATA_FORK ? &imap : &cmap, - allocfork == XFS_DATA_FORK ? &icur : &ccur, - allocfork == XFS_DATA_FORK ? eof : cow_eof); - switch (error) { - case 0: - break; - case -ENOSPC: - case -EDQUOT: - /* retry without any preallocation */ - trace_xfs_delalloc_enospc(ip, offset, count); - if (prealloc_blocks) { - prealloc_blocks = 0; - goto retry; - } - fallthrough; - default: - goto out_unlock; - } - if (allocfork == XFS_COW_FORK) { + error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb, + end_fsb - offset_fsb, prealloc_blocks, &cmap, + &ccur, cow_eof); + if (error) + goto out_unlock; + trace_xfs_iomap_alloc(ip, offset, count, allocfork, &cmap); goto found_cow; } + error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb, + end_fsb - offset_fsb, prealloc_blocks, &imap, &icur, + eof); + if (error) + goto out_unlock; + /* * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch * them out if the write happens to fail. From 677f1df179cb68c12ddf7707ec325eb50e99c7d9 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 7 Oct 2024 16:28:32 +0100 Subject: [PATCH 045/125] maple_tree: correct tree corruption on spanning store commit bea07fd63192b61209d48cbb81ef474cc3ee4c62 upstream. Patch series "maple_tree: correct tree corruption on spanning store", v3. There has been a nasty yet subtle maple tree corruption bug that appears to have been in existence since the inception of the algorithm. This bug seems far more likely to happen since commit f8d112a4e657 ("mm/mmap: avoid zeroing vma tree in mmap_region()"), which is the point at which reports started to be submitted concerning this bug. We were made definitely aware of the bug thanks to the kind efforts of Bert Karwatzki who helped enormously in my being able to track this down and identify the cause of it. The bug arises when an attempt is made to perform a spanning store across two leaf nodes, where the right leaf node is the rightmost child of the shared parent, AND the store completely consumes the right-mode node. This results in mas_wr_spanning_store() mitakenly duplicating the new and existing entries at the maximum pivot within the range, and thus maple tree corruption. The fix patch corrects this by detecting this scenario and disallowing the mistaken duplicate copy. The fix patch commit message goes into great detail as to how this occurs. This series also includes a test which reliably reproduces the issue, and asserts that the fix works correctly. Bert has kindly tested the fix and confirmed it resolved his issues. Also Mikhail Gavrilov kindly reported what appears to be precisely the same bug, which this fix should also resolve. This patch (of 2): There has been a subtle bug present in the maple tree implementation from its inception. This arises from how stores are performed - when a store occurs, it will overwrite overlapping ranges and adjust the tree as necessary to accommodate this. A range may always ultimately span two leaf nodes. In this instance we walk the two leaf nodes, determine which elements are not overwritten to the left and to the right of the start and end of the ranges respectively and then rebalance the tree to contain these entries and the newly inserted one. This kind of store is dubbed a 'spanning store' and is implemented by mas_wr_spanning_store(). In order to reach this stage, mas_store_gfp() invokes mas_wr_preallocate(), mas_wr_store_type() and mas_wr_walk() in turn to walk the tree and update the object (mas) to traverse to the location where the write should be performed, determining its store type. When a spanning store is required, this function returns false stopping at the parent node which contains the target range, and mas_wr_store_type() marks the mas->store_type as wr_spanning_store to denote this fact. When we go to perform the store in mas_wr_spanning_store(), we first determine the elements AFTER the END of the range we wish to store (that is, to the right of the entry to be inserted) - we do this by walking to the NEXT pivot in the tree (i.e. r_mas.last + 1), starting at the node we have just determined contains the range over which we intend to write. We then turn our attention to the entries to the left of the entry we are inserting, whose state is represented by l_mas, and copy these into a 'big node', which is a special node which contains enough slots to contain two leaf node's worth of data. We then copy the entry we wish to store immediately after this - the copy and the insertion of the new entry is performed by mas_store_b_node(). After this we copy the elements to the right of the end of the range which we are inserting, if we have not exceeded the length of the node (i.e. r_mas.offset <= r_mas.end). Herein lies the bug - under very specific circumstances, this logic can break and corrupt the maple tree. Consider the following tree: Height 0 Root Node / \ pivot = 0xffff / \ pivot = ULONG_MAX / \ 1 A [-----] ... / \ pivot = 0x4fff / \ pivot = 0xffff / \ 2 (LEAVES) B [-----] [-----] C ^--- Last pivot 0xffff. Now imagine we wish to store an entry in the range [0x4000, 0xffff] (note that all ranges expressed in maple tree code are inclusive): 1. mas_store_gfp() descends the tree, finds node A at <=0xffff, then determines that this is a spanning store across nodes B and C. The mas state is set such that the current node from which we traverse further is node A. 2. In mas_wr_spanning_store() we try to find elements to the right of pivot 0xffff by searching for an index of 0x10000: - mas_wr_walk_index() invokes mas_wr_walk_descend() and mas_wr_node_walk() in turn. - mas_wr_node_walk() loops over entries in node A until EITHER it finds an entry whose pivot equals or exceeds 0x10000 OR it reaches the final entry. - Since no entry has a pivot equal to or exceeding 0x10000, pivot 0xffff is selected, leading to node C. - mas_wr_walk_traverse() resets the mas state to traverse node C. We loop around and invoke mas_wr_walk_descend() and mas_wr_node_walk() in turn once again. - Again, we reach the last entry in node C, which has a pivot of 0xffff. 3. We then copy the elements to the left of 0x4000 in node B to the big node via mas_store_b_node(), and insert the new [0x4000, 0xffff] entry too. 4. We determine whether we have any entries to copy from the right of the end of the range via - and with r_mas set up at the entry at pivot 0xffff, r_mas.offset <= r_mas.end, and then we DUPLICATE the entry at pivot 0xffff. 5. BUG! The maple tree is corrupted with a duplicate entry. This requires a very specific set of circumstances - we must be spanning the last element in a leaf node, which is the last element in the parent node. spanning store across two leaf nodes with a range that ends at that shared pivot. A potential solution to this problem would simply be to reset the walk each time we traverse r_mas, however given the rarity of this situation it seems that would be rather inefficient. Instead, this patch detects if the right hand node is populated, i.e. has anything we need to copy. We do so by only copying elements from the right of the entry being inserted when the maximum value present exceeds the last, rather than basing this on offset position. The patch also updates some comments and eliminates the unused bool return value in mas_wr_walk_index(). The work performed in commit f8d112a4e657 ("mm/mmap: avoid zeroing vma tree in mmap_region()") seems to have made the probability of this event much more likely, which is the point at which reports started to be submitted concerning this bug. The motivation for this change arose from Bert Karwatzki's report of encountering mm instability after the release of kernel v6.12-rc1 which, after the use of CONFIG_DEBUG_VM_MAPLE_TREE and similar configuration options, was identified as maple tree corruption. After Bert very generously provided his time and ability to reproduce this event consistently, I was able to finally identify that the issue discussed in this commit message was occurring for him. Link: https://lkml.kernel.org/r/cover.1728314402.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/48b349a2a0f7c76e18772712d0997a5e12ab0a3b.1728314403.git.lorenzo.stoakes@oracle.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Lorenzo Stoakes Reported-by: Bert Karwatzki Closes: https://lore.kernel.org/all/20241001023402.3374-1-spasswolf@web.de/ Tested-by: Bert Karwatzki Reported-by: Mikhail Gavrilov Closes: https://lore.kernel.org/all/CABXGCsOPwuoNOqSMmAvWO2Fz4TEmPnjFj-b7iF+XFRu1h7-+Dg@mail.gmail.com/ Acked-by: Vlastimil Babka Reviewed-by: Liam R. Howlett Tested-by: Mikhail Gavrilov Reviewed-by: Wei Yang Cc: Matthew Wilcox Cc: Sidhartha Kumar Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- lib/maple_tree.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 41ef91590761..4e05511c8d1e 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -2228,6 +2228,8 @@ static inline struct maple_enode *mte_node_or_none(struct maple_enode *enode) /* * mas_wr_node_walk() - Find the correct offset for the index in the @mas. + * If @mas->index cannot be found within the containing + * node, we traverse to the last entry in the node. * @wr_mas: The maple write state * * Uses mas_slot_locked() and does not need to worry about dead nodes. @@ -3643,7 +3645,7 @@ static bool mas_wr_walk(struct ma_wr_state *wr_mas) return true; } -static bool mas_wr_walk_index(struct ma_wr_state *wr_mas) +static void mas_wr_walk_index(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; @@ -3652,11 +3654,9 @@ static bool mas_wr_walk_index(struct ma_wr_state *wr_mas) wr_mas->content = mas_slot_locked(mas, wr_mas->slots, mas->offset); if (ma_is_leaf(wr_mas->type)) - return true; + return; mas_wr_walk_traverse(wr_mas); - } - return true; } /* * mas_extend_spanning_null() - Extend a store of a %NULL to include surrounding %NULLs. @@ -3892,8 +3892,8 @@ static inline int mas_wr_spanning_store(struct ma_wr_state *wr_mas) memset(&b_node, 0, sizeof(struct maple_big_node)); /* Copy l_mas and store the value in b_node. */ mas_store_b_node(&l_wr_mas, &b_node, l_wr_mas.node_end); - /* Copy r_mas into b_node. */ - if (r_mas.offset <= r_wr_mas.node_end) + /* Copy r_mas into b_node if there is anything to copy. */ + if (r_mas.max > r_mas.last) mas_mab_cp(&r_mas, r_mas.offset, r_wr_mas.node_end, &b_node, b_node.b_end + 1); else From 1e1eb62c40e1a89df73ebb5b472b7726392fbca2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cs=C3=B3k=C3=A1s=2C=20Bence?= Date: Mon, 12 Aug 2024 11:47:13 +0200 Subject: [PATCH 046/125] net: fec: Move `fec_ptp_read()` to the top of the file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 4374a1fe580a14f6152752390c678d90311df247 upstream. This function is used in `fec_ptp_enable_pps()` through struct cyclecounter read(). Moving the declaration makes it clearer, what's happening. Suggested-by: Frank Li Link: https://lore.kernel.org/netdev/20240805144754.2384663-1-csokas.bence@prolan.hu/T/#ma6c21ad264016c24612048b1483769eaff8cdf20 Signed-off-by: Csókás, Bence Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20240812094713.2883476-1-csokas.bence@prolan.hu Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/freescale/fec_ptp.c | 50 ++++++++++++------------ 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec_ptp.c b/drivers/net/ethernet/freescale/fec_ptp.c index 5e8fac50f945..d56d37767506 100644 --- a/drivers/net/ethernet/freescale/fec_ptp.c +++ b/drivers/net/ethernet/freescale/fec_ptp.c @@ -90,6 +90,30 @@ #define FEC_PTP_MAX_NSEC_PERIOD 4000000000ULL #define FEC_PTP_MAX_NSEC_COUNTER 0x80000000ULL +/** + * fec_ptp_read - read raw cycle counter (to be used by time counter) + * @cc: the cyclecounter structure + * + * this function reads the cyclecounter registers and is called by the + * cyclecounter structure used to construct a ns counter from the + * arbitrary fixed point registers + */ +static u64 fec_ptp_read(const struct cyclecounter *cc) +{ + struct fec_enet_private *fep = + container_of(cc, struct fec_enet_private, cc); + u32 tempval; + + tempval = readl(fep->hwp + FEC_ATIME_CTRL); + tempval |= FEC_T_CTRL_CAPTURE; + writel(tempval, fep->hwp + FEC_ATIME_CTRL); + + if (fep->quirks & FEC_QUIRK_BUG_CAPTURE) + udelay(1); + + return readl(fep->hwp + FEC_ATIME); +} + /** * fec_ptp_enable_pps * @fep: the fec_enet_private structure handle @@ -136,7 +160,7 @@ static int fec_ptp_enable_pps(struct fec_enet_private *fep, uint enable) * NSEC_PER_SEC - ts.tv_nsec. Add the remaining nanoseconds * to current timer would be next second. */ - tempval = fep->cc.read(&fep->cc); + tempval = fec_ptp_read(&fep->cc); /* Convert the ptp local counter to 1588 timestamp */ ns = timecounter_cyc2time(&fep->tc, tempval); ts = ns_to_timespec64(ns); @@ -271,30 +295,6 @@ static enum hrtimer_restart fec_ptp_pps_perout_handler(struct hrtimer *timer) return HRTIMER_NORESTART; } -/** - * fec_ptp_read - read raw cycle counter (to be used by time counter) - * @cc: the cyclecounter structure - * - * this function reads the cyclecounter registers and is called by the - * cyclecounter structure used to construct a ns counter from the - * arbitrary fixed point registers - */ -static u64 fec_ptp_read(const struct cyclecounter *cc) -{ - struct fec_enet_private *fep = - container_of(cc, struct fec_enet_private, cc); - u32 tempval; - - tempval = readl(fep->hwp + FEC_ATIME_CTRL); - tempval |= FEC_T_CTRL_CAPTURE; - writel(tempval, fep->hwp + FEC_ATIME_CTRL); - - if (fep->quirks & FEC_QUIRK_BUG_CAPTURE) - udelay(1); - - return readl(fep->hwp + FEC_ATIME); -} - /** * fec_ptp_start_cyclecounter - create the cycle counter from hw * @ndev: network device From 6365900cb10e80e330ab5bd1084c9bc7a347979f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cs=C3=B3k=C3=A1s=2C=20Bence?= Date: Mon, 12 Aug 2024 11:47:15 +0200 Subject: [PATCH 047/125] net: fec: Remove duplicated code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 713ebaed68d88121cbaf5e74104e2290a9ea74bd upstream. `fec_ptp_pps_perout()` reimplements logic already in `fec_ptp_read()`. Replace with function call. Signed-off-by: Csókás, Bence Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20240812094713.2883476-2-csokas.bence@prolan.hu Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/freescale/fec_ptp.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec_ptp.c b/drivers/net/ethernet/freescale/fec_ptp.c index d56d37767506..a4eb6edb850a 100644 --- a/drivers/net/ethernet/freescale/fec_ptp.c +++ b/drivers/net/ethernet/freescale/fec_ptp.c @@ -235,13 +235,7 @@ static int fec_ptp_pps_perout(struct fec_enet_private *fep) timecounter_read(&fep->tc); /* Get the current ptp hardware time counter */ - temp_val = readl(fep->hwp + FEC_ATIME_CTRL); - temp_val |= FEC_T_CTRL_CAPTURE; - writel(temp_val, fep->hwp + FEC_ATIME_CTRL); - if (fep->quirks & FEC_QUIRK_BUG_CAPTURE) - udelay(1); - - ptp_hc = readl(fep->hwp + FEC_ATIME); + ptp_hc = fec_ptp_read(&fep->cc); /* Convert the ptp local counter to 1588 timestamp */ curr_time = timecounter_cyc2time(&fep->tc, ptp_hc); From 37d9305caace99024085dbe532e21df00f6f2bd0 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 14 Oct 2024 16:06:00 +0200 Subject: [PATCH 048/125] mptcp: prevent MPC handshake on port-based signal endpoints commit 3d041393ea8c815f773020fb4a995331a69c0139 upstream. Syzkaller reported a lockdep splat: ============================================ WARNING: possible recursive locking detected 6.11.0-rc6-syzkaller-00019-g67784a74e258 #0 Not tainted -------------------------------------------- syz-executor364/5113 is trying to acquire lock: ffff8880449f1958 (k-slock-AF_INET){+.-.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline] ffff8880449f1958 (k-slock-AF_INET){+.-.}-{2:2}, at: sk_clone_lock+0x2cd/0xf40 net/core/sock.c:2328 but task is already holding lock: ffff88803fe3cb58 (k-slock-AF_INET){+.-.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline] ffff88803fe3cb58 (k-slock-AF_INET){+.-.}-{2:2}, at: sk_clone_lock+0x2cd/0xf40 net/core/sock.c:2328 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(k-slock-AF_INET); lock(k-slock-AF_INET); *** DEADLOCK *** May be due to missing lock nesting notation 7 locks held by syz-executor364/5113: #0: ffff8880449f0e18 (sk_lock-AF_INET){+.+.}-{0:0}, at: lock_sock include/net/sock.h:1607 [inline] #0: ffff8880449f0e18 (sk_lock-AF_INET){+.+.}-{0:0}, at: mptcp_sendmsg+0x153/0x1b10 net/mptcp/protocol.c:1806 #1: ffff88803fe39ad8 (k-sk_lock-AF_INET){+.+.}-{0:0}, at: lock_sock include/net/sock.h:1607 [inline] #1: ffff88803fe39ad8 (k-sk_lock-AF_INET){+.+.}-{0:0}, at: mptcp_sendmsg_fastopen+0x11f/0x530 net/mptcp/protocol.c:1727 #2: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:326 [inline] #2: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:838 [inline] #2: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: __ip_queue_xmit+0x5f/0x1b80 net/ipv4/ip_output.c:470 #3: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:326 [inline] #3: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:838 [inline] #3: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: ip_finish_output2+0x45f/0x1390 net/ipv4/ip_output.c:228 #4: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: local_lock_acquire include/linux/local_lock_internal.h:29 [inline] #4: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: process_backlog+0x33b/0x15b0 net/core/dev.c:6104 #5: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:326 [inline] #5: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:838 [inline] #5: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: ip_local_deliver_finish+0x230/0x5f0 net/ipv4/ip_input.c:232 #6: ffff88803fe3cb58 (k-slock-AF_INET){+.-.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline] #6: ffff88803fe3cb58 (k-slock-AF_INET){+.-.}-{2:2}, at: sk_clone_lock+0x2cd/0xf40 net/core/sock.c:2328 stack backtrace: CPU: 0 UID: 0 PID: 5113 Comm: syz-executor364 Not tainted 6.11.0-rc6-syzkaller-00019-g67784a74e258 #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:93 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:119 check_deadlock kernel/locking/lockdep.c:3061 [inline] validate_chain+0x15d3/0x5900 kernel/locking/lockdep.c:3855 __lock_acquire+0x137a/0x2040 kernel/locking/lockdep.c:5142 lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5759 __raw_spin_lock include/linux/spinlock_api_smp.h:133 [inline] _raw_spin_lock+0x2e/0x40 kernel/locking/spinlock.c:154 spin_lock include/linux/spinlock.h:351 [inline] sk_clone_lock+0x2cd/0xf40 net/core/sock.c:2328 mptcp_sk_clone_init+0x32/0x13c0 net/mptcp/protocol.c:3279 subflow_syn_recv_sock+0x931/0x1920 net/mptcp/subflow.c:874 tcp_check_req+0xfe4/0x1a20 net/ipv4/tcp_minisocks.c:853 tcp_v4_rcv+0x1c3e/0x37f0 net/ipv4/tcp_ipv4.c:2267 ip_protocol_deliver_rcu+0x22e/0x440 net/ipv4/ip_input.c:205 ip_local_deliver_finish+0x341/0x5f0 net/ipv4/ip_input.c:233 NF_HOOK+0x3a4/0x450 include/linux/netfilter.h:314 NF_HOOK+0x3a4/0x450 include/linux/netfilter.h:314 __netif_receive_skb_one_core net/core/dev.c:5661 [inline] __netif_receive_skb+0x2bf/0x650 net/core/dev.c:5775 process_backlog+0x662/0x15b0 net/core/dev.c:6108 __napi_poll+0xcb/0x490 net/core/dev.c:6772 napi_poll net/core/dev.c:6841 [inline] net_rx_action+0x89b/0x1240 net/core/dev.c:6963 handle_softirqs+0x2c4/0x970 kernel/softirq.c:554 do_softirq+0x11b/0x1e0 kernel/softirq.c:455 __local_bh_enable_ip+0x1bb/0x200 kernel/softirq.c:382 local_bh_enable include/linux/bottom_half.h:33 [inline] rcu_read_unlock_bh include/linux/rcupdate.h:908 [inline] __dev_queue_xmit+0x1763/0x3e90 net/core/dev.c:4450 dev_queue_xmit include/linux/netdevice.h:3105 [inline] neigh_hh_output include/net/neighbour.h:526 [inline] neigh_output include/net/neighbour.h:540 [inline] ip_finish_output2+0xd41/0x1390 net/ipv4/ip_output.c:235 ip_local_out net/ipv4/ip_output.c:129 [inline] __ip_queue_xmit+0x118c/0x1b80 net/ipv4/ip_output.c:535 __tcp_transmit_skb+0x2544/0x3b30 net/ipv4/tcp_output.c:1466 tcp_rcv_synsent_state_process net/ipv4/tcp_input.c:6542 [inline] tcp_rcv_state_process+0x2c32/0x4570 net/ipv4/tcp_input.c:6729 tcp_v4_do_rcv+0x77d/0xc70 net/ipv4/tcp_ipv4.c:1934 sk_backlog_rcv include/net/sock.h:1111 [inline] __release_sock+0x214/0x350 net/core/sock.c:3004 release_sock+0x61/0x1f0 net/core/sock.c:3558 mptcp_sendmsg_fastopen+0x1ad/0x530 net/mptcp/protocol.c:1733 mptcp_sendmsg+0x1884/0x1b10 net/mptcp/protocol.c:1812 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x1a6/0x270 net/socket.c:745 ____sys_sendmsg+0x525/0x7d0 net/socket.c:2597 ___sys_sendmsg net/socket.c:2651 [inline] __sys_sendmmsg+0x3b2/0x740 net/socket.c:2737 __do_sys_sendmmsg net/socket.c:2766 [inline] __se_sys_sendmmsg net/socket.c:2763 [inline] __x64_sys_sendmmsg+0xa0/0xb0 net/socket.c:2763 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f04fb13a6b9 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 01 1a 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ffd651f42d8 EFLAGS: 00000246 ORIG_RAX: 0000000000000133 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f04fb13a6b9 RDX: 0000000000000001 RSI: 0000000020000d00 RDI: 0000000000000004 RBP: 00007ffd651f4310 R08: 0000000000000001 R09: 0000000000000001 R10: 0000000020000080 R11: 0000000000000246 R12: 00000000000f4240 R13: 00007f04fb187449 R14: 00007ffd651f42f4 R15: 00007ffd651f4300 As noted by Cong Wang, the splat is false positive, but the code path leading to the report is an unexpected one: a client is attempting an MPC handshake towards the in-kernel listener created by the in-kernel PM for a port based signal endpoint. Such connection will be never accepted; many of them can make the listener queue full and preventing the creation of MPJ subflow via such listener - its intended role. Explicitly detect this scenario at initial-syn time and drop the incoming MPC request. Fixes: 1729cf186d8a ("mptcp: create the listening socket for new port") Cc: stable@vger.kernel.org Reported-by: syzbot+f4aacdfef2c6a6529c3e@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=f4aacdfef2c6a6529c3e Cc: Cong Wang Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241014-net-mptcp-mpc-port-endp-v2-1-7faea8e6b6ae@kernel.org Signed-off-by: Jakub Kicinski [ Conflicts in mib.[ch], because commit 6982826fe5e5 ("mptcp: fallback to TCP after SYN+MPC drops"), and commit 27069e7cb3d1 ("mptcp: disable active MPTCP in case of blackhole") are linked to new features, not available in this version. Resolving the conflicts is easy, simply adding the new lines declaring the new "endpoint attempt" MIB entry. ] Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Greg Kroah-Hartman --- net/mptcp/mib.c | 1 + net/mptcp/mib.h | 1 + net/mptcp/pm_netlink.c | 1 + net/mptcp/protocol.h | 1 + net/mptcp/subflow.c | 11 +++++++++++ 5 files changed, 15 insertions(+) diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index 79f2278434b9..3dc49c3169f2 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -15,6 +15,7 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MPCapableACKRX", MPTCP_MIB_MPCAPABLEPASSIVEACK), SNMP_MIB_ITEM("MPCapableFallbackACK", MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK), SNMP_MIB_ITEM("MPCapableFallbackSYNACK", MPTCP_MIB_MPCAPABLEACTIVEFALLBACK), + SNMP_MIB_ITEM("MPCapableEndpAttempt", MPTCP_MIB_MPCAPABLEENDPATTEMPT), SNMP_MIB_ITEM("MPFallbackTokenInit", MPTCP_MIB_TOKENFALLBACKINIT), SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS), SNMP_MIB_ITEM("MPJoinNoTokenFound", MPTCP_MIB_JOINNOTOKEN), diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index e6e570251d32..007d7a749472 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -8,6 +8,7 @@ enum linux_mptcp_mib_field { MPTCP_MIB_MPCAPABLEPASSIVEACK, /* Received third ACK with MP_CAPABLE */ MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK,/* Server-side fallback during 3-way handshake */ MPTCP_MIB_MPCAPABLEACTIVEFALLBACK, /* Client-side fallback during 3-way handshake */ + MPTCP_MIB_MPCAPABLEENDPATTEMPT, /* Prohibited MPC to port-based endp */ MPTCP_MIB_TOKENFALLBACKINIT, /* Could not init/allocate token */ MPTCP_MIB_RETRANSSEGS, /* Segments retransmitted at the MPTCP-level */ MPTCP_MIB_JOINNOTOKEN, /* Received MP_JOIN but the token was not found */ diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 54b49fc6efd9..d8c47ca86de4 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1125,6 +1125,7 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, */ inet_sk_state_store(newsk, TCP_LISTEN); lock_sock(ssk); + WRITE_ONCE(mptcp_subflow_ctx(ssk)->pm_listener, true); err = __inet_listen_sk(ssk, backlog); if (!err) mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index b9a4f6364b78..89d1c299ff2b 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -504,6 +504,7 @@ struct mptcp_subflow_context { __unused : 9; enum mptcp_data_avail data_avail; bool scheduled; + bool pm_listener; /* a listener managed by the kernel PM? */ u32 remote_nonce; u64 thmac; u32 local_nonce; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 38f46502d76e..282ecc8bf75e 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -132,6 +132,13 @@ static void subflow_add_reset_reason(struct sk_buff *skb, u8 reason) } } +static int subflow_reset_req_endp(struct request_sock *req, struct sk_buff *skb) +{ + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEENDPATTEMPT); + subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); + return -EPERM; +} + /* Init mptcp request socket. * * Returns an error code if a JOIN has failed and a TCP reset @@ -163,6 +170,8 @@ static int subflow_check_req(struct request_sock *req, if (opt_mp_capable) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); + if (unlikely(listener->pm_listener)) + return subflow_reset_req_endp(req, skb); if (opt_mp_join) return 0; } else if (opt_mp_join) { @@ -170,6 +179,8 @@ static int subflow_check_req(struct request_sock *req, if (mp_opt.backup) SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNBACKUPRX); + } else if (unlikely(listener->pm_listener)) { + return subflow_reset_req_endp(req, skb); } if (opt_mp_capable && listener->request_mptcp) { From fe2e0b6cd00abea3efac66de1da22d844364c1b0 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 14 Oct 2024 09:37:44 +0800 Subject: [PATCH 049/125] iommu/vt-d: Fix incorrect pci_for_each_dma_alias() for non-PCI devices commit 6e02a277f1db24fa039e23783c8921c7b0e5b1b3 upstream. Previously, the domain_context_clear() function incorrectly called pci_for_each_dma_alias() to set up context entries for non-PCI devices. This could lead to kernel hangs or other unexpected behavior. Add a check to only call pci_for_each_dma_alias() for PCI devices. For non-PCI devices, domain_context_clear_one() is called directly. Reported-by: Todd Brandt Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219363 Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219349 Fixes: 9a16ab9d6402 ("iommu/vt-d: Make context clearing consistent with context mapping") Cc: stable@vger.kernel.org Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20241014013744.102197-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Greg Kroah-Hartman --- drivers/iommu/intel/iommu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index b7317016834c..3a7c647d3aff 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3925,8 +3925,10 @@ static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *op */ static void domain_context_clear(struct device_domain_info *info) { - if (!dev_is_pci(info->dev)) + if (!dev_is_pci(info->dev)) { domain_context_clear_one(info, info->bus, info->devfn); + return; + } pci_for_each_dma_alias(to_pci_dev(info->dev), &domain_context_clear_one_cb, info); From ccea29b1e072d311fc532716442ef43d73caecc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 14 Oct 2024 07:50:06 +0200 Subject: [PATCH 050/125] s390/sclp: Deactivate sclp after all its users MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 0d9dc27df22d9b5c8dc7185c8dddbc14f5468518 upstream. On reboot the SCLP interface is deactivated through a reboot notifier. This happens before other components using SCLP have the chance to run their own reboot notifiers. Two of those components are the SCLP console and tty drivers which try to flush the last outstanding messages. At that point the SCLP interface is already unusable and the messages are discarded. Execute sclp_deactivate() as late as possible to avoid this issue. Fixes: 4ae46db99cd8 ("s390/consoles: improve panic notifiers reliability") Cc: stable@vger.kernel.org Signed-off-by: Thomas Weißschuh Reviewed-by: Sven Schnelle Link: https://lore.kernel.org/r/20241014-s390-kunit-v1-1-941defa765a6@linutronix.de Signed-off-by: Heiko Carstens Signed-off-by: Greg Kroah-Hartman --- drivers/s390/char/sclp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/s390/char/sclp.c b/drivers/s390/char/sclp.c index ba10f9b8fac7..6fa0fb35e521 100644 --- a/drivers/s390/char/sclp.c +++ b/drivers/s390/char/sclp.c @@ -1195,7 +1195,8 @@ sclp_reboot_event(struct notifier_block *this, unsigned long event, void *ptr) } static struct notifier_block sclp_reboot_notifier = { - .notifier_call = sclp_reboot_event + .notifier_call = sclp_reboot_event, + .priority = INT_MIN, }; static ssize_t con_pages_show(struct device_driver *dev, char *buf) From 82b433fb9b13b755a979c5a3b723963e985c5977 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 14 Oct 2024 07:50:07 +0200 Subject: [PATCH 051/125] s390/sclp_vt220: Convert newlines to CRLF instead of LFCR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit dee3df68ab4b00fff6bdf9fc39541729af37307c upstream. According to the VT220 specification the possible character combinations sent on RETURN are only CR or CRLF [0]. The Return key sends either a CR character (0/13) or a CR character (0/13) and an LF character (0/10), depending on the set/reset state of line feed/new line mode (LNM). The sclp/vt220 driver however uses LFCR. This can confuse tools, for example the kunit runner. Link: https://vt100.net/docs/vt220-rm/chapter3.html#S3.2 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: stable@vger.kernel.org Signed-off-by: Thomas Weißschuh Reviewed-by: Sven Schnelle Link: https://lore.kernel.org/r/20241014-s390-kunit-v1-2-941defa765a6@linutronix.de Signed-off-by: Heiko Carstens Signed-off-by: Greg Kroah-Hartman --- drivers/s390/char/sclp_vt220.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/s390/char/sclp_vt220.c b/drivers/s390/char/sclp_vt220.c index 218ae604f737..33b9c968dbcb 100644 --- a/drivers/s390/char/sclp_vt220.c +++ b/drivers/s390/char/sclp_vt220.c @@ -319,7 +319,7 @@ sclp_vt220_add_msg(struct sclp_vt220_request *request, buffer = (void *) ((addr_t) sccb + sccb->header.length); if (convertlf) { - /* Perform Linefeed conversion (0x0a -> 0x0a 0x0d)*/ + /* Perform Linefeed conversion (0x0a -> 0x0d 0x0a)*/ for (from=0, to=0; (from < count) && (to < sclp_vt220_space_left(request)); from++) { @@ -328,8 +328,8 @@ sclp_vt220_add_msg(struct sclp_vt220_request *request, /* Perform conversion */ if (c == 0x0a) { if (to + 1 < sclp_vt220_space_left(request)) { - ((unsigned char *) buffer)[to++] = c; ((unsigned char *) buffer)[to++] = 0x0d; + ((unsigned char *) buffer)[to++] = c; } else break; From 88607ed93ee6361e0a969dc574c12654cc7d9399 Mon Sep 17 00:00:00 2001 From: Nico Boehr Date: Tue, 17 Sep 2024 17:18:33 +0200 Subject: [PATCH 052/125] KVM: s390: gaccess: Check if guest address is in memslot commit e8061f06185be0a06a73760d6526b8b0feadfe52 upstream. Previously, access_guest_page() did not check whether the given guest address is inside of a memslot. This is not a problem, since kvm_write_guest_page/kvm_read_guest_page return -EFAULT in this case. However, -EFAULT is also returned when copy_to/from_user fails. When emulating a guest instruction, the address being outside a memslot usually means that an addressing exception should be injected into the guest. Failure in copy_to/from_user however indicates that something is wrong in userspace and hence should be handled there. To be able to distinguish these two cases, return PGM_ADDRESSING in access_guest_page() when the guest address is outside guest memory. In access_guest_real(), populate vcpu->arch.pgm.code such that kvm_s390_inject_prog_cond() can be used in the caller for injecting into the guest (if applicable). Since this adds a new return value to access_guest_page(), we need to make sure that other callers are not confused by the new positive return value. There are the following users of access_guest_page(): - access_guest_with_key() does the checking itself (in guest_range_to_gpas()), so this case should never happen. Even if, the handling is set up properly. - access_guest_real() just passes the return code to its callers, which are: - read_guest_real() - see below - write_guest_real() - see below There are the following users of read_guest_real(): - ar_translation() in gaccess.c which already returns PGM_* - setup_apcb10(), setup_apcb00(), setup_apcb11() in vsie.c which always return -EFAULT on read_guest_read() nonzero return - no change - shadow_crycb(), handle_stfle() always present this as validity, this could be handled better but doesn't change current behaviour - no change There are the following users of write_guest_real(): - kvm_s390_store_status_unloaded() always returns -EFAULT on write_guest_real() failure. Fixes: 2293897805c2 ("KVM: s390: add architecture compliant guest access functions") Cc: stable@vger.kernel.org Signed-off-by: Nico Boehr Reviewed-by: Heiko Carstens Link: https://lore.kernel.org/r/20240917151904.74314-2-nrb@linux.ibm.com Acked-by: Janosch Frank Signed-off-by: Heiko Carstens Signed-off-by: Greg Kroah-Hartman --- arch/s390/kvm/gaccess.c | 4 ++++ arch/s390/kvm/gaccess.h | 14 ++++++++------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index ff8349d17b33..090dc3833433 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -1001,6 +1001,8 @@ static int access_guest_page(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, const gfn_t gfn = gpa_to_gfn(gpa); int rc; + if (!gfn_to_memslot(kvm, gfn)) + return PGM_ADDRESSING; if (mode == GACC_STORE) rc = kvm_write_guest_page(kvm, gfn, data, offset, len); else @@ -1158,6 +1160,8 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, gra += fragment_len; data += fragment_len; } + if (rc > 0) + vcpu->arch.pgm.code = rc; return rc; } diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index b320d12aa049..3fde45a151f2 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -405,11 +405,12 @@ int read_guest_abs(struct kvm_vcpu *vcpu, unsigned long gpa, void *data, * @len: number of bytes to copy * * Copy @len bytes from @data (kernel space) to @gra (guest real address). - * It is up to the caller to ensure that the entire guest memory range is - * valid memory before calling this function. * Guest low address and key protection are not checked. * - * Returns zero on success or -EFAULT on error. + * Returns zero on success, -EFAULT when copying from @data failed, or + * PGM_ADRESSING in case @gra is outside a memslot. In this case, pgm check info + * is also stored to allow injecting into the guest (if applicable) using + * kvm_s390_inject_prog_cond(). * * If an error occurs data may have been copied partially to guest memory. */ @@ -428,11 +429,12 @@ int write_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data, * @len: number of bytes to copy * * Copy @len bytes from @gra (guest real address) to @data (kernel space). - * It is up to the caller to ensure that the entire guest memory range is - * valid memory before calling this function. * Guest key protection is not checked. * - * Returns zero on success or -EFAULT on error. + * Returns zero on success, -EFAULT when copying to @data failed, or + * PGM_ADRESSING in case @gra is outside a memslot. In this case, pgm check info + * is also stored to allow injecting into the guest (if applicable) using + * kvm_s390_inject_prog_cond(). * * If an error occurs data may have been copied partially to kernel space. */ From 369535232d6acd67f360c7da474104efd0f50c82 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Tue, 17 Sep 2024 17:18:34 +0200 Subject: [PATCH 053/125] KVM: s390: Change virtual to physical address access in diag 0x258 handler commit cad4b3d4ab1f062708fff33f44d246853f51e966 upstream. The parameters for the diag 0x258 are real addresses, not virtual, but KVM was using them as virtual addresses. This only happened to work, since the Linux kernel as a guest used to have a 1:1 mapping for physical vs virtual addresses. Fix KVM so that it correctly uses the addresses as real addresses. Cc: stable@vger.kernel.org Fixes: 8ae04b8f500b ("KVM: s390: Guest's memory access functions get access registers") Suggested-by: Vasily Gorbik Signed-off-by: Michael Mueller Signed-off-by: Nico Boehr Reviewed-by: Christian Borntraeger Reviewed-by: Heiko Carstens Link: https://lore.kernel.org/r/20240917151904.74314-3-nrb@linux.ibm.com Acked-by: Janosch Frank Signed-off-by: Heiko Carstens Signed-off-by: Greg Kroah-Hartman --- arch/s390/kvm/diag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index 3c65b8258ae6..2cc3ec034046 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -77,7 +77,7 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu) vcpu->stat.instruction_diagnose_258++; if (vcpu->run->s.regs.gprs[rx] & 7) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - rc = read_guest(vcpu, vcpu->run->s.regs.gprs[rx], rx, &parm, sizeof(parm)); + rc = read_guest_real(vcpu, vcpu->run->s.regs.gprs[rx], &parm, sizeof(parm)); if (rc) return kvm_s390_inject_prog_cond(vcpu, rc); if (parm.parm_version != 2 || parm.parm_len < 5 || parm.code != 0x258) From 6ccc10ed0a7d3e6bf74619e09ed09e9d6d09b238 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 13 Sep 2024 10:32:27 -0700 Subject: [PATCH 054/125] x86/cpufeatures: Define X86_FEATURE_AMD_IBPB_RET commit ff898623af2ed564300752bba83a680a1e4fec8d upstream. AMD's initial implementation of IBPB did not clear the return address predictor. Beginning with Zen4, AMD's IBPB *does* clear the return address predictor. This behavior is enumerated by CPUID.80000008H:EBX.IBPB_RET[30]. Define X86_FEATURE_AMD_IBPB_RET for use in KVM_GET_SUPPORTED_CPUID, when determining cross-vendor capabilities. Suggested-by: Venkatesh Srinivas Signed-off-by: Jim Mattson Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Reviewed-by: Thomas Gleixner Cc: Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/cpufeatures.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 8c1593dd2c31..ff55d5cb5bee 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -216,7 +216,7 @@ #define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */ #define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* "" AMD SSBD implementation via LS_CFG MSR */ #define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */ -#define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ +#define X86_FEATURE_IBPB ( 7*32+26) /* "ibpb" Indirect Branch Prediction Barrier without a guaranteed RSB flush */ #define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ #define X86_FEATURE_ZEN ( 7*32+28) /* "" Generic flag for all Zen and newer */ #define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */ @@ -347,6 +347,7 @@ #define X86_FEATURE_CPPC (13*32+27) /* Collaborative Processor Performance Control */ #define X86_FEATURE_AMD_PSFD (13*32+28) /* "" Predictive Store Forwarding Disable */ #define X86_FEATURE_BTC_NO (13*32+29) /* "" Not vulnerable to Branch Type Confusion */ +#define X86_FEATURE_AMD_IBPB_RET (13*32+30) /* "" IBPB clears return address predictor */ #define X86_FEATURE_BRS (13*32+31) /* Branch Sampling available */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ From b28d3f44e20cf37e74d87fc453234865f26c41e1 Mon Sep 17 00:00:00 2001 From: Johannes Wikner Date: Mon, 23 Sep 2024 20:49:34 +0200 Subject: [PATCH 055/125] x86/cpufeatures: Add a IBPB_NO_RET BUG flag commit 3ea87dfa31a7b0bb0ff1675e67b9e54883013074 upstream. Set this flag if the CPU has an IBPB implementation that does not invalidate return target predictions. Zen generations < 4 do not flush the RSB when executing an IBPB and this bug flag denotes that. [ bp: Massage. ] Signed-off-by: Johannes Wikner Signed-off-by: Borislav Petkov (AMD) Cc: Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/common.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index ff55d5cb5bee..55d18eef6775 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -517,4 +517,5 @@ #define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* AMD DIV0 speculation bug */ #define X86_BUG_RFDS X86_BUG(1*32 + 2) /* CPU is vulnerable to Register File Data Sampling */ #define X86_BUG_BHI X86_BUG(1*32 + 3) /* CPU is affected by Branch History Injection */ +#define X86_BUG_IBPB_NO_RET X86_BUG(1*32 + 4) /* "ibpb_no_ret" IBPB omits return target predictions */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 74d566263467..7a1e58fb43a0 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1483,6 +1483,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) boot_cpu_has(X86_FEATURE_HYPERVISOR))) setup_force_cpu_bug(X86_BUG_BHI); + if (cpu_has(c, X86_FEATURE_AMD_IBPB) && !cpu_has(c, X86_FEATURE_AMD_IBPB_RET)) + setup_force_cpu_bug(X86_BUG_IBPB_NO_RET); + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; From 61211f2da0be9ae670afa4b6796bd15a385fd954 Mon Sep 17 00:00:00 2001 From: Johannes Wikner Date: Mon, 23 Sep 2024 20:49:36 +0200 Subject: [PATCH 056/125] x86/entry: Have entry_ibpb() invalidate return predictions commit 50e4b3b94090babe8d4bb85c95f0d3e6b07ea86e upstream. entry_ibpb() should invalidate all indirect predictions, including return target predictions. Not all IBPB implementations do this, in which case the fallback is RSB filling. Prevent SRSO-style hijacks of return predictions following IBPB, as the return target predictor can be corrupted before the IBPB completes. [ bp: Massage. ] Signed-off-by: Johannes Wikner Signed-off-by: Borislav Petkov (AMD) Cc: Signed-off-by: Greg Kroah-Hartman --- arch/x86/entry/entry.S | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S index 718c00367f9a..34eca8015b64 100644 --- a/arch/x86/entry/entry.S +++ b/arch/x86/entry/entry.S @@ -9,6 +9,8 @@ #include #include #include +#include +#include .pushsection .noinstr.text, "ax" @@ -17,6 +19,9 @@ SYM_FUNC_START(entry_ibpb) movl $PRED_CMD_IBPB, %eax xorl %edx, %edx wrmsr + + /* Make sure IBPB clears return stack preductions too. */ + FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_BUG_IBPB_NO_RET RET SYM_FUNC_END(entry_ibpb) /* For KVM */ From c42a343158ea44933b21a7960045f9ae942acf76 Mon Sep 17 00:00:00 2001 From: Johannes Wikner Date: Tue, 8 Oct 2024 12:36:30 +0200 Subject: [PATCH 057/125] x86/bugs: Skip RSB fill at VMEXIT commit 0fad2878642ec46225af2054564932745ac5c765 upstream. entry_ibpb() is designed to follow Intel's IBPB specification regardless of CPU. This includes invalidating RSB entries. Hence, if IBPB on VMEXIT has been selected, entry_ibpb() as part of the RET untraining in the VMEXIT path will take care of all BTB and RSB clearing so there's no need to explicitly fill the RSB anymore. [ bp: Massage commit message. ] Suggested-by: Borislav Petkov Signed-off-by: Johannes Wikner Cc: Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/bugs.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index c2dc9b7426ac..1919f1b50895 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1115,6 +1115,14 @@ do_cmd_auto: setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); mitigate_smt = true; + + /* + * There is no need for RSB filling: entry_ibpb() ensures + * all predictions, including the RSB, are invalidated, + * regardless of IBPB implementation. + */ + setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT); + break; case RETBLEED_MITIGATION_STUFF: @@ -2622,6 +2630,13 @@ static void __init srso_select_mitigation(void) if (!boot_cpu_has(X86_FEATURE_ENTRY_IBPB) && has_microcode) { setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; + + /* + * There is no need for RSB filling: entry_ibpb() ensures + * all predictions, including the RSB, are invalidated, + * regardless of IBPB implementation. + */ + setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT); } } else { pr_err("WARNING: kernel not compiled with CPU_SRSO.\n"); From c04670dffb3fd43fe3547d585ae40ce3424a41d8 Mon Sep 17 00:00:00 2001 From: Johannes Wikner Date: Tue, 8 Oct 2024 12:58:03 +0200 Subject: [PATCH 058/125] x86/bugs: Do not use UNTRAIN_RET with IBPB on entry commit c62fa117c32bd1abed9304c58e0da6940f8c7fc2 upstream. Since X86_FEATURE_ENTRY_IBPB will invalidate all harmful predictions with IBPB, no software-based untraining of returns is needed anymore. Currently, this change affects retbleed and SRSO mitigations so if either of the mitigations is doing IBPB and the other one does the software sequence, the latter is not needed anymore. [ bp: Massage commit message. ] Suggested-by: Borislav Petkov Signed-off-by: Johannes Wikner Cc: Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/bugs.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 1919f1b50895..7b5ba5b8592a 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1113,6 +1113,15 @@ do_cmd_auto: case RETBLEED_MITIGATION_IBPB: setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); + + /* + * IBPB on entry already obviates the need for + * software-based untraining so clear those in case some + * other mitigation like SRSO has selected them. + */ + setup_clear_cpu_cap(X86_FEATURE_UNRET); + setup_clear_cpu_cap(X86_FEATURE_RETHUNK); + setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); mitigate_smt = true; @@ -2618,6 +2627,14 @@ static void __init srso_select_mitigation(void) if (has_microcode) { setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); srso_mitigation = SRSO_MITIGATION_IBPB; + + /* + * IBPB on entry already obviates the need for + * software-based untraining so clear those in case some + * other mitigation like Retbleed has selected them. + */ + setup_clear_cpu_cap(X86_FEATURE_UNRET); + setup_clear_cpu_cap(X86_FEATURE_RETHUNK); } } else { pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n"); From 4c5b123ab289767afe940389dbb963c5c05e594e Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 15 Oct 2024 10:59:46 -0700 Subject: [PATCH 059/125] blk-rq-qos: fix crash on rq_qos_wait vs. rq_qos_wake_function race commit e972b08b91ef48488bae9789f03cfedb148667fb upstream. We're seeing crashes from rq_qos_wake_function that look like this: BUG: unable to handle page fault for address: ffffafe180a40084 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 100000067 P4D 100000067 PUD 10027c067 PMD 10115d067 PTE 0 Oops: Oops: 0002 [#1] PREEMPT SMP PTI CPU: 17 UID: 0 PID: 0 Comm: swapper/17 Not tainted 6.12.0-rc3-00013-geca631b8fe80 #11 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 RIP: 0010:_raw_spin_lock_irqsave+0x1d/0x40 Code: 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 0f 1f 44 00 00 41 54 9c 41 5c fa 65 ff 05 62 97 30 4c 31 c0 ba 01 00 00 00 0f b1 17 75 0a 4c 89 e0 41 5c c3 cc cc cc cc 89 c6 e8 2c 0b 00 RSP: 0018:ffffafe180580ca0 EFLAGS: 00010046 RAX: 0000000000000000 RBX: ffffafe180a3f7a8 RCX: 0000000000000011 RDX: 0000000000000001 RSI: 0000000000000003 RDI: ffffafe180a40084 RBP: 0000000000000000 R08: 00000000001e7240 R09: 0000000000000011 R10: 0000000000000028 R11: 0000000000000888 R12: 0000000000000002 R13: ffffafe180a40084 R14: 0000000000000000 R15: 0000000000000003 FS: 0000000000000000(0000) GS:ffff9aaf1f280000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffafe180a40084 CR3: 000000010e428002 CR4: 0000000000770ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: try_to_wake_up+0x5a/0x6a0 rq_qos_wake_function+0x71/0x80 __wake_up_common+0x75/0xa0 __wake_up+0x36/0x60 scale_up.part.0+0x50/0x110 wb_timer_fn+0x227/0x450 ... So rq_qos_wake_function() calls wake_up_process(data->task), which calls try_to_wake_up(), which faults in raw_spin_lock_irqsave(&p->pi_lock). p comes from data->task, and data comes from the waitqueue entry, which is stored on the waiter's stack in rq_qos_wait(). Analyzing the core dump with drgn, I found that the waiter had already woken up and moved on to a completely unrelated code path, clobbering what was previously data->task. Meanwhile, the waker was passing the clobbered garbage in data->task to wake_up_process(), leading to the crash. What's happening is that in between rq_qos_wake_function() deleting the waitqueue entry and calling wake_up_process(), rq_qos_wait() is finding that it already got a token and returning. The race looks like this: rq_qos_wait() rq_qos_wake_function() ============================================================== prepare_to_wait_exclusive() data->got_token = true; list_del_init(&curr->entry); if (data.got_token) break; finish_wait(&rqw->wait, &data.wq); ^- returns immediately because list_empty_careful(&wq_entry->entry) is true ... return, go do something else ... wake_up_process(data->task) (NO LONGER VALID!)-^ Normally, finish_wait() is supposed to synchronize against the waker. But, as noted above, it is returning immediately because the waitqueue entry has already been removed from the waitqueue. The bug is that rq_qos_wake_function() is accessing the waitqueue entry AFTER deleting it. Note that autoremove_wake_function() wakes the waiter and THEN deletes the waitqueue entry, which is the proper order. Fix it by swapping the order. We also need to use list_del_init_careful() to match the list_empty_careful() in finish_wait(). Fixes: 38cfb5a45ee0 ("blk-wbt: improve waking of tasks") Cc: stable@vger.kernel.org Signed-off-by: Omar Sandoval Acked-by: Tejun Heo Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/d3bee2463a67b1ee597211823bf7ad3721c26e41.1729014591.git.osandov@fb.com Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- block/blk-rq-qos.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index dd7310c94713..dc510f493ba5 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -219,8 +219,8 @@ static int rq_qos_wake_function(struct wait_queue_entry *curr, data->got_token = true; smp_wmb(); - list_del_init(&curr->entry); wake_up_process(data->task); + list_del_init_careful(&curr->entry); return 1; } From 2762b3cc9094f7cd8d71577485375611cf736953 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 15 Oct 2024 08:58:25 -0600 Subject: [PATCH 060/125] io_uring/sqpoll: close race on waiting for sqring entries commit 28aabffae6be54284869a91cd8bccd3720041129 upstream. When an application uses SQPOLL, it must wait for the SQPOLL thread to consume SQE entries, if it fails to get an sqe when calling io_uring_get_sqe(). It can do so by calling io_uring_enter(2) with the flag value of IORING_ENTER_SQ_WAIT. In liburing, this is generally done with io_uring_sqring_wait(). There's a natural expectation that once this call returns, a new SQE entry can be retrieved, filled out, and submitted. However, the kernel uses the cached sq head to determine if the SQRING is full or not. If the SQPOLL thread is currently in the process of submitting SQE entries, it may have updated the cached sq head, but not yet committed it to the SQ ring. Hence the kernel may find that there are SQE entries ready to be consumed, and return successfully to the application. If the SQPOLL thread hasn't yet committed the SQ ring entries by the time the application returns to userspace and attempts to get a new SQE, it will fail getting a new SQE. Fix this by having io_sqring_full() always use the user visible SQ ring head entry, rather than the internally cached one. Cc: stable@vger.kernel.org # 5.10+ Link: https://github.com/axboe/liburing/discussions/1267 Reported-by: Benedek Thaler Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- io_uring/io_uring.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 19ac1b2f1ea4..8242820742ee 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -262,7 +262,14 @@ static inline bool io_sqring_full(struct io_ring_ctx *ctx) { struct io_rings *r = ctx->rings; - return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries; + /* + * SQPOLL must use the actual sqring head, as using the cached_sq_head + * is race prone if the SQPOLL thread has grabbed entries but not yet + * committed them to the ring. For !SQPOLL, this doesn't matter, but + * since this helper is just used for SQPOLL sqring waits (or POLLOUT), + * just read the actual sqring head unconditionally. + */ + return READ_ONCE(r->sq.tail) - READ_ONCE(r->sq.head) == ctx->sq_entries; } static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) From 6414ab5c9c9c068eca6dc4fd3a036bc4b83164dc Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 16 Oct 2024 21:48:47 +0800 Subject: [PATCH 061/125] ublk: don't allow user copy for unprivileged device commit 42aafd8b48adac1c3b20fe5892b1b91b80c1a1e6 upstream. UBLK_F_USER_COPY requires userspace to call write() on ublk char device for filling request buffer, and unprivileged device can't be trusted. So don't allow user copy for unprivileged device. Cc: stable@vger.kernel.org Fixes: 1172d5b8beca ("ublk: support user copy") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241016134847.2911721-1-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- drivers/block/ublk_drv.c | 11 ++++++++++- include/uapi/linux/ublk_cmd.h | 8 +++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 9cafbce1faf3..f31607a24f57 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2327,10 +2327,19 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) * TODO: provide forward progress for RECOVERY handler, so that * unprivileged device can benefit from it */ - if (info.flags & UBLK_F_UNPRIVILEGED_DEV) + if (info.flags & UBLK_F_UNPRIVILEGED_DEV) { info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE | UBLK_F_USER_RECOVERY); + /* + * For USER_COPY, we depends on userspace to fill request + * buffer by pwrite() to ublk char device, which can't be + * used for unprivileged device + */ + if (info.flags & UBLK_F_USER_COPY) + return -EINVAL; + } + /* the created device is always owned by current user */ ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid); diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index b9cfc5c96268..3830b428ecf1 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -173,7 +173,13 @@ /* use ioctl encoding for uring command */ #define UBLK_F_CMD_IOCTL_ENCODE (1UL << 6) -/* Copy between request and user buffer by pread()/pwrite() */ +/* + * Copy between request and user buffer by pread()/pwrite() + * + * Not available for UBLK_F_UNPRIVILEGED_DEV, otherwise userspace may + * deceive us by not filling request buffer, then kernel uninitialized + * data may be leaked. + */ #define UBLK_F_USER_COPY (1UL << 7) /* From a16af52f244272d3f1a91daa4a6dc1d2c5815710 Mon Sep 17 00:00:00 2001 From: Yun Lu Date: Tue, 15 Oct 2024 17:15:20 +0800 Subject: [PATCH 062/125] selftest: hid: add the missing tests directory commit fe05c40ca9c18cfdb003f639a30fc78a7ab49519 upstream. Commit 160c826b4dd0 ("selftest: hid: add missing run-hid-tools-tests.sh") has added the run-hid-tools-tests.sh script for it to be installed, but I forgot to add the tests directory together. If running the test case without the tests directory, will results in the following error message: make -C tools/testing/selftests/ TARGETS=hid install \ INSTALL_PATH=$KSFT_INSTALL_PATH cd $KSFT_INSTALL_PATH ./run_kselftest.sh -t hid:hid-core.sh /usr/lib/python3.11/site-packages/_pytest/config/__init__.py:331: PluggyTeardownRaisedWarning: A plugin raised an exception during an old-style hookwrapper teardown. Plugin: helpconfig, Hook: pytest_cmdline_parse UsageError: usage: __main__.py [options] [file_or_dir] [file_or_dir] [...] __main__.py: error: unrecognized arguments: --udevd inifile: None rootdir: /root/linux/kselftest_install/hid In fact, the run-hid-tools-tests.sh script uses the scripts in the tests directory to run tests. The tests directory also needs to be added to be installed. Fixes: ffb85d5c9e80 ("selftests: hid: import hid-tools hid-core tests") Cc: stable@vger.kernel.org Signed-off-by: Yun Lu Acked-by: Benjamin Tissoires Signed-off-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- tools/testing/selftests/hid/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/hid/Makefile b/tools/testing/selftests/hid/Makefile index 87b6f5f83d7e..2e75fb30f3a5 100644 --- a/tools/testing/selftests/hid/Makefile +++ b/tools/testing/selftests/hid/Makefile @@ -18,6 +18,7 @@ TEST_PROGS += hid-usb_crash.sh TEST_PROGS += hid-wacom.sh TEST_FILES := run-hid-tools-tests.sh +TEST_FILES += tests CXX ?= $(CROSS_COMPILE)g++ From d3c4f41ae32ca3636e1364e5b84de22415b3cabe Mon Sep 17 00:00:00 2001 From: John Edwards Date: Thu, 10 Oct 2024 23:09:23 +0000 Subject: [PATCH 063/125] Input: xpad - add support for MSI Claw A1M commit 22a18935d7d96bbb1a28076f843c1926d0ba189e upstream. Add MSI Claw A1M controller to xpad_device match table when in xinput mode. Add MSI VID as XPAD_XBOX360_VENDOR. Signed-off-by: John Edwards Reviewed-by: Derek J. Clark Reviewed-by: Christopher Snowhill Link: https://lore.kernel.org/r/20241010232020.3292284-4-uejji@uejji.net Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/joystick/xpad.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index daf2a46521f0..1cb47488375b 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -217,6 +217,7 @@ static const struct xpad_device { { 0x0c12, 0x8810, "Zeroplus Xbox Controller", 0, XTYPE_XBOX }, { 0x0c12, 0x9902, "HAMA VibraX - *FAULTY HARDWARE*", 0, XTYPE_XBOX }, { 0x0d2f, 0x0002, "Andamiro Pump It Up pad", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX }, + { 0x0db0, 0x1901, "Micro Star International Xbox360 Controller for Windows", 0, XTYPE_XBOX360 }, { 0x0e4c, 0x1097, "Radica Gamester Controller", 0, XTYPE_XBOX }, { 0x0e4c, 0x1103, "Radica Gamester Reflex", MAP_TRIGGERS_TO_BUTTONS, XTYPE_XBOX }, { 0x0e4c, 0x2390, "Radica Games Jtech Controller", 0, XTYPE_XBOX }, @@ -486,6 +487,7 @@ static const struct usb_device_id xpad_table[] = { XPAD_XBOX360_VENDOR(0x07ff), /* Mad Catz Gamepad */ XPAD_XBOXONE_VENDOR(0x0b05), /* ASUS controllers */ XPAD_XBOX360_VENDOR(0x0c12), /* Zeroplus X-Box 360 controllers */ + XPAD_XBOX360_VENDOR(0x0db0), /* Micro Star International X-Box 360 controllers */ XPAD_XBOX360_VENDOR(0x0e6f), /* 0x0e6f Xbox 360 controllers */ XPAD_XBOXONE_VENDOR(0x0e6f), /* 0x0e6f Xbox One controllers */ XPAD_XBOX360_VENDOR(0x0f0d), /* Hori controllers */ From 7bd9af254275fad7071d85f04616560deb598d7d Mon Sep 17 00:00:00 2001 From: Seunghwan Baek Date: Thu, 29 Aug 2024 18:39:13 +0900 Subject: [PATCH 064/125] scsi: ufs: core: Set SDEV_OFFLINE when UFS is shut down commit 19a198b67767d952c8f3d0cf24eb3100522a8223 upstream. There is a history of deadlock if reboot is performed at the beginning of booting. SDEV_QUIESCE was set for all LU's scsi_devices by UFS shutdown, and at that time the audio driver was waiting on blk_mq_submit_bio() holding a mutex_lock while reading the fw binary. After that, a deadlock issue occurred while audio driver shutdown was waiting for mutex_unlock of blk_mq_submit_bio(). To solve this, set SDEV_OFFLINE for all LUs except WLUN, so that any I/O that comes down after a UFS shutdown will return an error. [ 31.907781]I[0: swapper/0: 0] 1 130705007 1651079834 11289729804 0 D( 2) 3 ffffff882e208000 * init [device_shutdown] [ 31.907793]I[0: swapper/0: 0] Mutex: 0xffffff8849a2b8b0: owner[0xffffff882e28cb00 kworker/6:0 :49] [ 31.907806]I[0: swapper/0: 0] Call trace: [ 31.907810]I[0: swapper/0: 0] __switch_to+0x174/0x338 [ 31.907819]I[0: swapper/0: 0] __schedule+0x5ec/0x9cc [ 31.907826]I[0: swapper/0: 0] schedule+0x7c/0xe8 [ 31.907834]I[0: swapper/0: 0] schedule_preempt_disabled+0x24/0x40 [ 31.907842]I[0: swapper/0: 0] __mutex_lock+0x408/0xdac [ 31.907849]I[0: swapper/0: 0] __mutex_lock_slowpath+0x14/0x24 [ 31.907858]I[0: swapper/0: 0] mutex_lock+0x40/0xec [ 31.907866]I[0: swapper/0: 0] device_shutdown+0x108/0x280 [ 31.907875]I[0: swapper/0: 0] kernel_restart+0x4c/0x11c [ 31.907883]I[0: swapper/0: 0] __arm64_sys_reboot+0x15c/0x280 [ 31.907890]I[0: swapper/0: 0] invoke_syscall+0x70/0x158 [ 31.907899]I[0: swapper/0: 0] el0_svc_common+0xb4/0xf4 [ 31.907909]I[0: swapper/0: 0] do_el0_svc+0x2c/0xb0 [ 31.907918]I[0: swapper/0: 0] el0_svc+0x34/0xe0 [ 31.907928]I[0: swapper/0: 0] el0t_64_sync_handler+0x68/0xb4 [ 31.907937]I[0: swapper/0: 0] el0t_64_sync+0x1a0/0x1a4 [ 31.908774]I[0: swapper/0: 0] 49 0 11960702 11236868007 0 D( 2) 6 ffffff882e28cb00 * kworker/6:0 [__bio_queue_enter] [ 31.908783]I[0: swapper/0: 0] Call trace: [ 31.908788]I[0: swapper/0: 0] __switch_to+0x174/0x338 [ 31.908796]I[0: swapper/0: 0] __schedule+0x5ec/0x9cc [ 31.908803]I[0: swapper/0: 0] schedule+0x7c/0xe8 [ 31.908811]I[0: swapper/0: 0] __bio_queue_enter+0xb8/0x178 [ 31.908818]I[0: swapper/0: 0] blk_mq_submit_bio+0x194/0x67c [ 31.908827]I[0: swapper/0: 0] __submit_bio+0xb8/0x19c Fixes: b294ff3e3449 ("scsi: ufs: core: Enable power management for wlun") Cc: stable@vger.kernel.org Signed-off-by: Seunghwan Baek Link: https://lore.kernel.org/r/20240829093913.6282-2-sh8267.baek@samsung.com Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/ufs/core/ufshcd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 59d53ea100c0..db4044358e22 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -10100,7 +10100,9 @@ static void ufshcd_wl_shutdown(struct device *dev) shost_for_each_device(sdev, hba->host) { if (sdev == hba->ufs_device_wlun) continue; - scsi_device_quiesce(sdev); + mutex_lock(&sdev->state_mutex); + scsi_device_set_state(sdev, SDEV_OFFLINE); + mutex_unlock(&sdev->state_mutex); } __ufshcd_wl_suspend(hba, UFS_SHUTDOWN_PM); From 8e6ca01b3b8d8a143e36337095386663de0672ce Mon Sep 17 00:00:00 2001 From: Peter Wang Date: Tue, 1 Oct 2024 17:19:16 +0800 Subject: [PATCH 065/125] scsi: ufs: core: Fix the issue of ICU failure commit bf0c6cc73f7f91ec70307f7c72343f6cb7d65d01 upstream. When setting the ICU bit without using read-modify-write, SQRTCy will restart SQ again and receive an RTC return error code 2 (Failure - SQ not stopped). Additionally, the error log has been modified so that this type of error can be observed. Fixes: ab248643d3d6 ("scsi: ufs: core: Add error handling for MCQ mode") Cc: stable@vger.kernel.org Signed-off-by: Peter Wang Link: https://lore.kernel.org/r/20241001091917.6917-2-peter.wang@mediatek.com Reviewed-by: Bao D. Nguyen Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/ufs/core/ufs-mcq.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/ufs/core/ufs-mcq.c b/drivers/ufs/core/ufs-mcq.c index af5ce45315b3..da8c1734d333 100644 --- a/drivers/ufs/core/ufs-mcq.c +++ b/drivers/ufs/core/ufs-mcq.c @@ -498,7 +498,7 @@ int ufshcd_mcq_sq_cleanup(struct ufs_hba *hba, int task_tag) struct scsi_cmnd *cmd = lrbp->cmd; struct ufs_hw_queue *hwq; void __iomem *reg, *opr_sqd_base; - u32 nexus, id, val; + u32 nexus, id, val, rtc; int err; if (hba->quirks & UFSHCD_QUIRK_MCQ_BROKEN_RTC) @@ -528,17 +528,18 @@ int ufshcd_mcq_sq_cleanup(struct ufs_hba *hba, int task_tag) opr_sqd_base = mcq_opr_base(hba, OPR_SQD, id); writel(nexus, opr_sqd_base + REG_SQCTI); - /* SQRTCy.ICU = 1 */ - writel(SQ_ICU, opr_sqd_base + REG_SQRTC); + /* Initiate Cleanup */ + writel(readl(opr_sqd_base + REG_SQRTC) | SQ_ICU, + opr_sqd_base + REG_SQRTC); /* Poll SQRTSy.CUS = 1. Return result from SQRTSy.RTC */ reg = opr_sqd_base + REG_SQRTS; err = read_poll_timeout(readl, val, val & SQ_CUS, 20, MCQ_POLL_US, false, reg); - if (err) - dev_err(hba->dev, "%s: failed. hwq=%d, tag=%d err=%ld\n", - __func__, id, task_tag, - FIELD_GET(SQ_ICU_ERR_CODE_MASK, readl(reg))); + rtc = FIELD_GET(SQ_ICU_ERR_CODE_MASK, readl(reg)); + if (err || rtc) + dev_err(hba->dev, "%s: failed. hwq=%d, tag=%d err=%d RTC=%d\n", + __func__, id, task_tag, err, rtc); if (ufshcd_mcq_sq_start(hba, hwq)) err = -ETIMEDOUT; From 1a235af0216411a32ab4db54f7bd19020b46c86d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Mon, 14 Oct 2024 19:09:36 +0300 Subject: [PATCH 066/125] drm/radeon: Fix encoder->possible_clones MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 28127dba64d8ae1a0b737b973d6d029908599611 upstream. Include the encoder itself in its possible_clones bitmask. In the past nothing validated that drivers were populating possible_clones correctly, but that changed in commit 74d2aacbe840 ("drm: Validate encoder->possible_clones"). Looks like radeon never got the memo and is still not following the rules 100% correctly. This results in some warnings during driver initialization: Bogus possible_clones: [ENCODER:46:TV-46] possible_clones=0x4 (full encoder mask=0x7) WARNING: CPU: 0 PID: 170 at drivers/gpu/drm/drm_mode_config.c:615 drm_mode_config_validate+0x113/0x39c ... Cc: Alex Deucher Cc: amd-gfx@lists.freedesktop.org Fixes: 74d2aacbe840 ("drm: Validate encoder->possible_clones") Reported-by: Erhard Furtner Closes: https://lore.kernel.org/dri-devel/20241009000321.418e4294@yea/ Tested-by: Erhard Furtner Signed-off-by: Ville Syrjälä Signed-off-by: Alex Deucher (cherry picked from commit 3b6e7d40649c0d75572039aff9d0911864c689db) Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/radeon/radeon_encoders.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/radeon/radeon_encoders.c b/drivers/gpu/drm/radeon/radeon_encoders.c index 9cb6401fe97e..bb908f125269 100644 --- a/drivers/gpu/drm/radeon/radeon_encoders.c +++ b/drivers/gpu/drm/radeon/radeon_encoders.c @@ -42,7 +42,7 @@ static uint32_t radeon_encoder_clones(struct drm_encoder *encoder) struct radeon_device *rdev = dev->dev_private; struct radeon_encoder *radeon_encoder = to_radeon_encoder(encoder); struct drm_encoder *clone_encoder; - uint32_t index_mask = 0; + uint32_t index_mask = drm_encoder_mask(encoder); int count; /* DIG routing gets problematic */ From 7c0763fd79748e918fc93890d0fef2d093ce4caf Mon Sep 17 00:00:00 2001 From: Nikolay Kuratov Date: Wed, 2 Oct 2024 15:24:29 +0300 Subject: [PATCH 067/125] drm/vmwgfx: Handle surface check failure correctly commit 26498b8d54373d31a621d7dec95c4bd842563b3b upstream. Currently if condition (!bo and !vmw_kms_srf_ok()) was met we go to err_out with ret == 0. err_out dereferences vfb if ret == 0, but in our case vfb is still NULL. Fix this by assigning sensible error to ret. Found by Linux Verification Center (linuxtesting.org) with SVACE Signed-off-by: Nikolay Kuratov Cc: stable@vger.kernel.org Fixes: 810b3e1683d0 ("drm/vmwgfx: Support topology greater than texture size") Signed-off-by: Zack Rusin Link: https://patchwork.freedesktop.org/patch/msgid/20241002122429.1981822-1-kniv@yandex-team.ru Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/vmwgfx/vmwgfx_kms.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c index 08f2470edab2..11f7c0e5420e 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c @@ -1659,6 +1659,7 @@ static struct drm_framebuffer *vmw_kms_fb_create(struct drm_device *dev, DRM_ERROR("Surface size cannot exceed %dx%d\n", dev_priv->texture_max_width, dev_priv->texture_max_height); + ret = -EINVAL; goto err_out; } From db0978d3ed8b1f69b6e09c3ccb66897fd2a9f6d7 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 2 Oct 2024 10:22:30 -0400 Subject: [PATCH 068/125] drm/amdgpu/swsmu: Only force workload setup on init commit cb07c8338fc2b9d5f949a19d4a07ee4d5ecf8793 upstream. Needed to set the workload type at init time so that we can apply the navi3x margin optimization. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3618 Link: https://gitlab.freedesktop.org/drm/amd/-/issues/3131 Fixes: c50fe289ed72 ("drm/amdgpu/swsmu: always force a state reprogram on init") Reviewed-by: Kenneth Feng Signed-off-by: Alex Deucher (cherry picked from commit 580ad7cbd4b7be8d2cb5ab5c1fca6bb76045eb0e) Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c index c0d7fe5102e6..4d17b6958397 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c @@ -1843,7 +1843,7 @@ static int smu_bump_power_profile_mode(struct smu_context *smu, static int smu_adjust_power_state_dynamic(struct smu_context *smu, enum amd_dpm_forced_level level, bool skip_display_settings, - bool force_update) + bool init) { int ret = 0; int index = 0; @@ -1872,7 +1872,7 @@ static int smu_adjust_power_state_dynamic(struct smu_context *smu, } } - if (force_update || smu_dpm_ctx->dpm_level != level) { + if (smu_dpm_ctx->dpm_level != level) { ret = smu_asic_set_performance_level(smu, level); if (ret) { dev_err(smu->adev->dev, "Failed to set performance level!"); @@ -1889,7 +1889,7 @@ static int smu_adjust_power_state_dynamic(struct smu_context *smu, index = index > 0 && index <= WORKLOAD_POLICY_MAX ? index - 1 : 0; workload[0] = smu->workload_setting[index]; - if (force_update || smu->power_profile_mode != workload[0]) + if (init || smu->power_profile_mode != workload[0]) smu_bump_power_profile_mode(smu, workload, 0); } From 64cf93b87fe34e72557de294718019c0ad2931b4 Mon Sep 17 00:00:00 2001 From: Mohammed Anees Date: Wed, 9 Oct 2024 17:58:31 +0530 Subject: [PATCH 069/125] drm/amdgpu: prevent BO_HANDLES error from being overwritten MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit c0ec082f10b7a1fd25e8c1e2a686440da913b7a3 upstream. Before this patch, if multiple BO_HANDLES chunks were submitted, the error -EINVAL would be correctly set but could be overwritten by the return value from amdgpu_cs_p1_bo_handles(). This patch ensures that if there are multiple BO_HANDLES, we stop. Fixes: fec5f8e8c6bc ("drm/amdgpu: disallow multiple BO_HANDLES chunks in one submit") Signed-off-by: Mohammed Anees Reviewed-by: Christian König Signed-off-by: Pierre-Eric Pelloux-Prayer Signed-off-by: Alex Deucher (cherry picked from commit 40f2cd98828f454bdc5006ad3d94330a5ea164b7) Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 7abcd618e70b..13c97ba7a820 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -265,7 +265,7 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p, /* Only a single BO list is allowed to simplify handling. */ if (p->bo_list) - ret = -EINVAL; + goto free_partial_kdata; ret = amdgpu_cs_p1_bo_handles(p, p->chunks[i].kdata); if (ret) From c3a230c1df360feb14fb3d1e3c2962e6ad21c65f Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Thu, 3 Oct 2024 18:49:38 +0200 Subject: [PATCH 070/125] iio: dac: ad5770r: add missing select REGMAP_SPI in Kconfig commit bcdab6f74c91cda19714354fd4e9e3ef3c9a78b3 upstream. This driver makes use of regmap_spi, but does not select the required module. Add the missing 'select REGMAP_SPI'. Fixes: cbbb819837f6 ("iio: dac: ad5770r: Add AD5770R support") Signed-off-by: Javier Carrasco Link: https://patch.msgid.link/20241003-ad2s1210-select-v1-6-4019453f8c33@gmail.com Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/dac/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iio/dac/Kconfig b/drivers/iio/dac/Kconfig index 93b8be183de6..12d9cccd723c 100644 --- a/drivers/iio/dac/Kconfig +++ b/drivers/iio/dac/Kconfig @@ -224,6 +224,7 @@ config AD5766 config AD5770R tristate "Analog Devices AD5770R IDAC driver" depends on SPI_MASTER + select REGMAP_SPI help Say yes here to build support for Analog Devices AD5770R Digital to Analog Converter. From ea8180528111abc64bae193a7694d04d77b1cfe2 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Thu, 3 Oct 2024 18:49:39 +0200 Subject: [PATCH 071/125] iio: dac: ltc1660: add missing select REGMAP_SPI in Kconfig commit 252ff06a4cb4e572cb3c7fcfa697db96b08a7781 upstream. This driver makes use of regmap_spi, but does not select the required module. Add the missing 'select REGMAP_SPI'. Fixes: 8316cebd1e59 ("iio: dac: add support for ltc1660") Signed-off-by: Javier Carrasco Link: https://patch.msgid.link/20241003-ad2s1210-select-v1-7-4019453f8c33@gmail.com Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/dac/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iio/dac/Kconfig b/drivers/iio/dac/Kconfig index 12d9cccd723c..d727578c8c24 100644 --- a/drivers/iio/dac/Kconfig +++ b/drivers/iio/dac/Kconfig @@ -316,6 +316,7 @@ config LPC18XX_DAC config LTC1660 tristate "Linear Technology LTC1660/LTC1665 DAC SPI driver" depends on SPI + select REGMAP_SPI help Say yes here to build support for Linear Technology LTC1660 and LTC1665 Digital to Analog Converters. From e0eb585eef6c5922a9207adb080f9ec9c1c13207 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Thu, 3 Oct 2024 18:49:40 +0200 Subject: [PATCH 072/125] iio: dac: stm32-dac-core: add missing select REGMAP_MMIO in Kconfig commit 27b6aa68a68105086aef9f0cb541cd688e5edea8 upstream. This driver makes use of regmap_mmio, but does not select the required module. Add the missing 'select REGMAP_MMIO'. Fixes: 4d4b30526eb8 ("iio: dac: add support for stm32 DAC") Signed-off-by: Javier Carrasco Link: https://patch.msgid.link/20241003-ad2s1210-select-v1-8-4019453f8c33@gmail.com Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/dac/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iio/dac/Kconfig b/drivers/iio/dac/Kconfig index d727578c8c24..a2a014c541cd 100644 --- a/drivers/iio/dac/Kconfig +++ b/drivers/iio/dac/Kconfig @@ -426,6 +426,7 @@ config STM32_DAC config STM32_DAC_CORE tristate + select REGMAP_MMIO config TI_DAC082S085 tristate "Texas Instruments 8/10/12-bit 2/4-channel DAC driver" From 082a75594ba4385284b18ae2bd02f997f737bf2f Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Thu, 3 Oct 2024 23:04:50 +0200 Subject: [PATCH 073/125] iio: adc: ti-ads8688: add missing select IIO_(TRIGGERED_)BUFFER in Kconfig commit 4c4834fd8696a949d1b1f1c2c5b96e1ad2083b02 upstream. This driver makes use of triggered buffers, but does not select the required modules. Fixes: 2a86487786b5 ("iio: adc: ti-ads8688: add trigger and buffer support") Add the missing 'select IIO_BUFFER' and 'select IIO_TRIGGERED_BUFFER'. Signed-off-by: Javier Carrasco Reviewed-by: Sean Nyekjaer Link: https://patch.msgid.link/20241003-iio-select-v1-4-67c0385197cd@gmail.com Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/adc/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iio/adc/Kconfig b/drivers/iio/adc/Kconfig index dfb925cfe38e..7937b2aae6c6 100644 --- a/drivers/iio/adc/Kconfig +++ b/drivers/iio/adc/Kconfig @@ -1286,6 +1286,8 @@ config TI_ADS8344 config TI_ADS8688 tristate "Texas Instruments ADS8688" depends on SPI + select IIO_BUFFER + select IIO_TRIGGERED_BUFFER help If you say yes here you get support for Texas Instruments ADS8684 and and ADS8688 ADC chips From 6acb0a4dac4000e805e63d84433804f32043a0fe Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 3 Oct 2024 20:41:12 +0200 Subject: [PATCH 074/125] iio: hid-sensors: Fix an error handling path in _hid_sensor_set_report_latency() commit 3a29b84cf7fbf912a6ab1b9c886746f02b74ea25 upstream. If hid_sensor_set_report_latency() fails, the error code should be returned instead of a value likely to be interpreted as 'success'. Fixes: 138bc7969c24 ("iio: hid-sensor-hub: Implement batch mode") Signed-off-by: Christophe JAILLET Acked-by: Srinivas Pandruvada Link: https://patch.msgid.link/c50640665f091a04086e5092cf50f73f2055107a.1727980825.git.christophe.jaillet@wanadoo.fr Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/common/hid-sensors/hid-sensor-trigger.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/common/hid-sensors/hid-sensor-trigger.c b/drivers/iio/common/hid-sensors/hid-sensor-trigger.c index ad8910e6ad59..abb09fefc792 100644 --- a/drivers/iio/common/hid-sensors/hid-sensor-trigger.c +++ b/drivers/iio/common/hid-sensors/hid-sensor-trigger.c @@ -32,7 +32,7 @@ static ssize_t _hid_sensor_set_report_latency(struct device *dev, latency = integer * 1000 + fract / 1000; ret = hid_sensor_set_report_latency(attrb, latency); if (ret < 0) - return len; + return ret; attrb->latency_ms = hid_sensor_get_report_latency(attrb); From 6bd2b16425d4b522fe328cb05ac28abc56fa3ec1 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Mon, 23 Sep 2024 00:17:49 +0200 Subject: [PATCH 075/125] iio: light: veml6030: fix ALS sensor resolution commit c9e9746f275c45108f2b0633a4855d65d9ae0736 upstream. The driver still uses the sensor resolution provided in the datasheet until Rev. 1.6, 28-Apr-2022, which was updated with Rev 1.7, 28-Nov-2023. The original ambient light resolution has been updated from 0.0036 lx/ct to 0.0042 lx/ct, which is the value that can be found in the current device datasheet. Update the default resolution for IT = 100 ms and GAIN = 1/8 from the original 4608 mlux/cnt to the current value from the "Resolution and maximum detection range" table (Application Note 84367, page 5), 5376 mlux/cnt. Cc: Fixes: 7b779f573c48 ("iio: light: add driver for veml6030 ambient light sensor") Signed-off-by: Javier Carrasco Link: https://patch.msgid.link/20240923-veml6035-v2-1-58c72a0df31c@gmail.com Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/light/veml6030.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/light/veml6030.c b/drivers/iio/light/veml6030.c index 043f233d9bdb..e1fd789f855b 100644 --- a/drivers/iio/light/veml6030.c +++ b/drivers/iio/light/veml6030.c @@ -780,7 +780,7 @@ static int veml6030_hw_init(struct iio_dev *indio_dev) /* Cache currently active measurement parameters */ data->cur_gain = 3; - data->cur_resolution = 4608; + data->cur_resolution = 5376; data->cur_integration_time = 3; return ret; From 2cbb41abae65626736b8b52cf3b9339612c5a86a Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Fri, 13 Sep 2024 15:18:58 +0200 Subject: [PATCH 076/125] iio: light: veml6030: fix IIO device retrieval from embedded device commit c7c44e57750c31de43906d97813273fdffcf7d02 upstream. The dev pointer that is received as an argument in the in_illuminance_period_available_show function references the device embedded in the IIO device, not in the i2c client. dev_to_iio_dev() must be used to accessthe right data. The current implementation leads to a segmentation fault on every attempt to read the attribute because indio_dev gets a NULL assignment. This bug has been present since the first appearance of the driver, apparently since the last version (V6) before getting applied. A constant attribute was used until then, and the last modifications might have not been tested again. Cc: stable@vger.kernel.org Fixes: 7b779f573c48 ("iio: light: add driver for veml6030 ambient light sensor") Signed-off-by: Javier Carrasco Link: https://patch.msgid.link/20240913-veml6035-v1-3-0b09c0c90418@gmail.com Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/light/veml6030.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/iio/light/veml6030.c b/drivers/iio/light/veml6030.c index e1fd789f855b..433f58e1dd66 100644 --- a/drivers/iio/light/veml6030.c +++ b/drivers/iio/light/veml6030.c @@ -99,9 +99,8 @@ static const char * const period_values[] = { static ssize_t in_illuminance_period_available_show(struct device *dev, struct device_attribute *attr, char *buf) { + struct veml6030_data *data = iio_priv(dev_to_iio_dev(dev)); int ret, reg, x; - struct iio_dev *indio_dev = i2c_get_clientdata(to_i2c_client(dev)); - struct veml6030_data *data = iio_priv(indio_dev); ret = regmap_read(data->regmap, VEML6030_REG_ALS_CONF, ®); if (ret) { From 7f06b154ffcf3b8410472efee3df6e578458bba4 Mon Sep 17 00:00:00 2001 From: Emil Gedenryd Date: Fri, 13 Sep 2024 11:57:02 +0200 Subject: [PATCH 077/125] iio: light: opt3001: add missing full-scale range value commit 530688e39c644543b71bdd9cb45fdfb458a28eaa upstream. The opt3001 driver uses predetermined full-scale range values to determine what exponent to use for event trigger threshold values. The problem is that one of the values specified in the datasheet is missing from the implementation. This causes larger values to be scaled down to an incorrect exponent, effectively reducing the maximum settable threshold value by a factor of 2. Add missing full-scale range array value. Fixes: 94a9b7b1809f ("iio: light: add support for TI's opt3001 light sensor") Signed-off-by: Emil Gedenryd Cc: Link: https://patch.msgid.link/20240913-add_opt3002-v2-1-69e04f840360@axis.com Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/light/opt3001.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/iio/light/opt3001.c b/drivers/iio/light/opt3001.c index cb41e5ee8ec1..dc529cbe3805 100644 --- a/drivers/iio/light/opt3001.c +++ b/drivers/iio/light/opt3001.c @@ -138,6 +138,10 @@ static const struct opt3001_scale opt3001_scales[] = { .val = 20966, .val2 = 400000, }, + { + .val = 41932, + .val2 = 800000, + }, { .val = 83865, .val2 = 600000, From dc7a11e8cebdecda942e401e110c684b8c51e5e6 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Thu, 3 Oct 2024 18:49:37 +0200 Subject: [PATCH 078/125] iio: amplifiers: ada4250: add missing select REGMAP_SPI in Kconfig commit b7983033a10baa0d98784bb411b2679bfb207d9a upstream. This driver makes use of regmap_spi, but does not select the required module. Add the missing 'select REGMAP_SPI'. Fixes: 28b4c30bfa5f ("iio: amplifiers: ada4250: add support for ADA4250") Signed-off-by: Javier Carrasco Link: https://patch.msgid.link/20241003-ad2s1210-select-v1-5-4019453f8c33@gmail.com Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/amplifiers/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iio/amplifiers/Kconfig b/drivers/iio/amplifiers/Kconfig index b54fe01734b0..55eb16b32f6c 100644 --- a/drivers/iio/amplifiers/Kconfig +++ b/drivers/iio/amplifiers/Kconfig @@ -27,6 +27,7 @@ config AD8366 config ADA4250 tristate "Analog Devices ADA4250 Instrumentation Amplifier" depends on SPI + select REGMAP_SPI help Say yes here to build support for Analog Devices ADA4250 SPI Amplifier's support. The driver provides direct access via From 5d41abc777dce9cfcd559678e796dab468ff62e7 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Thu, 3 Oct 2024 18:49:35 +0200 Subject: [PATCH 079/125] iio: frequency: adf4377: add missing select REMAP_SPI in Kconfig commit c64643ed4eaa5dfd0b3bab7ef1c50b84f3dbaba4 upstream. This driver makes use of regmap_spi, but does not select the required module. Add the missing 'select REGMAP_SPI'. Fixes: eda549e2e524 ("iio: frequency: adf4377: add support for ADF4377") Signed-off-by: Javier Carrasco Link: https://patch.msgid.link/20241003-ad2s1210-select-v1-3-4019453f8c33@gmail.com Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/frequency/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iio/frequency/Kconfig b/drivers/iio/frequency/Kconfig index 9e85dfa58508..71de6cc4a158 100644 --- a/drivers/iio/frequency/Kconfig +++ b/drivers/iio/frequency/Kconfig @@ -53,6 +53,7 @@ config ADF4371 config ADF4377 tristate "Analog Devices ADF4377 Microwave Wideband Synthesizer" depends on SPI && COMMON_CLK + select REGMAP_SPI help Say yes here to build support for Analog Devices ADF4377 Microwave Wideband Synthesizer. From 66bfe12005f6b3f9df9c21b10aa3bdfdfe171471 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Thu, 3 Oct 2024 23:04:56 +0200 Subject: [PATCH 080/125] iio: light: bu27008: add missing select IIO_(TRIGGERED_)BUFFER in Kconfig commit aa99ef68eff5bc6df4959a372ae355b3b73f9930 upstream. This driver makes use of triggered buffers, but does not select the required modules. Add the missing 'select IIO_BUFFER' and 'select IIO_TRIGGERED_BUFFER'. Fixes: 41ff93d14f78 ("iio: light: ROHM BU27008 color sensor") Signed-off-by: Javier Carrasco Acked-by: Matti Vaittinen Link: https://patch.msgid.link/20241003-iio-select-v1-10-67c0385197cd@gmail.com Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/light/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iio/light/Kconfig b/drivers/iio/light/Kconfig index 45edba797e4c..ddbd1838650e 100644 --- a/drivers/iio/light/Kconfig +++ b/drivers/iio/light/Kconfig @@ -294,6 +294,8 @@ config ROHM_BU27008 depends on I2C select REGMAP_I2C select IIO_GTS_HELPER + select IIO_BUFFER + select IIO_TRIGGERED_BUFFER help Enable support for the ROHM BU27008 color sensor. The ROHM BU27008 is a sensor with 5 photodiodes (red, green, From 1fcc9d634b50e4fac99990e368027dea7edfbc9c Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Thu, 3 Oct 2024 23:04:54 +0200 Subject: [PATCH 081/125] iio: dac: ad5766: add missing select IIO_(TRIGGERED_)BUFFER in Kconfig commit 62ec3df342cca6a8eb7ed33fd4ac8d0fbfcb9391 upstream. This driver makes use of triggered buffers, but does not select the required modules. Add the missing 'select IIO_BUFFER' and 'select IIO_TRIGGERED_BUFFER'. Fixes: 885b9790c25a ("drivers:iio:dac:ad5766.c: Add trigger buffer") Signed-off-by: Javier Carrasco Link: https://patch.msgid.link/20241003-iio-select-v1-8-67c0385197cd@gmail.com Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/dac/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iio/dac/Kconfig b/drivers/iio/dac/Kconfig index a2a014c541cd..f4d42382ec05 100644 --- a/drivers/iio/dac/Kconfig +++ b/drivers/iio/dac/Kconfig @@ -214,6 +214,8 @@ config AD5764 config AD5766 tristate "Analog Devices AD5766/AD5767 DAC driver" depends on SPI_MASTER + select IIO_BUFFER + select IIO_TRIGGERED_BUFFER help Say yes here to build support for Analog Devices AD5766, AD5767 Digital to Analog Converter. From ad60bbd7c0712709cf1e37d91145f4b42a11ff77 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Thu, 3 Oct 2024 23:04:59 +0200 Subject: [PATCH 082/125] iio: proximity: mb1232: add missing select IIO_(TRIGGERED_)BUFFER in Kconfig commit 75461a0b15d7c026924d0001abce0476bbc7eda8 upstream. This driver makes use of triggered buffers, but does not select the required modules. Add the missing 'select IIO_BUFFER' and 'select IIO_TRIGGERED_BUFFER'. Fixes: 16b05261537e ("mb1232.c: add distance iio sensor with i2c") Signed-off-by: Javier Carrasco Link: https://patch.msgid.link/20241003-iio-select-v1-13-67c0385197cd@gmail.com Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/proximity/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iio/proximity/Kconfig b/drivers/iio/proximity/Kconfig index 2ca3b0bc5eba..931eaea046b3 100644 --- a/drivers/iio/proximity/Kconfig +++ b/drivers/iio/proximity/Kconfig @@ -72,6 +72,8 @@ config LIDAR_LITE_V2 config MB1232 tristate "MaxSonar I2CXL family ultrasonic sensors" depends on I2C + select IIO_BUFFER + select IIO_TRIGGERED_BUFFER help Say Y to build a driver for the ultrasonic sensors I2CXL of MaxBotix which have an i2c interface. It can be used to measure From 2f76debe69b898fa649a8018a1f947aab006f266 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Thu, 3 Oct 2024 23:04:53 +0200 Subject: [PATCH 083/125] iio: dac: ad3552r: add missing select IIO_(TRIGGERED_)BUFFER in Kconfig commit 5bede948670f447154df401458aef4e2fd446ba8 upstream. This driver makes use of triggered buffers, but does not select the required modules. Add the missing 'select IIO_BUFFER' and 'select IIO_TRIGGERED_BUFFER'. Fixes: 8f2b54824b28 ("drivers:iio:dac: Add AD3552R driver support") Signed-off-by: Javier Carrasco Link: https://patch.msgid.link/20241003-iio-select-v1-7-67c0385197cd@gmail.com Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/dac/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iio/dac/Kconfig b/drivers/iio/dac/Kconfig index f4d42382ec05..b8ff547bc4da 100644 --- a/drivers/iio/dac/Kconfig +++ b/drivers/iio/dac/Kconfig @@ -9,6 +9,8 @@ menu "Digital to analog converters" config AD3552R tristate "Analog Devices AD3552R DAC driver" depends on SPI_MASTER + select IIO_BUFFER + select IIO_TRIGGERED_BUFFER help Say yes here to build support for Analog Devices AD3552R Digital to Analog Converter. From 48e594b77cb7ae9a17d12433794df89a7a998770 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Thu, 3 Oct 2024 23:04:51 +0200 Subject: [PATCH 084/125] iio: adc: ti-lmp92064: add missing select REGMAP_SPI in Kconfig commit f3fe8c52c580e99c6dc0c7859472ec48176af32d upstream. This driver makes use of regmap_spi, but does not select the required module. Add the missing 'select REGMAP_SPI'. Fixes: 627198942641 ("iio: adc: add ADC driver for the TI LMP92064 controller") Signed-off-by: Javier Carrasco Link: https://patch.msgid.link/20241003-iio-select-v1-5-67c0385197cd@gmail.com Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/adc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iio/adc/Kconfig b/drivers/iio/adc/Kconfig index 7937b2aae6c6..32f2e3af23bf 100644 --- a/drivers/iio/adc/Kconfig +++ b/drivers/iio/adc/Kconfig @@ -1332,6 +1332,7 @@ config TI_AM335X_ADC config TI_LMP92064 tristate "Texas Instruments LMP92064 ADC driver" depends on SPI + select REGMAP_SPI help Say yes here to build support for the LMP92064 Precision Current and Voltage sensor. From d8ef39dad9b9251a903d9039d2c8ccf682cba01a Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Thu, 3 Oct 2024 23:04:49 +0200 Subject: [PATCH 085/125] iio: adc: ti-ads124s08: add missing select IIO_(TRIGGERED_)BUFFER in Kconfig commit eb143d05def52bc6d193e813018e5fa1a0e47c77 upstream. This driver makes use of triggered buffers, but does not select the required modules. Add the missing 'select IIO_BUFFER' and 'select IIO_TRIGGERED_BUFFER'. Fixes: e717f8c6dfec ("iio: adc: Add the TI ads124s08 ADC code") Signed-off-by: Javier Carrasco Link: https://patch.msgid.link/20241003-iio-select-v1-3-67c0385197cd@gmail.com Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/adc/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iio/adc/Kconfig b/drivers/iio/adc/Kconfig index 32f2e3af23bf..e46817cb5581 100644 --- a/drivers/iio/adc/Kconfig +++ b/drivers/iio/adc/Kconfig @@ -1298,6 +1298,8 @@ config TI_ADS8688 config TI_ADS124S08 tristate "Texas Instruments ADS124S08" depends on SPI + select IIO_BUFFER + select IIO_TRIGGERED_BUFFER help If you say yes here you get support for Texas Instruments ADS124S08 and ADS124S06 ADC chips From 232c2eb6d60c127a1c0266c82bbb0a08c4774a0a Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Thu, 3 Oct 2024 23:04:47 +0200 Subject: [PATCH 086/125] iio: accel: kx022a: add missing select IIO_(TRIGGERED_)BUFFER in Kconfig commit 96666f05d11acf0370cedca17a4c3ab6f9554b35 upstream. This driver makes use of triggered buffers, but does not select the required modules. Add the missing 'select IIO_BUFFER' and 'select IIO_TRIGGERED_BUFFER'. Fixes: 7c1d1677b322 ("iio: accel: Support Kionix/ROHM KX022A accelerometer") Signed-off-by: Javier Carrasco Acked-by: Matti Vaittinen Link: https://patch.msgid.link/20241003-iio-select-v1-1-67c0385197cd@gmail.com Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/accel/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iio/accel/Kconfig b/drivers/iio/accel/Kconfig index 0398974c9d2b..df60986d9836 100644 --- a/drivers/iio/accel/Kconfig +++ b/drivers/iio/accel/Kconfig @@ -415,6 +415,8 @@ config IIO_ST_ACCEL_SPI_3AXIS config IIO_KX022A tristate + select IIO_BUFFER + select IIO_TRIGGERED_BUFFER config IIO_KX022A_SPI tristate "Kionix KX022A tri-axis digital accelerometer SPI interface" From 05f84d86169b2ebac185c5736a256823d42c425b Mon Sep 17 00:00:00 2001 From: Aaron Thompson Date: Fri, 4 Oct 2024 23:04:09 +0000 Subject: [PATCH 087/125] Bluetooth: Call iso_exit() on module unload commit d458cd1221e9e56da3b2cc5518ad3225caa91f20 upstream. If iso_init() has been called, iso_exit() must be called on module unload. Without that, the struct proto that iso_init() registered with proto_register() becomes invalid, which could cause unpredictable problems later. In my case, with CONFIG_LIST_HARDENED and CONFIG_BUG_ON_DATA_CORRUPTION enabled, loading the module again usually triggers this BUG(): list_add corruption. next->prev should be prev (ffffffffb5355fd0), but was 0000000000000068. (next=ffffffffc0a010d0). ------------[ cut here ]------------ kernel BUG at lib/list_debug.c:29! Oops: invalid opcode: 0000 [#1] PREEMPT SMP PTI CPU: 1 PID: 4159 Comm: modprobe Not tainted 6.10.11-4+bt2-ao-desktop #1 RIP: 0010:__list_add_valid_or_report+0x61/0xa0 ... __list_add_valid_or_report+0x61/0xa0 proto_register+0x299/0x320 hci_sock_init+0x16/0xc0 [bluetooth] bt_init+0x68/0xd0 [bluetooth] __pfx_bt_init+0x10/0x10 [bluetooth] do_one_initcall+0x80/0x2f0 do_init_module+0x8b/0x230 __do_sys_init_module+0x15f/0x190 do_syscall_64+0x68/0x110 ... Cc: stable@vger.kernel.org Fixes: ccf74f2390d6 ("Bluetooth: Add BTPROTO_ISO socket type") Signed-off-by: Aaron Thompson Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Greg Kroah-Hartman --- net/bluetooth/af_bluetooth.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 67604ccec2f4..9425d0680844 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -830,6 +830,8 @@ cleanup_led: static void __exit bt_exit(void) { + iso_exit(); + mgmt_exit(); sco_exit(); From 59bd80df4b88e28e37524105b3ec1853bae32713 Mon Sep 17 00:00:00 2001 From: Aaron Thompson Date: Fri, 4 Oct 2024 23:04:10 +0000 Subject: [PATCH 088/125] Bluetooth: Remove debugfs directory on module init failure commit 1db4564f101b47188c1b71696bd342ef09172b22 upstream. If bt_init() fails, the debugfs directory currently is not removed. If the module is loaded again after that, the debugfs directory is not set up properly due to the existing directory. # modprobe bluetooth # ls -laF /sys/kernel/debug/bluetooth total 0 drwxr-xr-x 2 root root 0 Sep 27 14:26 ./ drwx------ 31 root root 0 Sep 27 14:25 ../ -r--r--r-- 1 root root 0 Sep 27 14:26 l2cap -r--r--r-- 1 root root 0 Sep 27 14:26 sco # modprobe -r bluetooth # ls -laF /sys/kernel/debug/bluetooth ls: cannot access '/sys/kernel/debug/bluetooth': No such file or directory # # modprobe bluetooth modprobe: ERROR: could not insert 'bluetooth': Invalid argument # dmesg | tail -n 6 Bluetooth: Core ver 2.22 NET: Registered PF_BLUETOOTH protocol family Bluetooth: HCI device and connection manager initialized Bluetooth: HCI socket layer initialized Bluetooth: Faking l2cap_init() failure for testing NET: Unregistered PF_BLUETOOTH protocol family # ls -laF /sys/kernel/debug/bluetooth total 0 drwxr-xr-x 2 root root 0 Sep 27 14:31 ./ drwx------ 31 root root 0 Sep 27 14:26 ../ # # modprobe bluetooth # dmesg | tail -n 7 Bluetooth: Core ver 2.22 debugfs: Directory 'bluetooth' with parent '/' already present! NET: Registered PF_BLUETOOTH protocol family Bluetooth: HCI device and connection manager initialized Bluetooth: HCI socket layer initialized Bluetooth: L2CAP socket layer initialized Bluetooth: SCO socket layer initialized # ls -laF /sys/kernel/debug/bluetooth total 0 drwxr-xr-x 2 root root 0 Sep 27 14:31 ./ drwx------ 31 root root 0 Sep 27 14:26 ../ # Cc: stable@vger.kernel.org Fixes: ffcecac6a738 ("Bluetooth: Create root debugfs directory during module init") Signed-off-by: Aaron Thompson Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Greg Kroah-Hartman --- net/bluetooth/af_bluetooth.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 9425d0680844..e39fba5565c5 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -825,6 +825,7 @@ cleanup_sysfs: bt_sysfs_cleanup(); cleanup_led: bt_leds_cleanup(); + debugfs_remove_recursive(bt_debugfs); return err; } From 8fb8e912afb4c47dec12ea9a5853e7a8db95816f Mon Sep 17 00:00:00 2001 From: Aaron Thompson Date: Fri, 4 Oct 2024 23:04:08 +0000 Subject: [PATCH 089/125] Bluetooth: ISO: Fix multiple init when debugfs is disabled commit a9b7b535ba192c6b77e6c15a4c82d853163eab8c upstream. If bt_debugfs is not created successfully, which happens if either CONFIG_DEBUG_FS or CONFIG_DEBUG_FS_ALLOW_ALL is unset, then iso_init() returns early and does not set iso_inited to true. This means that a subsequent call to iso_init() will result in duplicate calls to proto_register(), bt_sock_register(), etc. With CONFIG_LIST_HARDENED and CONFIG_BUG_ON_DATA_CORRUPTION enabled, the duplicate call to proto_register() triggers this BUG(): list_add double add: new=ffffffffc0b280d0, prev=ffffffffbab56250, next=ffffffffc0b280d0. ------------[ cut here ]------------ kernel BUG at lib/list_debug.c:35! Oops: invalid opcode: 0000 [#1] PREEMPT SMP PTI CPU: 2 PID: 887 Comm: bluetoothd Not tainted 6.10.11-1-ao-desktop #1 RIP: 0010:__list_add_valid_or_report+0x9a/0xa0 ... __list_add_valid_or_report+0x9a/0xa0 proto_register+0x2b5/0x340 iso_init+0x23/0x150 [bluetooth] set_iso_socket_func+0x68/0x1b0 [bluetooth] kmem_cache_free+0x308/0x330 hci_sock_sendmsg+0x990/0x9e0 [bluetooth] __sock_sendmsg+0x7b/0x80 sock_write_iter+0x9a/0x110 do_iter_readv_writev+0x11d/0x220 vfs_writev+0x180/0x3e0 do_writev+0xca/0x100 ... This change removes the early return. The check for iso_debugfs being NULL was unnecessary, it is always NULL when iso_inited is false. Cc: stable@vger.kernel.org Fixes: ccf74f2390d6 ("Bluetooth: Add BTPROTO_ISO socket type") Signed-off-by: Aaron Thompson Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Greg Kroah-Hartman --- net/bluetooth/iso.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index c46d123c30e1..9b365fb44fac 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -2112,13 +2112,9 @@ int iso_init(void) hci_register_cb(&iso_cb); - if (IS_ERR_OR_NULL(bt_debugfs)) - return 0; - - if (!iso_debugfs) { + if (!IS_ERR_OR_NULL(bt_debugfs)) iso_debugfs = debugfs_create_file("iso", 0444, bt_debugfs, NULL, &iso_debugfs_fops); - } iso_inited = true; From 885f8c873fedf3393c76827c4821071bf5fd16b1 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 16 Oct 2024 11:47:00 -0400 Subject: [PATCH 090/125] Bluetooth: btusb: Fix regression with fake CSR controllers 0a12:0001 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 2c1dda2acc4192d826e84008d963b528e24d12bc upstream. Fake CSR controllers don't seem to handle short-transfer properly which cause command to time out: kernel: usb 1-1: new full-speed USB device number 19 using xhci_hcd kernel: usb 1-1: New USB device found, idVendor=0a12, idProduct=0001, bcdDevice=88.91 kernel: usb 1-1: New USB device strings: Mfr=0, Product=2, SerialNumber=0 kernel: usb 1-1: Product: BT DONGLE10 ... Bluetooth: hci1: Opcode 0x1004 failed: -110 kernel: Bluetooth: hci1: command 0x1004 tx timeout According to USB Spec 2.0 Section 5.7.3 Interrupt Transfer Packet Size Constraints a interrupt transfer is considered complete when the size is 0 (ZPL) or < wMaxPacketSize: 'When an interrupt transfer involves more data than can fit in one data payload of the currently established maximum size, all data payloads are required to be maximum-sized except for the last data payload, which will contain the remaining data. An interrupt transfer is complete when the endpoint does one of the following: • Has transferred exactly the amount of data expected • Transfers a packet with a payload size less than wMaxPacketSize or transfers a zero-length packet' Link: https://bugzilla.kernel.org/show_bug.cgi?id=219365 Fixes: 7b05933340f4 ("Bluetooth: btusb: Fix not handling ZPL/short-transfer") Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Greg Kroah-Hartman --- drivers/bluetooth/btusb.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index bc53da383f85..b3a9b93f027a 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -1354,10 +1354,15 @@ static int btusb_submit_intr_urb(struct hci_dev *hdev, gfp_t mem_flags) if (!urb) return -ENOMEM; - /* Use maximum HCI Event size so the USB stack handles - * ZPL/short-transfer automatically. - */ - size = HCI_MAX_EVENT_SIZE; + if (le16_to_cpu(data->udev->descriptor.idVendor) == 0x0a12 && + le16_to_cpu(data->udev->descriptor.idProduct) == 0x0001) + /* Fake CSR devices don't seem to support sort-transter */ + size = le16_to_cpu(data->intr_ep->wMaxPacketSize); + else + /* Use maximum HCI Event size so the USB stack handles + * ZPL/short-transfer automatically. + */ + size = HCI_MAX_EVENT_SIZE; buf = kmalloc(size, mem_flags); if (!buf) { From dc2d5f02636c7587bdd6d1f60fc59c55860b00a4 Mon Sep 17 00:00:00 2001 From: Jeongjun Park Date: Fri, 11 Oct 2024 02:46:19 +0900 Subject: [PATCH 091/125] vt: prevent kernel-infoleak in con_font_get() commit f956052e00de211b5c9ebaa1958366c23f82ee9e upstream. font.data may not initialize all memory spaces depending on the implementation of vc->vc_sw->con_font_get. This may cause info-leak, so to prevent this, it is safest to modify it to initialize the allocated memory space to 0, and it generally does not affect the overall performance of the system. Cc: stable@vger.kernel.org Reported-by: syzbot+955da2d57931604ee691@syzkaller.appspotmail.com Fixes: 05e2600cb0a4 ("VT: Bump font size limitation to 64x128 pixels") Signed-off-by: Jeongjun Park Link: https://lore.kernel.org/r/20241010174619.59662-1-aha310510@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/vt/vt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c index a22da757ca6d..6bd1a7785e88 100644 --- a/drivers/tty/vt/vt.c +++ b/drivers/tty/vt/vt.c @@ -4550,7 +4550,7 @@ static int con_font_get(struct vc_data *vc, struct console_font_op *op) return -EINVAL; if (op->data) { - font.data = kvmalloc(max_font_size, GFP_KERNEL); + font.data = kvzalloc(max_font_size, GFP_KERNEL); if (!font.data) return -ENOMEM; } else From 9c696bf4ab54c7cec81221887564305f0ceeac0a Mon Sep 17 00:00:00 2001 From: Henry Lin Date: Mon, 14 Oct 2024 12:21:34 +0800 Subject: [PATCH 092/125] xhci: tegra: fix checked USB2 port number commit 7d381137cb6ecf558ef6698c7730ddd482d4c8f2 upstream. If USB virtualizatoin is enabled, USB2 ports are shared between all Virtual Functions. The USB2 port number owned by an USB2 root hub in a Virtual Function may be less than total USB2 phy number supported by the Tegra XUSB controller. Using total USB2 phy number as port number to check all PORTSC values would cause invalid memory access. [ 116.923438] Unable to handle kernel paging request at virtual address 006c622f7665642f ... [ 117.213640] Call trace: [ 117.216783] tegra_xusb_enter_elpg+0x23c/0x658 [ 117.222021] tegra_xusb_runtime_suspend+0x40/0x68 [ 117.227260] pm_generic_runtime_suspend+0x30/0x50 [ 117.232847] __rpm_callback+0x84/0x3c0 [ 117.237038] rpm_suspend+0x2dc/0x740 [ 117.241229] pm_runtime_work+0xa0/0xb8 [ 117.245769] process_scheduled_works+0x24c/0x478 [ 117.251007] worker_thread+0x23c/0x328 [ 117.255547] kthread+0x104/0x1b0 [ 117.259389] ret_from_fork+0x10/0x20 [ 117.263582] Code: 54000222 f9461ae8 f8747908 b4ffff48 (f9400100) Cc: stable@vger.kernel.org # v6.3+ Fixes: a30951d31b25 ("xhci: tegra: USB2 pad power controls") Signed-off-by: Henry Lin Link: https://lore.kernel.org/r/20241014042134.27664-1-henryl@nvidia.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-tegra.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/host/xhci-tegra.c b/drivers/usb/host/xhci-tegra.c index 6246d5ad1468..76f228e7443c 100644 --- a/drivers/usb/host/xhci-tegra.c +++ b/drivers/usb/host/xhci-tegra.c @@ -2183,7 +2183,7 @@ static int tegra_xusb_enter_elpg(struct tegra_xusb *tegra, bool runtime) goto out; } - for (i = 0; i < tegra->num_usb_phys; i++) { + for (i = 0; i < xhci->usb2_rhub.num_ports; i++) { if (!xhci->usb2_rhub.ports[i]) continue; portsc = readl(xhci->usb2_rhub.ports[i]->addr); From cc7b7050996ac46a7236864a3c86adfd69829a14 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Wed, 16 Oct 2024 16:59:57 +0300 Subject: [PATCH 093/125] xhci: Fix incorrect stream context type macro commit 6599b6a6fa8060145046d0744456b6abdb3122a7 upstream. The stream contex type (SCT) bitfield is used both in the stream context data structure, and in the 'Set TR Dequeue pointer' command TRB. In both cases it uses bits 3:1 The SCT_FOR_TRB(p) macro used to set the stream context type (SCT) field for the 'Set TR Dequeue pointer' command TRB incorrectly shifts the value 1 bit left before masking the three bits. Fix this by first masking and rshifting, just like the similar SCT_FOR_CTX(p) macro does This issue has not been visibile as the lost bit 3 is only used with secondary stream arrays (SSA). Xhci driver currently only supports using a primary stream array with Linear stream addressing. Fixes: 95241dbdf828 ("xhci: Set SCT field for Set TR dequeue on streams") Cc: stable@vger.kernel.org Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20241016140000.783905-2-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h index f5efee06ff06..7754ed55d220 100644 --- a/drivers/usb/host/xhci.h +++ b/drivers/usb/host/xhci.h @@ -1286,7 +1286,7 @@ enum xhci_setup_dev { /* Set TR Dequeue Pointer command TRB fields, 6.4.3.9 */ #define TRB_TO_STREAM_ID(p) ((((p) & (0xffff << 16)) >> 16)) #define STREAM_ID_FOR_TRB(p) ((((p)) & 0xffff) << 16) -#define SCT_FOR_TRB(p) (((p) << 1) & 0x7) +#define SCT_FOR_TRB(p) (((p) & 0x7) << 1) /* Link TRB specific fields */ #define TRB_TC (1<<1) From 53cd1bb1f8ebf961a7be197734465893a8930a58 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Wed, 16 Oct 2024 16:59:58 +0300 Subject: [PATCH 094/125] xhci: Mitigate failed set dequeue pointer commands commit fe49df60cdb7c2975aa743dc295f8786e4b7db10 upstream. Avoid xHC host from processing a cancelled URB by always turning cancelled URB TDs into no-op TRBs before queuing a 'Set TR Deq' command. If the command fails then xHC will start processing the cancelled TD instead of skipping it once endpoint is restarted, causing issues like Babble error. This is not a complete solution as a failed 'Set TR Deq' command does not guarantee xHC TRB caches are cleared. Fixes: 4db356924a50 ("xhci: turn cancelled td cleanup to its own function") Cc: stable@vger.kernel.org Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20241016140000.783905-3-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-ring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index 95d8cf24b015..7d959e2753f9 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -1046,7 +1046,7 @@ static int xhci_invalidate_cancelled_tds(struct xhci_virt_ep *ep) td_to_noop(xhci, ring, cached_td, false); cached_td->cancel_status = TD_CLEARED; } - + td_to_noop(xhci, ring, td, false); td->cancel_status = TD_CLEARING_CACHE; cached_td = td; break; From 90a5c64279ae5b906604cfb3809a25fb2d91d674 Mon Sep 17 00:00:00 2001 From: "Benjamin B. Frost" Date: Wed, 11 Sep 2024 10:54:05 +0200 Subject: [PATCH 095/125] USB: serial: option: add support for Quectel EG916Q-GL commit 540eff5d7faf0c9330ec762da49df453263f7676 upstream. Add Quectel EM916Q-GL with product ID 0x6007 T: Bus=01 Lev=02 Prnt=02 Port=01 Cnt=01 Dev#= 3 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=2c7c ProdID=6007 Rev= 2.00 S: Manufacturer=Quectel S: Product=EG916Q-GL C:* #Ifs= 6 Cfg#= 1 Atr=a0 MxPwr=200mA A: FirstIf#= 4 IfCount= 2 Cls=02(comm.) Sub=06 Prot=00 I:* If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=82(I) Atr=03(Int.) MxPS= 16 Ivl=32ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=84(I) Atr=03(Int.) MxPS= 16 Ivl=32ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=86(I) Atr=03(Int.) MxPS= 16 Ivl=32ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 4 Alt= 0 #EPs= 1 Cls=02(comm.) Sub=06 Prot=00 Driver=cdc_ether E: Ad=88(I) Atr=03(Int.) MxPS= 32 Ivl=32ms I: If#= 5 Alt= 0 #EPs= 0 Cls=0a(data ) Sub=00 Prot=00 Driver=cdc_ether I:* If#= 5 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=cdc_ether E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=89(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms MI_00 Quectel USB Diag Port MI_01 Quectel USB NMEA Port MI_02 Quectel USB AT Port MI_03 Quectel USB Modem Port MI_04 Quectel USB Net Port Signed-off-by: Benjamin B. Frost Reviewed-by: Lars Melin Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/option.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 176f38750ad5..74df03b879cb 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -279,6 +279,7 @@ static void option_instat_callback(struct urb *urb); #define QUECTEL_PRODUCT_EG912Y 0x6001 #define QUECTEL_PRODUCT_EC200S_CN 0x6002 #define QUECTEL_PRODUCT_EC200A 0x6005 +#define QUECTEL_PRODUCT_EG916Q 0x6007 #define QUECTEL_PRODUCT_EM061K_LWW 0x6008 #define QUECTEL_PRODUCT_EM061K_LCN 0x6009 #define QUECTEL_PRODUCT_EC200T 0x6026 @@ -1270,6 +1271,7 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC200S_CN, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC200T, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EG912Y, 0xff, 0, 0) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EG916Q, 0xff, 0x00, 0x00) }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RM500K, 0xff, 0x00, 0x00) }, { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_6001) }, From 86c68aa714d0e5082ccee8f0679cb69821cf8339 Mon Sep 17 00:00:00 2001 From: Daniele Palmas Date: Thu, 3 Oct 2024 11:38:08 +0200 Subject: [PATCH 096/125] USB: serial: option: add Telit FN920C04 MBIM compositions commit 6d951576ee16430822a8dee1e5c54d160e1de87d upstream. Add the following Telit FN920C04 compositions: 0x10a2: MBIM + tty (AT/NMEA) + tty (AT) + tty (diag) T: Bus=03 Lev=01 Prnt=03 Port=06 Cnt=01 Dev#= 17 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=10a2 Rev=05.15 S: Manufacturer=Telit Cinterion S: Product=FN920 S: SerialNumber=92c4c4d8 C: #Ifs= 5 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0e Prot=00 Driver=cdc_mbim E: Ad=82(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=60 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms 0x10a7: MBIM + tty (AT) + tty (AT) + tty (diag) T: Bus=03 Lev=01 Prnt=03 Port=06 Cnt=01 Dev#= 18 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=10a7 Rev=05.15 S: Manufacturer=Telit Cinterion S: Product=FN920 S: SerialNumber=92c4c4d8 C: #Ifs= 5 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0e Prot=00 Driver=cdc_mbim E: Ad=82(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms 0x10aa: MBIM + tty (AT) + tty (diag) + DPL (data packet logging) + adb T: Bus=03 Lev=01 Prnt=03 Port=06 Cnt=01 Dev#= 15 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=10aa Rev=05.15 S: Manufacturer=Telit Cinterion S: Product=FN920 S: SerialNumber=92c4c4d8 C: #Ifs= 6 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0e Prot=00 Driver=cdc_mbim E: Ad=82(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 4 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=80 Driver=(none) E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none) E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms Signed-off-by: Daniele Palmas Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/option.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 74df03b879cb..55886b64cadd 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1382,10 +1382,16 @@ static const struct usb_device_id option_ids[] = { .driver_info = NCTRL(0) | RSVD(1) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10a0, 0xff), /* Telit FN20C04 (rmnet) */ .driver_info = RSVD(0) | NCTRL(3) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10a2, 0xff), /* Telit FN920C04 (MBIM) */ + .driver_info = NCTRL(4) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10a4, 0xff), /* Telit FN20C04 (rmnet) */ .driver_info = RSVD(0) | NCTRL(3) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10a7, 0xff), /* Telit FN920C04 (MBIM) */ + .driver_info = NCTRL(4) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10a9, 0xff), /* Telit FN20C04 (rmnet) */ .driver_info = RSVD(0) | NCTRL(2) | RSVD(3) | RSVD(4) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10aa, 0xff), /* Telit FN920C04 (MBIM) */ + .driver_info = NCTRL(3) | RSVD(4) | RSVD(5) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910), .driver_info = NCTRL(0) | RSVD(1) | RSVD(3) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910_DUAL_MODEM), From 479830f6c15b1a3e3c1488762a8a3bcf93e2b94f Mon Sep 17 00:00:00 2001 From: Jonathan Marek Date: Sat, 5 Oct 2024 10:41:46 -0400 Subject: [PATCH 097/125] usb: typec: qcom-pmic-typec: fix sink status being overwritten with RP_DEF commit ffe85c24d7ca5de7d57690c0ab194b3838674935 upstream. This line is overwriting the result of the above switch-case. This fixes the tcpm driver getting stuck in a "Sink TX No Go" loop. Fixes: a4422ff22142 ("usb: typec: qcom: Add Qualcomm PMIC Type-C driver") Cc: stable Signed-off-by: Jonathan Marek Acked-by: Bryan O'Donoghue Reviewed-by: Heikki Krogerus Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20241005144146.2345-1-jonathan@marek.ca Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/qcom/qcom_pmic_typec_port.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/usb/typec/tcpm/qcom/qcom_pmic_typec_port.c b/drivers/usb/typec/tcpm/qcom/qcom_pmic_typec_port.c index a8f3f4d3a450..d6607491fcef 100644 --- a/drivers/usb/typec/tcpm/qcom/qcom_pmic_typec_port.c +++ b/drivers/usb/typec/tcpm/qcom/qcom_pmic_typec_port.c @@ -252,7 +252,6 @@ int qcom_pmic_typec_port_get_cc(struct pmic_typec_port *pmic_typec_port, val = TYPEC_CC_RP_DEF; break; } - val = TYPEC_CC_RP_DEF; } if (misc & CC_ORIENTATION) From c3d3501cf896cd6b75f9b018d59247322ac5a4eb Mon Sep 17 00:00:00 2001 From: Prashanth K Date: Tue, 24 Sep 2024 15:02:08 +0530 Subject: [PATCH 098/125] usb: dwc3: Wait for EndXfer completion before restoring GUSB2PHYCFG commit c96e31252110a84dcc44412e8a7b456b33c3e298 upstream. DWC3 programming guide mentions that when operating in USB2.0 speeds, if GUSB2PHYCFG[6] or GUSB2PHYCFG[8] is set, it must be cleared prior to issuing commands and may be set again after the command completes. But currently while issuing EndXfer command without CmdIOC set, we wait for 1ms after GUSB2PHYCFG is restored. This results in cases where EndXfer command doesn't get completed and causes SMMU faults since requests are unmapped afterwards. Hence restore GUSB2PHYCFG after waiting for EndXfer command completion. Cc: stable@vger.kernel.org Fixes: 1d26ba0944d3 ("usb: dwc3: Wait unconditionally after issuing EndXfer command") Signed-off-by: Prashanth K Acked-by: Thinh Nguyen Link: https://lore.kernel.org/r/20240924093208.2524531-1-quic_prashk@quicinc.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/gadget.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index 6d439aba6460..867000cdeb96 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -438,6 +438,10 @@ skip_status: dwc3_gadget_ep_get_transfer_index(dep); } + if (DWC3_DEPCMD_CMD(cmd) == DWC3_DEPCMD_ENDTRANSFER && + !(cmd & DWC3_DEPCMD_CMDIOC)) + mdelay(1); + if (saved_config) { reg = dwc3_readl(dwc->regs, DWC3_GUSB2PHYCFG(0)); reg |= saved_config; @@ -1734,12 +1738,10 @@ static int __dwc3_stop_active_transfer(struct dwc3_ep *dep, bool force, bool int WARN_ON_ONCE(ret); dep->resource_index = 0; - if (!interrupt) { - mdelay(1); + if (!interrupt) dep->flags &= ~DWC3_EP_TRANSFER_STARTED; - } else if (!ret) { + else if (!ret) dep->flags |= DWC3_EP_END_TRANSFER_PENDING; - } dep->flags &= ~DWC3_EP_DELAY_STOP; return ret; From 78df42183e824d368e5d45169c6b7c506d81f690 Mon Sep 17 00:00:00 2001 From: Heiko Thiery Date: Mon, 7 Oct 2024 09:11:20 +0200 Subject: [PATCH 099/125] misc: microchip: pci1xxxx: add support for NVMEM_DEVID_AUTO for EEPROM device commit 3c2d73de49be528276474c1a53f78b38ee11c1fa upstream. By using NVMEM_DEVID_AUTO we support more than 1 device and automatically enumerate. Fixes: 9ab5465349c0 ("misc: microchip: pci1xxxx: Add support to read and write into PCI1XXXX EEPROM via NVMEM sysfs") Cc: stable@vger.kernel.org Signed-off-by: Heiko Thiery Reviewed-by: Michael Walle Link: https://lore.kernel.org/r/20241007071120.9522-1-heiko.thiery@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_otpe2p.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_otpe2p.c b/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_otpe2p.c index 7c3d8bedf90b..d1cd4544c83c 100644 --- a/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_otpe2p.c +++ b/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_otpe2p.c @@ -364,6 +364,7 @@ static int pci1xxxx_otp_eeprom_probe(struct auxiliary_device *aux_dev, if (is_eeprom_responsive(priv)) { priv->nvmem_config_eeprom.type = NVMEM_TYPE_EEPROM; priv->nvmem_config_eeprom.name = EEPROM_NAME; + priv->nvmem_config_eeprom.id = NVMEM_DEVID_AUTO; priv->nvmem_config_eeprom.dev = &aux_dev->dev; priv->nvmem_config_eeprom.owner = THIS_MODULE; priv->nvmem_config_eeprom.reg_read = pci1xxxx_eeprom_read; From 245bee12a52d55312aa3948ddae6887788566893 Mon Sep 17 00:00:00 2001 From: Heiko Thiery Date: Mon, 7 Oct 2024 09:11:22 +0200 Subject: [PATCH 100/125] misc: microchip: pci1xxxx: add support for NVMEM_DEVID_AUTO for OTP device commit 2471787c1f0dae6721f60ab44be37460635d3732 upstream. By using NVMEM_DEVID_AUTO we support more than 1 device and automatically enumerate. Fixes: 0969001569e4 ("misc: microchip: pci1xxxx: Add support to read and write into PCI1XXXX OTP via NVMEM sysfs") Cc: stable@vger.kernel.org Signed-off-by: Heiko Thiery Reviewed-by: Michael Walle Link: https://lore.kernel.org/r/20241007071120.9522-2-heiko.thiery@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_otpe2p.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_otpe2p.c b/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_otpe2p.c index d1cd4544c83c..a2ed477e0370 100644 --- a/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_otpe2p.c +++ b/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_otpe2p.c @@ -384,6 +384,7 @@ static int pci1xxxx_otp_eeprom_probe(struct auxiliary_device *aux_dev, priv->nvmem_config_otp.type = NVMEM_TYPE_OTP; priv->nvmem_config_otp.name = OTP_NAME; + priv->nvmem_config_otp.id = NVMEM_DEVID_AUTO; priv->nvmem_config_otp.dev = &aux_dev->dev; priv->nvmem_config_otp.owner = THIS_MODULE; priv->nvmem_config_otp.reg_read = pci1xxxx_otp_read; From c895d48c843dee8baf5327dee33ce786760c9d44 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Wed, 2 Oct 2024 20:40:38 +0200 Subject: [PATCH 101/125] serial: imx: Update mctrl old_status on RTSD interrupt commit 40d7903386df4d18f04d90510ba90eedee260085 upstream. When sending data using DMA at high baudrate (4 Mbdps in local test case) to a device with small RX buffer which keeps asserting RTS after every received byte, it is possible that the iMX UART driver would not recognize the falling edge of RTS input signal and get stuck, unable to transmit any more data. This condition happens when the following sequence of events occur: - imx_uart_mctrl_check() is called at some point and takes a snapshot of UART control signal status into sport->old_status using imx_uart_get_hwmctrl(). The RTSS/TIOCM_CTS bit is of interest here (*). - DMA transfer occurs, the remote device asserts RTS signal after each byte. The i.MX UART driver recognizes each such RTS signal change, raises an interrupt with USR1 register RTSD bit set, which leads to invocation of __imx_uart_rtsint(), which calls uart_handle_cts_change(). - If the RTS signal is deasserted, uart_handle_cts_change() clears port->hw_stopped and unblocks the port for further data transfers. - If the RTS is asserted, uart_handle_cts_change() sets port->hw_stopped and blocks the port for further data transfers. This may occur as the last interrupt of a transfer, which means port->hw_stopped remains set and the port remains blocked (**). - Any further data transfer attempts will trigger imx_uart_mctrl_check(), which will read current status of UART control signals by calling imx_uart_get_hwmctrl() (***) and compare it with sport->old_status . - If current status differs from sport->old_status for RTS signal, uart_handle_cts_change() is called and possibly unblocks the port by clearing port->hw_stopped . - If current status does not differ from sport->old_status for RTS signal, no action occurs. This may occur in case prior snapshot (*) was taken before any transfer so the RTS is deasserted, current snapshot (***) was taken after a transfer and therefore RTS is deasserted again, which means current status and sport->old_status are identical. In case (**) triggered when RTS got asserted, and made port->hw_stopped set, the port->hw_stopped will remain set because no change on RTS line is recognized by this driver and uart_handle_cts_change() is not called from here to unblock the port->hw_stopped. Update sport->old_status in __imx_uart_rtsint() accordingly to make imx_uart_mctrl_check() detect such RTS change. Note that TIOCM_CAR and TIOCM_RI bits in sport->old_status do not suffer from this problem. Fixes: ceca629e0b48 ("[ARM] 2971/1: i.MX uart handle rts irq") Cc: stable Reviewed-by: Esben Haabendal Signed-off-by: Marek Vasut Link: https://lore.kernel.org/r/20241002184133.19427-1-marex@denx.de Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/imx.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c index a5d0df2ba5c5..349d4849ba5e 100644 --- a/drivers/tty/serial/imx.c +++ b/drivers/tty/serial/imx.c @@ -771,6 +771,21 @@ static irqreturn_t __imx_uart_rtsint(int irq, void *dev_id) imx_uart_writel(sport, USR1_RTSD, USR1); usr1 = imx_uart_readl(sport, USR1) & USR1_RTSS; + /* + * Update sport->old_status here, so any follow-up calls to + * imx_uart_mctrl_check() will be able to recognize that RTS + * state changed since last imx_uart_mctrl_check() call. + * + * In case RTS has been detected as asserted here and later on + * deasserted by the time imx_uart_mctrl_check() was called, + * imx_uart_mctrl_check() can detect the RTS state change and + * trigger uart_handle_cts_change() to unblock the port for + * further TX transfers. + */ + if (usr1 & USR1_RTSS) + sport->old_status |= TIOCM_CTS; + else + sport->old_status &= ~TIOCM_CTS; uart_handle_cts_change(&sport->port, usr1); wake_up_interruptible(&sport->port.state->port.delta_msr_wait); From 66029078fee00646e2e9dbb8f41ff7819f8e7569 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 20 Sep 2024 12:32:19 +0200 Subject: [PATCH 102/125] parport: Proper fix for array out-of-bounds access commit 02ac3a9ef3a18b58d8f3ea2b6e46de657bf6c4f9 upstream. The recent fix for array out-of-bounds accesses replaced sprintf() calls blindly with snprintf(). However, since snprintf() returns the would-be-printed size, not the actually output size, the length calculation can still go over the given limit. Use scnprintf() instead of snprintf(), which returns the actually output letters, for addressing the potential out-of-bounds access properly. Fixes: ab11dac93d2d ("dev/parport: fix the array out-of-bounds risk") Cc: stable@vger.kernel.org Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/r/20240920103318.19271-1-tiwai@suse.de Signed-off-by: Greg Kroah-Hartman --- drivers/parport/procfs.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/parport/procfs.c b/drivers/parport/procfs.c index c334ef6e3b3f..3b7d7e23602a 100644 --- a/drivers/parport/procfs.c +++ b/drivers/parport/procfs.c @@ -58,12 +58,12 @@ static int do_active_device(struct ctl_table *table, int write, for (dev = port->devices; dev ; dev = dev->next) { if(dev == port->cad) { - len += snprintf(buffer, sizeof(buffer), "%s\n", dev->name); + len += scnprintf(buffer, sizeof(buffer), "%s\n", dev->name); } } if(!len) { - len += snprintf(buffer, sizeof(buffer), "%s\n", "none"); + len += scnprintf(buffer, sizeof(buffer), "%s\n", "none"); } if (len > *lenp) @@ -94,19 +94,19 @@ static int do_autoprobe(struct ctl_table *table, int write, } if ((str = info->class_name) != NULL) - len += snprintf (buffer + len, sizeof(buffer) - len, "CLASS:%s;\n", str); + len += scnprintf (buffer + len, sizeof(buffer) - len, "CLASS:%s;\n", str); if ((str = info->model) != NULL) - len += snprintf (buffer + len, sizeof(buffer) - len, "MODEL:%s;\n", str); + len += scnprintf (buffer + len, sizeof(buffer) - len, "MODEL:%s;\n", str); if ((str = info->mfr) != NULL) - len += snprintf (buffer + len, sizeof(buffer) - len, "MANUFACTURER:%s;\n", str); + len += scnprintf (buffer + len, sizeof(buffer) - len, "MANUFACTURER:%s;\n", str); if ((str = info->description) != NULL) - len += snprintf (buffer + len, sizeof(buffer) - len, "DESCRIPTION:%s;\n", str); + len += scnprintf (buffer + len, sizeof(buffer) - len, "DESCRIPTION:%s;\n", str); if ((str = info->cmdset) != NULL) - len += snprintf (buffer + len, sizeof(buffer) - len, "COMMAND SET:%s;\n", str); + len += scnprintf (buffer + len, sizeof(buffer) - len, "COMMAND SET:%s;\n", str); if (len > *lenp) len = *lenp; @@ -135,7 +135,7 @@ static int do_hardware_base_addr(struct ctl_table *table, int write, if (write) /* permissions prevent this anyway */ return -EACCES; - len += snprintf (buffer, sizeof(buffer), "%lu\t%lu\n", port->base, port->base_hi); + len += scnprintf (buffer, sizeof(buffer), "%lu\t%lu\n", port->base, port->base_hi); if (len > *lenp) len = *lenp; @@ -162,7 +162,7 @@ static int do_hardware_irq(struct ctl_table *table, int write, if (write) /* permissions prevent this anyway */ return -EACCES; - len += snprintf (buffer, sizeof(buffer), "%d\n", port->irq); + len += scnprintf (buffer, sizeof(buffer), "%d\n", port->irq); if (len > *lenp) len = *lenp; @@ -189,7 +189,7 @@ static int do_hardware_dma(struct ctl_table *table, int write, if (write) /* permissions prevent this anyway */ return -EACCES; - len += snprintf (buffer, sizeof(buffer), "%d\n", port->dma); + len += scnprintf (buffer, sizeof(buffer), "%d\n", port->dma); if (len > *lenp) len = *lenp; @@ -220,7 +220,7 @@ static int do_hardware_modes(struct ctl_table *table, int write, #define printmode(x) \ do { \ if (port->modes & PARPORT_MODE_##x) \ - len += snprintf(buffer + len, sizeof(buffer) - len, "%s%s", f++ ? "," : "", #x); \ + len += scnprintf(buffer + len, sizeof(buffer) - len, "%s%s", f++ ? "," : "", #x); \ } while (0) int f = 0; printmode(PCSPP); From 16d7d35f1c1e2a3a5cb14c91fe0f1922068942c9 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 17 Sep 2024 09:02:53 -0700 Subject: [PATCH 103/125] x86/resctrl: Annotate get_mem_config() functions as __init commit d5fd042bf4cfb557981d65628e1779a492cd8cfa upstream. After a recent LLVM change [1] that deduces __cold on functions that only call cold code (such as __init functions), there is a section mismatch warning from __get_mem_config_intel(), which got moved to .text.unlikely. as a result of that optimization: WARNING: modpost: vmlinux: section mismatch in reference: \ __get_mem_config_intel+0x77 (section: .text.unlikely.) -> thread_throttle_mode_init (section: .init.text) Mark __get_mem_config_intel() as __init as well since it is only called from __init code, which clears up the warning. While __rdt_get_mem_config_amd() does not exhibit a warning because it does not call any __init code, it is a similar function that is only called from __init code like __get_mem_config_intel(), so mark it __init as well to keep the code symmetrical. CONFIG_SECTION_MISMATCH_WARN_ONLY=n would turn this into a fatal error. Fixes: 05b93417ce5b ("x86/intel_rdt/mba: Add primary support for Memory Bandwidth Allocation (MBA)") Fixes: 4d05bf71f157 ("x86/resctrl: Introduce AMD QOS feature") Signed-off-by: Nathan Chancellor Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Cc: Link: https://github.com/llvm/llvm-project/commit/6b11573b8c5e3d36beee099dbe7347c2a007bf53 [1] Link: https://lore.kernel.org/r/20240917-x86-restctrl-get_mem_config_intel-init-v3-1-10d521256284@kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/resctrl/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 8073fd304293..10830995eada 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -193,7 +193,7 @@ static inline bool rdt_get_mb_table(struct rdt_resource *r) return false; } -static bool __get_mem_config_intel(struct rdt_resource *r) +static __init bool __get_mem_config_intel(struct rdt_resource *r) { struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); union cpuid_0x10_3_eax eax; @@ -227,7 +227,7 @@ static bool __get_mem_config_intel(struct rdt_resource *r) return true; } -static bool __rdt_get_mem_config_amd(struct rdt_resource *r) +static __init bool __rdt_get_mem_config_amd(struct rdt_resource *r) { struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); u32 eax, ebx, ecx, edx, subleaf; From 6663f0c658857e9ebb87ed6b98c0126bbaa2bf46 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 15 Oct 2024 14:15:22 +0800 Subject: [PATCH 104/125] x86/apic: Always explicitly disarm TSC-deadline timer commit ffd95846c6ec6cf1f93da411ea10d504036cab42 upstream. New processors have become pickier about the local APIC timer state before entering low power modes. These low power modes are used (for example) when you close your laptop lid and suspend. If you put your laptop in a bag and it is not in this low power mode, it is likely to get quite toasty while it quickly sucks the battery dry. The problem boils down to some CPUs' inability to power down until the CPU recognizes that the local APIC timer is shut down. The current kernel code works in one-shot and periodic modes but does not work for deadline mode. Deadline mode has been the supported and preferred mode on Intel CPUs for over a decade and uses an MSR to drive the timer instead of an APIC register. Disable the TSC Deadline timer in lapic_timer_shutdown() by writing to MSR_IA32_TSC_DEADLINE when in TSC-deadline mode. Also avoid writing to the initial-count register (APIC_TMICT) which is ignored in TSC-deadline mode. Note: The APIC_LVTT|=APIC_LVT_MASKED operation should theoretically be enough to tell the hardware that the timer will not fire in any of the timer modes. But mitigating AMD erratum 411[1] also requires clearing out APIC_TMICT. Solely setting APIC_LVT_MASKED is also ineffective in practice on Intel Lunar Lake systems, which is the motivation for this change. 1. 411 Processor May Exit Message-Triggered C1E State Without an Interrupt if Local APIC Timer Reaches Zero - https://www.amd.com/content/dam/amd/en/documents/archived-tech-docs/revision-guides/41322_10h_Rev_Gd.pdf Fixes: 279f1461432c ("x86: apic: Use tsc deadline for oneshot when available") Suggested-by: Dave Hansen Signed-off-by: Zhang Rui Signed-off-by: Dave Hansen Reviewed-by: Rafael J. Wysocki Tested-by: Srinivas Pandruvada Tested-by: Todd Brandt Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20241015061522.25288-1-rui.zhang%40intel.com Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/apic/apic.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index f2e605ee75d9..00ca9c3c1d8b 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -473,7 +473,19 @@ static int lapic_timer_shutdown(struct clock_event_device *evt) v = apic_read(APIC_LVTT); v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); apic_write(APIC_LVTT, v); - apic_write(APIC_TMICT, 0); + + /* + * Setting APIC_LVT_MASKED (above) should be enough to tell + * the hardware that this timer will never fire. But AMD + * erratum 411 and some Intel CPU behavior circa 2024 say + * otherwise. Time for belt and suspenders programming: mask + * the timer _and_ zero the counter registers: + */ + if (v & APIC_LVT_TIMER_TSCDEADLINE) + wrmsrl(MSR_IA32_TSC_DEADLINE, 0); + else + apic_write(APIC_TMICT, 0); + return 0; } From c8170b5ddc411201f8de0a654901f204a2cdd415 Mon Sep 17 00:00:00 2001 From: John Allen Date: Mon, 23 Sep 2024 16:44:04 +0000 Subject: [PATCH 105/125] x86/CPU/AMD: Only apply Zenbleed fix for Zen2 during late microcode load commit ee4d4e8d2c3bec6ee652599ab31991055a72c322 upstream. Commit f69759be251d ("x86/CPU/AMD: Move Zenbleed check to the Zen2 init function") causes a bit in the DE_CFG MSR to get set erroneously after a microcode late load. The microcode late load path calls into amd_check_microcode() and subsequently zen2_zenbleed_check(). Since the above commit removes the cpu_has_amd_erratum() call from zen2_zenbleed_check(), this will cause all non-Zen2 CPUs to go through the function and set the bit in the DE_CFG MSR. Call into the Zenbleed fix path on Zen2 CPUs only. [ bp: Massage commit message, use cpu_feature_enabled(). ] Fixes: f69759be251d ("x86/CPU/AMD: Move Zenbleed check to the Zen2 init function") Signed-off-by: John Allen Signed-off-by: Borislav Petkov (AMD) Acked-by: Borislav Petkov (AMD) Cc: Link: https://lore.kernel.org/r/20240923164404.27227-1-john.allen@amd.com Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/amd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 9fd91022d92d..145c81c68394 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1374,7 +1374,8 @@ void amd_check_microcode(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) return; - on_each_cpu(zenbleed_check_cpu, NULL, 1); + if (cpu_feature_enabled(X86_FEATURE_ZEN2)) + on_each_cpu(zenbleed_check_cpu, NULL, 1); } /* From 0c6a7e2c6012cee8273f2b7c1a335a609f8a73b7 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Wed, 25 Sep 2024 15:25:38 -0700 Subject: [PATCH 106/125] x86/entry_32: Do not clobber user EFLAGS.ZF commit 2e2e5143d4868163d6756c8c6a4d28cbfa5245e5 upstream. Opportunistic SYSEXIT executes VERW to clear CPU buffers after user EFLAGS are restored. This can clobber user EFLAGS.ZF. Move CLEAR_CPU_BUFFERS before the user EFLAGS are restored. This ensures that the user EFLAGS.ZF is not clobbered. Closes: https://lore.kernel.org/lkml/yVXwe8gvgmPADpRB6lXlicS2fcHoV5OHHxyuFbB_MEleRPD7-KhGe5VtORejtPe-KCkT8Uhcg5d7-IBw4Ojb4H7z5LQxoZylSmJ8KNL3A8o=@protonmail.com/ Fixes: a0e2dab44d22 ("x86/entry_32: Add VERW just before userspace transition") Reported-by: Jari Ruusu Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Cc:stable@vger.kernel.org Link: https://lore.kernel.org/all/20240925-fix-dosemu-vm86-v7-1-1de0daca2d42%40linux.intel.com Signed-off-by: Greg Kroah-Hartman --- arch/x86/entry/entry_32.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 74a4358c7f45..7722a1a1a8ae 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -875,6 +875,8 @@ SYM_FUNC_START(entry_SYSENTER_32) /* Now ready to switch the cr3 */ SWITCH_TO_USER_CR3 scratch_reg=%eax + /* Clobbers ZF */ + CLEAR_CPU_BUFFERS /* * Restore all flags except IF. (We restore IF separately because @@ -885,7 +887,6 @@ SYM_FUNC_START(entry_SYSENTER_32) BUG_IF_WRONG_CR3 no_user_check=1 popfl popl %eax - CLEAR_CPU_BUFFERS /* * Return back to the vDSO, which will pop ecx and edx. From 227358e89703c344008119be7e8ffa3fdb5b92de Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Wed, 25 Sep 2024 15:25:44 -0700 Subject: [PATCH 107/125] x86/entry_32: Clear CPU buffers after register restore in NMI return commit 48a2440d0f20c826b884e04377ccc1e4696c84e9 upstream. CPU buffers are currently cleared after call to exc_nmi, but before register state is restored. This may be okay for MDS mitigation but not for RDFS. Because RDFS mitigation requires CPU buffers to be cleared when registers don't have any sensitive data. Move CLEAR_CPU_BUFFERS after RESTORE_ALL_NMI. Fixes: a0e2dab44d22 ("x86/entry_32: Add VERW just before userspace transition") Suggested-by: Dave Hansen Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Cc:stable@vger.kernel.org Link: https://lore.kernel.org/all/20240925-fix-dosemu-vm86-v7-2-1de0daca2d42%40linux.intel.com Signed-off-by: Greg Kroah-Hartman --- arch/x86/entry/entry_32.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 7722a1a1a8ae..3894acc54b79 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1149,7 +1149,6 @@ SYM_CODE_START(asm_exc_nmi) /* Not on SYSENTER stack. */ call exc_nmi - CLEAR_CPU_BUFFERS jmp .Lnmi_return .Lnmi_from_sysenter_stack: @@ -1170,6 +1169,7 @@ SYM_CODE_START(asm_exc_nmi) CHECK_AND_APPLY_ESPFIX RESTORE_ALL_NMI cr3_reg=%edi pop=4 + CLEAR_CPU_BUFFERS jmp .Lirq_return #ifdef CONFIG_X86_ESPFIX32 @@ -1211,6 +1211,7 @@ SYM_CODE_START(asm_exc_nmi) * 1 - orig_ax */ lss (1+5+6)*4(%esp), %esp # back to espfix stack + CLEAR_CPU_BUFFERS jmp .Lirq_return #endif SYM_CODE_END(asm_exc_nmi) From c29f192e0d44cc1cbaf698fa1ff198f63556691a Mon Sep 17 00:00:00 2001 From: Longlong Xia Date: Thu, 26 Sep 2024 21:02:13 +0800 Subject: [PATCH 108/125] tty: n_gsm: Fix use-after-free in gsm_cleanup_mux commit 9462f4ca56e7d2430fdb6dcc8498244acbfc4489 upstream. BUG: KASAN: slab-use-after-free in gsm_cleanup_mux+0x77b/0x7b0 drivers/tty/n_gsm.c:3160 [n_gsm] Read of size 8 at addr ffff88815fe99c00 by task poc/3379 CPU: 0 UID: 0 PID: 3379 Comm: poc Not tainted 6.11.0+ #56 Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 11/12/2020 Call Trace: gsm_cleanup_mux+0x77b/0x7b0 drivers/tty/n_gsm.c:3160 [n_gsm] __pfx_gsm_cleanup_mux+0x10/0x10 drivers/tty/n_gsm.c:3124 [n_gsm] __pfx_sched_clock_cpu+0x10/0x10 kernel/sched/clock.c:389 update_load_avg+0x1c1/0x27b0 kernel/sched/fair.c:4500 __pfx_min_vruntime_cb_rotate+0x10/0x10 kernel/sched/fair.c:846 __rb_insert_augmented+0x492/0xbf0 lib/rbtree.c:161 gsmld_ioctl+0x395/0x1450 drivers/tty/n_gsm.c:3408 [n_gsm] _raw_spin_lock_irqsave+0x92/0xf0 arch/x86/include/asm/atomic.h:107 __pfx_gsmld_ioctl+0x10/0x10 drivers/tty/n_gsm.c:3822 [n_gsm] ktime_get+0x5e/0x140 kernel/time/timekeeping.c:195 ldsem_down_read+0x94/0x4e0 arch/x86/include/asm/atomic64_64.h:79 __pfx_ldsem_down_read+0x10/0x10 drivers/tty/tty_ldsem.c:338 __pfx_do_vfs_ioctl+0x10/0x10 fs/ioctl.c:805 tty_ioctl+0x643/0x1100 drivers/tty/tty_io.c:2818 Allocated by task 65: gsm_data_alloc.constprop.0+0x27/0x190 drivers/tty/n_gsm.c:926 [n_gsm] gsm_send+0x2c/0x580 drivers/tty/n_gsm.c:819 [n_gsm] gsm1_receive+0x547/0xad0 drivers/tty/n_gsm.c:3038 [n_gsm] gsmld_receive_buf+0x176/0x280 drivers/tty/n_gsm.c:3609 [n_gsm] tty_ldisc_receive_buf+0x101/0x1e0 drivers/tty/tty_buffer.c:391 tty_port_default_receive_buf+0x61/0xa0 drivers/tty/tty_port.c:39 flush_to_ldisc+0x1b0/0x750 drivers/tty/tty_buffer.c:445 process_scheduled_works+0x2b0/0x10d0 kernel/workqueue.c:3229 worker_thread+0x3dc/0x950 kernel/workqueue.c:3391 kthread+0x2a3/0x370 kernel/kthread.c:389 ret_from_fork+0x2d/0x70 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:257 Freed by task 3367: kfree+0x126/0x420 mm/slub.c:4580 gsm_cleanup_mux+0x36c/0x7b0 drivers/tty/n_gsm.c:3160 [n_gsm] gsmld_ioctl+0x395/0x1450 drivers/tty/n_gsm.c:3408 [n_gsm] tty_ioctl+0x643/0x1100 drivers/tty/tty_io.c:2818 [Analysis] gsm_msg on the tx_ctrl_list or tx_data_list of gsm_mux can be freed by multi threads through ioctl,which leads to the occurrence of uaf. Protect it by gsm tx lock. Signed-off-by: Longlong Xia Cc: stable Suggested-by: Jiri Slaby Link: https://lore.kernel.org/r/20240926130213.531959-1-xialonglong@kylinos.cn Signed-off-by: Greg Kroah-Hartman --- drivers/tty/n_gsm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c index 5fc8540a83e3..8559ba1361c6 100644 --- a/drivers/tty/n_gsm.c +++ b/drivers/tty/n_gsm.c @@ -3156,6 +3156,8 @@ static void gsm_cleanup_mux(struct gsm_mux *gsm, bool disc) mutex_unlock(&gsm->mutex); /* Now wipe the queues */ tty_ldisc_flush(gsm->tty); + + guard(spinlock_irqsave)(&gsm->tx_lock); list_for_each_entry_safe(txq, ntxq, &gsm->tx_ctrl_list, list) kfree(txq); INIT_LIST_HEAD(&gsm->tx_ctrl_list); From 481b477ab63c7245715a3e57ba79eb87c2dc0d02 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Thu, 26 Sep 2024 09:10:31 -0700 Subject: [PATCH 109/125] x86/bugs: Use code segment selector for VERW operand commit e4d2102018542e3ae5e297bc6e229303abff8a0f upstream. Robert Gill reported below #GP in 32-bit mode when dosemu software was executing vm86() system call: general protection fault: 0000 [#1] PREEMPT SMP CPU: 4 PID: 4610 Comm: dosemu.bin Not tainted 6.6.21-gentoo-x86 #1 Hardware name: Dell Inc. PowerEdge 1950/0H723K, BIOS 2.7.0 10/30/2010 EIP: restore_all_switch_stack+0xbe/0xcf EAX: 00000000 EBX: 00000000 ECX: 00000000 EDX: 00000000 ESI: 00000000 EDI: 00000000 EBP: 00000000 ESP: ff8affdc DS: 0000 ES: 0000 FS: 0000 GS: 0033 SS: 0068 EFLAGS: 00010046 CR0: 80050033 CR2: 00c2101c CR3: 04b6d000 CR4: 000406d0 Call Trace: show_regs+0x70/0x78 die_addr+0x29/0x70 exc_general_protection+0x13c/0x348 exc_bounds+0x98/0x98 handle_exception+0x14d/0x14d exc_bounds+0x98/0x98 restore_all_switch_stack+0xbe/0xcf exc_bounds+0x98/0x98 restore_all_switch_stack+0xbe/0xcf This only happens in 32-bit mode when VERW based mitigations like MDS/RFDS are enabled. This is because segment registers with an arbitrary user value can result in #GP when executing VERW. Intel SDM vol. 2C documents the following behavior for VERW instruction: #GP(0) - If a memory operand effective address is outside the CS, DS, ES, FS, or GS segment limit. CLEAR_CPU_BUFFERS macro executes VERW instruction before returning to user space. Use %cs selector to reference VERW operand. This ensures VERW will not #GP for an arbitrary user %ds. [ mingo: Fixed the SOB chain. ] Fixes: a0e2dab44d22 ("x86/entry_32: Add VERW just before userspace transition") Reported-by: Robert Gill Reviewed-by: Andrew Cooper Suggested-by: Brian Gerst Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/nospec-branch.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index a8781c8763b4..ee642d26e304 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -332,7 +332,16 @@ * Note: Only the memory operand variant of VERW clears the CPU buffers. */ .macro CLEAR_CPU_BUFFERS - ALTERNATIVE "", __stringify(verw _ASM_RIP(mds_verw_sel)), X86_FEATURE_CLEAR_CPU_BUF +#ifdef CONFIG_X86_64 + ALTERNATIVE "", "verw mds_verw_sel(%rip)", X86_FEATURE_CLEAR_CPU_BUF +#else + /* + * In 32bit mode, the memory operand must be a %cs reference. The data + * segments may not be usable (vm86 mode), and the stack segment may not + * be flat (ESPFIX32). + */ + ALTERNATIVE "", "verw %cs:mds_verw_sel", X86_FEATURE_CLEAR_CPU_BUF +#endif .endm #ifdef CONFIG_X86_64 From 20728e86289ab463b99b7ab4425515bd26aba417 Mon Sep 17 00:00:00 2001 From: Sergey Matsievskiy Date: Sat, 12 Oct 2024 13:57:43 +0300 Subject: [PATCH 110/125] pinctrl: ocelot: fix system hang on level based interrupts commit 93b8ddc54507a227087c60a0013ed833b6ae7d3c upstream. The current implementation only calls chained_irq_enter() and chained_irq_exit() if it detects pending interrupts. ``` for (i = 0; i < info->stride; i++) { uregmap_read(info->map, id_reg + 4 * i, ®); if (!reg) continue; chained_irq_enter(parent_chip, desc); ``` However, in case of GPIO pin configured in level mode and the parent controller configured in edge mode, GPIO interrupt might be lowered by the hardware. In the result, if the interrupt is short enough, the parent interrupt is still pending while the GPIO interrupt is cleared; chained_irq_enter() never gets called and the system hangs trying to service the parent interrupt. Moving chained_irq_enter() and chained_irq_exit() outside the for loop ensures that they are called even when GPIO interrupt is lowered by the hardware. The similar code with chained_irq_enter() / chained_irq_exit() functions wrapping interrupt checking loop may be found in many other drivers: ``` grep -r -A 10 chained_irq_enter drivers/pinctrl ``` Cc: stable@vger.kernel.org Signed-off-by: Sergey Matsievskiy Reviewed-by: Alexandre Belloni Link: https://lore.kernel.org/20241012105743.12450-2-matsievskiysv@gmail.com Signed-off-by: Linus Walleij Signed-off-by: Greg Kroah-Hartman --- drivers/pinctrl/pinctrl-ocelot.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/pinctrl/pinctrl-ocelot.c b/drivers/pinctrl/pinctrl-ocelot.c index f8ae2e974221..8d26c8061c77 100644 --- a/drivers/pinctrl/pinctrl-ocelot.c +++ b/drivers/pinctrl/pinctrl-ocelot.c @@ -1962,21 +1962,21 @@ static void ocelot_irq_handler(struct irq_desc *desc) unsigned int reg = 0, irq, i; unsigned long irqs; + chained_irq_enter(parent_chip, desc); + for (i = 0; i < info->stride; i++) { regmap_read(info->map, id_reg + 4 * i, ®); if (!reg) continue; - chained_irq_enter(parent_chip, desc); - irqs = reg; for_each_set_bit(irq, &irqs, min(32U, info->desc->npins - 32 * i)) generic_handle_domain_irq(chip->irq.domain, irq + 32 * i); - - chained_irq_exit(parent_chip, desc); } + + chained_irq_exit(parent_chip, desc); } static int ocelot_gpiochip_register(struct platform_device *pdev, From 3b36bb1fca2b87f6292ca2a8593f297c5e9fab41 Mon Sep 17 00:00:00 2001 From: Ma Ke Date: Fri, 6 Sep 2024 18:03:26 +0800 Subject: [PATCH 111/125] pinctrl: stm32: check devm_kasprintf() returned value commit b0f0e3f0552a566def55c844b0d44250c58e4df6 upstream. devm_kasprintf() can return a NULL pointer on failure but this returned value is not checked. Fix this lack and check the returned value. Found by code review. Cc: stable@vger.kernel.org Fixes: 32c170ff15b0 ("pinctrl: stm32: set default gpio line names using pin names") Signed-off-by: Ma Ke Link: https://lore.kernel.org/20240906100326.624445-1-make24@iscas.ac.cn Signed-off-by: Linus Walleij Signed-off-by: Greg Kroah-Hartman --- drivers/pinctrl/stm32/pinctrl-stm32.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/pinctrl/stm32/pinctrl-stm32.c b/drivers/pinctrl/stm32/pinctrl-stm32.c index 346a31f31bba..5e91def60784 100644 --- a/drivers/pinctrl/stm32/pinctrl-stm32.c +++ b/drivers/pinctrl/stm32/pinctrl-stm32.c @@ -1387,10 +1387,15 @@ static int stm32_gpiolib_register_bank(struct stm32_pinctrl *pctl, struct fwnode for (i = 0; i < npins; i++) { stm32_pin = stm32_pctrl_get_desc_pin_from_gpio(pctl, bank, i); - if (stm32_pin && stm32_pin->pin.name) + if (stm32_pin && stm32_pin->pin.name) { names[i] = devm_kasprintf(dev, GFP_KERNEL, "%s", stm32_pin->pin.name); - else + if (!names[i]) { + err = -ENOMEM; + goto err_clk; + } + } else { names[i] = NULL; + } } bank->gpio_chip.names = (const char * const *)names; From fad940e2dd789155f99ecafa71a7baf6f96530bc Mon Sep 17 00:00:00 2001 From: Ma Ke Date: Thu, 5 Sep 2024 10:09:17 +0800 Subject: [PATCH 112/125] pinctrl: apple: check devm_kasprintf() returned value commit 665a58fe663ac7a9ea618dc0b29881649324b116 upstream. devm_kasprintf() can return a NULL pointer on failure but this returned value is not checked. Fix this lack and check the returned value. Found by code review. Cc: stable@vger.kernel.org Fixes: a0f160ffcb83 ("pinctrl: add pinctrl/GPIO driver for Apple SoCs") Signed-off-by: Ma Ke Reviewed-by: Christophe JAILLET Link: https://lore.kernel.org/20240905020917.356534-1-make24@iscas.ac.cn Signed-off-by: Linus Walleij Signed-off-by: Greg Kroah-Hartman --- drivers/pinctrl/pinctrl-apple-gpio.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/pinctrl/pinctrl-apple-gpio.c b/drivers/pinctrl/pinctrl-apple-gpio.c index 3751c7de37aa..f861e63f4115 100644 --- a/drivers/pinctrl/pinctrl-apple-gpio.c +++ b/drivers/pinctrl/pinctrl-apple-gpio.c @@ -474,6 +474,9 @@ static int apple_gpio_pinctrl_probe(struct platform_device *pdev) for (i = 0; i < npins; i++) { pins[i].number = i; pins[i].name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "PIN%u", i); + if (!pins[i].name) + return -ENOMEM; + pins[i].drv_data = pctl; pin_names[i] = pins[i].name; pin_nums[i] = i; From 01282ab5182f85e42234df2ff42f0ce790f465ff Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 2 Oct 2024 21:49:59 +0100 Subject: [PATCH 113/125] irqchip/gic-v4: Don't allow a VMOVP on a dying VPE commit 1442ee0011983f0c5c4b92380e6853afb513841a upstream. Kunkun Jiang reported that there is a small window of opportunity for userspace to force a change of affinity for a VPE while the VPE has already been unmapped, but the corresponding doorbell interrupt still visible in /proc/irq/. Plug the race by checking the value of vmapp_count, which tracks whether the VPE is mapped ot not, and returning an error in this case. This involves making vmapp_count common to both GICv4.1 and its v4.0 ancestor. Fixes: 64edfaa9a234 ("irqchip/gic-v4.1: Implement the v4.1 flavour of VMAPP") Reported-by: Kunkun Jiang Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/c182ece6-2ba0-ce4f-3404-dba7a3ab6c52@huawei.com Link: https://lore.kernel.org/all/20241002204959.2051709-1-maz@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/irqchip/irq-gic-v3-its.c | 18 ++++++++++++------ include/linux/irqchip/arm-gic-v4.h | 4 +++- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index f04a2c3a45e1..b1e60c13c1e1 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -796,8 +796,8 @@ static struct its_vpe *its_build_vmapp_cmd(struct its_node *its, its_encode_valid(cmd, desc->its_vmapp_cmd.valid); if (!desc->its_vmapp_cmd.valid) { + alloc = !atomic_dec_return(&desc->its_vmapp_cmd.vpe->vmapp_count); if (is_v4_1(its)) { - alloc = !atomic_dec_return(&desc->its_vmapp_cmd.vpe->vmapp_count); its_encode_alloc(cmd, alloc); /* * Unmapping a VPE is self-synchronizing on GICv4.1, @@ -816,13 +816,13 @@ static struct its_vpe *its_build_vmapp_cmd(struct its_node *its, its_encode_vpt_addr(cmd, vpt_addr); its_encode_vpt_size(cmd, LPI_NRBITS - 1); + alloc = !atomic_fetch_inc(&desc->its_vmapp_cmd.vpe->vmapp_count); + if (!is_v4_1(its)) goto out; vconf_addr = virt_to_phys(page_address(desc->its_vmapp_cmd.vpe->its_vm->vprop_page)); - alloc = !atomic_fetch_inc(&desc->its_vmapp_cmd.vpe->vmapp_count); - its_encode_alloc(cmd, alloc); /* @@ -3816,6 +3816,13 @@ static int its_vpe_set_affinity(struct irq_data *d, unsigned long flags; int from, cpu; + /* + * Check if we're racing against a VPE being destroyed, for + * which we don't want to allow a VMOVP. + */ + if (!atomic_read(&vpe->vmapp_count)) + return -EINVAL; + /* * Changing affinity is mega expensive, so let's be as lazy as * we can and only do it if we really have to. Also, if mapped @@ -4456,9 +4463,8 @@ static int its_vpe_init(struct its_vpe *vpe) raw_spin_lock_init(&vpe->vpe_lock); vpe->vpe_id = vpe_id; vpe->vpt_page = vpt_page; - if (gic_rdists->has_rvpeid) - atomic_set(&vpe->vmapp_count, 0); - else + atomic_set(&vpe->vmapp_count, 0); + if (!gic_rdists->has_rvpeid) vpe->vpe_proxy_event = -1; return 0; diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h index 2c63375bbd43..bf9e0640288d 100644 --- a/include/linux/irqchip/arm-gic-v4.h +++ b/include/linux/irqchip/arm-gic-v4.h @@ -58,10 +58,12 @@ struct its_vpe { bool enabled; bool group; } sgi_config[16]; - atomic_t vmapp_count; }; }; + /* Track the VPE being mapped */ + atomic_t vmapp_count; + /* * Ensures mutual exclusion between affinity setting of the * vPE and vLPI operations using vpe->col_idx. From 6acd19ad3aaf270b19e704b5cff14808766fca84 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 3 Oct 2024 10:41:52 +0200 Subject: [PATCH 114/125] irqchip/sifive-plic: Unmask interrupt in plic_irq_enable() commit 6b1e0651e9ce8ce418ad4ff360e7b9925dc5da79 upstream. It is possible that an interrupt is disabled and masked at the same time. When the interrupt is enabled again by enable_irq(), only plic_irq_enable() is called, not plic_irq_unmask(). The interrupt remains masked and never raises. An example where interrupt is both disabled and masked is when handle_fasteoi_irq() is the handler, and IRQS_ONESHOT is set. The interrupt handler: 1. Mask the interrupt 2. Handle the interrupt 3. Check if interrupt is still enabled, and unmask it (see cond_unmask_eoi_irq()) If another task disables the interrupt in the middle of the above steps, the interrupt will not get unmasked, and will remain masked when it is enabled in the future. The problem is occasionally observed when PREEMPT_RT is enabled, because PREEMPT_RT adds the IRQS_ONESHOT flag. But PREEMPT_RT only makes the problem more likely to appear, the bug has been around since commit a1706a1c5062 ("irqchip/sifive-plic: Separate the enable and mask operations"). Fix it by unmasking interrupt in plic_irq_enable(). Fixes: a1706a1c5062 ("irqchip/sifive-plic: Separate the enable and mask operations") Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20241003084152.2422969-1-namcao@linutronix.de Signed-off-by: Greg Kroah-Hartman --- drivers/irqchip/irq-sifive-plic.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/irqchip/irq-sifive-plic.c b/drivers/irqchip/irq-sifive-plic.c index bf0b40b0fad4..572899669154 100644 --- a/drivers/irqchip/irq-sifive-plic.c +++ b/drivers/irqchip/irq-sifive-plic.c @@ -120,16 +120,6 @@ static inline void plic_irq_toggle(const struct cpumask *mask, } } -static void plic_irq_enable(struct irq_data *d) -{ - plic_irq_toggle(irq_data_get_effective_affinity_mask(d), d, 1); -} - -static void plic_irq_disable(struct irq_data *d) -{ - plic_irq_toggle(irq_data_get_effective_affinity_mask(d), d, 0); -} - static void plic_irq_unmask(struct irq_data *d) { struct plic_priv *priv = irq_data_get_irq_chip_data(d); @@ -144,6 +134,17 @@ static void plic_irq_mask(struct irq_data *d) writel(0, priv->regs + PRIORITY_BASE + d->hwirq * PRIORITY_PER_ID); } +static void plic_irq_enable(struct irq_data *d) +{ + plic_irq_toggle(irq_data_get_effective_affinity_mask(d), d, 1); + plic_irq_unmask(d); +} + +static void plic_irq_disable(struct irq_data *d) +{ + plic_irq_toggle(irq_data_get_effective_affinity_mask(d), d, 0); +} + static void plic_irq_eoi(struct irq_data *d) { struct plic_handler *handler = this_cpu_ptr(&plic_handlers); From 90e3f819e6aacfcf5b297d4769744f42b5a749c8 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 9 Oct 2024 16:51:02 +0200 Subject: [PATCH 115/125] serial: qcom-geni: fix polled console initialisation commit 4bef7c6f299910f19876ad8e7f5897514855f1d2 upstream. The polled console (KGDB/KDB) implementation must not call port setup unconditionally as the port may already be in use by the console or a getty. Only make sure that the receiver is enabled, but do not enable any device interrupts. Fixes: d8851a96ba25 ("tty: serial: qcom-geni-serial: Add a poll_init() function") Cc: stable@vger.kernel.org # 6.4 Cc: Douglas Anderson Signed-off-by: Johan Hovold Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20241009145110.16847-2-johan+linaro@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/qcom_geni_serial.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/drivers/tty/serial/qcom_geni_serial.c b/drivers/tty/serial/qcom_geni_serial.c index f798ef3c4130..6ebe83db0bc4 100644 --- a/drivers/tty/serial/qcom_geni_serial.c +++ b/drivers/tty/serial/qcom_geni_serial.c @@ -144,6 +144,8 @@ static const struct uart_ops qcom_geni_uart_pops; static struct uart_driver qcom_geni_console_driver; static struct uart_driver qcom_geni_uart_driver; +static int qcom_geni_serial_port_setup(struct uart_port *uport); + static inline struct qcom_geni_serial_port *to_dev_port(struct uart_port *uport) { return container_of(uport, struct qcom_geni_serial_port, uport); @@ -385,6 +387,23 @@ static void qcom_geni_serial_poll_put_char(struct uart_port *uport, writel(M_TX_FIFO_WATERMARK_EN, uport->membase + SE_GENI_M_IRQ_CLEAR); qcom_geni_serial_poll_tx_done(uport); } + +static int qcom_geni_serial_poll_init(struct uart_port *uport) +{ + struct qcom_geni_serial_port *port = to_dev_port(uport); + int ret; + + if (!port->setup) { + ret = qcom_geni_serial_port_setup(uport); + if (ret) + return ret; + } + + if (!qcom_geni_serial_secondary_active(uport)) + geni_se_setup_s_cmd(&port->se, UART_START_READ, 0); + + return 0; +} #endif #ifdef CONFIG_SERIAL_QCOM_GENI_CONSOLE @@ -1544,7 +1563,7 @@ static const struct uart_ops qcom_geni_console_pops = { #ifdef CONFIG_CONSOLE_POLL .poll_get_char = qcom_geni_serial_get_char, .poll_put_char = qcom_geni_serial_poll_put_char, - .poll_init = qcom_geni_serial_port_setup, + .poll_init = qcom_geni_serial_poll_init, #endif .pm = qcom_geni_serial_pm, }; From 7176aee5a1b20ae4da1082d6fc59a3b58383f079 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 9 Oct 2024 16:51:03 +0200 Subject: [PATCH 116/125] serial: qcom-geni: revert broken hibernation support commit 19df76662a33d2f2fc41a66607cb8285fc02d6ec upstream. This reverts commit 35781d8356a2eecaa6074ceeb80ee22e252fcdae. Hibernation is not supported on Qualcomm platforms with mainline kernels yet a broken vendor implementation for the GENI serial driver made it upstream. This is effectively dead code that cannot be tested and should just be removed, but if these paths were ever hit for an open non-console port they would crash the machine as the driver would fail to enable clocks during restore() (i.e. all ports would have to be closed by drivers and user space before hibernating the system to avoid this as a comment in the code hinted at). The broken implementation also added a random call to enable the receiver in the port setup code where it does not belong and which enables the receiver prematurely for console ports. Fixes: 35781d8356a2 ("tty: serial: qcom-geni-serial: Add support for Hibernation feature") Cc: stable@vger.kernel.org # 6.2 Cc: Aniket Randive Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20241009145110.16847-3-johan+linaro@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/qcom_geni_serial.c | 41 ++------------------------- 1 file changed, 2 insertions(+), 39 deletions(-) diff --git a/drivers/tty/serial/qcom_geni_serial.c b/drivers/tty/serial/qcom_geni_serial.c index 6ebe83db0bc4..bac50f31ac67 100644 --- a/drivers/tty/serial/qcom_geni_serial.c +++ b/drivers/tty/serial/qcom_geni_serial.c @@ -1134,7 +1134,6 @@ static int qcom_geni_serial_port_setup(struct uart_port *uport) false, true, true); geni_se_init(&port->se, UART_RX_WM, port->rx_fifo_depth - 2); geni_se_select_mode(&port->se, port->dev_data->mode); - qcom_geni_serial_start_rx(uport); port->setup = true; return 0; @@ -1764,38 +1763,6 @@ static int qcom_geni_serial_sys_resume(struct device *dev) return ret; } -static int qcom_geni_serial_sys_hib_resume(struct device *dev) -{ - int ret = 0; - struct uart_port *uport; - struct qcom_geni_private_data *private_data; - struct qcom_geni_serial_port *port = dev_get_drvdata(dev); - - uport = &port->uport; - private_data = uport->private_data; - - if (uart_console(uport)) { - geni_icc_set_tag(&port->se, QCOM_ICC_TAG_ALWAYS); - geni_icc_set_bw(&port->se); - ret = uart_resume_port(private_data->drv, uport); - /* - * For hibernation usecase clients for - * console UART won't call port setup during restore, - * hence call port setup for console uart. - */ - qcom_geni_serial_port_setup(uport); - } else { - /* - * Peripheral register settings are lost during hibernation. - * Update setup flag such that port setup happens again - * during next session. Clients of HS-UART will close and - * open the port during hibernation. - */ - port->setup = false; - } - return ret; -} - static const struct qcom_geni_device_data qcom_geni_console_data = { .console = true, .mode = GENI_SE_FIFO, @@ -1807,12 +1774,8 @@ static const struct qcom_geni_device_data qcom_geni_uart_data = { }; static const struct dev_pm_ops qcom_geni_serial_pm_ops = { - .suspend = pm_sleep_ptr(qcom_geni_serial_sys_suspend), - .resume = pm_sleep_ptr(qcom_geni_serial_sys_resume), - .freeze = pm_sleep_ptr(qcom_geni_serial_sys_suspend), - .poweroff = pm_sleep_ptr(qcom_geni_serial_sys_suspend), - .restore = pm_sleep_ptr(qcom_geni_serial_sys_hib_resume), - .thaw = pm_sleep_ptr(qcom_geni_serial_sys_hib_resume), + SYSTEM_SLEEP_PM_OPS(qcom_geni_serial_sys_suspend, + qcom_geni_serial_sys_resume) }; static const struct of_device_id qcom_geni_serial_match_table[] = { From 61c8f746ed831c4e8753bfbf0ccd08c20b13f83a Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 9 Oct 2024 16:51:05 +0200 Subject: [PATCH 117/125] serial: qcom-geni: fix dma rx cancellation commit 23ee4a25661c33e6381d41e848a9060ed6d72845 upstream. Make sure to wait for the DMA transfer to complete when cancelling the rx command on stop_rx(). This specifically prevents the DMA completion interrupt from firing after rx has been restarted, something which can lead to an IOMMU fault and hosed rx when the interrupt handler unmaps the DMA buffer for the new command: qcom_geni_serial 988000.serial: serial engine reports 0 RX bytes in! arm-smmu 15000000.iommu: FSR = 00000402 [Format=2 TF], SID=0x563 arm-smmu 15000000.iommu: FSYNR0 = 00210013 [S1CBNDX=33 WNR PLVL=3] Bluetooth: hci0: command 0xfc00 tx timeout Bluetooth: hci0: Reading QCA version information failed (-110) Also add the missing state machine reset which is needed in case cancellation fails. Fixes: 2aaa43c70778 ("tty: serial: qcom-geni-serial: add support for serial engine DMA") Cc: stable@vger.kernel.org # 6.3 Cc: Bartosz Golaszewski Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20241009145110.16847-5-johan+linaro@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/qcom_geni_serial.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/tty/serial/qcom_geni_serial.c b/drivers/tty/serial/qcom_geni_serial.c index bac50f31ac67..11c3fe8cc462 100644 --- a/drivers/tty/serial/qcom_geni_serial.c +++ b/drivers/tty/serial/qcom_geni_serial.c @@ -775,17 +775,27 @@ static void qcom_geni_serial_start_rx_fifo(struct uart_port *uport) static void qcom_geni_serial_stop_rx_dma(struct uart_port *uport) { struct qcom_geni_serial_port *port = to_dev_port(uport); + bool done; if (!qcom_geni_serial_secondary_active(uport)) return; geni_se_cancel_s_cmd(&port->se); - qcom_geni_serial_poll_bit(uport, SE_GENI_S_IRQ_STATUS, - S_CMD_CANCEL_EN, true); - - if (qcom_geni_serial_secondary_active(uport)) + done = qcom_geni_serial_poll_bit(uport, SE_DMA_RX_IRQ_STAT, + RX_EOT, true); + if (done) { + writel(RX_EOT | RX_DMA_DONE, + uport->membase + SE_DMA_RX_IRQ_CLR); + } else { qcom_geni_serial_abort_rx(uport); + writel(1, uport->membase + SE_DMA_RX_FSM_RST); + qcom_geni_serial_poll_bit(uport, SE_DMA_RX_IRQ_STAT, + RX_RESET_DONE, true); + writel(RX_RESET_DONE | RX_DMA_DONE, + uport->membase + SE_DMA_RX_IRQ_CLR); + } + if (port->rx_dma_addr) { geni_se_rx_dma_unprep(&port->se, port->rx_dma_addr, DMA_RX_BUF_SIZE); From 647cd4494cc301f019267b901ab5cf71f3e54d57 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 9 Oct 2024 16:51:06 +0200 Subject: [PATCH 118/125] serial: qcom-geni: fix receiver enable commit fa103d2599e11e802c818684cff821baefe7f206 upstream. The receiver is supposed to be enabled in the startup() callback and not in set_termios() which is called also during console setup. This specifically avoids accepting input before the port has been opened (and interrupts enabled), something which can also break the GENI firmware (cancel fails and after abort, the "stale" counter handling appears to be broken so that later input is not processed until twelve chars have been received). There also does not appear to be any need to keep the receiver disabled while updating the port settings. Since commit 6f3c3cafb115 ("serial: qcom-geni: disable interrupts during console writes") the calls to manipulate the secondary interrupts, which were done without holding the port lock, can also lead to the receiver being left disabled when set_termios() races with the console code (e.g. when init opens the tty during boot). This can manifest itself as a serial getty not accepting input. The calls to stop and start rx in set_termios() can similarly race with DMA completion and, for example, cause the DMA buffer to be unmapped twice or the mapping to be leaked. Fix this by only enabling the receiver during startup and while holding the port lock to avoid racing with the console code. Fixes: 6f3c3cafb115 ("serial: qcom-geni: disable interrupts during console writes") Fixes: 2aaa43c70778 ("tty: serial: qcom-geni-serial: add support for serial engine DMA") Fixes: c4f528795d1a ("tty: serial: msm_geni_serial: Add serial driver support for GENI based QUP") Cc: stable@vger.kernel.org # 6.3 Cc: Bartosz Golaszewski Signed-off-by: Johan Hovold Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20241009145110.16847-6-johan+linaro@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/qcom_geni_serial.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/tty/serial/qcom_geni_serial.c b/drivers/tty/serial/qcom_geni_serial.c index 11c3fe8cc462..f820a09cb5c3 100644 --- a/drivers/tty/serial/qcom_geni_serial.c +++ b/drivers/tty/serial/qcom_geni_serial.c @@ -1159,6 +1159,11 @@ static int qcom_geni_serial_startup(struct uart_port *uport) if (ret) return ret; } + + uart_port_lock_irq(uport); + qcom_geni_serial_start_rx(uport); + uart_port_unlock_irq(uport); + enable_irq(uport->irq); return 0; @@ -1244,7 +1249,6 @@ static void qcom_geni_serial_set_termios(struct uart_port *uport, unsigned int avg_bw_core; unsigned long timeout; - qcom_geni_serial_stop_rx(uport); /* baud rate */ baud = uart_get_baud_rate(uport, termios, old, 300, 4000000); @@ -1260,7 +1264,7 @@ static void qcom_geni_serial_set_termios(struct uart_port *uport, dev_err(port->se.dev, "Couldn't find suitable clock rate for %u\n", baud * sampling_rate); - goto out_restart_rx; + return; } dev_dbg(port->se.dev, "desired_rate = %u, clk_rate = %lu, clk_div = %u\n", @@ -1351,8 +1355,6 @@ static void qcom_geni_serial_set_termios(struct uart_port *uport, writel(stop_bit_len, uport->membase + SE_UART_TX_STOP_BIT_LEN); writel(ser_clk_cfg, uport->membase + GENI_SER_M_CLK_CFG); writel(ser_clk_cfg, uport->membase + GENI_SER_S_CLK_CFG); -out_restart_rx: - qcom_geni_serial_start_rx(uport); } #ifdef CONFIG_SERIAL_QCOM_GENI_CONSOLE From 229dfdc36f31a8d47433438bc0e6e1662c4ab404 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 18 Oct 2024 17:57:36 +0200 Subject: [PATCH 119/125] tcp: fix mptcp DSS corruption due to large pmtu xmit commit 4dabcdf581217e60690467a37c956a5b8dbc6bd9 upstream. Syzkaller was able to trigger a DSS corruption: TCP: request_sock_subflow_v4: Possible SYN flooding on port [::]:20002. Sending cookies. ------------[ cut here ]------------ WARNING: CPU: 0 PID: 5227 at net/mptcp/protocol.c:695 __mptcp_move_skbs_from_subflow+0x20a9/0x21f0 net/mptcp/protocol.c:695 Modules linked in: CPU: 0 UID: 0 PID: 5227 Comm: syz-executor350 Not tainted 6.11.0-syzkaller-08829-gaf9c191ac2a0 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/06/2024 RIP: 0010:__mptcp_move_skbs_from_subflow+0x20a9/0x21f0 net/mptcp/protocol.c:695 Code: 0f b6 dc 31 ff 89 de e8 b5 dd ea f5 89 d8 48 81 c4 50 01 00 00 5b 41 5c 41 5d 41 5e 41 5f 5d c3 cc cc cc cc e8 98 da ea f5 90 <0f> 0b 90 e9 47 ff ff ff e8 8a da ea f5 90 0f 0b 90 e9 99 e0 ff ff RSP: 0018:ffffc90000006db8 EFLAGS: 00010246 RAX: ffffffff8ba9df18 RBX: 00000000000055f0 RCX: ffff888030023c00 RDX: 0000000000000100 RSI: 00000000000081e5 RDI: 00000000000055f0 RBP: 1ffff110062bf1ae R08: ffffffff8ba9cf12 R09: 1ffff110062bf1b8 R10: dffffc0000000000 R11: ffffed10062bf1b9 R12: 0000000000000000 R13: dffffc0000000000 R14: 00000000700cec61 R15: 00000000000081e5 FS: 000055556679c380(0000) GS:ffff8880b8600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020287000 CR3: 0000000077892000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: move_skbs_to_msk net/mptcp/protocol.c:811 [inline] mptcp_data_ready+0x29c/0xa90 net/mptcp/protocol.c:854 subflow_data_ready+0x34a/0x920 net/mptcp/subflow.c:1490 tcp_data_queue+0x20fd/0x76c0 net/ipv4/tcp_input.c:5283 tcp_rcv_established+0xfba/0x2020 net/ipv4/tcp_input.c:6237 tcp_v4_do_rcv+0x96d/0xc70 net/ipv4/tcp_ipv4.c:1915 tcp_v4_rcv+0x2dc0/0x37f0 net/ipv4/tcp_ipv4.c:2350 ip_protocol_deliver_rcu+0x22e/0x440 net/ipv4/ip_input.c:205 ip_local_deliver_finish+0x341/0x5f0 net/ipv4/ip_input.c:233 NF_HOOK+0x3a4/0x450 include/linux/netfilter.h:314 NF_HOOK+0x3a4/0x450 include/linux/netfilter.h:314 __netif_receive_skb_one_core net/core/dev.c:5662 [inline] __netif_receive_skb+0x2bf/0x650 net/core/dev.c:5775 process_backlog+0x662/0x15b0 net/core/dev.c:6107 __napi_poll+0xcb/0x490 net/core/dev.c:6771 napi_poll net/core/dev.c:6840 [inline] net_rx_action+0x89b/0x1240 net/core/dev.c:6962 handle_softirqs+0x2c5/0x980 kernel/softirq.c:554 do_softirq+0x11b/0x1e0 kernel/softirq.c:455 __local_bh_enable_ip+0x1bb/0x200 kernel/softirq.c:382 local_bh_enable include/linux/bottom_half.h:33 [inline] rcu_read_unlock_bh include/linux/rcupdate.h:919 [inline] __dev_queue_xmit+0x1764/0x3e80 net/core/dev.c:4451 dev_queue_xmit include/linux/netdevice.h:3094 [inline] neigh_hh_output include/net/neighbour.h:526 [inline] neigh_output include/net/neighbour.h:540 [inline] ip_finish_output2+0xd41/0x1390 net/ipv4/ip_output.c:236 ip_local_out net/ipv4/ip_output.c:130 [inline] __ip_queue_xmit+0x118c/0x1b80 net/ipv4/ip_output.c:536 __tcp_transmit_skb+0x2544/0x3b30 net/ipv4/tcp_output.c:1466 tcp_transmit_skb net/ipv4/tcp_output.c:1484 [inline] tcp_mtu_probe net/ipv4/tcp_output.c:2547 [inline] tcp_write_xmit+0x641d/0x6bf0 net/ipv4/tcp_output.c:2752 __tcp_push_pending_frames+0x9b/0x360 net/ipv4/tcp_output.c:3015 tcp_push_pending_frames include/net/tcp.h:2107 [inline] tcp_data_snd_check net/ipv4/tcp_input.c:5714 [inline] tcp_rcv_established+0x1026/0x2020 net/ipv4/tcp_input.c:6239 tcp_v4_do_rcv+0x96d/0xc70 net/ipv4/tcp_ipv4.c:1915 sk_backlog_rcv include/net/sock.h:1113 [inline] __release_sock+0x214/0x350 net/core/sock.c:3072 release_sock+0x61/0x1f0 net/core/sock.c:3626 mptcp_push_release net/mptcp/protocol.c:1486 [inline] __mptcp_push_pending+0x6b5/0x9f0 net/mptcp/protocol.c:1625 mptcp_sendmsg+0x10bb/0x1b10 net/mptcp/protocol.c:1903 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x1a6/0x270 net/socket.c:745 ____sys_sendmsg+0x52a/0x7e0 net/socket.c:2603 ___sys_sendmsg net/socket.c:2657 [inline] __sys_sendmsg+0x2aa/0x390 net/socket.c:2686 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7fb06e9317f9 Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ffe2cfd4f98 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007fb06e97f468 RCX: 00007fb06e9317f9 RDX: 0000000000000000 RSI: 0000000020000080 RDI: 0000000000000005 RBP: 00007fb06e97f446 R08: 0000555500000000 R09: 0000555500000000 R10: 0000555500000000 R11: 0000000000000246 R12: 00007fb06e97f406 R13: 0000000000000001 R14: 00007ffe2cfd4fe0 R15: 0000000000000003 Additionally syzkaller provided a nice reproducer. The repro enables pmtu on the loopback device, leading to tcp_mtu_probe() generating very large probe packets. tcp_can_coalesce_send_queue_head() currently does not check for mptcp-level invariants, and allowed the creation of cross-DSS probes, leading to the mentioned corruption. Address the issue teaching tcp_can_coalesce_send_queue_head() about mptcp using the tcp_skb_can_collapse(), also reducing the code duplication. Fixes: 85712484110d ("tcp: coalesce/collapse must respect MPTCP extensions") Cc: stable@vger.kernel.org Reported-by: syzbot+d1bff73460e33101f0e7@syzkaller.appspotmail.com Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/513 Signed-off-by: Paolo Abeni Acked-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241008-net-mptcp-fallback-fixes-v1-2-c6fb8e93e551@kernel.org Signed-off-by: Jakub Kicinski [ Conflict in tcp_output.c, because the commit 65249feb6b3d ("net: add support for skbs with unreadable frags") is not in this version. This commit is linked to a new feature (Devmem TCP) and introduces a new condition which causes the conflicts. Resolving this is easy: we can ignore the missing new condition, and use tcp_skb_can_collapse() like in the original patch. ] Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_output.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 15c49d559db5..328640d9b607 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2314,9 +2314,7 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len) if (len <= skb->len) break; - if (unlikely(TCP_SKB_CB(skb)->eor) || - tcp_has_tx_tstamp(skb) || - !skb_pure_zcopy_same(skb, next)) + if (tcp_has_tx_tstamp(skb) || !tcp_skb_can_collapse(skb, next)) return false; len -= skb->len; From d46b96f0a4892580757d3bbb546d2971dbe83361 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 18 Oct 2024 17:57:37 +0200 Subject: [PATCH 120/125] selftests: mptcp: join: change capture/checksum as bool commit 8c6f6b4bb53a904f922dfb90d566391d3feee32c upstream. To maintain consistency with other scripts, this patch changes vars 'capture' and 'checksum' as bool vars in mptcp_join. Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240223-upstream-net-next-20240223-misc-improvements-v1-7-b6c8a10396bd@kernel.org Signed-off-by: Jakub Kicinski Stable-dep-of: 5afca7e996c4 ("selftests: mptcp: join: test for prohibited MPC to port-based endp") Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Greg Kroah-Hartman --- .../testing/selftests/net/mptcp/mptcp_join.sh | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 3c286fba8d5d..a21376b0f61d 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -30,11 +30,11 @@ iptables="iptables" ip6tables="ip6tables" timeout_poll=30 timeout_test=$((timeout_poll * 2 + 1)) -capture=0 -checksum=0 +capture=false +checksum=false ip_mptcp=0 check_invert=0 -validate_checksum=0 +validate_checksum=false init=0 evts_ns1="" evts_ns2="" @@ -99,7 +99,7 @@ init_partial() ip netns exec $netns sysctl -q net.mptcp.pm_type=0 2>/dev/null || true ip netns exec $netns sysctl -q net.ipv4.conf.all.rp_filter=0 ip netns exec $netns sysctl -q net.ipv4.conf.default.rp_filter=0 - if [ $checksum -eq 1 ]; then + if $checksum; then ip netns exec $netns sysctl -q net.mptcp.checksum_enabled=1 fi done @@ -386,7 +386,7 @@ reset_with_checksum() ip netns exec $ns1 sysctl -q net.mptcp.checksum_enabled=$ns1_enable ip netns exec $ns2 sysctl -q net.mptcp.checksum_enabled=$ns2_enable - validate_checksum=1 + validate_checksum=true } reset_with_allow_join_id0() @@ -419,7 +419,7 @@ reset_with_allow_join_id0() setup_fail_rules() { check_invert=1 - validate_checksum=1 + validate_checksum=true local i="$1" local ip="${2:-4}" local tables @@ -1024,7 +1024,7 @@ do_transfer() :> "$sout" :> "$capout" - if [ $capture -eq 1 ]; then + if $capture; then local capuser if [ -z $SUDO_USER ] ; then capuser="" @@ -1125,7 +1125,7 @@ do_transfer() wait $spid local rets=$? - if [ $capture -eq 1 ]; then + if $capture; then sleep 1 kill $cappid fi @@ -1514,7 +1514,7 @@ chk_join_nr() else print_ok fi - if [ $validate_checksum -eq 1 ]; then + if $validate_checksum; then chk_csum_nr $csum_ns1 $csum_ns2 chk_fail_nr $fail_nr $fail_nr chk_rst_nr $rst_nr $rst_nr @@ -3960,10 +3960,10 @@ while getopts "${all_tests_args}cCih" opt; do tests+=("${all_tests[${opt}]}") ;; c) - capture=1 + capture=true ;; C) - checksum=1 + checksum=true ;; i) ip_mptcp=1 From ec0d0fcbd5d8645fdfb75717d70270f59254d04d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 18 Oct 2024 17:57:38 +0200 Subject: [PATCH 121/125] selftests: mptcp: join: test for prohibited MPC to port-based endp commit 5afca7e996c42aed1b4a42d4712817601ba42aff upstream. Explicitly verify that MPC connection attempts towards a port-based signal endpoint fail with a reset. Note that this new test is a bit different from the other ones, not using 'run_tests'. It is then needed to add the capture capability, and the picking the right port which have been extracted into three new helpers. The info about the capture can also be printed from a single point, which simplifies the exit paths in do_transfer(). The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it validates the previous fix for an issue introduced by this commit ID. Fixes: 1729cf186d8a ("mptcp: create the listening socket for new port") Cc: stable@vger.kernel.org Co-developed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241014-net-mptcp-mpc-port-endp-v2-2-7faea8e6b6ae@kernel.org Signed-off-by: Jakub Kicinski [ Conflicts in mptcp_join.sh, because commit 0bd962dd86b2 ("selftests: mptcp: join: check CURRESTAB counters"), and commit 9e6a39ecb9a1 ("selftests: mptcp: export TEST_COUNTER variable") are linked to new features, not available in this version. Resolving the conflicts is easy, simply adding the new helpers before do_transfer(), and rename MPTCP_LIB_TEST_COUNTER to TEST_COUNT that was used before. ] Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Greg Kroah-Hartman --- .../testing/selftests/net/mptcp/mptcp_join.sh | 117 +++++++++++++----- 1 file changed, 86 insertions(+), 31 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index a21376b0f61d..17ace5627ce3 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -23,6 +23,7 @@ tmpfile="" cout="" err="" capout="" +cappid="" ns1="" ns2="" ksft_skip=4 @@ -1006,6 +1007,44 @@ pm_nl_set_endpoint() fi } +cond_start_capture() +{ + local ns="$1" + + :> "$capout" + + if $capture; then + local capuser capfile + if [ -z $SUDO_USER ]; then + capuser="" + else + capuser="-Z $SUDO_USER" + fi + + capfile=$(printf "mp_join-%02u-%s.pcap" "$TEST_COUNT" "$ns") + + echo "Capturing traffic for test $TEST_COUNT into $capfile" + ip netns exec "$ns" tcpdump -i any -s 65535 -B 32768 $capuser -w "$capfile" > "$capout" 2>&1 & + cappid=$! + + sleep 1 + fi +} + +cond_stop_capture() +{ + if $capture; then + sleep 1 + kill $cappid + cat "$capout" + fi +} + +get_port() +{ + echo "$((10000 + TEST_COUNT - 1))" +} + do_transfer() { local listener_ns="$1" @@ -1013,33 +1052,17 @@ do_transfer() local cl_proto="$3" local srv_proto="$4" local connect_addr="$5" + local port - local port=$((10000 + TEST_COUNT - 1)) - local cappid local FAILING_LINKS=${FAILING_LINKS:-""} local fastclose=${fastclose:-""} local speed=${speed:-"fast"} + port=$(get_port) :> "$cout" :> "$sout" - :> "$capout" - if $capture; then - local capuser - if [ -z $SUDO_USER ] ; then - capuser="" - else - capuser="-Z $SUDO_USER" - fi - - capfile=$(printf "mp_join-%02u-%s.pcap" "$TEST_COUNT" "${listener_ns}") - - echo "Capturing traffic for test $TEST_COUNT into $capfile" - ip netns exec ${listener_ns} tcpdump -i any -s 65535 -B 32768 $capuser -w $capfile > "$capout" 2>&1 & - cappid=$! - - sleep 1 - fi + cond_start_capture ${listener_ns} NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \ nstat -n @@ -1125,10 +1148,7 @@ do_transfer() wait $spid local rets=$? - if $capture; then - sleep 1 - kill $cappid - fi + cond_stop_capture NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \ nstat | grep Tcp > /tmp/${listener_ns}.out @@ -1144,7 +1164,6 @@ do_transfer() ip netns exec ${connector_ns} ss -Menita 1>&2 -o "dport = :$port" cat /tmp/${connector_ns}.out - cat "$capout" return 1 fi @@ -1161,13 +1180,7 @@ do_transfer() fi rets=$? - if [ $retc -eq 0 ] && [ $rets -eq 0 ];then - cat "$capout" - return 0 - fi - - cat "$capout" - return 1 + [ $retc -eq 0 ] && [ $rets -eq 0 ] } make_file() @@ -2944,6 +2957,32 @@ verify_listener_events() fail_test "$e_type:$type $e_family:$family $e_saddr:$saddr $e_sport:$sport" } +chk_mpc_endp_attempt() +{ + local retl=$1 + local attempts=$2 + + print_check "Connect" + + if [ ${retl} = 124 ]; then + fail_test "timeout on connect" + elif [ ${retl} = 0 ]; then + fail_test "unexpected successful connect" + else + print_ok + + print_check "Attempts" + count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPCapableEndpAttempt") + if [ -z "$count" ]; then + print_skip + elif [ "$count" != "$attempts" ]; then + fail_test "got ${count} MPC attempt[s] on port-based endpoint, expected ${attempts}" + else + print_ok + fi + fi +} + add_addr_ports_tests() { # signal address with port @@ -3034,6 +3073,22 @@ add_addr_ports_tests() chk_join_nr 2 2 2 chk_add_nr 2 2 2 fi + + if reset "port-based signal endpoint must not accept mpc"; then + local port retl count + port=$(get_port) + + cond_start_capture ${ns1} + pm_nl_add_endpoint ${ns1} 10.0.2.1 flags signal port ${port} + mptcp_lib_wait_local_port_listen ${ns1} ${port} + + timeout 1 ip netns exec ${ns2} \ + ./mptcp_connect -t ${timeout_poll} -p $port -s MPTCP 10.0.2.1 >/dev/null 2>&1 + retl=$? + cond_stop_capture + + chk_mpc_endp_attempt ${retl} 1 + fi } syncookies_tests() From 87cb3f6e0c047f9320889e68140f46b1f4ca9eca Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 18 Oct 2024 17:57:39 +0200 Subject: [PATCH 122/125] selftests: mptcp: remove duplicated variables A few week ago, there were some backport issues in MPTCP selftests, because some patches have been applied twice, but with versions handling conflicts differently [1]. Patches fixing these issues have been sent [2] and applied, but it looks like quilt was still confused with the removal of some patches, and commit a417ef47a665 ("selftests: mptcp: join: validate event numbers") duplicated some variables, not present in the original patch [3]. Anyway, Bash was complaining, but that was not causing any tests to fail. Also, that's easy to fix by simply removing the duplicated ones. Link: https://lore.kernel.org/fc21db4a-508d-41db-aa45-e3bc06d18ce7@kernel.org [1] Link: https://lore.kernel.org/20240905144306.1192409-5-matttbe@kernel.org [2] Link: https://lore.kernel.org/20240905144306.1192409-7-matttbe@kernel.org [3] Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Greg Kroah-Hartman --- tools/testing/selftests/net/mptcp/mptcp_lib.sh | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index 869c8eda4bc3..d98c89f31afe 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -8,17 +8,6 @@ readonly KSFT_SKIP=4 # shellcheck disable=SC2155 # declare and assign separately readonly KSFT_TEST="${MPTCP_LIB_KSFT_TEST:-$(basename "${0}" .sh)}" -# These variables are used in some selftests, read-only -declare -rx MPTCP_LIB_EVENT_ANNOUNCED=6 # MPTCP_EVENT_ANNOUNCED -declare -rx MPTCP_LIB_EVENT_REMOVED=7 # MPTCP_EVENT_REMOVED -declare -rx MPTCP_LIB_EVENT_SUB_ESTABLISHED=10 # MPTCP_EVENT_SUB_ESTABLISHED -declare -rx MPTCP_LIB_EVENT_SUB_CLOSED=11 # MPTCP_EVENT_SUB_CLOSED -declare -rx MPTCP_LIB_EVENT_LISTENER_CREATED=15 # MPTCP_EVENT_LISTENER_CREATED -declare -rx MPTCP_LIB_EVENT_LISTENER_CLOSED=16 # MPTCP_EVENT_LISTENER_CLOSED - -declare -rx MPTCP_LIB_AF_INET=2 -declare -rx MPTCP_LIB_AF_INET6=10 - # These variables are used in some selftests, read-only declare -rx MPTCP_LIB_EVENT_CREATED=1 # MPTCP_EVENT_CREATED declare -rx MPTCP_LIB_EVENT_ESTABLISHED=2 # MPTCP_EVENT_ESTABLISHED From 9698088ac7704e260f492d9c254e29ed7dd8729a Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 4 Oct 2024 12:35:31 +0900 Subject: [PATCH 123/125] nilfs2: propagate directory read errors from nilfs_find_entry() commit 08cfa12adf888db98879dbd735bc741360a34168 upstream. Syzbot reported that a task hang occurs in vcs_open() during a fuzzing test for nilfs2. The root cause of this problem is that in nilfs_find_entry(), which searches for directory entries, ignores errors when loading a directory page/folio via nilfs_get_folio() fails. If the filesystem images is corrupted, and the i_size of the directory inode is large, and the directory page/folio is successfully read but fails the sanity check, for example when it is zero-filled, nilfs_check_folio() may continue to spit out error messages in bursts. Fix this issue by propagating the error to the callers when loading a page/folio fails in nilfs_find_entry(). The current interface of nilfs_find_entry() and its callers is outdated and cannot propagate error codes such as -EIO and -ENOMEM returned via nilfs_find_entry(), so fix it together. Link: https://lkml.kernel.org/r/20241004033640.6841-1-konishi.ryusuke@gmail.com Fixes: 2ba466d74ed7 ("nilfs2: directory entry operations") Signed-off-by: Ryusuke Konishi Reported-by: Lizhi Xu Closes: https://lkml.kernel.org/r/20240927013806.3577931-1-lizhi.xu@windriver.com Reported-by: syzbot+8a192e8d090fa9a31135@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=8a192e8d090fa9a31135 Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/nilfs2/dir.c | 50 +++++++++++++++++++++++++---------------------- fs/nilfs2/namei.c | 39 ++++++++++++++++++++++++------------ fs/nilfs2/nilfs.h | 2 +- 3 files changed, 54 insertions(+), 37 deletions(-) diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 53e4e63c607e..ddf8e575e489 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -331,6 +331,8 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx) * returns the page in which the entry was found, and the entry itself * (as a parameter - res_dir). Page is returned mapped and unlocked. * Entry is guaranteed to be valid. + * + * On failure, returns an error pointer and the caller should ignore res_page. */ struct nilfs_dir_entry * nilfs_find_entry(struct inode *dir, const struct qstr *qstr, @@ -358,22 +360,24 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr, do { char *kaddr = nilfs_get_page(dir, n, &page); - if (!IS_ERR(kaddr)) { - de = (struct nilfs_dir_entry *)kaddr; - kaddr += nilfs_last_byte(dir, n) - reclen; - while ((char *) de <= kaddr) { - if (de->rec_len == 0) { - nilfs_error(dir->i_sb, - "zero-length directory entry"); - nilfs_put_page(page); - goto out; - } - if (nilfs_match(namelen, name, de)) - goto found; - de = nilfs_next_entry(de); + if (IS_ERR(kaddr)) + return ERR_CAST(kaddr); + + de = (struct nilfs_dir_entry *)kaddr; + kaddr += nilfs_last_byte(dir, n) - reclen; + while ((char *)de <= kaddr) { + if (de->rec_len == 0) { + nilfs_error(dir->i_sb, + "zero-length directory entry"); + nilfs_put_page(page); + goto out; } - nilfs_put_page(page); + if (nilfs_match(namelen, name, de)) + goto found; + de = nilfs_next_entry(de); } + nilfs_put_page(page); + if (++n >= npages) n = 0; /* next page is past the blocks we've got */ @@ -386,7 +390,7 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr, } } while (n != start); out: - return NULL; + return ERR_PTR(-ENOENT); found: *res_page = page; @@ -431,19 +435,19 @@ fail: return NULL; } -ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr) +int nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr, ino_t *ino) { - ino_t res = 0; struct nilfs_dir_entry *de; struct page *page; de = nilfs_find_entry(dir, qstr, &page); - if (de) { - res = le64_to_cpu(de->inode); - kunmap(page); - put_page(page); - } - return res; + if (IS_ERR(de)) + return PTR_ERR(de); + + *ino = le64_to_cpu(de->inode); + kunmap(page); + put_page(page); + return 0; } /* Releases the page */ diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 2a4e7f4a8102..9f9b0762ff69 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -55,12 +55,20 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode; ino_t ino; + int res; if (dentry->d_name.len > NILFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - ino = nilfs_inode_by_name(dir, &dentry->d_name); - inode = ino ? nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino) : NULL; + res = nilfs_inode_by_name(dir, &dentry->d_name, &ino); + if (res) { + if (res != -ENOENT) + return ERR_PTR(res); + inode = NULL; + } else { + inode = nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino); + } + return d_splice_alias(inode, dentry); } @@ -263,10 +271,11 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry) struct page *page; int err; - err = -ENOENT; de = nilfs_find_entry(dir, &dentry->d_name, &page); - if (!de) + if (IS_ERR(de)) { + err = PTR_ERR(de); goto out; + } inode = d_inode(dentry); err = -EIO; @@ -361,10 +370,11 @@ static int nilfs_rename(struct mnt_idmap *idmap, if (unlikely(err)) return err; - err = -ENOENT; old_de = nilfs_find_entry(old_dir, &old_dentry->d_name, &old_page); - if (!old_de) + if (IS_ERR(old_de)) { + err = PTR_ERR(old_de); goto out; + } if (S_ISDIR(old_inode->i_mode)) { err = -EIO; @@ -381,10 +391,12 @@ static int nilfs_rename(struct mnt_idmap *idmap, if (dir_de && !nilfs_empty_dir(new_inode)) goto out_dir; - err = -ENOENT; - new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_page); - if (!new_de) + new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, + &new_page); + if (IS_ERR(new_de)) { + err = PTR_ERR(new_de); goto out_dir; + } nilfs_set_link(new_dir, new_de, new_page, old_inode); nilfs_mark_inode_dirty(new_dir); inode_set_ctime_current(new_inode); @@ -438,13 +450,14 @@ out: */ static struct dentry *nilfs_get_parent(struct dentry *child) { - unsigned long ino; + ino_t ino; + int res; struct inode *inode; struct nilfs_root *root; - ino = nilfs_inode_by_name(d_inode(child), &dotdot_name); - if (!ino) - return ERR_PTR(-ENOENT); + res = nilfs_inode_by_name(d_inode(child), &dotdot_name, &ino); + if (res) + return ERR_PTR(res); root = NILFS_I(d_inode(child))->i_root; diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 9a157e5051d0..ad13e74af65f 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -233,7 +233,7 @@ static inline __u32 nilfs_mask_flags(umode_t mode, __u32 flags) /* dir.c */ extern int nilfs_add_link(struct dentry *, struct inode *); -extern ino_t nilfs_inode_by_name(struct inode *, const struct qstr *); +int nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr, ino_t *ino); extern int nilfs_make_empty(struct inode *, struct inode *); extern struct nilfs_dir_entry * nilfs_find_entry(struct inode *, const struct qstr *, struct page **); From fd6e2af79a947e909c72b09f1ce3175f381067ef Mon Sep 17 00:00:00 2001 From: Vasiliy Kovalev Date: Wed, 16 Oct 2024 11:07:13 +0300 Subject: [PATCH 124/125] ALSA: hda/conexant - Use cached pin control for Node 0x1d on HP EliteOne 1000 G2 commit 164cd0e077a18d6208523c82b102c98c77fdd51f upstream. The cached version avoids redundant commands to the codec, improving stability and reducing unnecessary operations. This change ensures better power management and reliable restoration of pin configurations, especially after hibernation (S4) and other power transitions. Fixes: 9988844c457f ("ALSA: hda/conexant - Fix audio routing for HP EliteOne 1000 G2") Suggested-by: Kai-Heng Feng Suggested-by: Takashi Iwai Signed-off-by: Vasiliy Kovalev Link: https://patch.msgid.link/20241016080713.46801-1-kovalev@altlinux.org Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_conexant.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c index 01a2c22fef94..5833623f6ffa 100644 --- a/sound/pci/hda/patch_conexant.c +++ b/sound/pci/hda/patch_conexant.c @@ -334,7 +334,7 @@ static void cxt_fixup_update_pinctl(struct hda_codec *codec, * This is the value stored in the codec register after * the correct initialization of the previous windows boot. */ - snd_hda_set_pin_ctl(codec, 0x1d, AC_PINCTL_HP_EN); + snd_hda_set_pin_ctl_cache(codec, 0x1d, AC_PINCTL_HP_EN); } } From 18916a684a8b836957df88438f9bca590799d04c Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 22 Oct 2024 15:46:36 +0200 Subject: [PATCH 125/125] Linux 6.6.58 Link: https://lore.kernel.org/r/20241021102256.706334758@linuxfoundation.org Tested-by: SeongJae Park Tested-by: Florian Fainelli Tested-by: Linux Kernel Functional Testing Tested-by: Harshit Mogalapalli Tested-by: Shuah Khan Tested-by: Mark Brown Tested-by: Takeshi Ogasawara Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 5b5e40c4088c..f80e78c7cf20 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 6 PATCHLEVEL = 6 -SUBLEVEL = 57 +SUBLEVEL = 58 EXTRAVERSION = NAME = Pinguïn Aangedreven