diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 2ba31b6d68c0..c2389f66d766 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -63,6 +63,62 @@ static int seq_open_net(struct inode *inode, struct file *file) return 0; } +#ifdef CONFIG_NETNS_MBUF +/* token from seq_open_net, all is same except the private is + * alloc by vmalloc, why? + * + * sameone may need a big private, wasting continuous phy mem + * they can use this function to use vmalloc private + * + * from now if you using this open abi place write a write + * fops like proc_simple_write we delete the pde->write check + */ +void *seq_open_net_large_private(struct inode *inode, struct file *file) +{ + struct net *net; + struct seq_file *seq; + struct seq_net_private *p; + int ret; + + unsigned int state_size = PDE(inode)->state_size; + + WARN_ON_ONCE(state_size < sizeof(struct seq_net_private)); + + net = get_proc_net(inode); + if (!net) { + ret = -ENXIO; + goto out; + } + + p = vmalloc(state_size); + if (!p) { + ret = -ENOMEM; + goto put_out; + } + memset(p, 0, state_size); + + ret = seq_open(file, PDE(inode)->seq_ops); + if (ret < 0) + goto free_out; + + seq = file->private_data; + seq->private = (void *)p; + +#ifdef CONFIG_NET_NS + p->net = net; +#endif + return p; + +free_out: + vfree(p); +put_out: + put_net(net); +out: + return ERR_PTR(ret); +} +EXPORT_SYMBOL(seq_open_net_large_private); +#endif + static void seq_file_net_put_net(struct seq_file *seq) { #ifdef CONFIG_NET_NS @@ -83,6 +139,31 @@ static int seq_release_net(struct inode *ino, struct file *f) return 0; } +#ifdef CONFIG_NETNS_MBUF +/* add a ext-abi to allow someone define the fops by themself, this is all + * alike proc_create_net_data except has a extra f_ops parameter + */ +struct proc_dir_entry *proc_create_net_data_ops(const char *name, umode_t mode, + struct proc_dir_entry *parent, + const struct seq_operations *seq_ops, + unsigned int state_size, void *data, + const struct proc_ops *proc_ops) +{ + struct proc_dir_entry *p; + + p = proc_create_reg(name, mode, &parent, data); + if (!p) + return NULL; + + pde_force_lookup(p); + p->proc_ops = proc_ops; + p->seq_ops = seq_ops; + p->state_size = state_size; + return proc_register(parent, p); +} +EXPORT_SYMBOL_GPL(proc_create_net_data_ops); +#endif + static const struct proc_ops proc_net_seq_ops = { .proc_open = seq_open_net, .proc_read = seq_read, diff --git a/include/linux/mbuf.h b/include/linux/mbuf.h index 3ea188f2865b..34f8ae04633d 100644 --- a/include/linux/mbuf.h +++ b/include/linux/mbuf.h @@ -49,7 +49,7 @@ struct mbuf_slot { seqlock_t slot_lock; /* rate limit */ struct ratelimit_state ratelimit; - struct cgroup *owner; + void *owner; const struct mbuf_operations *ops; struct mbuf_ring *mring; }; @@ -62,7 +62,7 @@ struct mbuf_operations { u32 (*next)(struct mbuf_ring *mring, u32 idx); /* write message */ - ssize_t (*write)(struct cgroup *cg, const char *fmt, va_list); + ssize_t (*write)(struct mbuf_slot *mbuf, const char *fmt, va_list args); } ____cacheline_aligned; @@ -70,9 +70,13 @@ void __init mbuf_bmap_init(void); void __init setup_mbuf(void); struct mbuf_slot *mbuf_slot_alloc(struct cgroup *cg); +struct mbuf_slot *mbuf_slot_alloc_v2(void *owner, struct mbuf_operations *ops); void mbuf_free(struct cgroup *cg); + ssize_t mbuf_print(struct cgroup *cgrp, const char *fmt, ...); void snapshot_mbuf(struct mbuf_slot *, struct mbuf_slot*, seqlock_t *); u32 get_mbuf_slot_len(void); +void mbuf_free_slot(struct mbuf_slot *slot); +void mbuf_reset(struct mbuf_slot *mbuf); #endif #endif diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index de407e7c3b55..f68e43ee29f3 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -161,6 +161,15 @@ int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns, void arch_report_meminfo(struct seq_file *m); void arch_proc_pid_thread_features(struct seq_file *m, struct task_struct *task); +#ifdef CONFIG_NETNS_MBUF +void *seq_open_net_large_private(struct inode *inode, struct file *file); +struct proc_dir_entry *proc_create_net_data_ops(const char *name, umode_t mode, + struct proc_dir_entry *parent, + const struct seq_operations *seq_ops, + unsigned int state_size, void *data, + const struct proc_ops *proc_ops); +#endif + #else /* CONFIG_PROC_FS */ static inline void proc_root_init(void) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index eb6cd43b1746..5eabd1c0ef78 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -43,7 +43,9 @@ #include #include #include - +#ifdef CONFIG_NETNS_MBUF +#include +#endif struct user_namespace; struct proc_dir_entry; struct net_device; @@ -190,6 +192,9 @@ struct net { #if IS_ENABLED(CONFIG_SMC) struct netns_smc smc; #endif +#ifdef CONFIG_NETNS_MBUF + struct net_mbuf mbuf; +#endif } __randomize_layout; #include diff --git a/include/net/netns_mbuf.h b/include/net/netns_mbuf.h new file mode 100644 index 000000000000..6a272949c4a4 --- /dev/null +++ b/include/net/netns_mbuf.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0-only + * + * make mbuf can be used by net namespace + * + * Author: mengensun + * Copyright (C) 2024 Tencent, Inc + */ +#ifndef __NETNS_MBUF +#define __NETNS_MBUF + +#include +#include + +#ifdef CONFIG_NETNS_MBUF +struct net_mbuf { + struct proc_dir_entry *twatcher; + struct proc_dir_entry *log; + struct mbuf_slot *slot; +}; + +int inet_mbuf_init(void); +void inet_mbuf_exit(void); +ssize_t net_mbuf_print(struct net *net, const char *fmt, ...); +#else +static __always_inline int inet_mbuf_init(void) {return 0; } +static __always_inline void inet_mbuf_exit(void) {} +static __always_inline ssize_t net_mbuf_print(struct net *net, const char *fmt, ...) {return 0; }; +#endif +#endif diff --git a/kernel/cgroup/mbuf.c b/kernel/cgroup/mbuf.c index 1b7c30659ab0..b9f82fb505ae 100644 --- a/kernel/cgroup/mbuf.c +++ b/kernel/cgroup/mbuf.c @@ -252,7 +252,7 @@ static int mbuf_prepare(struct mbuf_ring *mring, u32 msg_size) } /* Write monitor buffer message */ -static ssize_t do_mbuf_write(struct cgroup *cg, char *buffer, size_t size) +static ssize_t do_mbuf_write(struct mbuf_slot *mbuf, char *buffer, size_t size) { struct mbuf_ring *mring; struct mbuf_ring_desc *desc; @@ -265,13 +265,13 @@ static ssize_t do_mbuf_write(struct cgroup *cg, char *buffer, size_t size) return 0; } - mring = cg->mbuf->mring; + mring = mbuf->mring; len = sizeof(struct mbuf_ring_desc) + size; - write_seqlock_irqsave(&cg->mbuf->slot_lock, flags); + write_seqlock_irqsave(&mbuf->slot_lock, flags); if (mbuf_prepare(mring, len)) { - write_sequnlock_irqrestore(&cg->mbuf->slot_lock, flags); + write_sequnlock_irqrestore(&mbuf->slot_lock, flags); pr_err("mbuf: Can not find enough space.\n"); return 0; } @@ -290,20 +290,23 @@ static ssize_t do_mbuf_write(struct cgroup *cg, char *buffer, size_t size) mring->next_idx += desc->len; mring->next_seq++; - write_sequnlock_irqrestore(&cg->mbuf->slot_lock, flags); + write_sequnlock_irqrestore(&mbuf->slot_lock, flags); return size; } -void mbuf_reset(struct mbuf_ring *mring) +void mbuf_reset(struct mbuf_slot *mbuf) { - mring->first_idx = mring->base_idx; - mring->first_seq = 0; - mring->next_idx = mring->base_idx; - mring->next_seq = 0; + write_seqlock(&mbuf->slot_lock); + mbuf->mring->first_idx = mbuf->mring->base_idx; + mbuf->mring->first_seq = 0; + mbuf->mring->next_idx = mbuf->mring->base_idx; + mbuf->mring->next_seq = 0; + write_sequnlock(&mbuf->slot_lock); } +EXPORT_SYMBOL(mbuf_reset); -static ssize_t mbuf_write(struct cgroup *cg, const char *fmt, va_list args) +static ssize_t mbuf_write(struct mbuf_slot *mbuf, const char *fmt, va_list args) { static char buf[MBUF_MSG_LEN_MAX]; char *text = buf; @@ -313,7 +316,7 @@ static ssize_t mbuf_write(struct cgroup *cg, const char *fmt, va_list args) t_len = vscnprintf(text, sizeof(buf), fmt, args); /* Write string to mbuf */ - ret = do_mbuf_write(cg, text, t_len); + ret = do_mbuf_write(mbuf, text, t_len); return ret; } @@ -335,11 +338,17 @@ static int get_next_mbuf_id(unsigned long *addr, u32 start) return index; } -static void mbuf_slot_init(struct mbuf_slot *mb, struct cgroup *cg, u32 index) +static void mbuf_slot_init(struct mbuf_slot *mb, + void *owner, u32 index, struct mbuf_operations *ops) { - mb->owner = cg; + mb->owner = owner; mb->idx = index; - mb->ops = &mbuf_ops; + + if (!ops) + mb->ops = &mbuf_ops; + else + mb->ops = ops; + seqlock_init(&mb->slot_lock); ratelimit_state_init(&mb->ratelimit, 5 * HZ, 50); @@ -349,10 +358,10 @@ static void mbuf_slot_init(struct mbuf_slot *mb, struct cgroup *cg, u32 index) + sizeof(struct mbuf_ring); mb->mring->end_idx = (index + 1) * g_mbuf.mbuf_size_per_cg - 1; - mbuf_reset(mb->mring); + mbuf_reset(mb); } -struct mbuf_slot *mbuf_slot_alloc(struct cgroup *cg) +struct mbuf_slot *mbuf_slot_alloc_v2(void *owner, struct mbuf_operations *ops) { struct mbuf_slot *mb; u32 index = 0; @@ -401,26 +410,38 @@ again: g_mbuf.mbuf_next_id = index; mb = (struct mbuf_slot *)(g_mbuf.mbuf + index * g_mbuf.mbuf_size_per_cg); - mbuf_slot_init(mb, cg, index); + mbuf_slot_init(mb, owner, index, ops); g_mbuf.mbuf_frees--; spin_unlock_irqrestore(&g_mbuf.mbuf_lock, flags); return mb; } +EXPORT_SYMBOL(mbuf_slot_alloc_v2); -void mbuf_free(struct cgroup *cg) +struct mbuf_slot *mbuf_slot_alloc(struct cgroup *cg) +{ + return mbuf_slot_alloc_v2((void *)cg, NULL); +} +EXPORT_SYMBOL(mbuf_slot_alloc); + +void mbuf_free_slot(struct mbuf_slot *slot) { unsigned long flags; spin_lock_irqsave(&g_mbuf.mbuf_lock, flags); - /* Make current idx the next available buffer */ - g_mbuf.mbuf_next_id = cg->mbuf->idx; + g_mbuf.mbuf_next_id = slot->idx; __clear_bit(g_mbuf.mbuf_next_id, g_mbuf.mbuf_bitmap); - g_mbuf.mbuf_frees++; spin_unlock_irqrestore(&g_mbuf.mbuf_lock, flags); + +} +EXPORT_SYMBOL(mbuf_free_slot); + +void mbuf_free(struct cgroup *cg) +{ + mbuf_free_slot(cg->mbuf); } static u32 rd_mbuf_next(struct mbuf_ring *mring, u32 curr_idx) diff --git a/net/Kconfig b/net/Kconfig index 29e46de3a306..6fa566507e60 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -124,6 +124,16 @@ source "net/mptcp/Kconfig" endif # if INET +config NETNS_MBUF + bool "attach a mbuf to net namespace" + default y + depends on RQM && INET && PROC_FS + help + this allows attach a mbuf to each net namespace. mbuf is a ring + buffer of log, you can used to print log to it. + + if you are unsure how to answer this question, answer N. + config NETWORK_SECMARK bool "Security Marking" help diff --git a/net/core/Makefile b/net/core/Makefile index 731db2eaa610..a029c9a8d410 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -40,3 +40,4 @@ obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_BPF_SYSCALL) += sock_map.o obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o obj-$(CONFIG_OF) += of_net.o +obj-$(CONFIG_NETNS_MBUF) += netns_mbuf.o diff --git a/net/core/netns_mbuf.c b/net/core/netns_mbuf.c new file mode 100644 index 000000000000..1e6bf7e74f18 --- /dev/null +++ b/net/core/netns_mbuf.c @@ -0,0 +1,275 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* make mbuf can be used by net namespace + * + * Author: mengensun + * Author: yuehongwu + * Copyright (C) 2024 Tencent, Inc + */ +#include +#include +#include + +#include +#include + +struct mbuf_seq_data { + struct seq_net_private snp; + struct mbuf_user_desc udesc; + struct mbuf_slot snapshot[]; +}; + +static inline struct mbuf_slot *get_net_mbuf(struct net *net) +{ + return net->mbuf.slot; +} + +/* not controlled by sysctl_qos_mbuf_enable because we will + * have a /proc/net/ipv4/netlat/enable in later patch + */ +ssize_t net_mbuf_print(struct net *net, const char *fmt, ...) +{ + va_list args; + struct mbuf_slot *slot; + + slot = net->mbuf.slot; + if (!slot || !__ratelimit(&slot->ratelimit)) + goto out; + + va_start(args, fmt); + slot->ops->write(slot, fmt, args); + va_end(args); +out: + return 0; +} +EXPORT_SYMBOL(net_mbuf_print); + +/* udesc is the user side interface, used to get data from mbuf, + * we can alloc a udesc per user, not to alloc a udesc and bind + * to mbuf when user accessing mbuf. + * + * seq file private data is the ideal place to hold the udesc + * if we put udesc in seq file private data all things is simple + */ +static void *netns_mbuf_start(struct seq_file *s, loff_t *pos) +{ + u32 index; + struct mbuf_user_desc *udesc; + struct mbuf_seq_data *pd; + + pd = s->private; + udesc = &pd->udesc; + index = *pos; + + /* why: see seq_mbuf_open */ + if (!pd->snapshot->mring) + return NULL; + + /* If already reach end, just return */ + if (index && index == pd->snapshot->mring->next_idx) + return NULL; + + udesc->user_idx = pd->snapshot->mring->first_idx; + udesc->user_seq = pd->snapshot->mring->first_seq; + + /* Maybe reach end or empty */ + if (udesc->user_idx == pd->snapshot->mring->next_idx) + return NULL; + return udesc; +} + +static void *netns_mbuf_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct mbuf_seq_data *pd; + struct mbuf_user_desc *udesc = v; + + pd = s->private; + + /* why: see seq_mbuf_open */ + if (!pd->snapshot->mring) + return NULL; + + udesc->user_idx = pd->snapshot->ops->next(pd->snapshot->mring, + udesc->user_idx); + *pos = udesc->user_idx; + if (udesc->user_idx == pd->snapshot->mring->next_idx) + return NULL; + + return udesc; +} + +static void netns_mbuf_stop(struct seq_file *s, void *v) { } + +static int netns_mbuf_show(struct seq_file *s, void *v) +{ + ssize_t ret; + struct mbuf_seq_data *pd; + struct mbuf_user_desc *udesc = (struct mbuf_user_desc *)v; + + pd = s->private; + + /* why: see seq_mbuf_open */ + if (!pd->snapshot->mring) + return 0; + + memset(udesc->buf, 0, sizeof(udesc->buf)); + ret = pd->snapshot->ops->read(pd->snapshot, udesc); + if (ret > 0) + seq_printf(s, "%s", udesc->buf); + return 0; +} + +static int seq_mbuf_open(struct inode *inode, struct file *file) +{ + struct mbuf_seq_data *p; + struct mbuf_slot *mbuf; + + p = seq_open_net_large_private(inode, file); + + if (IS_ERR(p)) + return PTR_ERR(p); + + mbuf = get_net_mbuf(p->snp.net); + /* netns may have no mbuf attached, because the mbuf + * pool has a max num + * here we let file open success, so, seq_ops must + * check mring point + * + * btw: we memzerod the private in + * seq_open_net_large_private + */ + if (!mbuf) + return 0; + + snapshot_mbuf(p->snapshot, mbuf, &mbuf->slot_lock); + return 0; +} + +/* this function is token from seq_release_net, all is the + * same except for using **vfree** to free the private + */ +static int seq_mbuf_release(struct inode *ino, struct file *f) +{ + struct seq_file *seq = f->private_data; + + put_net(seq_file_net(seq)); + vfree(seq->private); + seq->private = NULL; + seq_release(ino, f); + return 0; +} + +/* when write clear the data */ +ssize_t seq_mbuf_write(struct file *f, const char __user *ubuf, + size_t size, loff_t *_pos) +{ + struct seq_file *seq = f->private_data; + struct mbuf_seq_data *p; + struct mbuf_slot *mb; + + p = seq->private; + mb = get_net_mbuf(p->snp.net); + + /* the netns not attached mbuf */ + if (!mb) + return size; + + mbuf_reset(mb); + return size; +} + +/* seq_read have a mutex lock hold when called thoes function + * while the mutex lock is bind to struct file, not to inode, + * that mutex lock can control mutex access to mbuf among tasks + * which have the same file object (eg: muti-threads of + * a process) + * + * if there are muti-process access the mbuf, there have no + * mutex accessing. + */ +static const struct seq_operations mbuf_seq_ops = { + .show = netns_mbuf_show, + .start = netns_mbuf_start, + .next = netns_mbuf_next, + .stop = netns_mbuf_stop, +}; + +static const struct proc_ops mbuf_seq_fops = { + .proc_open = seq_mbuf_open, + .proc_read = seq_read, + .proc_write = seq_mbuf_write, + .proc_lseek = seq_lseek, + .proc_release = seq_mbuf_release, +}; + +static int __net_init net_mbuf_init(struct net *net) +{ + int ret = 0; + + /* if mbuf alloc failed, make the netns create success + * + * returning error here will put a limit on max netns + * can be created on current system + * + * btw: mbuf_slot has a max num 1024 for now, if mbuf_slot + * is all used, more allocing may failed, what we can do + * is make usr interface not changed, and make netlat + * `speak nothing` + * cgroup is used for kabi + */ + net->mbuf.slot = mbuf_slot_alloc_v2((void *)net, NULL); + if (!net->mbuf.slot) + pr_err("fail alloc mbuf"); + + net->mbuf.twatcher = proc_net_mkdir(net, "twatcher", net->proc_net); + if (!net->mbuf.twatcher) { + ret = -ENOMEM; + goto free_mbuf; + } + + net->mbuf.log = proc_create_net_data_ops("log", S_IFREG | 0644, + net->mbuf.twatcher, + &mbuf_seq_ops, + sizeof(struct mbuf_seq_data) + get_mbuf_slot_len(), + NULL, &mbuf_seq_fops); + if (!net->mbuf.log) { + ret = -ENOMEM; + goto remove_watcher; + } + return ret; + +remove_watcher: + remove_proc_entry("twatcher", net->proc_net); + +free_mbuf: + if (net->mbuf.slot) + mbuf_free_slot(net->mbuf.slot); + return ret; +} + +static void __net_exit net_mbuf_exit(struct net *net) +{ + remove_proc_entry("log", net->mbuf.log); + remove_proc_entry("twatcher", net->mbuf.twatcher); + + /* if mbuf allocate failed, no need to free */ + if (!net->mbuf.slot) + return; + mbuf_free_slot(net->mbuf.slot); +} + +static struct pernet_operations net_mbuf_ops = { + .init = net_mbuf_init, + .exit = net_mbuf_exit, +}; + +int inet_mbuf_init(void) +{ + return register_pernet_subsys(&net_mbuf_ops); +} +EXPORT_SYMBOL(inet_mbuf_init); + +void inet_mbuf_exit(void) +{ + unregister_pernet_subsys(&net_mbuf_ops); +} +EXPORT_SYMBOL(inet_mbuf_exit); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index edc9c27cf8d8..c687c5859f57 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -114,6 +114,7 @@ #include #include #include +#include #ifdef CONFIG_IP_MROUTE #include #endif @@ -2102,11 +2103,15 @@ static int __init ipv4_proc_init(void) goto out_udp; if (ping_proc_init()) goto out_ping; + if (inet_mbuf_init()) + goto out_mbuf; if (ip_misc_proc_init()) goto out_misc; out: return rc; out_misc: + inet_mbuf_exit(); +out_mbuf: ping_proc_exit(); out_ping: udp4_proc_exit();