netns/mbuf: add a per net namespace ring buffer

add a log per netns log ring buffer which's reading
side interface is:
/proc/net/twatcher/log

this is backport from tk3, do some clean-code tasks

Reviewed-by: kernelxing <kernelxing@tencent.com>
Signed-off-by: MengEn Sun <mengensun@tencent.com>
This commit is contained in:
MengEn Sun 2024-05-11 16:00:24 +08:00 committed by mengensun
parent 713a7bc81c
commit f660cd0791
10 changed files with 465 additions and 25 deletions

View File

@ -63,6 +63,62 @@ static int seq_open_net(struct inode *inode, struct file *file)
return 0;
}
#ifdef CONFIG_NETNS_MBUF
/* token from seq_open_net, all is same except the private is
* alloc by vmalloc, why?
*
* sameone may need a big private, wasting continuous phy mem
* they can use this function to use vmalloc private
*
* from now if you using this open abi place write a write
* fops like proc_simple_write we delete the pde->write check
*/
void *seq_open_net_large_private(struct inode *inode, struct file *file)
{
struct net *net;
struct seq_file *seq;
struct seq_net_private *p;
int ret;
unsigned int state_size = PDE(inode)->state_size;
WARN_ON_ONCE(state_size < sizeof(struct seq_net_private));
net = get_proc_net(inode);
if (!net) {
ret = -ENXIO;
goto out;
}
p = vmalloc(state_size);
if (!p) {
ret = -ENOMEM;
goto put_out;
}
memset(p, 0, state_size);
ret = seq_open(file, PDE(inode)->seq_ops);
if (ret < 0)
goto free_out;
seq = file->private_data;
seq->private = (void *)p;
#ifdef CONFIG_NET_NS
p->net = net;
#endif
return p;
free_out:
vfree(p);
put_out:
put_net(net);
out:
return ERR_PTR(ret);
}
EXPORT_SYMBOL(seq_open_net_large_private);
#endif
static void seq_file_net_put_net(struct seq_file *seq)
{
#ifdef CONFIG_NET_NS
@ -83,6 +139,31 @@ static int seq_release_net(struct inode *ino, struct file *f)
return 0;
}
#ifdef CONFIG_NETNS_MBUF
/* add a ext-abi to allow someone define the fops by themself, this is all
* alike proc_create_net_data except has a extra f_ops parameter
*/
struct proc_dir_entry *proc_create_net_data_ops(const char *name, umode_t mode,
struct proc_dir_entry *parent,
const struct seq_operations *seq_ops,
unsigned int state_size, void *data,
const struct proc_ops *proc_ops)
{
struct proc_dir_entry *p;
p = proc_create_reg(name, mode, &parent, data);
if (!p)
return NULL;
pde_force_lookup(p);
p->proc_ops = proc_ops;
p->seq_ops = seq_ops;
p->state_size = state_size;
return proc_register(parent, p);
}
EXPORT_SYMBOL_GPL(proc_create_net_data_ops);
#endif
static const struct proc_ops proc_net_seq_ops = {
.proc_open = seq_open_net,
.proc_read = seq_read,

View File

@ -49,7 +49,7 @@ struct mbuf_slot {
seqlock_t slot_lock;
/* rate limit */
struct ratelimit_state ratelimit;
struct cgroup *owner;
void *owner;
const struct mbuf_operations *ops;
struct mbuf_ring *mring;
};
@ -62,7 +62,7 @@ struct mbuf_operations {
u32 (*next)(struct mbuf_ring *mring, u32 idx);
/* write message */
ssize_t (*write)(struct cgroup *cg, const char *fmt, va_list);
ssize_t (*write)(struct mbuf_slot *mbuf, const char *fmt, va_list args);
} ____cacheline_aligned;
@ -70,9 +70,13 @@ void __init mbuf_bmap_init(void);
void __init setup_mbuf(void);
struct mbuf_slot *mbuf_slot_alloc(struct cgroup *cg);
struct mbuf_slot *mbuf_slot_alloc_v2(void *owner, struct mbuf_operations *ops);
void mbuf_free(struct cgroup *cg);
ssize_t mbuf_print(struct cgroup *cgrp, const char *fmt, ...);
void snapshot_mbuf(struct mbuf_slot *, struct mbuf_slot*, seqlock_t *);
u32 get_mbuf_slot_len(void);
void mbuf_free_slot(struct mbuf_slot *slot);
void mbuf_reset(struct mbuf_slot *mbuf);
#endif
#endif

View File

@ -161,6 +161,15 @@ int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
void arch_report_meminfo(struct seq_file *m);
void arch_proc_pid_thread_features(struct seq_file *m, struct task_struct *task);
#ifdef CONFIG_NETNS_MBUF
void *seq_open_net_large_private(struct inode *inode, struct file *file);
struct proc_dir_entry *proc_create_net_data_ops(const char *name, umode_t mode,
struct proc_dir_entry *parent,
const struct seq_operations *seq_ops,
unsigned int state_size, void *data,
const struct proc_ops *proc_ops);
#endif
#else /* CONFIG_PROC_FS */
static inline void proc_root_init(void)

View File

@ -43,7 +43,9 @@
#include <linux/skbuff.h>
#include <linux/notifier.h>
#include <linux/xarray.h>
#ifdef CONFIG_NETNS_MBUF
#include <net/netns_mbuf.h>
#endif
struct user_namespace;
struct proc_dir_entry;
struct net_device;
@ -190,6 +192,9 @@ struct net {
#if IS_ENABLED(CONFIG_SMC)
struct netns_smc smc;
#endif
#ifdef CONFIG_NETNS_MBUF
struct net_mbuf mbuf;
#endif
} __randomize_layout;
#include <linux/seq_file_net.h>

29
include/net/netns_mbuf.h Normal file
View File

@ -0,0 +1,29 @@
/* SPDX-License-Identifier: GPL-2.0-only
*
* make mbuf can be used by net namespace
*
* Author: mengensun <mengensun@tencent.com>
* Copyright (C) 2024 Tencent, Inc
*/
#ifndef __NETNS_MBUF
#define __NETNS_MBUF
#include<linux/proc_fs.h>
#include<linux/mbuf.h>
#ifdef CONFIG_NETNS_MBUF
struct net_mbuf {
struct proc_dir_entry *twatcher;
struct proc_dir_entry *log;
struct mbuf_slot *slot;
};
int inet_mbuf_init(void);
void inet_mbuf_exit(void);
ssize_t net_mbuf_print(struct net *net, const char *fmt, ...);
#else
static __always_inline int inet_mbuf_init(void) {return 0; }
static __always_inline void inet_mbuf_exit(void) {}
static __always_inline ssize_t net_mbuf_print(struct net *net, const char *fmt, ...) {return 0; };
#endif
#endif

View File

@ -252,7 +252,7 @@ static int mbuf_prepare(struct mbuf_ring *mring, u32 msg_size)
}
/* Write monitor buffer message */
static ssize_t do_mbuf_write(struct cgroup *cg, char *buffer, size_t size)
static ssize_t do_mbuf_write(struct mbuf_slot *mbuf, char *buffer, size_t size)
{
struct mbuf_ring *mring;
struct mbuf_ring_desc *desc;
@ -265,13 +265,13 @@ static ssize_t do_mbuf_write(struct cgroup *cg, char *buffer, size_t size)
return 0;
}
mring = cg->mbuf->mring;
mring = mbuf->mring;
len = sizeof(struct mbuf_ring_desc) + size;
write_seqlock_irqsave(&cg->mbuf->slot_lock, flags);
write_seqlock_irqsave(&mbuf->slot_lock, flags);
if (mbuf_prepare(mring, len)) {
write_sequnlock_irqrestore(&cg->mbuf->slot_lock, flags);
write_sequnlock_irqrestore(&mbuf->slot_lock, flags);
pr_err("mbuf: Can not find enough space.\n");
return 0;
}
@ -290,20 +290,23 @@ static ssize_t do_mbuf_write(struct cgroup *cg, char *buffer, size_t size)
mring->next_idx += desc->len;
mring->next_seq++;
write_sequnlock_irqrestore(&cg->mbuf->slot_lock, flags);
write_sequnlock_irqrestore(&mbuf->slot_lock, flags);
return size;
}
void mbuf_reset(struct mbuf_ring *mring)
void mbuf_reset(struct mbuf_slot *mbuf)
{
mring->first_idx = mring->base_idx;
mring->first_seq = 0;
mring->next_idx = mring->base_idx;
mring->next_seq = 0;
write_seqlock(&mbuf->slot_lock);
mbuf->mring->first_idx = mbuf->mring->base_idx;
mbuf->mring->first_seq = 0;
mbuf->mring->next_idx = mbuf->mring->base_idx;
mbuf->mring->next_seq = 0;
write_sequnlock(&mbuf->slot_lock);
}
EXPORT_SYMBOL(mbuf_reset);
static ssize_t mbuf_write(struct cgroup *cg, const char *fmt, va_list args)
static ssize_t mbuf_write(struct mbuf_slot *mbuf, const char *fmt, va_list args)
{
static char buf[MBUF_MSG_LEN_MAX];
char *text = buf;
@ -313,7 +316,7 @@ static ssize_t mbuf_write(struct cgroup *cg, const char *fmt, va_list args)
t_len = vscnprintf(text, sizeof(buf), fmt, args);
/* Write string to mbuf */
ret = do_mbuf_write(cg, text, t_len);
ret = do_mbuf_write(mbuf, text, t_len);
return ret;
}
@ -335,11 +338,17 @@ static int get_next_mbuf_id(unsigned long *addr, u32 start)
return index;
}
static void mbuf_slot_init(struct mbuf_slot *mb, struct cgroup *cg, u32 index)
static void mbuf_slot_init(struct mbuf_slot *mb,
void *owner, u32 index, struct mbuf_operations *ops)
{
mb->owner = cg;
mb->owner = owner;
mb->idx = index;
if (!ops)
mb->ops = &mbuf_ops;
else
mb->ops = ops;
seqlock_init(&mb->slot_lock);
ratelimit_state_init(&mb->ratelimit, 5 * HZ, 50);
@ -349,10 +358,10 @@ static void mbuf_slot_init(struct mbuf_slot *mb, struct cgroup *cg, u32 index)
+ sizeof(struct mbuf_ring);
mb->mring->end_idx = (index + 1) * g_mbuf.mbuf_size_per_cg - 1;
mbuf_reset(mb->mring);
mbuf_reset(mb);
}
struct mbuf_slot *mbuf_slot_alloc(struct cgroup *cg)
struct mbuf_slot *mbuf_slot_alloc_v2(void *owner, struct mbuf_operations *ops)
{
struct mbuf_slot *mb;
u32 index = 0;
@ -401,26 +410,38 @@ again:
g_mbuf.mbuf_next_id = index;
mb = (struct mbuf_slot *)(g_mbuf.mbuf + index * g_mbuf.mbuf_size_per_cg);
mbuf_slot_init(mb, cg, index);
mbuf_slot_init(mb, owner, index, ops);
g_mbuf.mbuf_frees--;
spin_unlock_irqrestore(&g_mbuf.mbuf_lock, flags);
return mb;
}
EXPORT_SYMBOL(mbuf_slot_alloc_v2);
void mbuf_free(struct cgroup *cg)
struct mbuf_slot *mbuf_slot_alloc(struct cgroup *cg)
{
return mbuf_slot_alloc_v2((void *)cg, NULL);
}
EXPORT_SYMBOL(mbuf_slot_alloc);
void mbuf_free_slot(struct mbuf_slot *slot)
{
unsigned long flags;
spin_lock_irqsave(&g_mbuf.mbuf_lock, flags);
/* Make current idx the next available buffer */
g_mbuf.mbuf_next_id = cg->mbuf->idx;
g_mbuf.mbuf_next_id = slot->idx;
__clear_bit(g_mbuf.mbuf_next_id, g_mbuf.mbuf_bitmap);
g_mbuf.mbuf_frees++;
spin_unlock_irqrestore(&g_mbuf.mbuf_lock, flags);
}
EXPORT_SYMBOL(mbuf_free_slot);
void mbuf_free(struct cgroup *cg)
{
mbuf_free_slot(cg->mbuf);
}
static u32 rd_mbuf_next(struct mbuf_ring *mring, u32 curr_idx)

View File

@ -124,6 +124,16 @@ source "net/mptcp/Kconfig"
endif # if INET
config NETNS_MBUF
bool "attach a mbuf to net namespace"
default y
depends on RQM && INET && PROC_FS
help
this allows attach a mbuf to each net namespace. mbuf is a ring
buffer of log, you can used to print log to it.
if you are unsure how to answer this question, answer N.
config NETWORK_SECMARK
bool "Security Marking"
help

View File

@ -40,3 +40,4 @@ obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
obj-$(CONFIG_BPF_SYSCALL) += sock_map.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o
obj-$(CONFIG_OF) += of_net.o
obj-$(CONFIG_NETNS_MBUF) += netns_mbuf.o

275
net/core/netns_mbuf.c Normal file
View File

@ -0,0 +1,275 @@
// SPDX-License-Identifier: GPL-2.0-only
/* make mbuf can be used by net namespace
*
* Author: mengensun <mengensun@tencent.com>
* Author: yuehongwu <yuehongwu@tencent.com>
* Copyright (C) 2024 Tencent, Inc
*/
#include<linux/cgroup.h>
#include<linux/mbuf.h>
#include<linux/proc_fs.h>
#include<net/net_namespace.h>
#include<net/netns/generic.h>
struct mbuf_seq_data {
struct seq_net_private snp;
struct mbuf_user_desc udesc;
struct mbuf_slot snapshot[];
};
static inline struct mbuf_slot *get_net_mbuf(struct net *net)
{
return net->mbuf.slot;
}
/* not controlled by sysctl_qos_mbuf_enable because we will
* have a /proc/net/ipv4/netlat/enable in later patch
*/
ssize_t net_mbuf_print(struct net *net, const char *fmt, ...)
{
va_list args;
struct mbuf_slot *slot;
slot = net->mbuf.slot;
if (!slot || !__ratelimit(&slot->ratelimit))
goto out;
va_start(args, fmt);
slot->ops->write(slot, fmt, args);
va_end(args);
out:
return 0;
}
EXPORT_SYMBOL(net_mbuf_print);
/* udesc is the user side interface, used to get data from mbuf,
* we can alloc a udesc per user, not to alloc a udesc and bind
* to mbuf when user accessing mbuf.
*
* seq file private data is the ideal place to hold the udesc
* if we put udesc in seq file private data all things is simple
*/
static void *netns_mbuf_start(struct seq_file *s, loff_t *pos)
{
u32 index;
struct mbuf_user_desc *udesc;
struct mbuf_seq_data *pd;
pd = s->private;
udesc = &pd->udesc;
index = *pos;
/* why: see seq_mbuf_open */
if (!pd->snapshot->mring)
return NULL;
/* If already reach end, just return */
if (index && index == pd->snapshot->mring->next_idx)
return NULL;
udesc->user_idx = pd->snapshot->mring->first_idx;
udesc->user_seq = pd->snapshot->mring->first_seq;
/* Maybe reach end or empty */
if (udesc->user_idx == pd->snapshot->mring->next_idx)
return NULL;
return udesc;
}
static void *netns_mbuf_next(struct seq_file *s, void *v, loff_t *pos)
{
struct mbuf_seq_data *pd;
struct mbuf_user_desc *udesc = v;
pd = s->private;
/* why: see seq_mbuf_open */
if (!pd->snapshot->mring)
return NULL;
udesc->user_idx = pd->snapshot->ops->next(pd->snapshot->mring,
udesc->user_idx);
*pos = udesc->user_idx;
if (udesc->user_idx == pd->snapshot->mring->next_idx)
return NULL;
return udesc;
}
static void netns_mbuf_stop(struct seq_file *s, void *v) { }
static int netns_mbuf_show(struct seq_file *s, void *v)
{
ssize_t ret;
struct mbuf_seq_data *pd;
struct mbuf_user_desc *udesc = (struct mbuf_user_desc *)v;
pd = s->private;
/* why: see seq_mbuf_open */
if (!pd->snapshot->mring)
return 0;
memset(udesc->buf, 0, sizeof(udesc->buf));
ret = pd->snapshot->ops->read(pd->snapshot, udesc);
if (ret > 0)
seq_printf(s, "%s", udesc->buf);
return 0;
}
static int seq_mbuf_open(struct inode *inode, struct file *file)
{
struct mbuf_seq_data *p;
struct mbuf_slot *mbuf;
p = seq_open_net_large_private(inode, file);
if (IS_ERR(p))
return PTR_ERR(p);
mbuf = get_net_mbuf(p->snp.net);
/* netns may have no mbuf attached, because the mbuf
* pool has a max num
* here we let file open success, so, seq_ops must
* check mring point
*
* btw: we memzerod the private in
* seq_open_net_large_private
*/
if (!mbuf)
return 0;
snapshot_mbuf(p->snapshot, mbuf, &mbuf->slot_lock);
return 0;
}
/* this function is token from seq_release_net, all is the
* same except for using **vfree** to free the private
*/
static int seq_mbuf_release(struct inode *ino, struct file *f)
{
struct seq_file *seq = f->private_data;
put_net(seq_file_net(seq));
vfree(seq->private);
seq->private = NULL;
seq_release(ino, f);
return 0;
}
/* when write clear the data */
ssize_t seq_mbuf_write(struct file *f, const char __user *ubuf,
size_t size, loff_t *_pos)
{
struct seq_file *seq = f->private_data;
struct mbuf_seq_data *p;
struct mbuf_slot *mb;
p = seq->private;
mb = get_net_mbuf(p->snp.net);
/* the netns not attached mbuf */
if (!mb)
return size;
mbuf_reset(mb);
return size;
}
/* seq_read have a mutex lock hold when called thoes function
* while the mutex lock is bind to struct file, not to inode,
* that mutex lock can control mutex access to mbuf among tasks
* which have the same file object (eg: muti-threads of
* a process)
*
* if there are muti-process access the mbuf, there have no
* mutex accessing.
*/
static const struct seq_operations mbuf_seq_ops = {
.show = netns_mbuf_show,
.start = netns_mbuf_start,
.next = netns_mbuf_next,
.stop = netns_mbuf_stop,
};
static const struct proc_ops mbuf_seq_fops = {
.proc_open = seq_mbuf_open,
.proc_read = seq_read,
.proc_write = seq_mbuf_write,
.proc_lseek = seq_lseek,
.proc_release = seq_mbuf_release,
};
static int __net_init net_mbuf_init(struct net *net)
{
int ret = 0;
/* if mbuf alloc failed, make the netns create success
*
* returning error here will put a limit on max netns
* can be created on current system
*
* btw: mbuf_slot has a max num 1024 for now, if mbuf_slot
* is all used, more allocing may failed, what we can do
* is make usr interface not changed, and make netlat
* `speak nothing`
* cgroup is used for kabi
*/
net->mbuf.slot = mbuf_slot_alloc_v2((void *)net, NULL);
if (!net->mbuf.slot)
pr_err("fail alloc mbuf");
net->mbuf.twatcher = proc_net_mkdir(net, "twatcher", net->proc_net);
if (!net->mbuf.twatcher) {
ret = -ENOMEM;
goto free_mbuf;
}
net->mbuf.log = proc_create_net_data_ops("log", S_IFREG | 0644,
net->mbuf.twatcher,
&mbuf_seq_ops,
sizeof(struct mbuf_seq_data) + get_mbuf_slot_len(),
NULL, &mbuf_seq_fops);
if (!net->mbuf.log) {
ret = -ENOMEM;
goto remove_watcher;
}
return ret;
remove_watcher:
remove_proc_entry("twatcher", net->proc_net);
free_mbuf:
if (net->mbuf.slot)
mbuf_free_slot(net->mbuf.slot);
return ret;
}
static void __net_exit net_mbuf_exit(struct net *net)
{
remove_proc_entry("log", net->mbuf.log);
remove_proc_entry("twatcher", net->mbuf.twatcher);
/* if mbuf allocate failed, no need to free */
if (!net->mbuf.slot)
return;
mbuf_free_slot(net->mbuf.slot);
}
static struct pernet_operations net_mbuf_ops = {
.init = net_mbuf_init,
.exit = net_mbuf_exit,
};
int inet_mbuf_init(void)
{
return register_pernet_subsys(&net_mbuf_ops);
}
EXPORT_SYMBOL(inet_mbuf_init);
void inet_mbuf_exit(void)
{
unregister_pernet_subsys(&net_mbuf_ops);
}
EXPORT_SYMBOL(inet_mbuf_exit);

View File

@ -114,6 +114,7 @@
#include <net/xfrm.h>
#include <net/net_namespace.h>
#include <net/secure_seq.h>
#include <net/netns_mbuf.h>
#ifdef CONFIG_IP_MROUTE
#include <linux/mroute.h>
#endif
@ -2102,11 +2103,15 @@ static int __init ipv4_proc_init(void)
goto out_udp;
if (ping_proc_init())
goto out_ping;
if (inet_mbuf_init())
goto out_mbuf;
if (ip_misc_proc_init())
goto out_misc;
out:
return rc;
out_misc:
inet_mbuf_exit();
out_mbuf:
ping_proc_exit();
out_ping:
udp4_proc_exit();